from statsmodels.stats.multicomp import pairwise_tukeyhsd from string import ascii_lowercase, ascii_uppercase import tqdm import pandas as pd from itertools import product # Make large CLD alphabet single_chars = list(ascii_lowercase) + list(ascii_uppercase) underscore_chars = [''.join(p) for p in product(['@'], single_chars)] CLD_ALPHABET = single_chars + underscore_chars def asserts_non_significance(col: list[bool], i: int, j: int) -> bool: """Assert whether i and j are represented as non-significant in the column i.e., if the corresponding values in the column are different Parameters ---------- col : list[bool] current column i : int index of first treatment j : int index of second treatment Returns ------- bool If the non-significance is represented accurately """ return col[i] and col[j] def insert(column: list[bool], i: int, j: int): """Duplicates column and in one of its copies flip entry i to 0, and in the other copy flip entry j to 0 Parameters ---------- column : list[bool] Original column i : int Index of first group j : int Index of second group Returns ------- list[bool], list[bool] New columns after duplication and flip """ col_i = column.copy() col_j = column.copy() col_i[i] = False col_j[j] = False return col_i, col_j def can_be_absorbed(new_col: list[bool], ref_col: list[bool]) -> bool: """An old column absorbs the new column if it has a 1 in every row in which the new column has one Parameters ---------- new_col : list[bool] Column to add ref_col : list[bool] Old column we are checking if it can absorb new_col Returns ------- bool Whether old column cand absorb new_col """ return all(ref_col[i] for i, x in enumerate(new_col) if x) def absorb(new_column: list[bool], columns: list[list[bool]]) -> list[list[bool]]: """Absorb new column into existing columns if the condition allows Parameters ---------- new_column : list[bool] Column to add columns : list[list[bool]] existing columns Returns ------- list[list[bool]] Columns after absorption """ if any(can_be_absorbed(new_column, c) for c in columns): return columns return columns + [new_column] def cld(comparisons: pd.DataFrame) -> dict[str, str]: """ Compact Letter Display Compute the compact letter display using the insert-absorb algorithm. See the following papers for more information: (1) https://doi.org/10.1016/j.csda.2006.09.035 (2) https://doi.org/10.1198/1061860043515 Parameters ---------- comparisons : pd.DataFrame A DataFrame containing the pairwise comparisons produced by: https://www.statsmodels.org/dev/generated/statsmodels.stats.multicomp.pairwise_tukeyhsd.html """ unique_groups = set(comparisons["group1"].unique()) unique_groups = unique_groups.union(set(comparisons["group2"].unique())) unique_groups = list(unique_groups) unique_groups_indices = {g: i for i, g in enumerate(unique_groups)} sig_diff = comparisons[comparisons["reject"]] print(f"Found {len(sig_diff)} significantly different pairs") # Initialize CLD matrix for all unique groups/models, with "columns" as rows solution = [[True] * len(unique_groups)] for _, row in tqdm.tqdm(sig_diff.iterrows(), total=len(sig_diff)): i = unique_groups_indices[row["group1"]] j = unique_groups_indices[row["group2"]] has_changed: bool = True while has_changed: has_changed = False for idx in range(len(solution)): if asserts_non_significance(solution[idx], i, j): # Duplicate the column col_i, col_j = insert(solution[idx], i, j) # Remove the old column solution.pop(idx) # Try absorb the column in an old column # Simply add it to the solution otherwise solution = absorb(col_i, solution) solution = absorb(col_j, solution) has_changed = True break # Assign letters letters = [""] * len(unique_groups) for ci, col in enumerate(solution): letter = CLD_ALPHABET[ci] for idx, has_letter in enumerate(col): if has_letter: letters[idx] += letter return {group: sorted(letter) for group, letter in zip(unique_groups, letters)} from statsmodels.stats.multicomp import pairwise_tukeyhsd import tqdm def add_cld_to_leaderboard( leaderboard: pd.DataFrame, scores: pd.DataFrame, metric: str, ): """Add the compact letter display to the leaderboard. Parameters ---------- leaderboard : pd.DataFrame The full leaderboard DataFrame scores : pd.DataFrame The **raw** scores DataFrame, with all replicates from bootstrapping metric_ : str The metric label to calculate CLD for. """ ordered_methods = leaderboard["user"].values scores = scores[["Sample", "user", metric]] scores[metric] = scores[metric].astype(float) # We compared methods using bootstrapping and the Tukey HSD test, presenting results via Compact Letter Display (CLD). # While acknowledging that bootstrapping likely underestimates variance, # we are not aware of better sampling techniques that fit the challenge format. stats = pairwise_tukeyhsd(endog=scores[metric], groups=scores["user"]) # comparisons = stats.summary_frame() # The version of statsmodel is for some reason not the latest, so we have to do small workaround to get summary_frame summary_table = stats.summary() # data attribute is a list of lists with column names as first element data = summary_table.data[1:] columns = summary_table.data[0] comparisons = pd.DataFrame(data=data, columns=columns) letter_mapping = {} letter_code = cld(comparisons) cld_column = [""] * len(leaderboard) for idx, method in enumerate(ordered_methods): try: letters = letter_code[str(method)] for letter in letters: if letter not in letter_mapping: letter_mapping[letter] = CLD_ALPHABET[len(letter_mapping)] cld_column[idx] += letter_mapping[letter] except KeyError: # Error with CLD for openadmet-dummy cld_column[idx] = "None" leaderboard["CLD"] = cld_column return leaderboard