Spaces:
Running
Running
| from __future__ import annotations | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from constants import Constants, model_type_emoji | |
| from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns | |
| TITLE = """<h1 align="center" id="space-title">TabArena: Public leaderboard for Tabular methods</h1>""" | |
| INTRODUCTION_TEXT = ( | |
| "TabArena Leaderboard measures the performance of tabular models on a collection of tabular " | |
| "datasets manually curated. The datasets are collected to make sure they are tabular, with " | |
| "permissive license without ethical issues and so on, we refer to the paper for a full " | |
| "description of our approach." | |
| ) | |
| ABOUT_TEXT = """ | |
| ## How It Works. | |
| To evaluate the leaderboard, follow install instructions in | |
| `https://github.com/autogluon/tabrepo/tree/tabarena` and run | |
| `https://github.com/autogluon/tabrepo/blob/tabarena/examples/tabarena/run_tabarena_eval.py`. | |
| This will generate a leaderboard. You can add your own method and contact the authors if you want it to be added | |
| to the leaderboard. We require method to have public code available to be considered in the leaderboard. | |
| """ | |
| CITATION_BUTTON_LABEL = ( | |
| "If you use this leaderboard in your research please cite the following:" | |
| ) | |
| CITATION_BUTTON_TEXT = r""" | |
| @article{ | |
| TBA, | |
| } | |
| """ | |
| def get_model_family(model_name: str) -> str: | |
| prefixes_mapping = { | |
| Constants.automl: ["AutoGluon"], | |
| Constants.neural_network: ["REALMLP", "TabM", "FASTAI", "MNCA", "NN_TORCH"], | |
| Constants.tree: ["GBM", "CAT", "EBM", "XGB", "XT", "RF"], | |
| Constants.foundational: ["TABDPT", "TABICL", "TABPFN"], | |
| Constants.baseline: ["KNN", "LR"], | |
| } | |
| for method_type, prefixes in prefixes_mapping.items(): | |
| for prefix in prefixes: | |
| if prefix.lower() in model_name.lower(): | |
| return method_type | |
| return Constants.other | |
| def rename_map(model_name: str) -> str: | |
| rename_map = { | |
| "TABM": "TabM", | |
| "REALMLP": "RealMLP", | |
| "GBM": "LightGBM", | |
| "CAT": "CatBoost", | |
| "XGB": "XGBoost", | |
| "XT": "ExtraTrees", | |
| "RF": "RandomForest", | |
| "MNCA": "ModernNCA", | |
| "NN_TORCH": "TorchMLP", | |
| "FASTAI": "FastaiMLP", | |
| "TABPFNV2": "TabPFNv2", | |
| "EBM": "EBM", | |
| "TABDPT": "TabDPT", | |
| "TABICL": "TabICL", | |
| "KNN": "KNN", | |
| "LR": "Linear", | |
| } | |
| for prefix in rename_map: | |
| if prefix in model_name: | |
| return model_name.replace(prefix, rename_map[prefix]) | |
| return model_name | |
| def load_data(filename: str): | |
| df_leaderboard = pd.read_csv(Path(__file__).parent / "data" / f"{filename}.csv.zip") | |
| print( | |
| f"Loaded dataframe with {len(df_leaderboard)} rows and columns {df_leaderboard.columns}" | |
| ) | |
| # add model family information | |
| df_leaderboard["Type"] = df_leaderboard.loc[:, "method"].apply( | |
| lambda s: model_type_emoji[get_model_family(s)] | |
| ) | |
| df_leaderboard["TypeName"] = df_leaderboard.loc[:, "method"].apply( | |
| lambda s: get_model_family(s) | |
| ) | |
| df_leaderboard["method"] = df_leaderboard["method"].apply(rename_map) | |
| # elo,elo+,elo-,mrr | |
| df_leaderboard["Elo 95% CI"] = ( | |
| "+" | |
| + df_leaderboard["elo+"].round(0).astype(int).astype(str) | |
| + "/-" | |
| + df_leaderboard["elo-"].round(0).astype(int).astype(str) | |
| ) | |
| # select only the columns we want to display | |
| df_leaderboard = df_leaderboard.loc[ | |
| :, | |
| [ | |
| "Type", | |
| "TypeName", | |
| "method", | |
| "elo", | |
| "Elo 95% CI", | |
| "rank", | |
| "normalized-error", | |
| "median_time_train_s_per_1K", | |
| "median_time_infer_s_per_1K", | |
| ], | |
| ] | |
| # round for better display | |
| df_leaderboard[["elo", "Elo 95% CI"]] = df_leaderboard[["elo", "Elo 95% CI"]].round(0) | |
| df_leaderboard[["median_time_train_s_per_1K", "rank"]] = df_leaderboard[ | |
| ["median_time_train_s_per_1K", "rank"] | |
| ].round(2) | |
| df_leaderboard[["normalized-error", "median_time_infer_s_per_1K"]] = df_leaderboard[ | |
| ["normalized-error", "median_time_infer_s_per_1K"] | |
| ].round(3) | |
| df_leaderboard = df_leaderboard.sort_values(by="elo", ascending=False) | |
| df_leaderboard = df_leaderboard.reset_index(drop=True) | |
| df_leaderboard = df_leaderboard.reset_index(names="#") | |
| # rename some columns | |
| return df_leaderboard.rename( | |
| columns={ | |
| "median_time_train_s_per_1K": "Median Train Time (s/1K) [β¬οΈ]", | |
| "median_time_infer_s_per_1K": "Median Predict Time (s/1K)) [β¬οΈ]", | |
| "method": "Model", | |
| "elo": "Elo [β¬οΈ]", | |
| "rank": "Rank [β¬οΈ]", | |
| "normalized-error": "Normalized Error [β¬οΈ]", | |
| } | |
| ) | |
| # TODO show ELO +/- sem | |
| # TODO: rename and re-order columns | |
| def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard: | |
| df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply( | |
| lambda m: f"{m} {model_type_emoji[m]}" | |
| ) | |
| # De-selects but does not filter... | |
| # default = df_leaderboard["TypeFiler"].unique().tolist() | |
| # default = [(s, s) for s in default if "AutoML" not in s] | |
| df_leaderboard["Only Default"] = df_leaderboard["Model"].str.endswith("(default)") | |
| df_leaderboard["Only Tuned"] = df_leaderboard["Model"].str.endswith("(tuned)") | |
| df_leaderboard["Only Tuned + Ensemble"] = df_leaderboard["Model"].str.endswith( | |
| "(tuned + ensemble)" | |
| ) | df_leaderboard["Model"].str.endswith("(4h)") | |
| # Add Imputed count postfix | |
| mask = df_leaderboard["Model"].str.startswith("TabPFNv2") | |
| df_leaderboard.loc[mask, "Model"] = ( | |
| df_leaderboard.loc[mask, "Model"] + " [35.29% IMPUTED]" | |
| ) | |
| mask = df_leaderboard["Model"].str.startswith("TabICL") | |
| df_leaderboard.loc[mask, "Model"] = ( | |
| df_leaderboard.loc[mask, "Model"] + " [29.41% IMPUTED]" | |
| ) | |
| df_leaderboard["Imputed"] = df_leaderboard["Model"].str.startswith( | |
| "TabPFNv2" | |
| ) | df_leaderboard["Model"].str.startswith("TabICL") | |
| df_leaderboard["Imputed"] = df_leaderboard["Imputed"].replace( | |
| { | |
| True: "Imputed", | |
| False: "Not Imputed", | |
| } | |
| ) | |
| return Leaderboard( | |
| value=df_leaderboard, | |
| select_columns=SelectColumns( | |
| default_selection=list(df_leaderboard.columns), | |
| cant_deselect=["Type", "Model"], | |
| label="Select Columns to Display:", | |
| ), | |
| hide_columns=[ | |
| "TypeName", | |
| "TypeFiler", | |
| "RefModel", | |
| "Only Default", | |
| "Only Tuned", | |
| "Only Tuned + Ensemble", | |
| "Imputed", | |
| ], | |
| search_columns=["Model", "Type"], | |
| filter_columns=[ | |
| ColumnFilter("TypeFiler", type="checkboxgroup", label="Model Types."), | |
| ColumnFilter("Only Default", type="boolean", default=False), | |
| ColumnFilter("Only Tuned", type="boolean", default=False), | |
| ColumnFilter("Only Tuned + Ensemble", type="boolean", default=False), | |
| ColumnFilter( | |
| "Imputed", | |
| type="checkboxgroup", | |
| label="(Not) Imputed Models.", | |
| info="We impute the performance for models that cannot run on all" | |
| " datasets due to task or dataset size constraints (e.g. TabPFN," | |
| " TabICL). We impute with the performance of a defaultRandomForest. " | |
| " We add a postfix [X% IMPUTED] to the model if any results were " | |
| "imputed. The X% shows the percentage of" | |
| " datasets that were imputed. In general, imputation negatively" | |
| " represents the model performance, punishing the model for not" | |
| " being able to run on all datasets.", | |
| ), | |
| ], | |
| bool_checkboxgroup_label="Custom Views (exclusive, only toggle one at a time):", | |
| ) | |
| def main(): | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.HTML(TITLE) | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
| with gr.Tabs(elem_classes="tab-buttons"): | |
| with gr.TabItem("π Overall", elem_id="llm-benchmark-tab-table", id=2): | |
| df_leaderboard = load_data("tabarena_leaderboard") | |
| make_leaderboard(df_leaderboard) | |
| # TODO: decide on which subsets we want to support here. | |
| # with gr.TabItem('π Regression', elem_id="llm-benchmark-tab-table", id=0): | |
| # df_leaderboard = load_data("leaderboard-regression") | |
| # leaderboard = make_leaderboard(df_leaderboard) | |
| # | |
| # with gr.TabItem('π Classification', elem_id="llm-benchmark-tab-table", id=1): | |
| # df_leaderboard = load_data("leaderboard-classification") | |
| # leaderboard = make_leaderboard(df_leaderboard) | |
| # | |
| # with gr.TabItem('π Classification', elem_id="llm-benchmark-tab-table", id=1): | |
| # df_leaderboard = load_data("leaderboard-classification") | |
| # leaderboard = make_leaderboard(df_leaderboard) | |
| # | |
| # with gr.TabItem('π TabPFNv2-Compatible', elem_id="llm-benchmark-tab-table", id=1): | |
| # df_leaderboard = load_data("leaderboard-classification") | |
| # leaderboard = make_leaderboard(df_leaderboard) | |
| # | |
| # with gr.TabItem('π TabICL-Compatible', elem_id="llm-benchmark-tab-table", id=1): | |
| # df_leaderboard = load_data("leaderboard-classification") | |
| # leaderboard = make_leaderboard(df_leaderboard) | |
| with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=4): | |
| gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text") | |
| with gr.Row(), gr.Accordion("π Citation", open=False): | |
| gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| lines=20, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| scheduler = BackgroundScheduler() | |
| # scheduler.add_job(restart_space, "interval", seconds=1800) | |
| scheduler.start() | |
| demo.queue(default_concurrency_limit=40).launch() | |
| demo.launch() | |
| if __name__ == "__main__": | |
| main() | |