import gradio as gr import pandas as pd import json import os import matplotlib.pyplot as plt import numpy as np def create_benchmark_plot(df): if df.empty: return None df_copy = df.copy() score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] for col in score_columns: df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0) df_copy['Total_Score'] = df_copy[score_columns].sum(axis=1) df_sorted = df_copy.sort_values(by='Total_Score', ascending=False) if len(df_sorted) > 10: top_models = df_sorted.head(10) else: top_models = df_sorted benchmarks = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] models = top_models['Model'].unique() x = np.arange(len(benchmarks)) width = 0.8 / len(models) if len(models) > 0 else 0.8 fig, ax = plt.subplots(figsize=(30, 10)) all_scores = [] for i, model in enumerate(models): model_data = top_models[top_models['Model'] == model] scores = [model_data[benchmark].values[0] if not model_data[benchmark].empty else 0 for benchmark in benchmarks] all_scores.extend(scores) offset = width * i - (width * (len(models) - 1) / 2) rects = ax.bar(x + offset, scores, width, label=model) ax.bar_label(rects, padding=3) ax.set_ylabel('Scores') ax.set_xticks(x) ax.set_xticklabels(benchmarks, rotation=45, ha="right") ax.legend(loc='lower right') if all_scores: ax.set_ylim(top=max(all_scores) * 1.15) plt.tight_layout() return fig def load_leaderboard_data(): data = [] benchmarks_dir = "benchmarks" mmlu_categories = { "mmlu_professional": [ "mmlu_professional_accounting", "mmlu_professional_law", "mmlu_professional_medicine", "mmlu_professional_psychology" ], "mmlu_college": [ "mmlu_college_biology", "mmlu_college_chemistry", "mmlu_college_computer_science", "mmlu_college_mathematics", "mmlu_college_medicine", "mmlu_college_physics" ], "mmlu_high_school": [ "mmlu_high_school_biology", "mmlu_high_school_chemistry", "mmlu_high_school_computer_science", "mmlu_high_school_european_history", "mmlu_high_school_geography", "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics", "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics", "mmlu_high_school_physics", "mmlu_high_school_psychology", "mmlu_high_school_statistics", "mmlu_high_school_us_history", "mmlu_high_school_world_history" ] } all_mmlu_scores = [ "mmlu_abstract_algebra", "mmlu_anatomy", "mmlu_astronomy", "mmlu_business_ethics", "mmlu_clinical_knowledge", "mmlu_college_biology", "mmlu_college_chemistry", "mmlu_college_computer_science", "mmlu_college_mathematics", "mmlu_college_medicine", "mmlu_college_physics", "mmlu_computer_security", "mmlu_conceptual_physics", "mmlu_econometrics", "mmlu_electrical_engineering", "mmlu_elementary_mathematics", "mmlu_formal_logic", "mmlu_global_facts", "mmlu_high_school_biology", "mmlu_high_school_chemistry", "mmlu_high_school_computer_science", "mmlu_high_school_european_history", "mmlu_high_school_geography", "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics", "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics", "mmlu_high_school_physics", "mmlu_high_school_psychology", "mmlu_high_school_statistics", "mmlu_high_school_us_history", "mmlu_high_school_world_history", "mmlu_human_aging", "mmlu_human_sexuality", "mmlu_humanities", "mmlu_international_law", "mmlu_jurisprudence", "mmlu_logical_fallacies", "mmlu_machine_learning", "mmlu_management", "mmlu_marketing", "mmlu_medical_genetics", "mmlu_miscellaneous", "mmlu_moral_disputes", "mmlu_moral_scenarios", "mmlu_nutrition", "mmlu_other", "mmlu_philosophy", "mmlu_prehistory", "mmlu_professional_accounting", "mmlu_professional_law", "mmlu_professional_medicine", "mmlu_professional_psychology", "mmlu_public_relations", "mmlu_security_studies", "mmlu_social_sciences", "mmlu_sociology", "mmlu_stem", "mmlu_us_foreign_policy", "mmlu_virology", "mmlu_world_religions" ] other_mmlu_scores = [s for s in all_mmlu_scores if s not in sum(mmlu_categories.values(), [])] mmlu_categories["mmlu_other"] = other_mmlu_scores for filename in os.listdir(benchmarks_dir): if filename.endswith(".json") and filename.startswith("results_"): filepath = os.path.join(benchmarks_dir, filename) with open(filepath, 'r') as f: content = json.load(f) model_name = content.get("model_name") if not model_name: model_name = os.path.splitext(filename)[0] if model_name.endswith('/'): model_name = model_name.rstrip('/') model_name = os.path.basename(model_name) results = content.get("results", {}) ifeval_score = results.get("ifeval", {}).get("prompt_level_strict_acc,none") mmlu_score = results.get("mmlu", {}).get("acc,none") row = {"Model": model_name, "IFEval": ifeval_score, "MMLU": mmlu_score} for score_name in all_mmlu_scores: row[score_name] = results.get(score_name, {}).get("acc,none") for category, scores in mmlu_categories.items(): category_scores = [pd.to_numeric(row.get(s), errors='coerce') for s in scores] category_scores = [s for s in category_scores if pd.notna(s)] if category_scores: row[category] = sum(category_scores) / len(category_scores) else: row[category] = np.nan data.append(row) df_raw = pd.DataFrame(data) numeric_cols = [col for col in df_raw.columns if col != 'Model'] for col in numeric_cols: df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce') score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] for col in score_columns: df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').fillna(0) df_raw['Total_Score'] = df_raw[score_columns].sum(axis=1) df_sorted = df_raw.sort_values(by='Total_Score', ascending=False) df = df_sorted.drop_duplicates(subset=['Model'], keep='first').copy() df = df.drop(columns=['Total_Score']) for col in numeric_cols: df[col] = df[col].apply(lambda x: round(x, 4) if pd.notna(x) else x) df.fillna(0, inplace=True) return df def style_diff(df, all_data_df): def highlight_max(s): s_numeric = pd.to_numeric(s, errors='coerce') max_val = s_numeric.max() return ['background-color: #68a055' if v == max_val else '' for v in s_numeric] def highlight_min(s): s_numeric = pd.to_numeric(s, errors='coerce') s_filtered = s_numeric[s_numeric > 0] if s_filtered.empty: return ['' for _ in s_numeric] min_val = s_filtered.min() return ['background-color: #d4605b' if v == min_val else '' for v in s_numeric] df_styler = df.style for col in df.columns: if col != 'Model': numeric_col = pd.to_numeric(df[col], errors='coerce') if not numeric_col.isnull().all(): df_styler = df_styler.apply(highlight_max, subset=[col], axis=0) df_styler = df_styler.apply(highlight_min, subset=[col], axis=0) return df_styler def prepare_plot_data(df, all_cols=False): df_plot = df.copy() if not df_plot.empty: if all_cols: score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] for col in score_columns: df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce').fillna(0) df_plot['Total_Score'] = df_plot[score_columns].sum(axis=1) df_plot = df_plot.sort_values(by='Total_Score', ascending=False).reset_index(drop=True) df_plot = df_plot.head(10) df_plot['Ranked_Model'] = [f"{i+1:02d}. {model}" for i, model in enumerate(df_plot['Model'])] else: df_plot['MMLU_IFEval_Combined'] = df_plot['MMLU'].fillna(0) + df_plot['IFEval'].fillna(0) df_plot = df_plot.sort_values(by='MMLU_IFEval_Combined', ascending=False).reset_index(drop=True) return df_plot initial_df = load_leaderboard_data() display_cols = ['Model', 'IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] display_df = initial_df[display_cols].copy() for col in display_df.columns: if col != 'Model': display_df[col] = pd.to_numeric(display_df[col], errors='coerce').fillna(0) with gr.Blocks() as demo: gr.Markdown("# Model Leaderboard") def update_plots(selected_models): if not selected_models: df_to_plot = initial_df else: df_to_plot = initial_df[initial_df['Model'].isin(selected_models)] scatter_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=False) padding_factor = 0.1 min_padding = 0.05 if not scatter_plot_df.empty: x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max() x_range = x_max - x_min x_padding = max(x_range * padding_factor, min_padding) if x_range > 0 else min_padding x_lim = [x_min - x_padding, x_max + x_padding] y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max() y_range = y_max - y_min y_padding = max(y_range * padding_factor, min_padding) if y_range > 0 else min_padding y_lim = [y_min - y_padding, y_max + y_padding] else: x_lim = [0, 1] y_lim = [0, 1] scatter_plot_df = pd.DataFrame(columns=['Model', 'MMLU', 'IFEval', 'MMLU_IFEval_Combined']) scatter_plot_update = gr.ScatterPlot( value=scatter_plot_df, x="MMLU", y="IFEval", color="Model", title="Model Performance", x_lim=x_lim, y_lim=y_lim, ) bar_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=True) if not bar_plot_df.empty: value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars, var_name='Benchmark', value_name='Score') else: melted_df = pd.DataFrame(columns=['Ranked_Model', 'Benchmark', 'Score']) bar_plot_update = gr.BarPlot( value=melted_df, x="Score", y="Ranked_Model", color="Benchmark", title="MMLU and IFEval Scores by Model", x_title="Score", y_title="Model", color_legend_title="Benchmark", vertical=False, ) benchmark_plot_update = create_benchmark_plot(df_to_plot) if not selected_models: df_to_display = display_df styled_df = style_diff(df_to_display, initial_df) else: df_to_display = display_df[display_df['Model'].isin(selected_models)] styled_df = style_diff(df_to_display, initial_df) return scatter_plot_update, bar_plot_update, benchmark_plot_update, styled_df with gr.Accordion("Plots", open=True): with gr.Tabs(): with gr.TabItem("Summary Plots"): with gr.Row(): scatter_plot_df = prepare_plot_data(initial_df.copy(), all_cols=False) padding_factor = 0.1 min_padding = 0.05 x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max() x_range = x_max - x_min x_padding = max(x_range * padding_factor, min_padding) x_lim = [x_min - x_padding, x_max + x_padding] y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max() y_range = y_max - y_min y_padding = max(y_range * padding_factor, min_padding) y_lim = [y_min - y_padding, y_max + y_padding] scatterplot = gr.ScatterPlot( value=scatter_plot_df, x="MMLU", y="IFEval", color="Model", title="Model Performance", x_lim=x_lim, y_lim=y_lim, ) bar_plot_df = prepare_plot_data(initial_df.copy(), all_cols=True) value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars, var_name='Benchmark', value_name='Score') barplot = gr.BarPlot( value=melted_df, x="Score", y="Ranked_Model", color="Benchmark", title="MMLU and IFEval Scores by Model", x_title="Score", y_title="Model", color_legend_title="Benchmark", vertical=False, ) with gr.TabItem("Benchmark Comparison"): with gr.Row(): benchmark_plot = gr.Plot(value=create_benchmark_plot(initial_df)) model_names = initial_df["Model"].tolist() model_selector = gr.Dropdown( choices=model_names, label="Select Models to Display", multiselect=True, info="Select one or more models to display on the plots. If none are selected, all models will be shown." ) with gr.Row(): dataframe = gr.DataFrame( value=style_diff(display_df, initial_df), type="pandas", column_widths=["30%", "10%", "10%", "12%", "10%", "10%", "10%"], wrap=True ) model_selector.change(update_plots, inputs=model_selector, outputs=[scatterplot, barplot, benchmark_plot, dataframe]) if __name__ == "__main__": demo.launch()