Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| import scipy.stats as st | |
| LEADERBOARD_FILE = "leaderboard.csv" | |
| def get_leaderboard_df(): | |
| df = pd.read_csv(LEADERBOARD_FILE) | |
| df = df.sort_values(by = ["Score"], ascending = False) | |
| df = df.reset_index(drop = True) | |
| return df | |
| def get_model_stats(uploaded_df): | |
| overall_score = uploaded_df["avg_score"].mean() | |
| data = np.array(list(uploaded_df["avg_score"])) | |
| bootstrap_res = st.bootstrap((data,), | |
| np.mean, | |
| confidence_level = 0.95, | |
| n_resamples = 10000, | |
| method = "percentile") | |
| ci_high = bootstrap_res.confidence_interval.high | |
| ci_low = bootstrap_res.confidence_interval.low | |
| formatted_upper_diff = str(round(ci_high - overall_score, 2)) | |
| formatted_lower_diff = str(round(overall_score - ci_low, 2)) | |
| formatted_score = round(overall_score, 2) | |
| formatted_ci = f"+{formatted_upper_diff}/-{formatted_lower_diff}" | |
| return (formatted_score, formatted_ci) | |
| def process_upload(file): | |
| uploaded_df = pd.read_csv(file.name).dropna() | |
| if "avg_score" not in list(uploaded_df.columns): | |
| return "Upload failed: file must have column 'avg_score'." | |
| overall_score, confidence_interval = get_model_stats(uploaded_df) | |
| leaderboard_df = get_leaderboard_df() | |
| model_name = file.name.split("gradio/")[1].split("/")[1].split(".csv")[0] | |
| new_entry = {"Model": model_name, "Score": overall_score, "95% CI": confidence_interval} | |
| leaderboard_df = leaderboard_df.append(new_entry, ignore_index = True) | |
| leaderboard_df.to_csv(LEADERBOARD_FILE, index = False) | |
| return "Upload complete! The leaderboard has been updated." | |
| #theme = gr.themes.Default(radius_size = "none") | |
| def create_ui(): | |
| text_size = gr.themes.sizes.text_lg | |
| # load theme from theme.json | |
| theme = gr.themes.Default.load("theme.json") | |
| # set text size to large | |
| theme.text_size = text_size | |
| with gr.Blocks(theme = theme) as demo: | |
| with gr.Row(): | |
| gr.Image("https://ai.stanford.edu/wp-content/themes/sail/img/logo.png", | |
| show_label = False, | |
| show_download_button = False, | |
| show_share_button = False, | |
| container = False, | |
| min_width = 200, | |
| scale = 0) | |
| gr.Image("https://crfm.stanford.edu/static/img/header/crfm-rgb.png", | |
| show_label = False, | |
| show_download_button = False, | |
| show_share_button = False, | |
| container = False, | |
| min_width = 200, | |
| scale = 0) | |
| gr.Markdown( | |
| """ | |
| # **RubricEval: A Scalable Human-LLM Evaluation Framework for Open-Ended Tasks** | |
| ###### | |
| """) | |
| with gr.TabItem("Leaderboard"): | |
| overall_leaderboard_table = gr.Dataframe(get_leaderboard_df, | |
| gr.Timer(5), | |
| column_widths = ["33.3%", "33.3%", "33.3%"], | |
| height = 600) | |
| gr.Markdown( | |
| """ | |
| ###### | |
| ## RubricEval leaderboard statistics (Overall) | |
| """ | |
| ) | |
| gr.Image("lb_stats.png", | |
| show_label = False, | |
| show_download_button = False, | |
| show_share_button = False, | |
| width = 800) | |
| gr.Markdown( | |
| """ | |
| ###### | |
| ## RubricEval scores by category | |
| """ | |
| ) | |
| gr.Image("category_scores.png", | |
| show_label = False, | |
| show_download_button = False, | |
| show_share_button = False) | |
| with gr.TabItem("About"): | |
| gr.Image("eval_about.jpg", | |
| show_label = False, | |
| show_download_button = False, | |
| show_share_button = False) | |
| with gr.Accordion("What is RubricEval?"): | |
| gr.Markdown( | |
| """ | |
| ###### | |
| #### Overview | |
| RubricEval is a framework for evaluating instruction-following models. | |
| The core idea is to create example-specific rubrics designed by human experts, which are then applied by an GPT-4o to evaluate model outputs at scale. This process results in more scalable, trustworthy, and interpretable evaluations of language models. | |
| #### Features | |
| **Open-Ended:** The responses of chat models are open-ended in nature, and a small set of reference | |
| answers often can’t capture all acceptable responses. This is a key limitation of reference-based | |
| evaluators like BLEU and BERTScore. | |
| **Multidimensional:** Responses can be good and bad in different ways, which isn’t captured by "head | |
| to head" evaluators like Chatbot Arena and AlpacaEval that simply decide if one response is better | |
| than another generally. | |
| **Absolute:** Evaluators like Chatbot Arena and AlpacaEval use win rates based on pairwise comparisons. | |
| This means that we don’t know how good a model is in absolute terms. For example, a model may | |
| have a low win rate against GPT-4o but still be formidable, and the highest win rate model may not | |
| be perfect despite topping the leaderboard. | |
| **Varying Criteria:** The criteria for what makes a good response is different for each instruction. While | |
| HELM Instruct is open-ended, multidimensional, and absolute, it uses the same set of scoring criteria | |
| for each instruction, missing nuances at the instruction level. Most pairwise comparison evaluators | |
| may implicitly consider varying criteria for each instruction, but these criteria are not explicitly laid | |
| out (WildBench is a notable exception). | |
| **Feedback:** To the best of our knowledge, no current language model evaluation system provides | |
| textual feedback on a model’s overall strengths and weaknesses with respect to some set of | |
| instructions. However, we believe that such feedback would be highly valuable for model developers. | |
| Evaluation is a key piece of iterative model development, and textual feedback could provide insight | |
| on what exactly needs to be improved rather than solely a score which is hard to interpret. | |
| ###### | |
| """) | |
| gr.Image("feature_comp.png", | |
| show_label = False, | |
| show_download_button = False, | |
| show_share_button = False) | |
| with gr.Accordion("Where do evaluation instructions come from?"): | |
| gr.Markdown( | |
| """ | |
| ###### | |
| We utilize a set of approximately 1,000 instructions from WildBench ([https://huggingface.co/spaces/allenai/WildBench](https://huggingface.co/spaces/allenai/WildBench)) which was made publicly available. From this, 392 of the hardest instructions were chosen via a GPT-4 based pairwise comparison method. | |
| Using the WildBench dataset has three primary benefits: | |
| 1) It contains a manually curated selection of instructions from real users. | |
| 2) The instructions are well spread out across 11 categories, which is useful for benchmarking. | |
| 3) Each instruction comes with user-defined criteria of what they’re looking for, which we can make use of directly in our framework | |
| ###### | |
| """) | |
| with gr.Accordion("How does RubricEval correlate with human preferences?"): | |
| gr.Markdown( | |
| """ | |
| ###### | |
| We used RubricEval to score 13 leading large language models across 11 categories and 392 instructions from WildBench. | |
| Notably, the ranking of these models based on RubricEval scores correlates highly with the ranking of the same models using Chatbot Arena ELO ratings (spearman ρ = 0.98). | |
| The main discordance is in the ranking of Claude 3 Opus (which is ranked relatively lower by RubricEval compared to Chatbot Arena). | |
| RubricEval’s correlation of ρ = 0.98 with human preferences ties length-corrected AlpacaEval’s record 0.98 correlation, while being higher than regular AlpacaEval (ρ = 0.94), MT-Bench (ρ = 0.94), and MMLU (ρ = 0.87). | |
| ###### | |
| """) | |
| with gr.Accordion("Additional details"): | |
| gr.Markdown( | |
| """ | |
| ###### | |
| See our detailed report at [insert blog link]. | |
| ###### | |
| """) | |
| with gr.Accordion("Citation"): | |
| gr.Markdown( | |
| """ | |
| ###### | |
| [insert citation] | |
| ###### | |
| """) | |
| with gr.TabItem("Submit Model"): | |
| gr.Markdown( | |
| """ | |
| ###### | |
| #### Want to add a model to this leaderboard? | |
| #### 1. Run RubricEval locally for <$x (see [insert github link]). | |
| #### 2. Upload the evaluation file generated by RubricEval below. Note: the file name will be used as the model name. | |
| #### 3. Wait ~5 seconds and refresh the leaderboard page to see that your model has been added! | |
| ###### | |
| """) | |
| model_submission = gr.File(file_types = [".csv"], file_count = "single") | |
| model_submission.upload(fn = process_upload, inputs = [model_submission], outputs = []) | |
| demo.launch() | |
| if __name__ == "__main__": | |
| create_ui() |