import gradio as gr import pandas as pd ########################################### # Load Data # ########################################### llm_judge_filename = "llm_judge_results.jsonl" response_generation_filename = "report_generation_w_docs.jsonl" def load_filename_into_df(filename): df = pd.read_json(filename, lines=True) return df color_map = { "Closed-source Instruct": "#B8D2F5" , "Open-weight Instruct": "#6f96e5", "Closed-source Reasoning": "#fce8c5" , "Open-weight Reasoning": "#ffcd75", } CAPTION_V2 = f"""**ProfBench**: Human-annotated rubrics on addressing professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n [Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench)\n Want to see your favorite models added? Run it with our code, send us the scores or ping us to run it for you!""" def color_model_type_column(df, color_map): """ Apply color to the 'Model Type' column of the DataFrame based on a given color mapping. Parameters: df (pd.DataFrame): The DataFrame containing the 'Model Type' column. color_map (dict): A dictionary mapping model types to colors. Returns: pd.Styler: The styled DataFrame. """ # Function to apply color based on the model type def apply_color(val): color = color_map.get(val, "default") # Default color if not specified in color_map return f"background-color: {color}" # # Format for different columns # format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Average", "Model", "Model Type"]} # format_dict["Average"] = "{:.2f}" # format_dict[""] = "{:d}" format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Model", "Category", "Input Tokens", "Output Tokens", "Cost"]} format_dict["Response Characters"] = "{:d}" format_dict["Input Tokens"] = "{:d}" format_dict["Output Tokens"] = "{:d}" format_dict[""] = "{:d}" format_dict["Cost"] = "{:.2f}" return df.style.applymap(apply_color, subset=["Category"]).format(format_dict, na_rep="") def regex_table(dataframe, regex, filter_button, style=True): """ Takes a model name as a regex, then returns only the rows that has that in it. """ # Split regex statement by comma and trim whitespace around regexes regex_list = [x.strip() for x in regex.split(",")] # Join the list into a single regex pattern with '|' acting as OR combined_regex = "|".join(regex_list) if isinstance(filter_button, list) or isinstance(filter_button, str): if "Open-weight" not in filter_button: dataframe = dataframe[~dataframe["Category"].str.contains("Open-weight", case=False, na=False)] if "Closed-source" not in filter_button: dataframe = dataframe[~dataframe["Category"].str.contains("Closed-source", case=False, na=False)] if "Reasoning" not in filter_button: dataframe = dataframe[~dataframe["Category"].str.contains("Reasoning", case=False, na=False)] if "Instruct" not in filter_button: dataframe = dataframe[~dataframe["Category"].str.contains("Instruct", case=False, na=False)] data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)] # if update the score to not use prior sets, do so data = data.sort_values(by="Overall", ascending=False) data.reset_index(drop=True, inplace=True) data.insert(0, "", range(1, 1 + len(data))) if style: # apply color data = color_model_type_column(data, color_map) return data # Using a string for a predefined color theme = gr.themes.Default(primary_hue="blue") ############################################# # Gradio App # ############################################# with gr.Blocks(theme=theme) as app: # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About" with gr.Row(): with gr.Column(scale=6): gr.Markdown(CAPTION_V2) with gr.Tabs(elem_id="outer-tabs", elem_classes="tabs-big") as tabs_big: with gr.TabItem("Report Generation w Docs"): with gr.Row(): with gr.Column(scale=7): gr.Markdown("Report Generation Leaderboard with Grounding Documents") with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs: with gr.TabItem("Leaderboard"): with gr.Row(): search_1 = gr.Textbox( label="Model Search (delimit with , )", placeholder="Model Search (delimit with , )", show_label=False, scale=8, ) model_types_1 = gr.CheckboxGroup( ["Open-weight", "Closed-source", "Reasoning", "Instruct"], value=["Open-weight", "Closed-source", "Reasoning", "Instruct"], show_label=False, scale=8, ) with gr.Row(): col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12 df_response_generation = load_filename_into_df(response_generation_filename) rewardbench_table_hidden = gr.Dataframe( df_response_generation.values, datatype=col_types_response_generation, headers=df_response_generation.columns.tolist(), visible=False, ) rewardbench_table = gr.Dataframe( regex_table( df_response_generation.copy(), "", ["Open-weight", "Closed-source", "Reasoning", "Instruct"] ), datatype=col_types_response_generation, headers=df_response_generation.columns.tolist(), elem_id="response_generation_dataframe", height=800, # 800 px ≈ ~25 rows on default row-height ) with gr.TabItem("LLM Judge"): with gr.Row(): gr.Markdown("LLM Judge Leaderboard") with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs: with gr.TabItem("Leaderboard"): with gr.Row(): search_1_v1 = gr.Textbox( label="Model Search (delimit with , )", placeholder="Model Search (delimit with , )", show_label=False, ) model_types_1_v1 = gr.CheckboxGroup( ["Open-weight", "Closed-source", "Reasoning", "Instruct"], value=["Open-weight", "Closed-source", "Reasoning", "Instruct"], label="Model Types", show_label=False, # info="Which model types to include.", ) with gr.Row(): col_types_llm_judge = ["number"] + ["markdown"] + ["str"] + ["number"] * 16 df_llm_judge = load_filename_into_df(llm_judge_filename) rewardbench_table_hidden_v1 = gr.Dataframe( df_llm_judge.values, datatype=col_types_llm_judge, headers=df_llm_judge.columns.tolist(), visible=False, ) rewardbench_table_v1 = gr.Dataframe( regex_table( df_llm_judge.copy(), "", ["Open-weight", "Closed-source", "Reasoning", "Instruct"], ), datatype=col_types_llm_judge, headers=df_llm_judge.columns.tolist(), elem_id="llm_judge_dataframe", height=800, # 800 px ≈ ~25 rows on default row-height ) search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table) search_1_v1.change( regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1 ) model_types_1.change( regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table ) model_types_1_v1.change( regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1 ) with gr.Row(): with gr.Accordion("📚 Citation and Credits", open=False): citation_button = gr.Textbox( value=r"""@misc{wang2025profbenchmultidomainrubricsrequiring, title={ProfBench: Multi-Domain Rubrics requiring Professional Knowledge to Answer and Judge}, author={Zhilin Wang and Jaehun Jung and Ximing Lu and Shizhe Diao and Ellie Evans and Jiaqi Zeng and Pavlo Molchanov and Yejin Choi and Jan Kautz and Yi Dong}, year={2025}, eprint={2510.18941}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2510.18941}, }""", lines=10, label="If you find the results helpful, please cite the following. ", elem_id="citation-button", show_copy_button=True, ) gr.Textbox("Leaderboard adapted from allenai/reward-bench ", label="Leaderboard credits",) app.launch() # had .queue() before launch before... not sure if that's necessary