File size: 16,132 Bytes
ee52384 b556a2a ee52384 4b7bd8f ee52384 c20bbb2 d43cc64 41e89c2 ee52384 ae87b7e ee52384 b556a2a ee52384 41e89c2 ee52384 ae87b7e ee52384 41e89c2 ee52384 ae87b7e ee52384 b556a2a 41e89c2 b556a2a ee52384 b556a2a ee52384 b556a2a d43cc64 6e03470 d43cc64 6e03470 d43cc64 ee52384 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
import gradio as gr
import pandas as pd
###########################################
# Load Data #
###########################################
llm_judge_filename = "llm_judge_results.jsonl"
response_generation_filename = "report_generation.jsonl"
response_generation_w_docs_filename = "report_generation_w_docs.jsonl"
def load_filename_into_df(filename):
df = pd.read_json(filename, lines=True)
return df
color_map = {
"Closed-source Instruct": "#4492F7" ,
"Open-weight Instruct": "#0856f1",
"Closed-source Reasoning": "#fac05d" ,
"Open-weight Reasoning": "#f59c03",
}
CAPTION_V2 = f"""**ProfBench**: Over 7,000 brand-new expert-authored response–criterion pairs across 80 professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
ProfBench is a high-quality, text-only dataset that represent the complex reasoning tasks faced by professionals in fields like finance and chemistry. We're not talking about simple Q&A or retrieval-based tasks. We're talking about multi-page assignments that require deep domain knowledge and reasoning. Can AI generate comprehensive reports by applying the nuanced reasoning that a PhD-level physicist/chemist or an MBA-level consultant/financier would have? \n
[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench) | [Nemo Evaluator SDK](https://github.com/NVIDIA-NeMo/Evaluator)\n
Want to see your favorite models added? Run it with [Nemo Evaluator SDK for scalable evaluation](https://github.com/NVIDIA-NeMo/Evaluator) or [ProfBench code for quick evaluation](https://github.com/NVlabs/ProfBench), send us the scores or ping zhilinw/viviennez [at] nvidia.com to run it for you!"""
def color_model_type_column(df, color_map):
"""
Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
Parameters:
df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
color_map (dict): A dictionary mapping model types to colors.
Returns:
pd.Styler: The styled DataFrame.
"""
# Function to apply color based on the model type
def apply_color(val):
color = color_map.get(val, "default") # Default color if not specified in color_map
return f"background-color: {color}"
# # Format for different columns
# format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Average", "Model", "Model Type"]}
# format_dict["Average"] = "{:.2f}"
# format_dict[""] = "{:d}"
format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Model", "Category", "Input Tokens", "Output Tokens", "Cost"]}
format_dict["Response Characters"] = "{:d}"
format_dict["Input Tokens"] = "{:d}"
format_dict["Output Tokens"] = "{:d}"
format_dict[""] = "{:d}"
format_dict["Cost"] = "{:.2f}"
return df.style.map(apply_color, subset=["Category"]).format(format_dict, na_rep="")
def regex_table(dataframe, regex, filter_button, style=True):
"""
Takes a model name as a regex, then returns only the rows that has that in it.
"""
# Split regex statement by comma and trim whitespace around regexes
regex_list = [x.strip() for x in regex.split(",")]
# Join the list into a single regex pattern with '|' acting as OR
combined_regex = "|".join(regex_list)
if isinstance(filter_button, list) or isinstance(filter_button, str):
if "Open-weight" not in filter_button:
dataframe = dataframe[~dataframe["Category"].str.contains("Open-weight", case=False, na=False)]
if "Closed-source" not in filter_button:
dataframe = dataframe[~dataframe["Category"].str.contains("Closed-source", case=False, na=False)]
if "Reasoning" not in filter_button:
dataframe = dataframe[~dataframe["Category"].str.contains("Reasoning", case=False, na=False)]
if "Instruct" not in filter_button:
dataframe = dataframe[~dataframe["Category"].str.contains("Instruct", case=False, na=False)]
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
# if update the score to not use prior sets, do so
data = data.sort_values(by="Overall", ascending=False)
data.reset_index(drop=True, inplace=True)
data.insert(0, "", range(1, 1 + len(data)))
if style:
# apply color
data = color_model_type_column(data, color_map)
return data
# Using a string for a predefined color
theme = gr.themes.Default(primary_hue="blue")
#############################################
# Gradio App #
#############################################
with gr.Blocks(theme=theme) as app:
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
with gr.Row():
with gr.Column(scale=6):
gr.Markdown(CAPTION_V2)
with gr.Tabs(elem_id="outer-tabs", elem_classes="tabs-big") as tabs_big:
with gr.TabItem("Report Generation"):
with gr.Row():
with gr.Column(scale=7):
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 29 Oct 2025.")
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
with gr.TabItem("Leaderboard"):
with gr.Row():
search_1 = gr.Textbox(
label="Model Search (delimit with , )",
placeholder="Model Search (delimit with , )",
show_label=False,
scale=8,
)
model_types_1 = gr.CheckboxGroup(
["Open-weight", "Closed-source", "Reasoning", "Instruct"],
value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
show_label=False,
scale=8,
)
with gr.Row():
col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12
df_response_generation = load_filename_into_df(response_generation_filename)
rewardbench_table_hidden = gr.Dataframe(
df_response_generation.values,
datatype=col_types_response_generation,
headers=df_response_generation.columns.tolist(),
visible=False,
)
rewardbench_table = gr.Dataframe(
regex_table(
df_response_generation.copy(),
"",
["Open-weight", "Closed-source", "Reasoning", "Instruct"]
),
datatype=col_types_response_generation,
headers=df_response_generation.columns.tolist(),
elem_id="response_generation_dataframe",
row_count=(25, "dynamic"),
)
with gr.TabItem("LLM Judge"):
with gr.Row():
gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index. \nEvaluation and cost estimations last performed on 20 Sep 2025.")
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
with gr.TabItem("Leaderboard"):
with gr.Row():
search_1_v1 = gr.Textbox(
label="Model Search (delimit with , )",
placeholder="Model Search (delimit with , )",
show_label=False,
)
model_types_1_v1 = gr.CheckboxGroup(
["Open-weight", "Closed-source", "Reasoning", "Instruct"],
value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
label="Model Types",
show_label=False,
# info="Which model types to include.",
)
with gr.Row():
col_types_llm_judge = ["number"] + ["markdown"] + ["str"] + ["number"] * 16
df_llm_judge = load_filename_into_df(llm_judge_filename)
rewardbench_table_hidden_v1 = gr.Dataframe(
df_llm_judge.values,
datatype=col_types_llm_judge,
headers=df_llm_judge.columns.tolist(),
visible=False,
)
rewardbench_table_v1 = gr.Dataframe(
regex_table(
df_llm_judge.copy(),
"",
["Open-weight", "Closed-source", "Reasoning", "Instruct"],
),
datatype=col_types_llm_judge,
headers=df_llm_judge.columns.tolist(),
elem_id="llm_judge_dataframe",
row_count=(25, "dynamic"),
)
with gr.TabItem("Report Generation w Docs"):
with gr.Row():
with gr.Column(scale=7):
gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge. \nEvaluation and cost estimations last performed on 20 Sep 2025.")
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
with gr.TabItem("Leaderboard"):
with gr.Row():
search_1_v2 = gr.Textbox(
label="Model Search (delimit with , )",
placeholder="Model Search (delimit with , )",
show_label=False,
scale=8,
)
model_types_1_v2 = gr.CheckboxGroup(
["Open-weight", "Closed-source", "Reasoning", "Instruct"],
value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
show_label=False,
scale=8,
)
with gr.Row():
col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12
df_response_generation_w_docs = load_filename_into_df(response_generation_w_docs_filename)
rewardbench_table_hidden_v2 = gr.Dataframe(
df_response_generation_w_docs.values,
datatype=col_types_response_generation,
headers=df_response_generation_w_docs.columns.tolist(),
visible=False,
)
rewardbench_table_v2 = gr.Dataframe(
regex_table(
df_response_generation_w_docs.copy(),
"",
["Open-weight", "Closed-source", "Reasoning", "Instruct"]
),
datatype=col_types_response_generation,
headers=df_response_generation_w_docs.columns.tolist(),
elem_id="response_generation_dataframe",
row_count=(25, "dynamic"),
)
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
search_1_v1.change(
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
)
search_1_v2.change(
regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
)
model_types_1.change(
regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
)
model_types_1_v1.change(
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
)
model_types_1_v2.change(
regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
)
with gr.Row():
with gr.Accordion("📚 Frequently Asked Questions", open=False):
citation_button = gr.Textbox(
value=r"""1. How is the cost calculated?: We use the token cost from https://openrouter.ai/models multipled by the total input/output tokens in each evaluation.
2. How can I run Report Generation Leaderboard with Grounding Documents: This benchmark is unable to be run externally at the moment since we are unable to release the required grounding documents. We are working on it.""",
lines=2,
label="FAQ",
elem_id="faq_box",
)
with gr.Row():
with gr.Accordion("📚 Understand the Metrics", open=False):
citation_button = gr.Textbox(
value=r"""Response Generation (w Docs): We first generate the response. Then we grade the response against the human-annotated rubrics. Finally, we calculate the proportion of rubrics satisfied by each response, weighted by their criterion-weight to derive a score for each response.
LLM Judge: We calculate macro-F1 of the LLM-judge predicted criteria-fulfillment against the human-labelled criterion fulfillment to get Overall F1. We then calculate the bias for each model by taking mean of predicted fulfilment minus mean of human-labelled fulfilment. We calculate Bias Index by taking max(bias) - min(bias) across models. Overall is calculated by Overall F1 - Bias Index.""",
lines=4,
label="Metrics",
elem_id="metrics_box",
)
with gr.Row():
with gr.Accordion("📚 Citation and Credits", open=False):
citation_button = gr.Textbox(
value=r"""@misc{wang2025profbenchmultidomainrubricsrequiring,
title={ProfBench: Multi-Domain Rubrics requiring Professional Knowledge to Answer and Judge},
author={Zhilin Wang and Jaehun Jung and Ximing Lu and Shizhe Diao and Ellie Evans and Jiaqi Zeng and Pavlo Molchanov and Yejin Choi and Jan Kautz and Yi Dong},
year={2025},
eprint={2510.18941},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2510.18941},
}""",
lines=10,
label="If you find the results helpful, please cite the following. ",
elem_id="citation-button",
show_copy_button=True,
)
gr.Textbox("Leaderboard adapted from allenai/reward-bench ", label="Leaderboard credits",)
app.launch() # had .queue() before launch before... not sure if that's necessary
|