zhilinw commited on
Commit
d43cc64
Β·
verified Β·
1 Parent(s): da1271f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -5
app.py CHANGED
@@ -22,8 +22,9 @@ color_map = {
22
  }
23
 
24
  CAPTION_V2 = f"""**ProfBench**: Human-annotated rubrics on addressing professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
25
- [Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench)\n
26
- Want to see your favorite models added? Run it with our code, send us the scores or ping us to run it for you!"""
 
27
 
28
 
29
  def color_model_type_column(df, color_map):
@@ -110,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
110
  with gr.TabItem("Report Generation"):
111
  with gr.Row():
112
  with gr.Column(scale=7):
113
- gr.Markdown("Report Generation Leaderboard")
114
 
115
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
116
  with gr.TabItem("Leaderboard"):
@@ -153,7 +154,7 @@ with gr.Blocks(theme=theme) as app:
153
 
154
  with gr.TabItem("LLM Judge"):
155
  with gr.Row():
156
- gr.Markdown("LLM Judge Leaderboard")
157
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
158
  with gr.TabItem("Leaderboard"):
159
  with gr.Row():
@@ -196,7 +197,7 @@ with gr.Blocks(theme=theme) as app:
196
  with gr.TabItem("Report Generation w Docs"):
197
  with gr.Row():
198
  with gr.Column(scale=7):
199
- gr.Markdown("Report Generation Leaderboard with Grounding Documents")
200
 
201
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
202
  with gr.TabItem("Leaderboard"):
@@ -256,6 +257,26 @@ with gr.Blocks(theme=theme) as app:
256
  regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
257
  )
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  with gr.Row():
260
  with gr.Accordion("πŸ“š Citation and Credits", open=False):
261
  citation_button = gr.Textbox(
 
22
  }
23
 
24
  CAPTION_V2 = f"""**ProfBench**: Human-annotated rubrics on addressing professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
25
+ ProfBench is a high-quality, text-only dataset that represent the complex reasoning tasks faced by professionals in fields like finance and chemistry. We're not talking about simple Q&A or retrieval-based tasks. We're talking about multi-page assignments that require deep domain knowledge and reasoning. Can AI generate comprehensive reports by applying the nuanced reasoning that a PhD-level physicist/chemist or an MBA-level consultant/financier would have? \n
26
+ [Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench) | [Nemo Evaluator SDK](https://github.com/NVIDIA-NeMo/Evaluator)\n
27
+ Want to see your favorite models added? Run it with [Nemo Evaluator SDK for scalable evaluation](https://github.com/NVIDIA-NeMo/Evaluator) or [ProfBench code for quick evaluation](https://github.com/NVlabs/ProfBench), send us the scores or ping us to run it for you!"""
28
 
29
 
30
  def color_model_type_column(df, color_map):
 
111
  with gr.TabItem("Report Generation"):
112
  with gr.Row():
113
  with gr.Column(scale=7):
114
+ gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples)")
115
 
116
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
117
  with gr.TabItem("Leaderboard"):
 
154
 
155
  with gr.TabItem("LLM Judge"):
156
  with gr.Row():
157
+ gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index.")
158
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
159
  with gr.TabItem("Leaderboard"):
160
  with gr.Row():
 
197
  with gr.TabItem("Report Generation w Docs"):
198
  with gr.Row():
199
  with gr.Column(scale=7):
200
+ gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge.")
201
 
202
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
203
  with gr.TabItem("Leaderboard"):
 
257
  regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
258
  )
259
 
260
+ with gr.Row():
261
+ with gr.Accordion("πŸ“š Frequently Asked Questions", open=False):
262
+ citation_button = gr.Textbox(
263
+ value=r"""1. How is the cost calculated?: We use the token cost from https://openrouter.ai/models multipled by the total input/output tokens in each evaluation.""",
264
+ lines=1,
265
+ label="FAQ",
266
+ elem_id="faq_box",
267
+ )
268
+
269
+ with gr.Row():
270
+ with gr.Accordion("πŸ“š Understand our metrics", open=False):
271
+ citation_button = gr.Textbox(
272
+ value=r"""Response Generation (w Docs): We first generate the response. Then we grade the response against the human-annotated rubrics. Finally, we calculate the proportion of rubrics satisfied by each response, weighted by their criterion-weight to derive a score for each response.
273
+ LLM Judge: We calculate macro-F1 of the LLM-judge predicted criteria-fulfillment against the human-labelled criterion fulfillment to get Overall F1. We then calculate the bias for each model by taking mean of predicted fulfilment minus mean of human-labelled fulfilment. We calculate Bias Index by taking max(bias) - min(bias) across models. Overall is calculated by Overall F1 - Bias Index.""",
274
+ lines=4,
275
+ label="Metrics",
276
+ elem_id="metrics_box",
277
+ )
278
+
279
+
280
  with gr.Row():
281
  with gr.Accordion("πŸ“š Citation and Credits", open=False):
282
  citation_button = gr.Textbox(