Upload app.py
Browse files
app.py
CHANGED
|
@@ -22,8 +22,9 @@ color_map = {
|
|
| 22 |
}
|
| 23 |
|
| 24 |
CAPTION_V2 = f"""**ProfBench**: Human-annotated rubrics on addressing professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
|
| 25 |
-
|
| 26 |
-
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
def color_model_type_column(df, color_map):
|
|
@@ -110,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 110 |
with gr.TabItem("Report Generation"):
|
| 111 |
with gr.Row():
|
| 112 |
with gr.Column(scale=7):
|
| 113 |
-
gr.Markdown("Report Generation Leaderboard")
|
| 114 |
|
| 115 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 116 |
with gr.TabItem("Leaderboard"):
|
|
@@ -153,7 +154,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 153 |
|
| 154 |
with gr.TabItem("LLM Judge"):
|
| 155 |
with gr.Row():
|
| 156 |
-
gr.Markdown("LLM Judge Leaderboard")
|
| 157 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 158 |
with gr.TabItem("Leaderboard"):
|
| 159 |
with gr.Row():
|
|
@@ -196,7 +197,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 196 |
with gr.TabItem("Report Generation w Docs"):
|
| 197 |
with gr.Row():
|
| 198 |
with gr.Column(scale=7):
|
| 199 |
-
gr.Markdown("Report Generation Leaderboard with Grounding Documents")
|
| 200 |
|
| 201 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 202 |
with gr.TabItem("Leaderboard"):
|
|
@@ -256,6 +257,26 @@ with gr.Blocks(theme=theme) as app:
|
|
| 256 |
regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
|
| 257 |
)
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
with gr.Row():
|
| 260 |
with gr.Accordion("π Citation and Credits", open=False):
|
| 261 |
citation_button = gr.Textbox(
|
|
|
|
| 22 |
}
|
| 23 |
|
| 24 |
CAPTION_V2 = f"""**ProfBench**: Human-annotated rubrics on addressing professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
|
| 25 |
+
ProfBench is a high-quality, text-only dataset that represent the complex reasoning tasks faced by professionals in fields like finance and chemistry. We're not talking about simple Q&A or retrieval-based tasks. We're talking about multi-page assignments that require deep domain knowledge and reasoning. Can AI generate comprehensive reports by applying the nuanced reasoning that a PhD-level physicist/chemist or an MBA-level consultant/financier would have? \n
|
| 26 |
+
[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench) | [Nemo Evaluator SDK](https://github.com/NVIDIA-NeMo/Evaluator)\n
|
| 27 |
+
Want to see your favorite models added? Run it with [Nemo Evaluator SDK for scalable evaluation](https://github.com/NVIDIA-NeMo/Evaluator) or [ProfBench code for quick evaluation](https://github.com/NVlabs/ProfBench), send us the scores or ping us to run it for you!"""
|
| 28 |
|
| 29 |
|
| 30 |
def color_model_type_column(df, color_map):
|
|
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
+
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples)")
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 154 |
|
| 155 |
with gr.TabItem("LLM Judge"):
|
| 156 |
with gr.Row():
|
| 157 |
+
gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index.")
|
| 158 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 159 |
with gr.TabItem("Leaderboard"):
|
| 160 |
with gr.Row():
|
|
|
|
| 197 |
with gr.TabItem("Report Generation w Docs"):
|
| 198 |
with gr.Row():
|
| 199 |
with gr.Column(scale=7):
|
| 200 |
+
gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge.")
|
| 201 |
|
| 202 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 203 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 257 |
regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
|
| 258 |
)
|
| 259 |
|
| 260 |
+
with gr.Row():
|
| 261 |
+
with gr.Accordion("π Frequently Asked Questions", open=False):
|
| 262 |
+
citation_button = gr.Textbox(
|
| 263 |
+
value=r"""1. How is the cost calculated?: We use the token cost from https://openrouter.ai/models multipled by the total input/output tokens in each evaluation.""",
|
| 264 |
+
lines=1,
|
| 265 |
+
label="FAQ",
|
| 266 |
+
elem_id="faq_box",
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
with gr.Row():
|
| 270 |
+
with gr.Accordion("π Understand our metrics", open=False):
|
| 271 |
+
citation_button = gr.Textbox(
|
| 272 |
+
value=r"""Response Generation (w Docs): We first generate the response. Then we grade the response against the human-annotated rubrics. Finally, we calculate the proportion of rubrics satisfied by each response, weighted by their criterion-weight to derive a score for each response.
|
| 273 |
+
LLM Judge: We calculate macro-F1 of the LLM-judge predicted criteria-fulfillment against the human-labelled criterion fulfillment to get Overall F1. We then calculate the bias for each model by taking mean of predicted fulfilment minus mean of human-labelled fulfilment. We calculate Bias Index by taking max(bias) - min(bias) across models. Overall is calculated by Overall F1 - Bias Index.""",
|
| 274 |
+
lines=4,
|
| 275 |
+
label="Metrics",
|
| 276 |
+
elem_id="metrics_box",
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
|
| 280 |
with gr.Row():
|
| 281 |
with gr.Accordion("π Citation and Credits", open=False):
|
| 282 |
citation_button = gr.Textbox(
|