Spaces:

nvidia
/

ProfBench

Running

App Files Files Community

zhilinw commited on Oct 30

Commit

d43cc64

verified ·

1 Parent(s): da1271f

Upload app.py

Browse files

Files changed (1) hide show

app.py +26 -5

app.py CHANGED Viewed

@@ -22,8 +22,9 @@ color_map = {
 }
 CAPTION_V2 = f"""**ProfBench**: Human-annotated rubrics on addressing professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
-[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench)\n
-Want to see your favorite models added? Run it with our code, send us the scores or ping us to run it for you!"""
 def color_model_type_column(df, color_map):
@@ -110,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
         with gr.TabItem("Report Generation"):
             with gr.Row():
                 with gr.Column(scale=7):
-                    gr.Markdown("Report Generation Leaderboard")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):
@@ -153,7 +154,7 @@ with gr.Blocks(theme=theme) as app:
         with gr.TabItem("LLM Judge"):
             with gr.Row():
-                gr.Markdown("LLM Judge Leaderboard")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):
                     with gr.Row():
@@ -196,7 +197,7 @@ with gr.Blocks(theme=theme) as app:
         with gr.TabItem("Report Generation w Docs"):
             with gr.Row():
                 with gr.Column(scale=7):
-                    gr.Markdown("Report Generation Leaderboard with Grounding Documents")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):
@@ -256,6 +257,26 @@ with gr.Blocks(theme=theme) as app:
         regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
     )
     with gr.Row():
         with gr.Accordion("📚 Citation and Credits", open=False):
             citation_button = gr.Textbox(

 }
 CAPTION_V2 = f"""**ProfBench**: Human-annotated rubrics on addressing professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
+ProfBench is a high-quality, text-only dataset that represent the complex reasoning tasks faced by professionals in fields like finance and chemistry. We're not talking about simple Q&A or retrieval-based tasks. We're talking about multi-page assignments that require deep domain knowledge and reasoning. Can AI generate comprehensive reports by applying the nuanced reasoning that a PhD-level physicist/chemist or an MBA-level consultant/financier would have? \n
+[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench) | [Nemo Evaluator SDK](https://github.com/NVIDIA-NeMo/Evaluator)\n
+Want to see your favorite models added? Run it with [Nemo Evaluator SDK for scalable evaluation](https://github.com/NVIDIA-NeMo/Evaluator) or [ProfBench code for quick evaluation](https://github.com/NVlabs/ProfBench), send us the scores or ping us to run it for you!"""
 def color_model_type_column(df, color_map):
         with gr.TabItem("Report Generation"):
             with gr.Row():
                 with gr.Column(scale=7):
+                    gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples)")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):
         with gr.TabItem("LLM Judge"):
             with gr.Row():
+                gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index.")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):
                     with gr.Row():
         with gr.TabItem("Report Generation w Docs"):
             with gr.Row():
                 with gr.Column(scale=7):
+                    gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge.")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):
         regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
     )
+    with gr.Row():
+        with gr.Accordion("📚 Frequently Asked Questions", open=False):
+            citation_button = gr.Textbox(
+                value=r"""1. How is the cost calculated?: We use the token cost from https://openrouter.ai/models multipled by the total input/output tokens in each evaluation.""",
+                lines=1,
+                label="FAQ",
+                elem_id="faq_box",
+            )
+    with gr.Row():
+        with gr.Accordion("📚 Understand our metrics", open=False):
+            citation_button = gr.Textbox(
+                value=r"""Response Generation (w Docs): We first generate the response. Then we grade the response against the human-annotated rubrics. Finally, we calculate the proportion of rubrics satisfied by each response, weighted by their criterion-weight to derive a score for each response.
+LLM Judge: We calculate macro-F1 of the LLM-judge predicted criteria-fulfillment against the human-labelled criterion fulfillment to get Overall F1. We then calculate the bias for each model by taking mean of predicted fulfilment minus mean of human-labelled fulfilment. We calculate Bias Index by taking max(bias) - min(bias) across models. Overall is calculated by Overall F1 - Bias Index.""",
+                lines=4,
+                label="Metrics",
+                elem_id="metrics_box",
+            )
     with gr.Row():
         with gr.Accordion("📚 Citation and Credits", open=False):
             citation_button = gr.Textbox(