Spaces:

nvidia
/

ProfBench

Running

zhilinw commited on Oct 30

Commit

5f536dc

verified ·

1 Parent(s): 41e89c2

Upload app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -154,7 +154,7 @@ with gr.Blocks(theme=theme) as app:
         with gr.TabItem("LLM Judge"):
             with gr.Row():
-                gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index. \nEvaluation and cost estimations last performed on 20 Sep 2025.")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):
                     with gr.Row():
@@ -197,7 +197,7 @@ with gr.Blocks(theme=theme) as app:
         with gr.TabItem("Report Generation w Docs"):
             with gr.Row():
                 with gr.Column(scale=7):
-                    gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge. \nEvaluation and cost estimations last performed on 20 Sep 2025.")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):

         with gr.TabItem("LLM Judge"):
             with gr.Row():
+                gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index. \nEvaluation and cost estimation last performed on 20 Sep 2025.")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):
                     with gr.Row():
         with gr.TabItem("Report Generation w Docs"):
             with gr.Row():
                 with gr.Column(scale=7):
+                    gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge. \nEvaluation and cost estimation last performed on 20 Sep 2025.")
             with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                 with gr.TabItem("Leaderboard"):