zhilinw commited on
Commit
5f536dc
·
verified ·
1 Parent(s): 41e89c2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -2
app.py CHANGED
@@ -154,7 +154,7 @@ with gr.Blocks(theme=theme) as app:
154
 
155
  with gr.TabItem("LLM Judge"):
156
  with gr.Row():
157
- gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index. \nEvaluation and cost estimations last performed on 20 Sep 2025.")
158
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
159
  with gr.TabItem("Leaderboard"):
160
  with gr.Row():
@@ -197,7 +197,7 @@ with gr.Blocks(theme=theme) as app:
197
  with gr.TabItem("Report Generation w Docs"):
198
  with gr.Row():
199
  with gr.Column(scale=7):
200
- gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge. \nEvaluation and cost estimations last performed on 20 Sep 2025.")
201
 
202
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
203
  with gr.TabItem("Leaderboard"):
 
154
 
155
  with gr.TabItem("LLM Judge"):
156
  with gr.Row():
157
+ gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index. \nEvaluation and cost estimation last performed on 20 Sep 2025.")
158
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
159
  with gr.TabItem("Leaderboard"):
160
  with gr.Row():
 
197
  with gr.TabItem("Report Generation w Docs"):
198
  with gr.Row():
199
  with gr.Column(scale=7):
200
+ gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge. \nEvaluation and cost estimation last performed on 20 Sep 2025.")
201
 
202
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
203
  with gr.TabItem("Leaderboard"):