Upload app.py
Browse files
app.py
CHANGED
|
@@ -154,7 +154,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 154 |
|
| 155 |
with gr.TabItem("LLM Judge"):
|
| 156 |
with gr.Row():
|
| 157 |
-
gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index. \nEvaluation and cost
|
| 158 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 159 |
with gr.TabItem("Leaderboard"):
|
| 160 |
with gr.Row():
|
|
@@ -197,7 +197,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 197 |
with gr.TabItem("Report Generation w Docs"):
|
| 198 |
with gr.Row():
|
| 199 |
with gr.Column(scale=7):
|
| 200 |
-
gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge. \nEvaluation and cost
|
| 201 |
|
| 202 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 203 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 154 |
|
| 155 |
with gr.TabItem("LLM Judge"):
|
| 156 |
with gr.Row():
|
| 157 |
+
gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index. \nEvaluation and cost estimation last performed on 20 Sep 2025.")
|
| 158 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 159 |
with gr.TabItem("Leaderboard"):
|
| 160 |
with gr.Row():
|
|
|
|
| 197 |
with gr.TabItem("Report Generation w Docs"):
|
| 198 |
with gr.Row():
|
| 199 |
with gr.Column(scale=7):
|
| 200 |
+
gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge. \nEvaluation and cost estimation last performed on 20 Sep 2025.")
|
| 201 |
|
| 202 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 203 |
with gr.TabItem("Leaderboard"):
|