Upload app.py
Browse files
app.py
CHANGED
|
@@ -154,7 +154,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 154 |
|
| 155 |
with gr.TabItem("LLM Judge"):
|
| 156 |
with gr.Row():
|
| 157 |
-
gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index.")
|
| 158 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 159 |
with gr.TabItem("Leaderboard"):
|
| 160 |
with gr.Row():
|
|
@@ -260,14 +260,15 @@ with gr.Blocks(theme=theme) as app:
|
|
| 260 |
with gr.Row():
|
| 261 |
with gr.Accordion("π Frequently Asked Questions", open=False):
|
| 262 |
citation_button = gr.Textbox(
|
| 263 |
-
value=r"""1. How is the cost calculated?: We use the token cost from https://openrouter.ai/models multipled by the total input/output tokens in each evaluation.
|
| 264 |
-
|
|
|
|
| 265 |
label="FAQ",
|
| 266 |
elem_id="faq_box",
|
| 267 |
)
|
| 268 |
|
| 269 |
with gr.Row():
|
| 270 |
-
with gr.Accordion("π Understand
|
| 271 |
citation_button = gr.Textbox(
|
| 272 |
value=r"""Response Generation (w Docs): We first generate the response. Then we grade the response against the human-annotated rubrics. Finally, we calculate the proportion of rubrics satisfied by each response, weighted by their criterion-weight to derive a score for each response.
|
| 273 |
LLM Judge: We calculate macro-F1 of the LLM-judge predicted criteria-fulfillment against the human-labelled criterion fulfillment to get Overall F1. We then calculate the bias for each model by taking mean of predicted fulfilment minus mean of human-labelled fulfilment. We calculate Bias Index by taking max(bias) - min(bias) across models. Overall is calculated by Overall F1 - Bias Index.""",
|
|
|
|
| 154 |
|
| 155 |
with gr.TabItem("LLM Judge"):
|
| 156 |
with gr.Row():
|
| 157 |
+
gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index.")
|
| 158 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 159 |
with gr.TabItem("Leaderboard"):
|
| 160 |
with gr.Row():
|
|
|
|
| 260 |
with gr.Row():
|
| 261 |
with gr.Accordion("π Frequently Asked Questions", open=False):
|
| 262 |
citation_button = gr.Textbox(
|
| 263 |
+
value=r"""1. How is the cost calculated?: We use the token cost from https://openrouter.ai/models multipled by the total input/output tokens in each evaluation.
|
| 264 |
+
2. How can I run Report Generation Leaderboard with Grounding Documents: This benchmark is unable to be run externally at the moment since we are unable to release the required grounding documents. We are working on it.""",
|
| 265 |
+
lines=2,
|
| 266 |
label="FAQ",
|
| 267 |
elem_id="faq_box",
|
| 268 |
)
|
| 269 |
|
| 270 |
with gr.Row():
|
| 271 |
+
with gr.Accordion("π Understand the Metrics", open=False):
|
| 272 |
citation_button = gr.Textbox(
|
| 273 |
value=r"""Response Generation (w Docs): We first generate the response. Then we grade the response against the human-annotated rubrics. Finally, we calculate the proportion of rubrics satisfied by each response, weighted by their criterion-weight to derive a score for each response.
|
| 274 |
LLM Judge: We calculate macro-F1 of the LLM-judge predicted criteria-fulfillment against the human-labelled criterion fulfillment to get Overall F1. We then calculate the bias for each model by taking mean of predicted fulfilment minus mean of human-labelled fulfilment. We calculate Bias Index by taking max(bias) - min(bias) across models. Overall is calculated by Overall F1 - Bias Index.""",
|