zhilinw commited on
Commit
6e03470
Β·
verified Β·
1 Parent(s): d43cc64

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -4
app.py CHANGED
@@ -154,7 +154,7 @@ with gr.Blocks(theme=theme) as app:
154
 
155
  with gr.TabItem("LLM Judge"):
156
  with gr.Row():
157
- gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index.")
158
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
159
  with gr.TabItem("Leaderboard"):
160
  with gr.Row():
@@ -260,14 +260,15 @@ with gr.Blocks(theme=theme) as app:
260
  with gr.Row():
261
  with gr.Accordion("πŸ“š Frequently Asked Questions", open=False):
262
  citation_button = gr.Textbox(
263
- value=r"""1. How is the cost calculated?: We use the token cost from https://openrouter.ai/models multipled by the total input/output tokens in each evaluation.""",
264
- lines=1,
 
265
  label="FAQ",
266
  elem_id="faq_box",
267
  )
268
 
269
  with gr.Row():
270
- with gr.Accordion("πŸ“š Understand our metrics", open=False):
271
  citation_button = gr.Textbox(
272
  value=r"""Response Generation (w Docs): We first generate the response. Then we grade the response against the human-annotated rubrics. Finally, we calculate the proportion of rubrics satisfied by each response, weighted by their criterion-weight to derive a score for each response.
273
  LLM Judge: We calculate macro-F1 of the LLM-judge predicted criteria-fulfillment against the human-labelled criterion fulfillment to get Overall F1. We then calculate the bias for each model by taking mean of predicted fulfilment minus mean of human-labelled fulfilment. We calculate Bias Index by taking max(bias) - min(bias) across models. Overall is calculated by Overall F1 - Bias Index.""",
 
154
 
155
  with gr.TabItem("LLM Judge"):
156
  with gr.Row():
157
+ gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index.")
158
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
159
  with gr.TabItem("Leaderboard"):
160
  with gr.Row():
 
260
  with gr.Row():
261
  with gr.Accordion("πŸ“š Frequently Asked Questions", open=False):
262
  citation_button = gr.Textbox(
263
+ value=r"""1. How is the cost calculated?: We use the token cost from https://openrouter.ai/models multipled by the total input/output tokens in each evaluation.
264
+ 2. How can I run Report Generation Leaderboard with Grounding Documents: This benchmark is unable to be run externally at the moment since we are unable to release the required grounding documents. We are working on it.""",
265
+ lines=2,
266
  label="FAQ",
267
  elem_id="faq_box",
268
  )
269
 
270
  with gr.Row():
271
+ with gr.Accordion("πŸ“š Understand the Metrics", open=False):
272
  citation_button = gr.Textbox(
273
  value=r"""Response Generation (w Docs): We first generate the response. Then we grade the response against the human-annotated rubrics. Finally, we calculate the proportion of rubrics satisfied by each response, weighted by their criterion-weight to derive a score for each response.
274
  LLM Judge: We calculate macro-F1 of the LLM-judge predicted criteria-fulfillment against the human-labelled criterion fulfillment to get Overall F1. We then calculate the bias for each model by taking mean of predicted fulfilment minus mean of human-labelled fulfilment. We calculate Bias Index by taking max(bias) - min(bias) across models. Overall is calculated by Overall F1 - Bias Index.""",