Upload 2 files
Browse files- app.py +1 -1
- report_generation.jsonl +1 -0
app.py
CHANGED
|
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
-
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
+
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 28 Nov 2025.")
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
report_generation.jsonl
CHANGED
|
@@ -41,3 +41,4 @@
|
|
| 41 |
{"Model": "xAI/grok-4.1-fast (Thinking)", "Category": "Closed-source Reasoning", "Overall": 44.8, "Physics": 27.8, "Chemistry": 56.3, "Finance": 25.0, "Consulting": 70.2, "Extraction": 36.9, "Reasoning": 45.8, "Style": 61.6, "Response Characters": 3630, "Input Tokens": 598, "Output Tokens": 11826, "Cost": 0.97}
|
| 42 |
{"Model": "xAI/grok-4.1-fast", "Category": "Closed-source Instruct", "Overall": 44.4, "Physics": 28.0, "Chemistry": 53.1, "Finance": 28.9, "Consulting": 67.5, "Extraction": 36.4, "Reasoning": 44.9, "Style": 64.0, "Response Characters": 3906, "Input Tokens": 531, "Output Tokens": 11320, "Cost": 0.92}
|
| 43 |
{"Model": "OpenAI/GPT-5.1 (high)", "Category": "Closed-source Reasoning", "Overall": 54.9, "Physics": 42.7, "Chemistry": 66.5, "Finance": 42.9, "Consulting": 67.6, "Extraction": 49.5, "Reasoning": 58.7, "Style": 61.6, "Response Characters": 11627, "Input Tokens": 467, "Output Tokens": 17148, "Cost": 27.53}
|
|
|
|
|
|
| 41 |
{"Model": "xAI/grok-4.1-fast (Thinking)", "Category": "Closed-source Reasoning", "Overall": 44.8, "Physics": 27.8, "Chemistry": 56.3, "Finance": 25.0, "Consulting": 70.2, "Extraction": 36.9, "Reasoning": 45.8, "Style": 61.6, "Response Characters": 3630, "Input Tokens": 598, "Output Tokens": 11826, "Cost": 0.97}
|
| 42 |
{"Model": "xAI/grok-4.1-fast", "Category": "Closed-source Instruct", "Overall": 44.4, "Physics": 28.0, "Chemistry": 53.1, "Finance": 28.9, "Consulting": 67.5, "Extraction": 36.4, "Reasoning": 44.9, "Style": 64.0, "Response Characters": 3906, "Input Tokens": 531, "Output Tokens": 11320, "Cost": 0.92}
|
| 43 |
{"Model": "OpenAI/GPT-5.1 (high)", "Category": "Closed-source Reasoning", "Overall": 54.9, "Physics": 42.7, "Chemistry": 66.5, "Finance": 42.9, "Consulting": 67.6, "Extraction": 49.5, "Reasoning": 58.7, "Style": 61.6, "Response Characters": 11627, "Input Tokens": 467, "Output Tokens": 17148, "Cost": 27.53}
|
| 44 |
+
{"Model": "Anthropic/claude-opus-4.5 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 54.2, "Physics": 40.5, "Chemistry": 68.5, "Finance": 39.0, "Consulting": 68.7, "Extraction": 48.6, "Reasoning": 56.1, "Style": 65.7, "Response Characters": 9400, "Input Tokens": 560, "Output Tokens": 17627, "Cost": 70.96}
|