Upload 2 files
Browse files- app.py +1 -1
- report_generation.jsonl +8 -0
app.py
CHANGED
|
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
-
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
+
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 5 Nov 2025.")
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
report_generation.jsonl
CHANGED
|
@@ -26,3 +26,11 @@
|
|
| 26 |
{"Model": "MoonshotAI/Kimi-K2-Instruct-0905", "Category": "Open-weight Instruct", "Overall": 44.2, "Physics": 33.7, "Chemistry": 47.3, "Finance": 32.2, "Consulting": 63.6, "Extraction": 36.6, "Reasoning": 43.2, "Style": 58.8, "Response Characters": 5322, "Input Tokens": 481, "Output Tokens": 1709, "Cost": 0.55}
|
| 27 |
{"Model": "Meta/llama-4-maverick", "Category": "Open-weight Instruct", "Overall": 34.4, "Physics": 31.3, "Chemistry": 35.4, "Finance": 22.1, "Consulting": 48.8, "Extraction": 27.2, "Reasoning": 34.6, "Style": 32.6, "Response Characters": 4532, "Input Tokens": 479, "Output Tokens": 1292, "Cost": 0.14}
|
| 28 |
{"Model": "Meta/llama-4-scout", "Category": "Open-weight Instruct", "Overall": 31.2, "Physics": 19.2, "Chemistry": 30.0, "Finance": 19.5, "Consulting": 55.9, "Extraction": 26.2, "Reasoning": 29.9, "Style": 38.9, "Response Characters": 4200, "Input Tokens": 457, "Output Tokens": 1197, "Cost": 0.06}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
{"Model": "MoonshotAI/Kimi-K2-Instruct-0905", "Category": "Open-weight Instruct", "Overall": 44.2, "Physics": 33.7, "Chemistry": 47.3, "Finance": 32.2, "Consulting": 63.6, "Extraction": 36.6, "Reasoning": 43.2, "Style": 58.8, "Response Characters": 5322, "Input Tokens": 481, "Output Tokens": 1709, "Cost": 0.55}
|
| 27 |
{"Model": "Meta/llama-4-maverick", "Category": "Open-weight Instruct", "Overall": 34.4, "Physics": 31.3, "Chemistry": 35.4, "Finance": 22.1, "Consulting": 48.8, "Extraction": 27.2, "Reasoning": 34.6, "Style": 32.6, "Response Characters": 4532, "Input Tokens": 479, "Output Tokens": 1292, "Cost": 0.14}
|
| 28 |
{"Model": "Meta/llama-4-scout", "Category": "Open-weight Instruct", "Overall": 31.2, "Physics": 19.2, "Chemistry": 30.0, "Finance": 19.5, "Consulting": 55.9, "Extraction": 26.2, "Reasoning": 29.9, "Style": 38.9, "Response Characters": 4200, "Input Tokens": 457, "Output Tokens": 1197, "Cost": 0.06}
|
| 29 |
+
{"Model": "xAI/grok-4-fast (Thinking)", "Category": "Closed-source Reasoning", "Overall": 48.7, "Physics": 34.3, "Chemistry": 57.9, "Finance": 34.2, "Consulting": 68.5, "Extraction": 40.4, "Reasoning": 49.9, "Style": 64.5, "Response Characters": 6333, "Input Tokens": 598, "Output Tokens": 11536, "Cost": 0.94}
|
| 30 |
+
{"Model": "xAI/grok-4-fast", "Category": "Closed-source Instruct", "Overall": 45.9, "Physics": 31.7, "Chemistry": 54.2, "Finance": 30.5, "Consulting": 67.3, "Extraction": 40.2, "Reasoning": 48.7, "Style": 58.3, "Response Characters": 6625, "Input Tokens": 598, "Output Tokens": 11733, "Cost": 0.96}
|
| 31 |
+
{"Model": "Anthropic/claude-haiku-4.5 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 39.6, "Physics": 32.4, "Chemistry": 48.7, "Finance": 20.9, "Consulting": 56.3, "Extraction": 29.6, "Reasoning": 39.4, "Style": 52.8, "Response Characters": 9694, "Input Tokens": 559, "Output Tokens": 17673, "Cost": 14.23}
|
| 32 |
+
{"Model": "Anthropic/claude-haiku-4.5", "Category": "Closed-source Instruct", "Overall": 38.0, "Physics": 25.6, "Chemistry": 47.3, "Finance": 18.0, "Consulting": 61.3, "Extraction": 28.9, "Reasoning": 38.5, "Style": 55.0, "Response Characters": 10776, "Input Tokens": 531, "Output Tokens": 3439, "Cost": 2.84}
|
| 33 |
+
{"Model": "Anthropic/claude-sonnet-4.5 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 44.8, "Physics": 34.5, "Chemistry": 63.7, "Finance": 27.5, "Consulting": 53.3, "Extraction": 36.6, "Reasoning": 45.7, "Style": 49.3, "Response Characters": 8356, "Input Tokens": 559, "Output Tokens": 11358, "Cost": 27.53}
|
| 34 |
+
{"Model": "Anthropic/claude-sonnet-4.5", "Category": "Closed-source Instruct", "Overall": 48.1, "Physics": 35.5, "Chemistry": 55.9, "Finance": 34.7, "Consulting": 66.1, "Extraction": 44.8, "Reasoning": 49.9, "Style": 51.2, "Response Characters": 7987, "Input Tokens": 531, "Output Tokens": 2299, "Cost": 5.77}
|
| 35 |
+
{"Model": "MiniMax/M2 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 34.1, "Physics": 22.6, "Chemistry": 38.1, "Finance": 25.2, "Consulting": 50.5, "Extraction": 25.4, "Reasoning": 31.9, "Style": 45.9, "Response Characters": 6485, "Input Tokens": 433, "Output Tokens": 14932, "Cost": 1.09}
|
| 36 |
+
{"Model": "MiniMax/M2", "Category": "Open-weight Instruct", "Overall": 37.7, "Physics": 25.2, "Chemistry": 49.5, "Finance": 25.8, "Consulting": 50.3, "Extraction": 30.1, "Reasoning": 37.3, "Style": 50.9, "Response Characters": 6312, "Input Tokens": 481, "Output Tokens": 8876, "Cost": 0.65}
|