zhilinw commited on
Commit
d0543d1
·
verified ·
1 Parent(s): c894cad

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. report_generation.jsonl +4 -0
app.py CHANGED
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
111
  with gr.TabItem("Report Generation"):
112
  with gr.Row():
113
  with gr.Column(scale=7):
114
- gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 12 Nov 2025.")
115
 
116
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
117
  with gr.TabItem("Leaderboard"):
 
111
  with gr.TabItem("Report Generation"):
112
  with gr.Row():
113
  with gr.Column(scale=7):
114
+ gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 21 Nov 2025.")
115
 
116
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
117
  with gr.TabItem("Leaderboard"):
report_generation.jsonl CHANGED
@@ -37,3 +37,7 @@
37
  {"Model": "MoonshotAI/Kimi-K2-Thinking", "Category": "Open-weight Reasoning", "Overall": 48.9, "Physics": 34.1, "Chemistry": 51.4, "Finance": 40.1, "Consulting": 69.9, "Extraction": 42.4, "Reasoning": 49.3, "Style": 63.5, "Response Characters": 6383, "Input Tokens": 469, "Output Tokens": 14221, "Cost": 5.16}
38
  {"Model": "DeepSeek-AI/DeepSeek-V3.2-Exp (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.4, "Physics": 26.2, "Chemistry": 52.4, "Finance": 26.3, "Consulting": 60.8, "Extraction": 31.6, "Reasoning": 41.5, "Style": 56.4, "Response Characters": 5242, "Input Tokens": 458, "Output Tokens": 9938, "Cost": 0.66}
39
  {"Model": "DeepSeek-AI/DeepSeek-V3.2-Exp", "Category": "Open-weight Instruct", "Overall": 48.2, "Physics": 41.0, "Chemistry": 57.2, "Finance": 32.7, "Consulting": 61.7, "Extraction": 41.9, "Reasoning": 49.3, "Style": 55.5, "Response Characters": 7671, "Input Tokens": 456, "Output Tokens": 2423, "Cost": 0.17}
 
 
 
 
 
37
  {"Model": "MoonshotAI/Kimi-K2-Thinking", "Category": "Open-weight Reasoning", "Overall": 48.9, "Physics": 34.1, "Chemistry": 51.4, "Finance": 40.1, "Consulting": 69.9, "Extraction": 42.4, "Reasoning": 49.3, "Style": 63.5, "Response Characters": 6383, "Input Tokens": 469, "Output Tokens": 14221, "Cost": 5.16}
38
  {"Model": "DeepSeek-AI/DeepSeek-V3.2-Exp (Thinking)", "Category": "Open-weight Reasoning", "Overall": 41.4, "Physics": 26.2, "Chemistry": 52.4, "Finance": 26.3, "Consulting": 60.8, "Extraction": 31.6, "Reasoning": 41.5, "Style": 56.4, "Response Characters": 5242, "Input Tokens": 458, "Output Tokens": 9938, "Cost": 0.66}
39
  {"Model": "DeepSeek-AI/DeepSeek-V3.2-Exp", "Category": "Open-weight Instruct", "Overall": 48.2, "Physics": 41.0, "Chemistry": 57.2, "Finance": 32.7, "Consulting": 61.7, "Extraction": 41.9, "Reasoning": 49.3, "Style": 55.5, "Response Characters": 7671, "Input Tokens": 456, "Output Tokens": 2423, "Cost": 0.17}
40
+ {"Model": "Google/Gemini-3-Pro-Preview", "Category": "Closed-source Reasoning", "Overall": 51.7, "Physics": 36.6, "Chemistry": 66.3, "Finance": 34.4, "Consulting": 69.4, "Extraction": 46.8, "Reasoning": 53.2, "Style": 64.6, "Response Characters": 5360, "Input Tokens": 479, "Output Tokens": 9131, "Cost": 17.68}
41
+ {"Model": "xAI/grok-4.1-fast (Thinking)", "Category": "Closed-source Reasoning", "Overall": 44.8, "Physics": 27.8, "Chemistry": 56.3, "Finance": 25.0, "Consulting": 70.2, "Extraction": 36.9, "Reasoning": 45.8, "Style": 61.6, "Response Characters": 3630, "Input Tokens": 598, "Output Tokens": 11826, "Cost": 0.97}
42
+ {"Model": "xAI/grok-4.1-fast", "Category": "Closed-source Instruct", "Overall": 44.4, "Physics": 28.0, "Chemistry": 53.1, "Finance": 28.9, "Consulting": 67.5, "Extraction": 36.4, "Reasoning": 44.9, "Style": 64.0, "Response Characters": 3906, "Input Tokens": 531, "Output Tokens": 11320, "Cost": 0.92}
43
+ {"Model": "OpenAI/GPT-5.1 (high)", "Category": "Closed-source Reasoning", "Overall": 54.9, "Physics": 42.7, "Chemistry": 66.5, "Finance": 42.9, "Consulting": 67.6, "Extraction": 49.5, "Reasoning": 58.7, "Style": 61.6, "Response Characters": 11627, "Input Tokens": 467, "Output Tokens": 17148, "Cost": 27.53}