Upload 2 files
Browse files- app.py +55 -3
- report_generation.jsonl +28 -0
app.py
CHANGED
|
@@ -6,7 +6,8 @@ import pandas as pd
|
|
| 6 |
###########################################
|
| 7 |
|
| 8 |
llm_judge_filename = "llm_judge_results.jsonl"
|
| 9 |
-
response_generation_filename = "
|
|
|
|
| 10 |
|
| 11 |
def load_filename_into_df(filename):
|
| 12 |
df = pd.read_json(filename, lines=True)
|
|
@@ -106,10 +107,10 @@ with gr.Blocks(theme=theme) as app:
|
|
| 106 |
gr.Markdown(CAPTION_V2)
|
| 107 |
|
| 108 |
with gr.Tabs(elem_id="outer-tabs", elem_classes="tabs-big") as tabs_big:
|
| 109 |
-
with gr.TabItem("Report Generation
|
| 110 |
with gr.Row():
|
| 111 |
with gr.Column(scale=7):
|
| 112 |
-
gr.Markdown("Report Generation Leaderboard
|
| 113 |
|
| 114 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 115 |
with gr.TabItem("Leaderboard"):
|
|
@@ -191,11 +192,58 @@ with gr.Blocks(theme=theme) as app:
|
|
| 191 |
elem_id="llm_judge_dataframe",
|
| 192 |
row_count=(25, "dynamic"),
|
| 193 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
| 196 |
search_1_v1.change(
|
| 197 |
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
|
| 198 |
)
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
model_types_1.change(
|
| 201 |
regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
|
|
@@ -204,6 +252,10 @@ with gr.Blocks(theme=theme) as app:
|
|
| 204 |
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
|
| 205 |
)
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
with gr.Row():
|
| 208 |
with gr.Accordion("📚 Citation and Credits", open=False):
|
| 209 |
citation_button = gr.Textbox(
|
|
|
|
| 6 |
###########################################
|
| 7 |
|
| 8 |
llm_judge_filename = "llm_judge_results.jsonl"
|
| 9 |
+
response_generation_filename = "report_generation.jsonl"
|
| 10 |
+
response_generation_w_docs_filename = "report_generation_w_docs.jsonl"
|
| 11 |
|
| 12 |
def load_filename_into_df(filename):
|
| 13 |
df = pd.read_json(filename, lines=True)
|
|
|
|
| 107 |
gr.Markdown(CAPTION_V2)
|
| 108 |
|
| 109 |
with gr.Tabs(elem_id="outer-tabs", elem_classes="tabs-big") as tabs_big:
|
| 110 |
+
with gr.TabItem("Report Generation"):
|
| 111 |
with gr.Row():
|
| 112 |
with gr.Column(scale=7):
|
| 113 |
+
gr.Markdown("Report Generation Leaderboard")
|
| 114 |
|
| 115 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 116 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 192 |
elem_id="llm_judge_dataframe",
|
| 193 |
row_count=(25, "dynamic"),
|
| 194 |
)
|
| 195 |
+
|
| 196 |
+
with gr.TabItem("Report Generation w Docs"):
|
| 197 |
+
with gr.Row():
|
| 198 |
+
with gr.Column(scale=7):
|
| 199 |
+
gr.Markdown("Report Generation Leaderboard with Grounding Documents")
|
| 200 |
+
|
| 201 |
+
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 202 |
+
with gr.TabItem("Leaderboard"):
|
| 203 |
+
with gr.Row():
|
| 204 |
+
search_1_v2 = gr.Textbox(
|
| 205 |
+
label="Model Search (delimit with , )",
|
| 206 |
+
placeholder="Model Search (delimit with , )",
|
| 207 |
+
show_label=False,
|
| 208 |
+
scale=8,
|
| 209 |
+
)
|
| 210 |
+
model_types_1_v2 = gr.CheckboxGroup(
|
| 211 |
+
["Open-weight", "Closed-source", "Reasoning", "Instruct"],
|
| 212 |
+
value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
|
| 213 |
+
show_label=False,
|
| 214 |
+
scale=8,
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
with gr.Row():
|
| 218 |
+
col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12
|
| 219 |
+
df_response_generation_w_docs = load_filename_into_df(response_generation_w_docs_filename)
|
| 220 |
+
|
| 221 |
+
rewardbench_table_hidden_v2 = gr.Dataframe(
|
| 222 |
+
df_response_generation_w_docs.values,
|
| 223 |
+
datatype=col_types_response_generation,
|
| 224 |
+
headers=df_response_generation_w_docs.columns.tolist(),
|
| 225 |
+
visible=False,
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
rewardbench_table_v2 = gr.Dataframe(
|
| 229 |
+
regex_table(
|
| 230 |
+
df_response_generation_w_docs.copy(),
|
| 231 |
+
"",
|
| 232 |
+
["Open-weight", "Closed-source", "Reasoning", "Instruct"]
|
| 233 |
+
),
|
| 234 |
+
datatype=col_types_response_generation,
|
| 235 |
+
headers=df_response_generation_w_docs.columns.tolist(),
|
| 236 |
+
elem_id="response_generation_dataframe",
|
| 237 |
+
row_count=(25, "dynamic"),
|
| 238 |
+
)
|
| 239 |
|
| 240 |
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
| 241 |
search_1_v1.change(
|
| 242 |
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
|
| 243 |
)
|
| 244 |
+
search_1_v2.change(
|
| 245 |
+
regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
|
| 246 |
+
)
|
| 247 |
|
| 248 |
model_types_1.change(
|
| 249 |
regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
|
|
|
|
| 252 |
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
|
| 253 |
)
|
| 254 |
|
| 255 |
+
model_types_1_v2.change(
|
| 256 |
+
regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
with gr.Row():
|
| 260 |
with gr.Accordion("📚 Citation and Credits", open=False):
|
| 261 |
citation_button = gr.Textbox(
|
report_generation.jsonl
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"Model": "OpenAI/GPT-5 (high)", "Category": "Closed-source Reasoning", "Overall": 49.4, "Physics": 45.7, "Chemistry": 62.6, "Finance": 29.9, "Consulting": 59.2, "Extraction": 37.1, "Reasoning": 51.7, "Style": 65.5, "Response Characters": 5876, "Input Tokens": 467, "Output Tokens": 16123, "Cost": 25.89}
|
| 2 |
+
{"Model": "OpenAI/GPT-5-mini (high)", "Category": "Closed-source Reasoning", "Overall": 41.6, "Physics": 40.0, "Chemistry": 59.5, "Finance": 11.8, "Consulting": 55.1, "Extraction": 32.7, "Reasoning": 42.1, "Style": 65.9, "Response Characters": 7870, "Input Tokens": 956, "Output Tokens": 15280, "Cost": 4.93}
|
| 3 |
+
{"Model": "OpenAI/GPT-5-nano (high)", "Category": "Closed-source Reasoning", "Overall": 36.9, "Physics": 29.1, "Chemistry": 37.9, "Finance": 23.6, "Consulting": 56.9, "Extraction": 26.5, "Reasoning": 35.6, "Style": 58.0, "Response Characters": 8915, "Input Tokens": 467, "Output Tokens": 23008, "Cost": 1.48}
|
| 4 |
+
{"Model": "OpenAI/o3", "Category": "Closed-source Reasoning", "Overall": 52.4, "Physics": 38.6, "Chemistry": 57.2, "Finance": 44.1, "Consulting": 69.8, "Extraction": 43.0, "Reasoning": 54.1, "Style": 59.2, "Response Characters": 4226, "Input Tokens": 467, "Output Tokens": 5569, "Cost": 7.28}
|
| 5 |
+
{"Model": "OpenAI/o4-mini", "Category": "Closed-source Reasoning", "Overall": 47.5, "Physics": 34.6, "Chemistry": 50.1, "Finance": 38.1, "Consulting": 67.2, "Extraction": 37.2, "Reasoning": 47.7, "Style": 60.4, "Response Characters": 3046, "Input Tokens": 467, "Output Tokens": 4335, "Cost": 0.77}
|
| 6 |
+
{"Model": "Google/Gemini-2.5-Pro", "Category": "Closed-source Reasoning", "Overall": 52.1, "Physics": 40.4, "Chemistry": 63.8, "Finance": 36.7, "Consulting": 67.5, "Extraction": 45.9, "Reasoning": 53.1, "Style": 62.6, "Response Characters": 8492, "Input Tokens": 480, "Output Tokens": 9102, "Cost": 14.66}
|
| 7 |
+
{"Model": "Google/Gemini-2.5-Flash (Thinking)", "Category": "Closed-source Reasoning", "Overall": 49.2, "Physics": 35.9, "Chemistry": 63.9, "Finance": 33.2, "Consulting": 63.8, "Extraction": 43.6, "Reasoning": 51.4, "Style": 57.3, "Response Characters": 18559, "Input Tokens": 480, "Output Tokens": 12943, "Cost": 5.2}
|
| 8 |
+
{"Model": "Google/Gemini-2.5-Flash-Lite (Thinking)", "Category": "Closed-source Reasoning", "Overall": 44.3, "Physics": 32.3, "Chemistry": 52.7, "Finance": 31.3, "Consulting": 61.0, "Extraction": 35.8, "Reasoning": 43.3, "Style": 56.7, "Response Characters": 12153, "Input Tokens": 480, "Output Tokens": 17302, "Cost": 1.12}
|
| 9 |
+
{"Model": "xAI/grok-4-0709", "Category": "Closed-source Reasoning", "Overall": 45.1, "Physics": 20.6, "Chemistry": 59.8, "Finance": 29.4, "Consulting": 70.5, "Extraction": 40.1, "Reasoning": 48.4, "Style": 65.2, "Response Characters": 4977, "Input Tokens": 1126, "Output Tokens": 17957, "Cost": 43.64}
|
| 10 |
+
{"Model": "Anthropic/claude-sonnet-4 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 42.5, "Physics": 39.5, "Chemistry": 53.3, "Finance": 21.2, "Consulting": 56.1, "Extraction": 29.5, "Reasoning": 42.5, "Style": 66.1, "Response Characters": 3621, "Input Tokens": 559, "Output Tokens": 7924, "Cost": 19.29}
|
| 11 |
+
{"Model": "OpenAI/gpt-oss-120b", "Category": "Open-weight Reasoning", "Overall": 50.0, "Physics": 43.6, "Chemistry": 53.5, "Finance": 35.3, "Consulting": 67.6, "Extraction": 39.7, "Reasoning": 51.5, "Style": 63.4, "Response Characters": 8657, "Input Tokens": 530, "Output Tokens": 4817, "Cost": 0.31}
|
| 12 |
+
{"Model": "OpenAI/gpt-oss-20b", "Category": "Open-weight Reasoning", "Overall": 42.3, "Physics": 33.6, "Chemistry": 40.5, "Finance": 28.7, "Consulting": 66.4, "Extraction": 29.9, "Reasoning": 44.1, "Style": 59.1, "Response Characters": 5609, "Input Tokens": 508, "Output Tokens": 5375, "Cost": 0.12}
|
| 13 |
+
{"Model": "DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 45.9, "Physics": 32.3, "Chemistry": 53.9, "Finance": 35.6, "Consulting": 61.9, "Extraction": 39.4, "Reasoning": 50.0, "Style": 59.1, "Response Characters": 5760, "Input Tokens": 415, "Output Tokens": 6253, "Cost": 0.81}
|
| 14 |
+
{"Model": "Qwen/Qwen3-235B-A22B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 41.1, "Physics": 32.7, "Chemistry": 46.4, "Finance": 23.5, "Consulting": 61.9, "Extraction": 35.0, "Reasoning": 46.1, "Style": 63.4, "Response Characters": 11390, "Input Tokens": 490, "Output Tokens": 5568, "Cost": 0.54}
|
| 15 |
+
{"Model": "Qwen/Qwen3-30B-A3B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 37.6, "Physics": 19.0, "Chemistry": 44.7, "Finance": 25.6, "Consulting": 61.3, "Extraction": 29.4, "Reasoning": 42.4, "Style": 73.1, "Response Characters": 5892, "Input Tokens": 469, "Output Tokens": 6376, "Cost": 0.3}
|
| 16 |
+
{"Model": "OpenAI/GPT-4.1", "Category": "Closed-source Instruct", "Overall": 46.3, "Physics": 34.3, "Chemistry": 48.5, "Finance": 36.4, "Consulting": 65.9, "Extraction": 36.5, "Reasoning": 47.4, "Style": 55.7, "Response Characters": 7386, "Input Tokens": 468, "Output Tokens": 2394, "Cost": 3.21}
|
| 17 |
+
{"Model": "OpenAI/GPT-4.1-mini", "Category": "Closed-source Instruct", "Overall": 42.8, "Physics": 40.5, "Chemistry": 50.6, "Finance": 23.1, "Consulting": 57.0, "Extraction": 33.5, "Reasoning": 43.1, "Style": 58.7, "Response Characters": 7550, "Input Tokens": 468, "Output Tokens": 2322, "Cost": 0.62}
|
| 18 |
+
{"Model": "OpenAI/GPT-4.1-nano", "Category": "Closed-source Instruct", "Overall": 33.2, "Physics": 21.4, "Chemistry": 35.1, "Finance": 23.3, "Consulting": 53.2, "Extraction": 24.9, "Reasoning": 32.9, "Style": 48.3, "Response Characters": 6198, "Input Tokens": 468, "Output Tokens": 1799, "Cost": 0.12}
|
| 19 |
+
{"Model": "Google/Gemini-2.5-Flash", "Category": "Closed-source Instruct", "Overall": 47.8, "Physics": 37.0, "Chemistry": 57.6, "Finance": 36.7, "Consulting": 60.0, "Extraction": 41.8, "Reasoning": 50.3, "Style": 53.6, "Response Characters": 24479, "Input Tokens": 480, "Output Tokens": 6255, "Cost": 2.53}
|
| 20 |
+
{"Model": "Google/Gemini-2.5-Flash-Lite", "Category": "Closed-source Instruct", "Overall": 41.8, "Physics": 28.8, "Chemistry": 51.2, "Finance": 25.0, "Consulting": 62.1, "Extraction": 35.2, "Reasoning": 41.3, "Style": 52.1, "Response Characters": 26746, "Input Tokens": 480, "Output Tokens": 8723, "Cost": 0.57}
|
| 21 |
+
{"Model": "Anthropic/claude-sonnet-4", "Category": "Closed-source Instruct", "Overall": 41.2, "Physics": 34.8, "Chemistry": 47.1, "Finance": 18.9, "Consulting": 63.9, "Extraction": 32.6, "Reasoning": 40.9, "Style": 58.3, "Response Characters": 4047, "Input Tokens": 531, "Output Tokens": 1375, "Cost": 3.55}
|
| 22 |
+
{"Model": "Anthropic/claude-3.5-haiku", "Category": "Closed-source Instruct", "Overall": 21.2, "Physics": 6.9, "Chemistry": 25.2, "Finance": 8.5, "Consulting": 44.3, "Extraction": 16.6, "Reasoning": 19.7, "Style": 41.7, "Response Characters": 1618, "Input Tokens": 531, "Output Tokens": 519, "Cost": 0.4}
|
| 23 |
+
{"Model": "Qwen/Qwen3-235B-A22B-Instruct-2507", "Category": "Open-weight Instruct", "Overall": 45.0, "Physics": 31.9, "Chemistry": 52.4, "Finance": 32.9, "Consulting": 63.0, "Extraction": 38.1, "Reasoning": 48.4, "Style": 54.5, "Response Characters": 14314, "Input Tokens": 487, "Output Tokens": 5000, "Cost": 0.45}
|
| 24 |
+
{"Model": "Qwen/Qwen3-30B-A3B-Instruct-2507", "Category": "Open-weight Instruct", "Overall": 40.0, "Physics": 26.6, "Chemistry": 47.2, "Finance": 24.9, "Consulting": 61.4, "Extraction": 31.2, "Reasoning": 40.1, "Style": 50.7, "Response Characters": 10488, "Input Tokens": 487, "Output Tokens": 3654, "Cost": 0.2}
|
| 25 |
+
{"Model": "DeepSeek-AI/DeepSeek-V3.1", "Category": "Open-weight Instruct", "Overall": 46.6, "Physics": 37.1, "Chemistry": 51.7, "Finance": 34.4, "Consulting": 63.1, "Extraction": 40.8, "Reasoning": 47.2, "Style": 55.9, "Response Characters": 8740, "Input Tokens": 456, "Output Tokens": 2717, "Cost": 0.36}
|
| 26 |
+
{"Model": "MoonshotAI/Kimi-K2-Instruct-0905", "Category": "Open-weight Instruct", "Overall": 44.2, "Physics": 33.7, "Chemistry": 47.3, "Finance": 32.2, "Consulting": 63.6, "Extraction": 36.6, "Reasoning": 43.2, "Style": 58.8, "Response Characters": 5322, "Input Tokens": 481, "Output Tokens": 1709, "Cost": 0.55}
|
| 27 |
+
{"Model": "Meta/llama-4-maverick", "Category": "Open-weight Instruct", "Overall": 34.4, "Physics": 31.3, "Chemistry": 35.4, "Finance": 22.1, "Consulting": 48.8, "Extraction": 27.2, "Reasoning": 34.6, "Style": 32.6, "Response Characters": 4532, "Input Tokens": 479, "Output Tokens": 1292, "Cost": 0.14}
|
| 28 |
+
{"Model": "Meta/llama-4-scout", "Category": "Open-weight Instruct", "Overall": 31.2, "Physics": 19.2, "Chemistry": 30.0, "Finance": 19.5, "Consulting": 55.9, "Extraction": 26.2, "Reasoning": 29.9, "Style": 38.9, "Response Characters": 4200, "Input Tokens": 457, "Output Tokens": 1197, "Cost": 0.06}
|