Upload 4 files
Browse files- app.py +226 -0
- llm_judge_results.jsonl +59 -0
- report_generation_w_docs.jsonl +28 -0
- requirements.txt +2 -0
app.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
###########################################
|
| 5 |
+
# Load Data #
|
| 6 |
+
###########################################
|
| 7 |
+
|
| 8 |
+
llm_judge_filename = "llm_judge_results.jsonl"
|
| 9 |
+
response_generation_filename = "report_generation_w_docs.jsonl"
|
| 10 |
+
|
| 11 |
+
def load_filename_into_df(filename):
|
| 12 |
+
df = pd.read_json(filename, lines=True)
|
| 13 |
+
return df
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
color_map = {
|
| 17 |
+
"Closed-source Instruct": "#B8D2F5" ,
|
| 18 |
+
"Open-weight Instruct": "#6f96e5",
|
| 19 |
+
"Closed-source Reasoning": "#fce8c5" ,
|
| 20 |
+
"Open-weight Reasoning": "#ffcd75",
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
CAPTION_V2 = f"""**ProfBench**: Human-annotated rubrics on addressing professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
|
| 24 |
+
[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench)\n
|
| 25 |
+
Want to see your favorite models added? Run it with our code, send us the scores or ping us to run it for you!"""
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def color_model_type_column(df, color_map):
|
| 29 |
+
"""
|
| 30 |
+
Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
|
| 31 |
+
Parameters:
|
| 32 |
+
df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
|
| 33 |
+
color_map (dict): A dictionary mapping model types to colors.
|
| 34 |
+
Returns:
|
| 35 |
+
pd.Styler: The styled DataFrame.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
# Function to apply color based on the model type
|
| 39 |
+
def apply_color(val):
|
| 40 |
+
color = color_map.get(val, "default") # Default color if not specified in color_map
|
| 41 |
+
return f"background-color: {color}"
|
| 42 |
+
|
| 43 |
+
# # Format for different columns
|
| 44 |
+
# format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Average", "Model", "Model Type"]}
|
| 45 |
+
# format_dict["Average"] = "{:.2f}"
|
| 46 |
+
# format_dict[""] = "{:d}"
|
| 47 |
+
|
| 48 |
+
format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Model", "Category", "Input Tokens", "Output Tokens", "Cost"]}
|
| 49 |
+
format_dict["Response Characters"] = "{:d}"
|
| 50 |
+
format_dict["Input Tokens"] = "{:d}"
|
| 51 |
+
format_dict["Output Tokens"] = "{:d}"
|
| 52 |
+
format_dict[""] = "{:d}"
|
| 53 |
+
format_dict["Cost"] = "{:.2f}"
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
return df.style.applymap(apply_color, subset=["Category"]).format(format_dict, na_rep="")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def regex_table(dataframe, regex, filter_button, style=True):
|
| 60 |
+
"""
|
| 61 |
+
Takes a model name as a regex, then returns only the rows that has that in it.
|
| 62 |
+
"""
|
| 63 |
+
# Split regex statement by comma and trim whitespace around regexes
|
| 64 |
+
regex_list = [x.strip() for x in regex.split(",")]
|
| 65 |
+
# Join the list into a single regex pattern with '|' acting as OR
|
| 66 |
+
combined_regex = "|".join(regex_list)
|
| 67 |
+
|
| 68 |
+
if isinstance(filter_button, list) or isinstance(filter_button, str):
|
| 69 |
+
|
| 70 |
+
if "Open-weight" not in filter_button:
|
| 71 |
+
dataframe = dataframe[~dataframe["Category"].str.contains("Open-weight", case=False, na=False)]
|
| 72 |
+
if "Closed-source" not in filter_button:
|
| 73 |
+
dataframe = dataframe[~dataframe["Category"].str.contains("Closed-source", case=False, na=False)]
|
| 74 |
+
if "Reasoning" not in filter_button:
|
| 75 |
+
dataframe = dataframe[~dataframe["Category"].str.contains("Reasoning", case=False, na=False)]
|
| 76 |
+
if "Instruct" not in filter_button:
|
| 77 |
+
dataframe = dataframe[~dataframe["Category"].str.contains("Instruct", case=False, na=False)]
|
| 78 |
+
|
| 79 |
+
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
|
| 80 |
+
|
| 81 |
+
# if update the score to not use prior sets, do so
|
| 82 |
+
data = data.sort_values(by="Overall", ascending=False)
|
| 83 |
+
|
| 84 |
+
data.reset_index(drop=True, inplace=True)
|
| 85 |
+
|
| 86 |
+
data.insert(0, "", range(1, 1 + len(data)))
|
| 87 |
+
|
| 88 |
+
if style:
|
| 89 |
+
# apply color
|
| 90 |
+
data = color_model_type_column(data, color_map)
|
| 91 |
+
|
| 92 |
+
return data
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# Using a string for a predefined color
|
| 96 |
+
theme = gr.themes.Default(primary_hue="blue")
|
| 97 |
+
|
| 98 |
+
#############################################
|
| 99 |
+
# Gradio App #
|
| 100 |
+
#############################################
|
| 101 |
+
|
| 102 |
+
with gr.Blocks(theme=theme) as app:
|
| 103 |
+
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
| 104 |
+
with gr.Row():
|
| 105 |
+
with gr.Column(scale=6):
|
| 106 |
+
gr.Markdown(CAPTION_V2)
|
| 107 |
+
|
| 108 |
+
with gr.Tabs(elem_id="outer-tabs", elem_classes="tabs-big") as tabs_big:
|
| 109 |
+
with gr.TabItem("Report Generation w Docs"):
|
| 110 |
+
with gr.Row():
|
| 111 |
+
with gr.Column(scale=7):
|
| 112 |
+
gr.Markdown("Report Generation Leaderboard with Grounding Documents")
|
| 113 |
+
|
| 114 |
+
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 115 |
+
with gr.TabItem("Leaderboard"):
|
| 116 |
+
with gr.Row():
|
| 117 |
+
search_1 = gr.Textbox(
|
| 118 |
+
label="Model Search (delimit with , )",
|
| 119 |
+
placeholder="Model Search (delimit with , )",
|
| 120 |
+
show_label=False,
|
| 121 |
+
scale=8,
|
| 122 |
+
)
|
| 123 |
+
model_types_1 = gr.CheckboxGroup(
|
| 124 |
+
["Open-weight", "Closed-source", "Reasoning", "Instruct"],
|
| 125 |
+
value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
|
| 126 |
+
show_label=False,
|
| 127 |
+
scale=8,
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
with gr.Row():
|
| 131 |
+
col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12
|
| 132 |
+
df_response_generation = load_filename_into_df(response_generation_filename)
|
| 133 |
+
|
| 134 |
+
rewardbench_table_hidden = gr.Dataframe(
|
| 135 |
+
df_response_generation.values,
|
| 136 |
+
datatype=col_types_response_generation,
|
| 137 |
+
headers=df_response_generation.columns.tolist(),
|
| 138 |
+
visible=False,
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
rewardbench_table = gr.Dataframe(
|
| 142 |
+
regex_table(
|
| 143 |
+
df_response_generation.copy(),
|
| 144 |
+
"",
|
| 145 |
+
["Open-weight", "Closed-source", "Reasoning", "Instruct"]
|
| 146 |
+
),
|
| 147 |
+
datatype=col_types_response_generation,
|
| 148 |
+
headers=df_response_generation.columns.tolist(),
|
| 149 |
+
elem_id="response_generation_dataframe",
|
| 150 |
+
height=800, # 800 px ≈ ~25 rows on default row-height
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
with gr.TabItem("LLM Judge"):
|
| 154 |
+
with gr.Row():
|
| 155 |
+
gr.Markdown("LLM Judge Leaderboard")
|
| 156 |
+
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 157 |
+
with gr.TabItem("Leaderboard"):
|
| 158 |
+
with gr.Row():
|
| 159 |
+
search_1_v1 = gr.Textbox(
|
| 160 |
+
label="Model Search (delimit with , )",
|
| 161 |
+
placeholder="Model Search (delimit with , )",
|
| 162 |
+
show_label=False,
|
| 163 |
+
)
|
| 164 |
+
model_types_1_v1 = gr.CheckboxGroup(
|
| 165 |
+
["Open-weight", "Closed-source", "Reasoning", "Instruct"],
|
| 166 |
+
value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
|
| 167 |
+
label="Model Types",
|
| 168 |
+
show_label=False,
|
| 169 |
+
# info="Which model types to include.",
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
with gr.Row():
|
| 173 |
+
col_types_llm_judge = ["number"] + ["markdown"] + ["str"] + ["number"] * 16
|
| 174 |
+
df_llm_judge = load_filename_into_df(llm_judge_filename)
|
| 175 |
+
|
| 176 |
+
rewardbench_table_hidden_v1 = gr.Dataframe(
|
| 177 |
+
df_llm_judge.values,
|
| 178 |
+
datatype=col_types_llm_judge,
|
| 179 |
+
headers=df_llm_judge.columns.tolist(),
|
| 180 |
+
visible=False,
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
rewardbench_table_v1 = gr.Dataframe(
|
| 184 |
+
regex_table(
|
| 185 |
+
df_llm_judge.copy(),
|
| 186 |
+
"",
|
| 187 |
+
["Open-weight", "Closed-source", "Reasoning", "Instruct"],
|
| 188 |
+
),
|
| 189 |
+
datatype=col_types_llm_judge,
|
| 190 |
+
headers=df_llm_judge.columns.tolist(),
|
| 191 |
+
elem_id="llm_judge_dataframe",
|
| 192 |
+
height=800, # 800 px ≈ ~25 rows on default row-height
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
| 196 |
+
search_1_v1.change(
|
| 197 |
+
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
model_types_1.change(
|
| 201 |
+
regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
|
| 202 |
+
)
|
| 203 |
+
model_types_1_v1.change(
|
| 204 |
+
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
with gr.Row():
|
| 208 |
+
with gr.Accordion("📚 Citation and Credits", open=False):
|
| 209 |
+
citation_button = gr.Textbox(
|
| 210 |
+
value=r"""@misc{wang2025profbenchmultidomainrubricsrequiring,
|
| 211 |
+
title={ProfBench: Multi-Domain Rubrics requiring Professional Knowledge to Answer and Judge},
|
| 212 |
+
author={Zhilin Wang and Jaehun Jung and Ximing Lu and Shizhe Diao and Ellie Evans and Jiaqi Zeng and Pavlo Molchanov and Yejin Choi and Jan Kautz and Yi Dong},
|
| 213 |
+
year={2025},
|
| 214 |
+
eprint={2510.18941},
|
| 215 |
+
archivePrefix={arXiv},
|
| 216 |
+
primaryClass={cs.CL},
|
| 217 |
+
url={https://arxiv.org/abs/2510.18941},
|
| 218 |
+
}""",
|
| 219 |
+
lines=10,
|
| 220 |
+
label="If you find the results helpful, please cite the following. ",
|
| 221 |
+
elem_id="citation-button",
|
| 222 |
+
show_copy_button=True,
|
| 223 |
+
)
|
| 224 |
+
gr.Textbox("Leaderboard adapted from allenai/reward-bench ", label="Leaderboard credits",)
|
| 225 |
+
|
| 226 |
+
app.launch() # had .queue() before launch before... not sure if that's necessary
|
llm_judge_results.jsonl
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"Model": "OpenAI/GPT-4.1", "Category": "Closed-source Instruct", "Overall": 75.4, "Physics": 80.9, "Chemistry": 69.2, "Finance": 71.0, "Consulting": 80.0, "Extraction": 79.8, "Reasoning": 74.4, "Style": 65.8, "F1": 76.3, "o3": 5.5, "R1-0528": 4.6, "Grok4": 5.0, "Bias-Index": 0.9, "Input Tokens": 1619.0, "Output Tokens": 1.0, "Cost": 11.31}
|
| 2 |
+
{"Model": "OpenAI/GPT-4.1-mini", "Category": "Closed-source Instruct", "Overall": 74.9, "Physics": 83.9, "Chemistry": 67.3, "Finance": 69.1, "Consulting": 80.6, "Extraction": 79.2, "Reasoning": 74.7, "Style": 69.8, "F1": 76.4, "o3": -0.2, "R1-0528": 1.2, "Grok4": -0.3, "Bias-Index": 1.5, "Input Tokens": 1619.0, "Output Tokens": 1.0, "Cost": 2.26}
|
| 3 |
+
{"Model": "OpenAI/GPT-4.1-nano", "Category": "Closed-source Instruct", "Overall": 54.1, "Physics": 69.8, "Chemistry": 62.9, "Finance": 66.7, "Consulting": 68.4, "Extraction": 71.0, "Reasoning": 65.6, "Style": 63.5, "F1": 67.9, "o3": -14.5, "R1-0528": -2.1, "Grok4": -0.7, "Bias-Index": 13.8, "Input Tokens": 1619.0, "Output Tokens": 1.0, "Cost": 0.56}
|
| 4 |
+
{"Model": "Google/Gemini-2.5-Flash", "Category": "Closed-source Instruct", "Overall": 73.4, "Physics": 82.9, "Chemistry": 67.3, "Finance": 70.8, "Consulting": 79.6, "Extraction": 79.2, "Reasoning": 74.5, "Style": 67.7, "F1": 76.3, "o3": -4.2, "R1-0528": -6.6, "Grok4": -7.1, "Bias-Index": 2.9, "Input Tokens": 1779.0, "Output Tokens": 1.0, "Cost": 1.87}
|
| 5 |
+
{"Model": "Google/Gemini-2.5-Flash-Lite", "Category": "Closed-source Instruct", "Overall": 73.3, "Physics": 83.6, "Chemistry": 68.2, "Finance": 68.2, "Consulting": 80.6, "Extraction": 77.9, "Reasoning": 75.0, "Style": 71.0, "F1": 76.4, "o3": -1.1, "R1-0528": 2.0, "Grok4": 0.6, "Bias-Index": 3.1, "Input Tokens": 1779.0, "Output Tokens": 1.0, "Cost": 0.62}
|
| 6 |
+
{"Model": "Anthropic/claude-sonnet-4", "Category": "Closed-source Instruct", "Overall": 70.2, "Physics": 85.0, "Chemistry": 66.9, "Finance": 68.1, "Consulting": 76.3, "Extraction": 77.6, "Reasoning": 73.3, "Style": 64.1, "F1": 75.2, "o3": -6.5, "R1-0528": -5.2, "Grok4": -10.2, "Bias-Index": 5.0, "Input Tokens": 1913.0, "Output Tokens": 1.0, "Cost": 20.06}
|
| 7 |
+
{"Model": "anthropic/claude-3.5-haiku", "Category": "Closed-source Instruct", "Overall": 72.5, "Physics": 78.9, "Chemistry": 67.2, "Finance": 71.2, "Consulting": 76.7, "Extraction": 76.9, "Reasoning": 73.3, "Style": 65.4, "F1": 74.9, "o3": -1.7, "R1-0528": 0.7, "Grok4": -1.4, "Bias-Index": 2.4, "Input Tokens": 1913.0, "Output Tokens": 1.0, "Cost": 5.35}
|
| 8 |
+
{"Model": "Qwen/Qwen3-235B-A22B-Instruct-2507", "Category": "Open-weight Instruct", "Overall": 75.1, "Physics": 86.5, "Chemistry": 69.3, "Finance": 69.3, "Consulting": 79.6, "Extraction": 79.2, "Reasoning": 76.0, "Style": 64.6, "F1": 77.3, "o3": 3.8, "R1-0528": 2.2, "Grok4": 1.6, "Bias-Index": 2.2, "Input Tokens": 1779.0, "Output Tokens": 1.0, "Cost": 0.48}
|
| 9 |
+
{"Model": "Qwen/Qwen3-30B-A3B-instruct-2507", "Category": "Open-weight Instruct", "Overall": 73.1, "Physics": 82.0, "Chemistry": 68.3, "Finance": 67.3, "Consulting": 79.7, "Extraction": 76.5, "Reasoning": 74.5, "Style": 64.7, "F1": 75.5, "o3": 4.7, "R1-0528": 7.1, "Grok4": 5.3, "Bias-Index": 2.4, "Input Tokens": 1778.0, "Output Tokens": 1.0, "Cost": 0.32}
|
| 10 |
+
{"Model": "MoonshotAI/Kimi-K2-Instruct-0905", "Category": "Open-weight Instruct", "Overall": 74.7, "Physics": 84.5, "Chemistry": 69.9, "Finance": 67.5, "Consulting": 81.9, "Extraction": 80.2, "Reasoning": 75.5, "Style": 65.9, "F1": 77.0, "o3": 7.5, "R1-0528": 6.1, "Grok4": 5.2, "Bias-Index": 2.3, "Input Tokens": 1623.0, "Output Tokens": 1.0, "Cost": 0.81}
|
| 11 |
+
{"Model": "MoonshotAI/Kimi-K2-Instruct-0711", "Category": "Open-weight Instruct", "Overall": 75.2, "Physics": 85.3, "Chemistry": 69.5, "Finance": 68.3, "Consulting": 82.3, "Extraction": 80.3, "Reasoning": 76.1, "Style": 66.4, "F1": 77.6, "o3": 7.1, "R1-0528": 6.1, "Grok4": 4.7, "Bias-Index": 2.4, "Input Tokens": 1636.0, "Output Tokens": 1.0, "Cost": 0.81}
|
| 12 |
+
{"Model": "DeepSeek-AI/DeepSeek-V3.1", "Category": "Open-weight Instruct", "Overall": 72.8, "Physics": 79.6, "Chemistry": 68.2, "Finance": 68.3, "Consulting": 78.7, "Extraction": 77.4, "Reasoning": 73.9, "Style": 65.8, "F1": 75.2, "o3": 0.2, "R1-0528": -1.5, "Grok4": -2.2, "Bias-Index": 2.4, "Input Tokens": 1586.0, "Output Tokens": 1.0, "Cost": 1.11}
|
| 13 |
+
{"Model": "DeepSeek-AI/DeepSeek-V3-0324", "Category": "Open-weight Instruct", "Overall": 72.6, "Physics": 84.5, "Chemistry": 68.0, "Finance": 67.0, "Consulting": 78.3, "Extraction": 77.7, "Reasoning": 74.6, "Style": 63.5, "F1": 75.7, "o3": 1.5, "R1-0528": 2.4, "Grok4": -0.7, "Bias-Index": 3.1, "Input Tokens": 1585.0, "Output Tokens": 1.0, "Cost": 1.11}
|
| 14 |
+
{"Model": "nvidia/llama-3.1-nemotron-nano-8b-v1", "Category": "Open-weight Instruct", "Overall": 55.8, "Physics": 56.5, "Chemistry": 59.5, "Finance": 57.3, "Consulting": 56.7, "Extraction": 61.3, "Reasoning": 58.6, "Style": 59.1, "F1": 59.3, "o3": -28.5, "R1-0528": -26.5, "Grok4": -30.0, "Bias-Index": 3.5, "Input Tokens": 1633.0, "Output Tokens": 1.0, "Cost": 0.09}
|
| 15 |
+
{"Model": "nvidia/llama-3.3-nemotron-super-49b-v1", "Category": "Open-weight Instruct", "Overall": 68.8, "Physics": 77.2, "Chemistry": 65.1, "Finance": 70.2, "Consulting": 72.1, "Extraction": 74.1, "Reasoning": 70.7, "Style": 64.1, "F1": 72.3, "o3": -15.7, "R1-0528": -12.2, "Grok4": -13.0, "Bias-Index": 3.5, "Input Tokens": 1637.0, "Output Tokens": 1.0, "Cost": 0.74}
|
| 16 |
+
{"Model": "nvidia/llama-3.1-nemotron-ultra-253b-v1", "Category": "Open-weight Instruct", "Overall": 67.4, "Physics": 84.8, "Chemistry": 63.6, "Finance": 66.6, "Consulting": 61.8, "Extraction": 72.6, "Reasoning": 67.8, "Style": 57.8, "F1": 69.6, "o3": -10.0, "R1-0528": -11.4, "Grok4": -9.2, "Bias-Index": 2.2, "Input Tokens": 1637.0, "Output Tokens": 1.0, "Cost": 3.43}
|
| 17 |
+
{"Model": "meta/llama-4-maverick-17b-128e-instruct", "Category": "Open-weight Instruct", "Overall": 67.9, "Physics": 64.9, "Chemistry": 66.7, "Finance": 73.4, "Consulting": 76.4, "Extraction": 76.5, "Reasoning": 70.4, "Style": 67.9, "F1": 72.4, "o3": -14.3, "R1-0528": -10.5, "Grok4": -9.8, "Bias-Index": 4.5, "Input Tokens": 1566.0, "Output Tokens": 1.0, "Cost": 0.82}
|
| 18 |
+
{"Model": "meta/llama-4-scout-17b-16e-instruct", "Category": "Open-weight Instruct", "Overall": 65.9, "Physics": 60.4, "Chemistry": 69.4, "Finance": 71.3, "Consulting": 75.6, "Extraction": 76.2, "Reasoning": 69.9, "Style": 62.0, "F1": 71.8, "o3": -14.5, "R1-0528": -10.2, "Grok4": -8.6, "Bias-Index": 5.9, "Input Tokens": 1565.0, "Output Tokens": 1.0, "Cost": 0.44}
|
| 19 |
+
{"Model": "meta/llama-3.1-405b-instruct", "Category": "Open-weight Instruct", "Overall": 71.6, "Physics": 85.1, "Chemistry": 69.1, "Finance": 67.6, "Consulting": 81.7, "Extraction": 77.7, "Reasoning": 75.5, "Style": 65.5, "F1": 77.0, "o3": 11.5, "R1-0528": 6.1, "Grok4": 9.4, "Bias-Index": 5.4, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 4.54}
|
| 20 |
+
{"Model": "meta/llama-3.3-70b-instruct", "Category": "Open-weight Instruct", "Overall": 74.1, "Physics": 84.6, "Chemistry": 66.5, "Finance": 71.6, "Consulting": 79.1, "Extraction": 78.1, "Reasoning": 75.4, "Style": 64.6, "F1": 76.7, "o3": -3.1, "R1-0528": -0.8, "Grok4": -3.4, "Bias-Index": 2.6, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 0.22}
|
| 21 |
+
{"Model": "meta/llama-3.1-70b-instruct", "Category": "Open-weight Instruct", "Overall": 70.7, "Physics": 82.1, "Chemistry": 66.7, "Finance": 72.6, "Consulting": 76.0, "Extraction": 77.5, "Reasoning": 73.9, "Style": 64.7, "F1": 75.4, "o3": -6.2, "R1-0528": -1.5, "Grok4": -4.1, "Bias-Index": 4.7, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 0.22}
|
| 22 |
+
{"Model": "meta/llama-3.1-8b-instruct", "Category": "Open-weight Instruct", "Overall": 63.1, "Physics": 76.2, "Chemistry": 69.3, "Finance": 70.2, "Consulting": 71.0, "Extraction": 76.6, "Reasoning": 71.5, "Style": 61.7, "F1": 73.2, "o3": -4.0, "R1-0528": 6.1, "Grok4": -1.5, "Bias-Index": 10.1, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 0.09}
|
| 23 |
+
{"Model": "meta/llama-3.2-3b-instruct", "Category": "Open-weight Instruct", "Overall": 58.3, "Physics": 67.6, "Chemistry": 63.8, "Finance": 59.7, "Consulting": 66.1, "Extraction": 68.8, "Reasoning": 64.6, "Style": 54.6, "F1": 66.2, "o3": 8.8, "R1-0528": 16.7, "Grok4": 13.1, "Bias-Index": 7.9, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 0.02}
|
| 24 |
+
{"Model": "meta/llama-3.1-1b-instruct", "Category": "Open-weight Instruct", "Overall": 39.5, "Physics": 31.9, "Chemistry": 48.4, "Finance": 44.9, "Consulting": 55.8, "Extraction": 47.8, "Reasoning": 43.2, "Style": 46.2, "F1": 45.7, "o3": 31.0, "R1-0528": 33.1, "Grok4": 37.2, "Bias-Index": 6.2, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 0.02}
|
| 25 |
+
{"Model": "OpenAI/GPT-5 (high)", "Category": "Closed-source Reasoning", "Overall": 76.0, "Physics": 90.2, "Chemistry": 68.2, "Finance": 69.4, "Consulting": 80.9, "Extraction": 78.3, "Reasoning": 76.7, "Style": 79.1, "F1": 78.3, "o3": 1.0, "R1-0528": -0.8, "Grok4": -1.3, "Bias-Index": 2.3, "Input Tokens": 1618.0, "Output Tokens": 668.0, "Cost": 30.34}
|
| 26 |
+
{"Model": "OpenAI/GPT-5 (med)", "Category": "Closed-source Reasoning", "Overall": 76.7, "Physics": 89.2, "Chemistry": 67.9, "Finance": 69.0, "Consulting": 80.9, "Extraction": 78.1, "Reasoning": 76.3, "Style": 77.3, "F1": 77.9, "o3": 0.0, "R1-0528": -0.9, "Grok4": -1.2, "Bias-Index": 1.2, "Input Tokens": 1619.0, "Output Tokens": 287.0, "Cost": 17.06}
|
| 27 |
+
{"Model": "OpenAI/GPT-5 (low)", "Category": "Closed-source Reasoning", "Overall": 76.3, "Physics": 88.6, "Chemistry": 69.3, "Finance": 69.0, "Consulting": 80.9, "Extraction": 78.1, "Reasoning": 76.6, "Style": 79.4, "F1": 78.1, "o3": 0.3, "R1-0528": -1.5, "Grok4": -1.4, "Bias-Index": 1.8, "Input Tokens": 1618.0, "Output Tokens": 130.0, "Cost": 11.58}
|
| 28 |
+
{"Model": "OpenAI/GPT-5 (minimal)", "Category": "Closed-source Reasoning", "Overall": 71.9, "Physics": 86.8, "Chemistry": 68.6, "Finance": 71.2, "Consulting": 77.5, "Extraction": 78.9, "Reasoning": 75.2, "Style": 64.8, "F1": 77.0, "o3": -0.5, "R1-0528": -5.6, "Grok4": -5.0, "Bias-Index": 5.1, "Input Tokens": 1618.0, "Output Tokens": 7.0, "Cost": 7.29}
|
| 29 |
+
{"Model": "OpenAI/GPT-5-mini (high)", "Category": "Closed-source Reasoning", "Overall": 75.3, "Physics": 84.5, "Chemistry": 69.2, "Finance": 70.4, "Consulting": 82.8, "Extraction": 78.4, "Reasoning": 75.9, "Style": 74.1, "F1": 77.7, "o3": 6.6, "R1-0528": 4.2, "Grok4": 4.6, "Bias-Index": 2.4, "Input Tokens": 1619.0, "Output Tokens": 497.0, "Cost": 4.88}
|
| 30 |
+
{"Model": "OpenAI/GPT-5-mini (med)", "Category": "Closed-source Reasoning", "Overall": 74.4, "Physics": 83.3, "Chemistry": 68.2, "Finance": 69.9, "Consulting": 81.5, "Extraction": 78.1, "Reasoning": 74.6, "Style": 72.8, "F1": 76.7, "o3": 6.3, "R1-0528": 4.0, "Grok4": 4.3, "Bias-Index": 2.3, "Input Tokens": 1618.0, "Output Tokens": 228.0, "Cost": 3.0}
|
| 31 |
+
{"Model": "OpenAI/GPT-5-mini (low)", "Category": "Closed-source Reasoning", "Overall": 74.7, "Physics": 82.9, "Chemistry": 68.5, "Finance": 70.3, "Consulting": 81.7, "Extraction": 77.4, "Reasoning": 74.6, "Style": 78.0, "F1": 76.8, "o3": 5.9, "R1-0528": 3.8, "Grok4": 4.6, "Bias-Index": 2.1, "Input Tokens": 1618.0, "Output Tokens": 92.0, "Cost": 2.05}
|
| 32 |
+
{"Model": "OpenAI/GPT-5-mini (minimal)", "Category": "Closed-source Reasoning", "Overall": 66.7, "Physics": 81.7, "Chemistry": 64.0, "Finance": 69.1, "Consulting": 76.0, "Extraction": 75.9, "Reasoning": 72.5, "Style": 58.8, "F1": 73.8, "o3": -4.0, "R1-0528": -6.2, "Grok4": -11.1, "Bias-Index": 7.1, "Input Tokens": 1618.0, "Output Tokens": 7.0, "Cost": 1.46}
|
| 33 |
+
{"Model": "OpenAI/GPT-5-nano (high)", "Category": "Closed-source Reasoning", "Overall": 71.9, "Physics": 86.8, "Chemistry": 67.6, "Finance": 68.7, "Consulting": 79.8, "Extraction": 77.6, "Reasoning": 75.1, "Style": 74.0, "F1": 76.9, "o3": 5.3, "R1-0528": 0.3, "Grok4": 3.1, "Bias-Index": 5.0, "Input Tokens": 1618.0, "Output Tokens": 1309.0, "Cost": 2.11}
|
| 34 |
+
{"Model": "OpenAI/GPT-5-nano (med)", "Category": "Closed-source Reasoning", "Overall": 72.7, "Physics": 85.6, "Chemistry": 67.0, "Finance": 68.7, "Consulting": 79.7, "Extraction": 77.1, "Reasoning": 74.3, "Style": 78.3, "F1": 76.4, "o3": 3.4, "R1-0528": -0.3, "Grok4": 1.7, "Bias-Index": 3.7, "Input Tokens": 1618.0, "Output Tokens": 479.0, "Cost": 0.95}
|
| 35 |
+
{"Model": "OpenAI/GPT-5-nano (low)", "Category": "Closed-source Reasoning", "Overall": 73.6, "Physics": 83.5, "Chemistry": 67.6, "Finance": 68.6, "Consulting": 77.7, "Extraction": 76.9, "Reasoning": 73.5, "Style": 70.9, "F1": 75.4, "o3": 2.4, "R1-0528": 0.6, "Grok4": 1.9, "Bias-Index": 1.8, "Input Tokens": 1619.0, "Output Tokens": 141.0, "Cost": 0.48}
|
| 36 |
+
{"Model": "OpenAI/GPT-5-nano (minimal)", "Category": "Closed-source Reasoning", "Overall": 55.0, "Physics": 68.8, "Chemistry": 55.3, "Finance": 60.9, "Consulting": 63.0, "Extraction": 65.8, "Reasoning": 62.1, "Style": 54.3, "F1": 63.2, "o3": -18.7, "R1-0528": -19.6, "Grok4": -26.9, "Bias-Index": 8.2, "Input Tokens": 1618.0, "Output Tokens": 7.0, "Cost": 0.29}
|
| 37 |
+
{"Model": "OpenAI/o3 (high)", "Category": "Closed-source Reasoning", "Overall": 76.4, "Physics": 88.3, "Chemistry": 68.2, "Finance": 69.3, "Consulting": 81.1, "Extraction": 79.1, "Reasoning": 76.1, "Style": 75.3, "F1": 77.9, "o3": 2.0, "R1-0528": 0.5, "Grok4": 0.8, "Bias-Index": 1.5, "Input Tokens": 1618.0, "Output Tokens": 350.0, "Cost": 21.04}
|
| 38 |
+
{"Model": "OpenAI/o3 (med)", "Category": "Closed-source Reasoning", "Overall": 76.0, "Physics": 89.3, "Chemistry": 69.1, "Finance": 68.9, "Consulting": 81.0, "Extraction": 79.3, "Reasoning": 76.4, "Style": 76.9, "F1": 78.2, "o3": 3.0, "R1-0528": 0.8, "Grok4": 1.5, "Bias-Index": 2.2, "Input Tokens": 1618.0, "Output Tokens": 207.0, "Cost": 17.05}
|
| 39 |
+
{"Model": "OpenAI/o3 (low)", "Category": "Closed-source Reasoning", "Overall": 76.4, "Physics": 88.9, "Chemistry": 69.3, "Finance": 70.3, "Consulting": 81.9, "Extraction": 79.7, "Reasoning": 76.8, "Style": 76.7, "F1": 78.7, "o3": 3.8, "R1-0528": 1.5, "Grok4": 2.6, "Bias-Index": 2.3, "Input Tokens": 1618.0, "Output Tokens": 98.0, "Cost": 14.01}
|
| 40 |
+
{"Model": "OpenAI/o4-mini (high)", "Category": "Closed-source Reasoning", "Overall": 75.8, "Physics": 88.5, "Chemistry": 68.9, "Finance": 70.5, "Consulting": 81.5, "Extraction": 78.7, "Reasoning": 76.8, "Style": 76.5, "F1": 78.4, "o3": 4.5, "R1-0528": 2.7, "Grok4": 1.9, "Bias-Index": 2.6, "Input Tokens": 1618.0, "Output Tokens": 308.0, "Cost": 10.93}
|
| 41 |
+
{"Model": "OpenAI/o4-mini (med)", "Category": "Closed-source Reasoning", "Overall": 75.8, "Physics": 88.1, "Chemistry": 69.6, "Finance": 70.8, "Consulting": 81.6, "Extraction": 78.9, "Reasoning": 76.8, "Style": 74.1, "F1": 78.6, "o3": 4.0, "R1-0528": 2.8, "Grok4": 1.2, "Bias-Index": 2.8, "Input Tokens": 1618.0, "Output Tokens": 228.0, "Cost": 9.7}
|
| 42 |
+
{"Model": "OpenAI/o4-mini (low)", "Category": "Closed-source Reasoning", "Overall": 76.8, "Physics": 88.6, "Chemistry": 70.1, "Finance": 70.1, "Consulting": 81.0, "Extraction": 78.8, "Reasoning": 76.8, "Style": 74.1, "F1": 78.5, "o3": 3.4, "R1-0528": 3.3, "Grok4": 1.7, "Bias-Index": 1.7, "Input Tokens": 1618.0, "Output Tokens": 104.0, "Cost": 7.8}
|
| 43 |
+
{"Model": "xAI/grok-4", "Category": "Closed-source Reasoning", "Overall": 75.9, "Physics": 86.1, "Chemistry": 68.5, "Finance": 70.7, "Consulting": 80.8, "Extraction": 78.5, "Reasoning": 76.3, "Style": 75.2, "F1": 77.7, "o3": 0.7, "R1-0528": 2.5, "Grok4": 1.8, "Bias-Index": 1.8, "Input Tokens": 1549.0, "Output Tokens": 812.0, "Cost": 58.7}
|
| 44 |
+
{"Model": "xAI/grok-3-mini", "Category": "Closed-source Reasoning", "Overall": 75.1, "Physics": 85.8, "Chemistry": 66.9, "Finance": 69.4, "Consulting": 82.0, "Extraction": 78.1, "Reasoning": 75.3, "Style": 75.2, "F1": 77.2, "o3": 4.5, "R1-0528": 2.4, "Grok4": 2.9, "Bias-Index": 2.1, "Input Tokens": 1549.0, "Output Tokens": 633.0, "Cost": 2.72}
|
| 45 |
+
{"Model": "Anthropic/claude-sonnet-4-20250514", "Category": "Closed-source Reasoning", "Overall": 70.9, "Physics": 75.7, "Chemistry": 66.3, "Finance": 69.9, "Consulting": 77.8, "Extraction": 77.5, "Reasoning": 72.3, "Style": 66.0, "F1": 74.0, "o3": -11.2, "R1-0528": -8.1, "Grok4": -10.7, "Bias-Index": 3.1, "Input Tokens": 1940.0, "Output Tokens": 810.0, "Cost": 62.64}
|
| 46 |
+
{"Model": "Google/Gemini-2.5-Pro", "Category": "Closed-source Reasoning", "Overall": 78.2, "Physics": 87.3, "Chemistry": 70.2, "Finance": 71.9, "Consulting": 82.6, "Extraction": 81.3, "Reasoning": 77.4, "Style": 76.8, "F1": 79.2, "o3": 3.1, "R1-0528": 2.8, "Grok4": 2.1, "Bias-Index": 1.0, "Input Tokens": 1779.0, "Output Tokens": 967.0, "Cost": 41.46}
|
| 47 |
+
{"Model": "Google/Gemini-2.5-Flash (Thinking)", "Category": "Closed-source Reasoning", "Overall": 78.1, "Physics": 87.0, "Chemistry": 68.7, "Finance": 71.6, "Consulting": 81.2, "Extraction": 80.1, "Reasoning": 76.7, "Style": 74.6, "F1": 78.4, "o3": 2.3, "R1-0528": 2.5, "Grok4": 2.2, "Bias-Index": 0.3, "Input Tokens": 1779.0, "Output Tokens": 695.0, "Cost": 7.92}
|
| 48 |
+
{"Model": "Google/Gemini-2.5-Flash-Lite (Thinking)", "Category": "Closed-source Reasoning", "Overall": 74.7, "Physics": 83.7, "Chemistry": 67.0, "Finance": 72.2, "Consulting": 81.9, "Extraction": 78.7, "Reasoning": 75.9, "Style": 79.1, "F1": 77.5, "o3": -1.1, "R1-0528": 0.2, "Grok4": -2.6, "Bias-Index": 2.8, "Input Tokens": 1779.0, "Output Tokens": 1670.0, "Cost": 2.95}
|
| 49 |
+
{"Model": "OpenAI/gpt-oss-20b (high)", "Category": "Open-weight Reasoning", "Overall": 74.4, "Physics": 89.3, "Chemistry": 68.7, "Finance": 68.5, "Consulting": 80.7, "Extraction": 77.8, "Reasoning": 76.5, "Style": 77.7, "F1": 77.9, "o3": 3.3, "R1-0528": -0.2, "Grok4": 0.9, "Bias-Index": 3.5, "Input Tokens": 1679.0, "Output Tokens": 465.0, "Cost": 0.46}
|
| 50 |
+
{"Model": "OpenAI/gpt-oss-20b (medium)", "Category": "Open-weight Reasoning", "Overall": 74.8, "Physics": 87.7, "Chemistry": 68.3, "Finance": 69.7, "Consulting": 80.9, "Extraction": 78.5, "Reasoning": 76.3, "Style": 76.2, "F1": 77.8, "o3": 3.6, "R1-0528": 1.1, "Grok4": 0.6, "Bias-Index": 3.0, "Input Tokens": 1683.0, "Output Tokens": 216.0, "Cost": 0.35}
|
| 51 |
+
{"Model": "OpenAI/gpt-oss-20b (low)", "Category": "Open-weight Reasoning", "Overall": 75.6, "Physics": 85.4, "Chemistry": 69.3, "Finance": 70.8, "Consulting": 79.2, "Extraction": 77.6, "Reasoning": 76.3, "Style": 71.1, "F1": 77.5, "o3": 0.4, "R1-0528": -0.3, "Grok4": 1.6, "Bias-Index": 1.9, "Input Tokens": 1677.0, "Output Tokens": 85.0, "Cost": 0.28}
|
| 52 |
+
{"Model": "OpenAI/gpt-oss-120b (high)", "Category": "Open-weight Reasoning", "Overall": 75.4, "Physics": 89.5, "Chemistry": 68.9, "Finance": 69.7, "Consulting": 80.8, "Extraction": 78.9, "Reasoning": 76.7, "Style": 80.8, "F1": 78.4, "o3": 1.6, "R1-0528": -1.4, "Grok4": 0.3, "Bias-Index": 3.0, "Input Tokens": 1683.0, "Output Tokens": 439.0, "Cost": 0.88}
|
| 53 |
+
{"Model": "OpenAI/gpt-oss-120b (med)", "Category": "Open-weight Reasoning", "Overall": 75.8, "Physics": 88.1, "Chemistry": 67.4, "Finance": 70.5, "Consulting": 79.9, "Extraction": 79.6, "Reasoning": 76.0, "Style": 75.3, "F1": 77.7, "o3": 0.6, "R1-0528": -1.3, "Grok4": -0.9, "Bias-Index": 1.9, "Input Tokens": 1683.0, "Output Tokens": 196.0, "Cost": 0.63}
|
| 54 |
+
{"Model": "OpenAI/gpt-oss-120b (low)", "Category": "Open-weight Reasoning", "Overall": 76.7, "Physics": 86.0, "Chemistry": 67.2, "Finance": 72.1, "Consulting": 79.0, "Extraction": 79.2, "Reasoning": 75.7, "Style": 72.4, "F1": 77.3, "o3": -1.0, "R1-0528": -1.6, "Grok4": -1.5, "Bias-Index": 0.6, "Input Tokens": 1683.0, "Output Tokens": 84.0, "Cost": 0.5}
|
| 55 |
+
{"Model": "OpenAI/gpt-oss-120b (mixed)", "Category": "Open-weight Reasoning", "Overall": 78.2, "Physics": 89.5, "Chemistry": 68.9, "Finance": 72.2, "Consulting": 79.7, "Extraction": 79.7, "Reasoning": 76.9, "Style": 80.8, "F1": 78.7, "o3": -0.5, "R1-0528": -0.9, "Grok4": -1.0, "Bias-Index": 0.5, "Input Tokens": 1683.0, "Output Tokens": 282.0, "Cost": 0.7}
|
| 56 |
+
{"Model": "DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 76.6, "Physics": 84.3, "Chemistry": 69.3, "Finance": 70.8, "Consulting": 80.3, "Extraction": 78.9, "Reasoning": 75.6, "Style": 72.0, "F1": 77.3, "o3": 3.2, "R1-0528": 3.3, "Grok4": 2.6, "Bias-Index": 0.7, "Input Tokens": 1587.0, "Output Tokens": 657.0, "Cost": 2.94}
|
| 57 |
+
{"Model": "DeepSeek-AI/DeepSeek-R1-0528", "Category": "Open-weight Reasoning", "Overall": 69.4, "Physics": 79.6, "Chemistry": 65.1, "Finance": 68.5, "Consulting": 71.6, "Extraction": 74.7, "Reasoning": 70.9, "Style": 64.1, "F1": 72.2, "o3": -11.6, "R1-0528": -9.3, "Grok4": -8.8, "Bias-Index": 2.8, "Input Tokens": 1601.0, "Output Tokens": 693.0, "Cost": 3.05}
|
| 58 |
+
{"Model": "Qwen/Qwen3-30B-A3B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 39.8, "Physics": 46.7, "Chemistry": 35.9, "Finance": 45.4, "Consulting": 35.8, "Extraction": 42.1, "Reasoning": 41.2, "Style": 35.3, "F1": 41.5, "o3": -0.2, "R1-0528": -1.3, "Grok4": 0.4, "Bias-Index": 1.7, "Input Tokens": 1780.0, "Output Tokens": 742.0, "Cost": 1.1}
|
| 59 |
+
{"Model": "Qwen/Qwen3-235B-A22B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 76.5, "Physics": 87.2, "Chemistry": 67.9, "Finance": 69.0, "Consulting": 80.4, "Extraction": 79.3, "Reasoning": 75.6, "Style": 74.3, "F1": 77.3, "o3": -1.0, "R1-0528": -1.8, "Grok4": -1.5, "Bias-Index": 0.8, "Input Tokens": 1782.0, "Output Tokens": 1245.0, "Cost": 1.84}
|
report_generation_w_docs.jsonl
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"Model": "OpenAI/GPT-5 (high)", "Category": "Closed-source Reasoning", "Overall": 65.9, "Physics": 49.3, "Chemistry": 70.6, "Finance": 63.7, "Consulting": 80.0, "Extraction": 64.4, "Reasoning": 66.2, "Style": 65.3, "Response Characters": 5451.0, "Input Tokens": 23758.0, "Output Tokens": 14583.0, "Cost": 112.34}
|
| 2 |
+
{"Model": "OpenAI/GPT-5-mini (high)", "Category": "Closed-source Reasoning", "Overall": 60.3, "Physics": 50.8, "Chemistry": 63.6, "Finance": 51.6, "Consulting": 75.4, "Extraction": 56.7, "Reasoning": 60.1, "Style": 68.2, "Response Characters": 9018.0, "Input Tokens": 26859.0, "Output Tokens": 18038.0, "Cost": 27.39}
|
| 3 |
+
{"Model": "OpenAI/GPT-5-nano (high)", "Category": "Closed-source Reasoning", "Overall": 50.1, "Physics": 42.2, "Chemistry": 44.6, "Finance": 44.6, "Consulting": 69.0, "Extraction": 46.6, "Reasoning": 48.3, "Style": 58.9, "Response Characters": 9796.0, "Input Tokens": 28549.0, "Output Tokens": 25189.0, "Cost": 7.36}
|
| 4 |
+
{"Model": "OpenAI/o3", "Category": "Closed-source Reasoning", "Overall": 61.4, "Physics": 46.1, "Chemistry": 61.8, "Finance": 60.9, "Consulting": 76.8, "Extraction": 60.4, "Reasoning": 61.8, "Style": 63.0, "Response Characters": 4158.0, "Input Tokens": 18445.0, "Output Tokens": 4709.0, "Cost": 47.72}
|
| 5 |
+
{"Model": "OpenAI/o4-mini", "Category": "Closed-source Reasoning", "Overall": 58.2, "Physics": 45.5, "Chemistry": 58.5, "Finance": 54.7, "Consulting": 74.4, "Extraction": 55.8, "Reasoning": 58.3, "Style": 61.0, "Response Characters": 3886.0, "Input Tokens": 31679.0, "Output Tokens": 4763.0, "Cost": 35.71}
|
| 6 |
+
{"Model": "Google/Gemini-2.5-Pro", "Category": "Closed-source Reasoning", "Overall": 60.3, "Physics": 46.8, "Chemistry": 66.3, "Finance": 54.0, "Consulting": 74.2, "Extraction": 61.4, "Reasoning": 59.3, "Style": 66.8, "Response Characters": 7449.0, "Input Tokens": 6086.0, "Output Tokens": 7950.0, "Cost": 55.75}
|
| 7 |
+
{"Model": "Google/Gemini-2.5-Flash (Thinking)", "Category": "Closed-source Reasoning", "Overall": 57.6, "Physics": 45.0, "Chemistry": 61.8, "Finance": 53.5, "Consulting": 69.9, "Extraction": 58.0, "Reasoning": 57.6, "Style": 61.1, "Response Characters": 12047.0, "Input Tokens": 6086.0, "Output Tokens": 12030.0, "Cost": 20.42}
|
| 8 |
+
{"Model": "Google/Gemini-2.5-Flash-Lite (Thinking)", "Category": "Closed-source Reasoning", "Overall": 49.4, "Physics": 31.7, "Chemistry": 53.1, "Finance": 44.6, "Consulting": 68.0, "Extraction": 48.3, "Reasoning": 48.8, "Style": 54.0, "Response Characters": 10058.0, "Input Tokens": 6086.0, "Output Tokens": 18584.0, "Cost": 5.15}
|
| 9 |
+
{"Model": "xAI/grok-4-0709", "Category": "Closed-source Reasoning", "Overall": 53.4, "Physics": 33.6, "Chemistry": 62.2, "Finance": 44.3, "Consulting": 73.4, "Extraction": 51.9, "Reasoning": 51.6, "Style": 64.1, "Response Characters": 5380.0, "Input Tokens": 13481.0, "Output Tokens": 9885.0, "Cost": 122.78}
|
| 10 |
+
{"Model": "Anthropic/claude-sonnet-4 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 55.8, "Physics": 43.9, "Chemistry": 57.1, "Finance": 50.8, "Consulting": 71.4, "Extraction": 53.8, "Reasoning": 54.0, "Style": 61.8, "Response Characters": 3866.0, "Input Tokens": 51044.0, "Output Tokens": 6916.0, "Cost": 164.39}
|
| 11 |
+
{"Model": "OpenAI/gpt-oss-120b", "Category": "Open-weight Reasoning", "Overall": 54.9, "Physics": 49.1, "Chemistry": 55.3, "Finance": 45.5, "Consulting": 69.4, "Extraction": 48.7, "Reasoning": 55.5, "Style": 59.0, "Response Characters": 7442.0, "Input Tokens": 11606.0, "Output Tokens": 4572.0, "Cost": 1.35}
|
| 12 |
+
{"Model": "OpenAI/gpt-oss-20b", "Category": "Open-weight Reasoning", "Overall": 48.4, "Physics": 41.4, "Chemistry": 46.5, "Finance": 39.8, "Consulting": 66.0, "Extraction": 40.9, "Reasoning": 48.2, "Style": 56.2, "Response Characters": 5331.0, "Input Tokens": 11600.0, "Output Tokens": 4705.0, "Cost": 0.75}
|
| 13 |
+
{"Model": "DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 53.8, "Physics": 44.8, "Chemistry": 59.8, "Finance": 43.3, "Consulting": 67.4, "Extraction": 51.1, "Reasoning": 53.0, "Style": 60.5, "Response Characters": 5239.0, "Input Tokens": 11258.0, "Output Tokens": 7486.0, "Cost": 5.27}
|
| 14 |
+
{"Model": "Qwen/Qwen3-235B-A22B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 54.0, "Physics": 45.1, "Chemistry": 61.4, "Finance": 42.3, "Consulting": 67.3, "Extraction": 51.4, "Reasoning": 51.6, "Style": 61.9, "Response Characters": 6046.0, "Input Tokens": 12442.0, "Output Tokens": 9256.0, "Cost": 2.47}
|
| 15 |
+
{"Model": "Qwen/Qwen3-30B-A3B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 44.6, "Physics": 34.4, "Chemistry": 45.4, "Finance": 36.8, "Consulting": 61.8, "Extraction": 40.4, "Reasoning": 42.3, "Style": 63.9, "Response Characters": 4757.0, "Input Tokens": 12339.0, "Output Tokens": 9027.0, "Cost": 2.16}
|
| 16 |
+
{"Model": "OpenAI/GPT-4.1", "Category": "Closed-source Instruct", "Overall": 56.8, "Physics": 44.7, "Chemistry": 55.2, "Finance": 54.0, "Consulting": 73.2, "Extraction": 56.7, "Reasoning": 56.7, "Style": 58.4, "Response Characters": 6451.0, "Input Tokens": 18427.0, "Output Tokens": 2152.0, "Cost": 34.6}
|
| 17 |
+
{"Model": "OpenAI/GPT-4.1-mini", "Category": "Closed-source Instruct", "Overall": 53.7, "Physics": 45.1, "Chemistry": 53.0, "Finance": 49.1, "Consulting": 67.5, "Extraction": 50.3, "Reasoning": 53.2, "Style": 52.8, "Response Characters": 6921.0, "Input Tokens": 29469.0, "Output Tokens": 2218.0, "Cost": 9.82}
|
| 18 |
+
{"Model": "OpenAI/GPT-4.1-nano", "Category": "Closed-source Instruct", "Overall": 39.3, "Physics": 24.8, "Chemistry": 40.8, "Finance": 33.4, "Consulting": 58.2, "Extraction": 34.9, "Reasoning": 38.4, "Style": 53.5, "Response Characters": 6359.0, "Input Tokens": 35561.0, "Output Tokens": 1966.0, "Cost": 2.78}
|
| 19 |
+
{"Model": "Google/Gemini-2.5-Flash", "Category": "Closed-source Instruct", "Overall": 56.8, "Physics": 44.6, "Chemistry": 59.4, "Finance": 54.3, "Consulting": 68.8, "Extraction": 57.1, "Reasoning": 56.1, "Style": 53.2, "Response Characters": 21612.0, "Input Tokens": 6086.0, "Output Tokens": 5936.0, "Cost": 10.67}
|
| 20 |
+
{"Model": "Google/Gemini-2.5-Flash-Lite", "Category": "Closed-source Instruct", "Overall": 46.6, "Physics": 29.8, "Chemistry": 49.0, "Finance": 44.0, "Consulting": 63.7, "Extraction": 47.4, "Reasoning": 45.0, "Style": 48.6, "Response Characters": 24167.0, "Input Tokens": 6086.0, "Output Tokens": 7787.0, "Cost": 2.33}
|
| 21 |
+
{"Model": "Anthropic/claude-sonnet-4", "Category": "Closed-source Instruct", "Overall": 53.5, "Physics": 40.7, "Chemistry": 54.2, "Finance": 49.5, "Consulting": 69.6, "Extraction": 55.3, "Reasoning": 51.1, "Style": 54.2, "Response Characters": 4068.0, "Input Tokens": 51016.0, "Output Tokens": 1398.0, "Cost": 111.37}
|
| 22 |
+
{"Model": "Anthropic/claude-3.5-haiku", "Category": "Closed-source Instruct", "Overall": 27.6, "Physics": 12.0, "Chemistry": 24.7, "Finance": 27.7, "Consulting": 46.3, "Extraction": 31.2, "Reasoning": 24.7, "Style": 49.4, "Response Characters": 1784.0, "Input Tokens": 34475.0, "Output Tokens": 576.0, "Cost": 19.13}
|
| 23 |
+
{"Model": "Qwen/Qwen3-235B-A22B-Instruct-2507", "Category": "Open-weight Instruct", "Overall": 54.2, "Physics": 45.6, "Chemistry": 55.8, "Finance": 45.7, "Consulting": 69.6, "Extraction": 51.0, "Reasoning": 52.9, "Style": 66.2, "Response Characters": 11400.0, "Input Tokens": 12450.0, "Output Tokens": 4244.0, "Cost": 1.47}
|
| 24 |
+
{"Model": "Qwen/Qwen3-30B-A3B-Instruct-2507", "Category": "Open-weight Instruct", "Overall": 49.3, "Physics": 41.6, "Chemistry": 47.9, "Finance": 42.3, "Consulting": 65.5, "Extraction": 44.5, "Reasoning": 48.0, "Style": 59.1, "Response Characters": 11167.0, "Input Tokens": 12490.0, "Output Tokens": 4021.0, "Cost": 0.95}
|
| 25 |
+
{"Model": "MoonshotAI/Kimi-K2-Instruct-0905", "Category": "Open-weight Instruct", "Overall": 51.3, "Physics": 40.4, "Chemistry": 50.2, "Finance": 48.8, "Consulting": 65.9, "Extraction": 51.2, "Reasoning": 50.0, "Style": 63.4, "Response Characters": 4817.0, "Input Tokens": 11462.0, "Output Tokens": 1562.0, "Cost": 3.36}
|
| 26 |
+
{"Model": "DeepSeek-AI/DeepSeek-V3.1", "Category": "Open-weight Instruct", "Overall": 53.5, "Physics": 45.8, "Chemistry": 55.9, "Finance": 45.2, "Consulting": 67.1, "Extraction": 50.8, "Reasoning": 52.7, "Style": 59.1, "Response Characters": 7792.0, "Input Tokens": 11231.0, "Output Tokens": 2407.0, "Cost": 2.67}
|
| 27 |
+
{"Model": "Meta/llama-4-maverick", "Category": "Open-weight Instruct", "Overall": 39.4, "Physics": 35.2, "Chemistry": 35.8, "Finance": 34.2, "Consulting": 52.5, "Extraction": 39.3, "Reasoning": 36.5, "Style": 46.2, "Response Characters": 4223.0, "Input Tokens": 14604.0, "Output Tokens": 1191.0, "Cost": 1.86}
|
| 28 |
+
{"Model": "meta/llama-4-scout", "Category": "Open-weight Instruct", "Overall": 35.4, "Physics": 23.4, "Chemistry": 34.6, "Finance": 33.4, "Consulting": 50.3, "Extraction": 35.1, "Reasoning": 33.3, "Style": 42.3, "Response Characters": 3612.0, "Input Tokens": 16675.0, "Output Tokens": 1039.0, "Cost": 1.05}
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
pandas
|