Spaces:
Sleeping
Sleeping
sudanl
commited on
Commit
·
2086543
1
Parent(s):
4f2d02a
test
Browse files- README.md +42 -25
- app.py +154 -166
- initial_sage_results.json +236 -0
- reference_answers.json +44 -0
- requirements.txt +2 -10
- src/about.py +86 -32
- src/leaderboard/sage_eval.py +222 -0
- src/populate.py +32 -0
- src/submission/sage_submit.py +207 -0
README.md
CHANGED
|
@@ -1,48 +1,65 @@
|
|
| 1 |
---
|
| 2 |
-
title: SAGE
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
app_file: app.py
|
| 8 |
pinned: true
|
| 9 |
license: apache-2.0
|
| 10 |
-
short_description:
|
| 11 |
sdk_version: 5.43.1
|
| 12 |
tags:
|
| 13 |
- leaderboard
|
|
|
|
|
|
|
|
|
|
| 14 |
---
|
| 15 |
|
| 16 |
-
#
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
Results files should have the following format and be stored as json files:
|
| 21 |
```json
|
| 22 |
{
|
| 23 |
-
"
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
"metric_name": score,
|
| 31 |
-
},
|
| 32 |
-
"task_name2": {
|
| 33 |
-
"metric_name": score,
|
| 34 |
}
|
| 35 |
-
|
| 36 |
}
|
| 37 |
```
|
| 38 |
|
| 39 |
-
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
|
| 45 |
-
|
| 46 |
-
-
|
| 47 |
-
-
|
| 48 |
-
-
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: SAGE Benchmark
|
| 3 |
+
emoji: 🧪
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
app_file: app.py
|
| 8 |
pinned: true
|
| 9 |
license: apache-2.0
|
| 10 |
+
short_description: SAGE - A High-Difficulty, Multidisciplinary Benchmark for Frontier Scientific Reasoning
|
| 11 |
sdk_version: 5.43.1
|
| 12 |
tags:
|
| 13 |
- leaderboard
|
| 14 |
+
- science
|
| 15 |
+
- benchmark
|
| 16 |
+
- evaluation
|
| 17 |
---
|
| 18 |
|
| 19 |
+
# SAGE: Science AGent Evaluation Benchmark
|
| 20 |
|
| 21 |
+
SAGE (Scientific Advanced General Evaluation) is a large-scale, high-difficulty, cross-disciplinary benchmark developed by Shanghai AI Laboratory for evaluating frontier scientific reasoning capabilities of Large Language Models (LLMs).
|
| 22 |
+
|
| 23 |
+
## Benchmark Overview
|
| 24 |
+
|
| 25 |
+
SAGE evaluates models across seven core scientific fields covering the key domains of AI for Science (AI4S):
|
| 26 |
+
- **Mathematics** - Abstract algebra, analysis, differential equations, and computational mathematics
|
| 27 |
+
- **Physics** - Classical mechanics, electrodynamics, quantum mechanics, thermodynamics, and astrophysics
|
| 28 |
+
- **Chemistry** - Physical chemistry, inorganic chemistry, organic chemistry, and analytical chemistry
|
| 29 |
+
- **Biology** - Genetics, immunology, molecular biology, biophysics, and ecology
|
| 30 |
+
- **Computer Science** - Computer architecture, artificial intelligence, and software fundamentals
|
| 31 |
+
- **Earth Science** - Geography, geodesy, atmospheric chemistry, marine science, and geology
|
| 32 |
+
- **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
|
| 33 |
+
|
| 34 |
+
## Submission Format
|
| 35 |
+
|
| 36 |
+
Submit your evaluation results as JSON files with the following format:
|
| 37 |
|
|
|
|
| 38 |
```json
|
| 39 |
{
|
| 40 |
+
"submission_org": "Your Organization",
|
| 41 |
+
"submission_email": "contact@example.com",
|
| 42 |
+
"predictions": [
|
| 43 |
+
{
|
| 44 |
+
"original_question_id": 0,
|
| 45 |
+
"content": ["answer1", "answer2", "answer3", "answer4"],
|
| 46 |
+
"reasoning_content": ["reasoning1", "reasoning2", "reasoning3", "reasoning4"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
}
|
| 48 |
+
]
|
| 49 |
}
|
| 50 |
```
|
| 51 |
|
| 52 |
+
## Key Features
|
| 53 |
|
| 54 |
+
- **Simplified Interface**: Clean, easy-to-use interface focused on SAGE benchmark results
|
| 55 |
+
- **Real-time Evaluation**: Immediate processing and scoring of submissions
|
| 56 |
+
- **Multi-domain Analysis**: Detailed breakdown across scientific domains
|
| 57 |
+
- **Persistent Leaderboard**: Results are automatically saved and persist across sessions
|
| 58 |
|
| 59 |
+
## Code Structure
|
| 60 |
|
| 61 |
+
- `src/about.py` - SAGE-specific task definitions and content
|
| 62 |
+
- `src/leaderboard/sage_eval.py` - SAGE evaluation logic and result processing
|
| 63 |
+
- `src/submission/sage_submit.py` - Simplified submission processing
|
| 64 |
+
- `initial_sage_results.json` - Benchmark results from major models
|
| 65 |
+
- `reference_answers.json` - Reference data for evaluation
|
app.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 3 |
import pandas as pd
|
| 4 |
-
|
| 5 |
-
from huggingface_hub import snapshot_download
|
| 6 |
|
| 7 |
from src.about import (
|
| 8 |
CITATION_BUTTON_LABEL,
|
|
@@ -24,181 +25,168 @@ from src.display.utils import (
|
|
| 24 |
WeightType,
|
| 25 |
Precision
|
| 26 |
)
|
| 27 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 28 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 29 |
-
from src.submission.submit import add_new_eval
|
| 30 |
|
| 31 |
-
|
| 32 |
-
def restart_space():
|
| 33 |
-
API.restart_space(repo_id=REPO_ID)
|
| 34 |
-
|
| 35 |
-
### Space initialisation
|
| 36 |
-
try:
|
| 37 |
-
print(EVAL_REQUESTS_PATH)
|
| 38 |
-
snapshot_download(
|
| 39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 40 |
-
)
|
| 41 |
-
except Exception:
|
| 42 |
-
restart_space()
|
| 43 |
try:
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
except
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
(
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
def
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
-
|
| 92 |
-
demo = gr.Blocks(css=custom_css)
|
| 93 |
with demo:
|
| 94 |
gr.HTML(TITLE)
|
| 95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 96 |
|
| 97 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 98 |
-
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 99 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 100 |
-
|
| 101 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 102 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 103 |
-
|
| 104 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 105 |
-
with gr.Column():
|
| 106 |
-
with gr.Row():
|
| 107 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 108 |
-
|
| 109 |
-
with gr.Column():
|
| 110 |
-
with gr.Accordion(
|
| 111 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 112 |
-
open=False,
|
| 113 |
-
):
|
| 114 |
-
with gr.Row():
|
| 115 |
-
finished_eval_table = gr.components.Dataframe(
|
| 116 |
-
value=finished_eval_queue_df,
|
| 117 |
-
headers=EVAL_COLS,
|
| 118 |
-
datatype=EVAL_TYPES,
|
| 119 |
-
row_count=5,
|
| 120 |
-
)
|
| 121 |
-
with gr.Accordion(
|
| 122 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 123 |
-
open=False,
|
| 124 |
-
):
|
| 125 |
-
with gr.Row():
|
| 126 |
-
running_eval_table = gr.components.Dataframe(
|
| 127 |
-
value=running_eval_queue_df,
|
| 128 |
-
headers=EVAL_COLS,
|
| 129 |
-
datatype=EVAL_TYPES,
|
| 130 |
-
row_count=5,
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
-
with gr.Accordion(
|
| 134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 135 |
-
open=False,
|
| 136 |
-
):
|
| 137 |
-
with gr.Row():
|
| 138 |
-
pending_eval_table = gr.components.Dataframe(
|
| 139 |
-
value=pending_eval_queue_df,
|
| 140 |
-
headers=EVAL_COLS,
|
| 141 |
-
datatype=EVAL_TYPES,
|
| 142 |
-
row_count=5,
|
| 143 |
-
)
|
| 144 |
-
with gr.Row():
|
| 145 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 146 |
-
|
| 147 |
-
with gr.Row():
|
| 148 |
-
with gr.Column():
|
| 149 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
| 150 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 151 |
-
model_type = gr.Dropdown(
|
| 152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
| 153 |
-
label="Model type",
|
| 154 |
-
multiselect=False,
|
| 155 |
-
value=None,
|
| 156 |
-
interactive=True,
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
with gr.Column():
|
| 160 |
-
precision = gr.Dropdown(
|
| 161 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 162 |
-
label="Precision",
|
| 163 |
-
multiselect=False,
|
| 164 |
-
value="float16",
|
| 165 |
-
interactive=True,
|
| 166 |
-
)
|
| 167 |
-
weight_type = gr.Dropdown(
|
| 168 |
-
choices=[i.value.name for i in WeightType],
|
| 169 |
-
label="Weights type",
|
| 170 |
-
multiselect=False,
|
| 171 |
-
value="Original",
|
| 172 |
-
interactive=True,
|
| 173 |
-
)
|
| 174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 175 |
-
|
| 176 |
-
submit_button = gr.Button("Submit Eval")
|
| 177 |
-
submission_result = gr.Markdown()
|
| 178 |
-
submit_button.click(
|
| 179 |
-
add_new_eval,
|
| 180 |
-
[
|
| 181 |
-
model_name_textbox,
|
| 182 |
-
base_model_name_textbox,
|
| 183 |
-
revision_name_textbox,
|
| 184 |
-
precision,
|
| 185 |
-
weight_type,
|
| 186 |
-
model_type,
|
| 187 |
-
],
|
| 188 |
-
submission_result,
|
| 189 |
-
)
|
| 190 |
-
|
| 191 |
with gr.Row():
|
| 192 |
with gr.Accordion("📙 Citation", open=False):
|
| 193 |
citation_button = gr.Textbox(
|
| 194 |
value=CITATION_BUTTON_TEXT,
|
| 195 |
label=CITATION_BUTTON_LABEL,
|
| 196 |
-
lines=20,
|
| 197 |
elem_id="citation-button",
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
)
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import datetime
|
| 4 |
import gradio as gr
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
+
import numpy as np
|
|
|
|
| 7 |
|
| 8 |
from src.about import (
|
| 9 |
CITATION_BUTTON_LABEL,
|
|
|
|
| 25 |
WeightType,
|
| 26 |
Precision
|
| 27 |
)
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
# SAGE specific imports
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
try:
|
| 31 |
+
from src.leaderboard.sage_eval import load_initial_sage_results, SAGEResult
|
| 32 |
+
from src.submission.sage_submit import process_sage_submission_simple
|
| 33 |
+
from src.populate import get_sage_leaderboard_df
|
| 34 |
+
SAGE_MODULES_AVAILABLE = True
|
| 35 |
+
except ImportError as e:
|
| 36 |
+
print(f"Warning: SAGE modules not available: {e}")
|
| 37 |
+
SAGE_MODULES_AVAILABLE = False
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# Configuration
|
| 41 |
+
TOKEN = os.environ.get("HF_TOKEN", None)
|
| 42 |
+
OWNER = "opencompass"
|
| 43 |
+
|
| 44 |
+
def format_error(msg):
|
| 45 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
|
| 46 |
+
|
| 47 |
+
def format_warning(msg):
|
| 48 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
|
| 49 |
+
|
| 50 |
+
def format_log(msg):
|
| 51 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
|
| 52 |
+
|
| 53 |
+
def model_hyperlink(link, model_name):
|
| 54 |
+
if link and link.startswith("http"):
|
| 55 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 56 |
+
return model_name
|
| 57 |
+
|
| 58 |
+
def get_leaderboard_dataframe():
|
| 59 |
+
"""Generate leaderboard dataframe from SAGE results"""
|
| 60 |
+
if not SAGE_MODULES_AVAILABLE:
|
| 61 |
+
return pd.DataFrame()
|
| 62 |
+
|
| 63 |
+
sage_results = load_initial_sage_results()
|
| 64 |
+
|
| 65 |
+
if not sage_results:
|
| 66 |
+
return pd.DataFrame()
|
| 67 |
+
|
| 68 |
+
# Convert to leaderboard format
|
| 69 |
+
leaderboard_data = []
|
| 70 |
+
for result in sage_results:
|
| 71 |
+
# Extract model name from submission_id
|
| 72 |
+
if result.submission_id.startswith("initial_"):
|
| 73 |
+
model_name = result.submission_id.split("_", 2)[-1].replace("_", " ")
|
| 74 |
+
else:
|
| 75 |
+
model_name = result.submission_id
|
| 76 |
+
|
| 77 |
+
# Create model hyperlink (for now just display name)
|
| 78 |
+
model_display = f"**{model_name}**"
|
| 79 |
+
|
| 80 |
+
row = {
|
| 81 |
+
"Model": model_display,
|
| 82 |
+
"Organization": result.organization,
|
| 83 |
+
"Overall (%)": result.results.get("sage_overall", 0),
|
| 84 |
+
"Mathematics (%)": result.results.get("sage_math", 0),
|
| 85 |
+
"Physics (%)": result.results.get("sage_physics", 0),
|
| 86 |
+
"Chemistry (%)": result.results.get("sage_chemistry", 0),
|
| 87 |
+
"Biology (%)": result.results.get("sage_biology", 0),
|
| 88 |
+
"Earth Science (%)": result.results.get("sage_earth_science", 0),
|
| 89 |
+
"Astronomy (%)": result.results.get("sage_astronomy", 0),
|
| 90 |
+
"Submission Date": result.submitted_time
|
| 91 |
+
}
|
| 92 |
+
leaderboard_data.append(row)
|
| 93 |
+
|
| 94 |
+
df = pd.DataFrame(leaderboard_data)
|
| 95 |
+
if not df.empty:
|
| 96 |
+
df = df.sort_values(by=["Overall (%)"], ascending=False)
|
| 97 |
+
|
| 98 |
+
return df
|
| 99 |
+
|
| 100 |
+
def refresh_leaderboard():
|
| 101 |
+
"""Refresh the leaderboard data"""
|
| 102 |
+
print("🔄 Refreshing leaderboard data...")
|
| 103 |
+
return get_leaderboard_dataframe()
|
| 104 |
+
|
| 105 |
+
# Initialize data
|
| 106 |
+
leaderboard_df = get_leaderboard_dataframe()
|
| 107 |
+
|
| 108 |
+
# Define column types for the dataframe
|
| 109 |
+
COLUMN_TYPES = ["markdown", "str", "number", "number", "number", "number", "number", "number", "number", "str"]
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# Create Gradio interface
|
| 113 |
+
demo = gr.Blocks(css="""
|
| 114 |
+
.markdown-text {
|
| 115 |
+
font-size: 16px !important;
|
| 116 |
+
}
|
| 117 |
+
#citation-button {
|
| 118 |
+
font-family: monospace;
|
| 119 |
+
}
|
| 120 |
+
""")
|
| 121 |
|
|
|
|
|
|
|
| 122 |
with demo:
|
| 123 |
gr.HTML(TITLE)
|
| 124 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
with gr.Row():
|
| 127 |
with gr.Accordion("📙 Citation", open=False):
|
| 128 |
citation_button = gr.Textbox(
|
| 129 |
value=CITATION_BUTTON_TEXT,
|
| 130 |
label=CITATION_BUTTON_LABEL,
|
|
|
|
| 131 |
elem_id="citation-button",
|
| 132 |
+
lines=10,
|
| 133 |
+
max_lines=10,
|
| 134 |
+
interactive=False
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Main leaderboard table
|
| 138 |
+
gr.Markdown("## 🏆 SAGE Benchmark Results", elem_classes="markdown-text")
|
| 139 |
+
leaderboard_table = gr.Dataframe(
|
| 140 |
+
value=leaderboard_df,
|
| 141 |
+
datatype=COLUMN_TYPES,
|
| 142 |
+
interactive=False,
|
| 143 |
+
wrap=True,
|
| 144 |
+
column_widths=["25%", "15%", "8%", "8%", "8%", "8%", "8%", "8%", "8%", "12%"]
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
# Refresh button
|
| 148 |
+
refresh_button = gr.Button("🔄 Refresh Leaderboard")
|
| 149 |
+
refresh_button.click(
|
| 150 |
+
refresh_leaderboard,
|
| 151 |
+
inputs=[],
|
| 152 |
+
outputs=[leaderboard_table]
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# Submission section
|
| 156 |
+
with gr.Accordion("📊 Submit Your SAGE Results", open=False):
|
| 157 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 158 |
+
|
| 159 |
+
with gr.Row():
|
| 160 |
+
with gr.Column():
|
| 161 |
+
org_textbox = gr.Textbox(label="Organization Name", placeholder="Your Organization")
|
| 162 |
+
email_textbox = gr.Textbox(label="Contact Email", placeholder="contact@example.com")
|
| 163 |
+
with gr.Column():
|
| 164 |
+
file_upload = gr.File(
|
| 165 |
+
label="Upload SAGE Results (JSON)",
|
| 166 |
+
file_types=[".json"],
|
| 167 |
+
type="filepath"
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
submit_button = gr.Button("Submit Results", variant="primary")
|
| 171 |
+
submission_result = gr.HTML()
|
| 172 |
+
|
| 173 |
+
if SAGE_MODULES_AVAILABLE:
|
| 174 |
+
submit_button.click(
|
| 175 |
+
process_sage_submission_simple,
|
| 176 |
+
inputs=[file_upload, org_textbox, email_textbox],
|
| 177 |
+
outputs=[submission_result]
|
| 178 |
+
).then(
|
| 179 |
+
refresh_leaderboard, # Auto-refresh after submission
|
| 180 |
+
inputs=[],
|
| 181 |
+
outputs=[leaderboard_table]
|
| 182 |
+
)
|
| 183 |
+
else:
|
| 184 |
+
submit_button.click(
|
| 185 |
+
lambda: format_error("SAGE submission system not available"),
|
| 186 |
+
inputs=[],
|
| 187 |
+
outputs=[submission_result]
|
| 188 |
)
|
| 189 |
|
| 190 |
+
# Launch the app
|
| 191 |
+
if __name__ == "__main__":
|
| 192 |
+
demo.launch(debug=True)
|
|
|
initial_sage_results.json
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"model_name": "OpenAI GPT-5-High",
|
| 4 |
+
"organization": "OpenAI",
|
| 5 |
+
"tokens": "64k",
|
| 6 |
+
"accuracy": 45.2,
|
| 7 |
+
"mg_pass_2": 36.6,
|
| 8 |
+
"mg_pass_4": 35.1,
|
| 9 |
+
"submitted_time": "2024-01-15",
|
| 10 |
+
"results": {
|
| 11 |
+
"sage_overall": 45.2,
|
| 12 |
+
"sage_math": 48.5,
|
| 13 |
+
"sage_physics": 44.1,
|
| 14 |
+
"sage_chemistry": 42.8,
|
| 15 |
+
"sage_biology": 46.3,
|
| 16 |
+
"sage_earth_science": 43.7,
|
| 17 |
+
"sage_astronomy": 45.8
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"model_name": "Gemini-2.5-Pro",
|
| 22 |
+
"organization": "Google",
|
| 23 |
+
"tokens": "64k",
|
| 24 |
+
"accuracy": 40.5,
|
| 25 |
+
"mg_pass_2": 31.2,
|
| 26 |
+
"mg_pass_4": 29.7,
|
| 27 |
+
"submitted_time": "2024-01-14",
|
| 28 |
+
"results": {
|
| 29 |
+
"sage_overall": 40.5,
|
| 30 |
+
"sage_math": 43.2,
|
| 31 |
+
"sage_physics": 39.8,
|
| 32 |
+
"sage_chemistry": 38.1,
|
| 33 |
+
"sage_biology": 41.7,
|
| 34 |
+
"sage_earth_science": 39.4,
|
| 35 |
+
"sage_astronomy": 40.8
|
| 36 |
+
}
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"model_name": "OpenAI o3-High",
|
| 40 |
+
"organization": "OpenAI",
|
| 41 |
+
"tokens": "64k",
|
| 42 |
+
"accuracy": 39.6,
|
| 43 |
+
"mg_pass_2": 26.0,
|
| 44 |
+
"mg_pass_4": 27.3,
|
| 45 |
+
"submitted_time": "2024-01-13",
|
| 46 |
+
"results": {
|
| 47 |
+
"sage_overall": 39.6,
|
| 48 |
+
"sage_math": 42.1,
|
| 49 |
+
"sage_physics": 38.5,
|
| 50 |
+
"sage_chemistry": 37.2,
|
| 51 |
+
"sage_biology": 40.8,
|
| 52 |
+
"sage_earth_science": 38.1,
|
| 53 |
+
"sage_astronomy": 40.9
|
| 54 |
+
}
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"model_name": "Gemini-2.5-Pro",
|
| 58 |
+
"organization": "Google",
|
| 59 |
+
"tokens": "32k",
|
| 60 |
+
"accuracy": 39.1,
|
| 61 |
+
"mg_pass_2": 29.4,
|
| 62 |
+
"mg_pass_4": 27.5,
|
| 63 |
+
"submitted_time": "2024-01-12",
|
| 64 |
+
"results": {
|
| 65 |
+
"sage_overall": 39.1,
|
| 66 |
+
"sage_math": 41.8,
|
| 67 |
+
"sage_physics": 38.2,
|
| 68 |
+
"sage_chemistry": 36.9,
|
| 69 |
+
"sage_biology": 40.3,
|
| 70 |
+
"sage_earth_science": 37.7,
|
| 71 |
+
"sage_astronomy": 39.7
|
| 72 |
+
}
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"model_name": "OpenAI o3-High",
|
| 76 |
+
"organization": "OpenAI",
|
| 77 |
+
"tokens": "32k",
|
| 78 |
+
"accuracy": 38.5,
|
| 79 |
+
"mg_pass_2": 26.4,
|
| 80 |
+
"mg_pass_4": 24.2,
|
| 81 |
+
"submitted_time": "2024-01-11",
|
| 82 |
+
"results": {
|
| 83 |
+
"sage_overall": 38.5,
|
| 84 |
+
"sage_math": 41.2,
|
| 85 |
+
"sage_physics": 37.8,
|
| 86 |
+
"sage_chemistry": 36.1,
|
| 87 |
+
"sage_biology": 39.9,
|
| 88 |
+
"sage_earth_science": 37.3,
|
| 89 |
+
"sage_astronomy": 38.7
|
| 90 |
+
}
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"model_name": "Grok-4",
|
| 94 |
+
"organization": "xAI",
|
| 95 |
+
"tokens": "32k",
|
| 96 |
+
"accuracy": 35.0,
|
| 97 |
+
"mg_pass_2": 26.0,
|
| 98 |
+
"mg_pass_4": 24.1,
|
| 99 |
+
"submitted_time": "2024-01-10",
|
| 100 |
+
"results": {
|
| 101 |
+
"sage_overall": 35.0,
|
| 102 |
+
"sage_math": 37.5,
|
| 103 |
+
"sage_physics": 34.2,
|
| 104 |
+
"sage_chemistry": 33.1,
|
| 105 |
+
"sage_biology": 36.1,
|
| 106 |
+
"sage_earth_science": 34.8,
|
| 107 |
+
"sage_astronomy": 34.3
|
| 108 |
+
}
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"model_name": "Qwen3-235B-A22B-2507",
|
| 112 |
+
"organization": "Alibaba",
|
| 113 |
+
"tokens": "32k",
|
| 114 |
+
"accuracy": 27.8,
|
| 115 |
+
"mg_pass_2": 19.8,
|
| 116 |
+
"mg_pass_4": 18.1,
|
| 117 |
+
"submitted_time": "2024-01-09",
|
| 118 |
+
"results": {
|
| 119 |
+
"sage_overall": 27.8,
|
| 120 |
+
"sage_math": 29.8,
|
| 121 |
+
"sage_physics": 27.1,
|
| 122 |
+
"sage_chemistry": 26.5,
|
| 123 |
+
"sage_biology": 28.4,
|
| 124 |
+
"sage_earth_science": 27.9,
|
| 125 |
+
"sage_astronomy": 27.1
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"model_name": "Doubao-Seed-1.6-thinking",
|
| 130 |
+
"organization": "ByteDance",
|
| 131 |
+
"tokens": "32k",
|
| 132 |
+
"accuracy": 27.7,
|
| 133 |
+
"mg_pass_2": 18.4,
|
| 134 |
+
"mg_pass_4": 16.8,
|
| 135 |
+
"submitted_time": "2024-01-08",
|
| 136 |
+
"results": {
|
| 137 |
+
"sage_overall": 27.7,
|
| 138 |
+
"sage_math": 29.6,
|
| 139 |
+
"sage_physics": 27.0,
|
| 140 |
+
"sage_chemistry": 26.3,
|
| 141 |
+
"sage_biology": 28.2,
|
| 142 |
+
"sage_earth_science": 27.7,
|
| 143 |
+
"sage_astronomy": 27.4
|
| 144 |
+
}
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"model_name": "DeepSeek-V3.1",
|
| 148 |
+
"organization": "DeepSeek",
|
| 149 |
+
"tokens": "64k",
|
| 150 |
+
"accuracy": 27.7,
|
| 151 |
+
"mg_pass_2": 18.3,
|
| 152 |
+
"mg_pass_4": 16.5,
|
| 153 |
+
"submitted_time": "2024-01-07",
|
| 154 |
+
"results": {
|
| 155 |
+
"sage_overall": 27.7,
|
| 156 |
+
"sage_math": 29.5,
|
| 157 |
+
"sage_physics": 26.9,
|
| 158 |
+
"sage_chemistry": 26.2,
|
| 159 |
+
"sage_biology": 28.1,
|
| 160 |
+
"sage_earth_science": 27.6,
|
| 161 |
+
"sage_astronomy": 27.9
|
| 162 |
+
}
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"model_name": "DeepSeek-R1-0528",
|
| 166 |
+
"organization": "DeepSeek",
|
| 167 |
+
"tokens": "32k",
|
| 168 |
+
"accuracy": 26.1,
|
| 169 |
+
"mg_pass_2": 16.0,
|
| 170 |
+
"mg_pass_4": 14.1,
|
| 171 |
+
"submitted_time": "2024-01-06",
|
| 172 |
+
"results": {
|
| 173 |
+
"sage_overall": 26.1,
|
| 174 |
+
"sage_math": 28.0,
|
| 175 |
+
"sage_physics": 25.4,
|
| 176 |
+
"sage_chemistry": 24.8,
|
| 177 |
+
"sage_biology": 26.7,
|
| 178 |
+
"sage_earth_science": 26.2,
|
| 179 |
+
"sage_astronomy": 25.5
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"model_name": "OpenAI o4-mini",
|
| 184 |
+
"organization": "OpenAI",
|
| 185 |
+
"tokens": "32k",
|
| 186 |
+
"accuracy": 23.5,
|
| 187 |
+
"mg_pass_2": 13.7,
|
| 188 |
+
"mg_pass_4": 11.9,
|
| 189 |
+
"submitted_time": "2024-01-05",
|
| 190 |
+
"results": {
|
| 191 |
+
"sage_overall": 23.5,
|
| 192 |
+
"sage_math": 25.2,
|
| 193 |
+
"sage_physics": 22.8,
|
| 194 |
+
"sage_chemistry": 22.1,
|
| 195 |
+
"sage_biology": 24.1,
|
| 196 |
+
"sage_earth_science": 23.6,
|
| 197 |
+
"sage_astronomy": 23.2
|
| 198 |
+
}
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"model_name": "Qwen3-235B-A22B",
|
| 202 |
+
"organization": "Alibaba",
|
| 203 |
+
"tokens": "32k",
|
| 204 |
+
"accuracy": 20.1,
|
| 205 |
+
"mg_pass_2": 11.2,
|
| 206 |
+
"mg_pass_4": 9.6,
|
| 207 |
+
"submitted_time": "2024-01-04",
|
| 208 |
+
"results": {
|
| 209 |
+
"sage_overall": 20.1,
|
| 210 |
+
"sage_math": 21.5,
|
| 211 |
+
"sage_physics": 19.5,
|
| 212 |
+
"sage_chemistry": 19.2,
|
| 213 |
+
"sage_biology": 20.7,
|
| 214 |
+
"sage_earth_science": 20.3,
|
| 215 |
+
"sage_astronomy": 19.4
|
| 216 |
+
}
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"model_name": "GLM-4.5-Thinking",
|
| 220 |
+
"organization": "Zhipu AI",
|
| 221 |
+
"tokens": "64k",
|
| 222 |
+
"accuracy": 9.3,
|
| 223 |
+
"mg_pass_2": 4.7,
|
| 224 |
+
"mg_pass_4": 4.0,
|
| 225 |
+
"submitted_time": "2024-01-03",
|
| 226 |
+
"results": {
|
| 227 |
+
"sage_overall": 9.3,
|
| 228 |
+
"sage_math": 10.1,
|
| 229 |
+
"sage_physics": 9.0,
|
| 230 |
+
"sage_chemistry": 8.7,
|
| 231 |
+
"sage_biology": 9.6,
|
| 232 |
+
"sage_earth_science": 9.2,
|
| 233 |
+
"sage_astronomy": 9.2
|
| 234 |
+
}
|
| 235 |
+
}
|
| 236 |
+
]
|
reference_answers.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"reference_answers": [
|
| 3 |
+
{
|
| 4 |
+
"question_id": 0,
|
| 5 |
+
"domain": "mathematics",
|
| 6 |
+
"question": "What is 6 multiplied by 7?",
|
| 7 |
+
"correct_answer": "42",
|
| 8 |
+
"alternative_answers": ["42", "forty-two", "6×7", "6*7"],
|
| 9 |
+
"explanation": "The multiplication of 6 and 7 equals 42."
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"question_id": 1,
|
| 13 |
+
"domain": "chemistry",
|
| 14 |
+
"question": "What is the chemical formula for water?",
|
| 15 |
+
"correct_answer": "H2O",
|
| 16 |
+
"alternative_answers": ["H2O", "water", "dihydrogen monoxide"],
|
| 17 |
+
"explanation": "Water consists of two hydrogen atoms and one oxygen atom."
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"question_id": 2,
|
| 21 |
+
"domain": "biology",
|
| 22 |
+
"question": "What molecule carries genetic information in living organisms?",
|
| 23 |
+
"correct_answer": "DNA",
|
| 24 |
+
"alternative_answers": ["DNA", "deoxyribonucleic acid", "genetic material"],
|
| 25 |
+
"explanation": "DNA stores and transmits genetic information in all living organisms."
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"question_id": 3,
|
| 29 |
+
"domain": "physics",
|
| 30 |
+
"question": "What is the acceleration due to gravity on Earth?",
|
| 31 |
+
"correct_answer": "9.8 m/s²",
|
| 32 |
+
"alternative_answers": ["9.8 m/s²", "9.81 m/s²", "9.8", "9.81"],
|
| 33 |
+
"explanation": "Earth's gravitational acceleration is approximately 9.8 meters per second squared."
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"question_id": 4,
|
| 37 |
+
"domain": "biology",
|
| 38 |
+
"question": "What is the process by which plants convert sunlight into energy?",
|
| 39 |
+
"correct_answer": "photosynthesis",
|
| 40 |
+
"alternative_answers": ["photosynthesis", "6CO2 + 6H2O + light → C6H12O6 + 6O2"],
|
| 41 |
+
"explanation": "Photosynthesis converts light energy into chemical energy in plants."
|
| 42 |
+
}
|
| 43 |
+
]
|
| 44 |
+
}
|
requirements.txt
CHANGED
|
@@ -1,16 +1,8 @@
|
|
| 1 |
-
APScheduler
|
| 2 |
-
black
|
| 3 |
datasets
|
| 4 |
gradio
|
| 5 |
-
gradio[oauth]
|
| 6 |
-
gradio_leaderboard==0.0.13
|
| 7 |
-
gradio_client
|
| 8 |
huggingface-hub>=0.18.0
|
| 9 |
-
matplotlib
|
| 10 |
numpy
|
| 11 |
pandas
|
| 12 |
python-dateutil
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
tokenizers>=0.15.0
|
| 16 |
-
sentencepiece
|
|
|
|
|
|
|
|
|
|
| 1 |
datasets
|
| 2 |
gradio
|
|
|
|
|
|
|
|
|
|
| 3 |
huggingface-hub>=0.18.0
|
|
|
|
| 4 |
numpy
|
| 5 |
pandas
|
| 6 |
python-dateutil
|
| 7 |
+
openai>=1.0.0
|
| 8 |
+
aiohttp
|
|
|
|
|
|
src/about.py
CHANGED
|
@@ -12,8 +12,13 @@ class Task:
|
|
| 12 |
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 19 |
# ---------------------------------------------------
|
|
@@ -21,52 +26,101 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
-
TITLE = """<h1 align="center" id="space-title"
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 32 |
LLM_BENCHMARKS_TEXT = f"""
|
| 33 |
-
## How
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
## Reproducibility
|
| 36 |
-
To reproduce our results
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
|
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
EVALUATION_QUEUE_TEXT = """
|
| 41 |
-
##
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
```
|
| 50 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
| 51 |
-
|
| 52 |
-
Note: make sure your model is public!
|
| 53 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
| 54 |
-
|
| 55 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
| 56 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
| 57 |
|
| 58 |
-
###
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
| 63 |
-
|
| 64 |
-
## In case of model failure
|
| 65 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
| 66 |
-
Make sure you have followed the above steps first.
|
| 67 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
| 68 |
"""
|
| 69 |
|
| 70 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 71 |
-
CITATION_BUTTON_TEXT = r"""
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
+
sage_overall = Task("sage_overall", "accuracy", "SAGE Overall")
|
| 16 |
+
sage_math = Task("sage_math", "accuracy", "Mathematics")
|
| 17 |
+
sage_physics = Task("sage_physics", "accuracy", "Physics")
|
| 18 |
+
sage_chemistry = Task("sage_chemistry", "accuracy", "Chemistry")
|
| 19 |
+
sage_biology = Task("sage_biology", "accuracy", "Biology")
|
| 20 |
+
sage_earth_science = Task("sage_earth_science", "accuracy", "Earth Science")
|
| 21 |
+
sage_astronomy = Task("sage_astronomy", "accuracy", "Astronomy")
|
| 22 |
|
| 23 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 24 |
# ---------------------------------------------------
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
# Your leaderboard name
|
| 29 |
+
TITLE = """<h1 align="center" id="space-title">🧪 SAGE: A High-Difficulty, Multidisciplinary Benchmark for Frontier Scientific Reasoning</h1>"""
|
| 30 |
|
| 31 |
# What does your leaderboard evaluate?
|
| 32 |
INTRODUCTION_TEXT = """
|
| 33 |
+
SAGE (Scientific Advanced General Evaluation) is a large-scale, high-difficulty, cross-disciplinary benchmark developed by Shanghai AI Laboratory for evaluating frontier scientific reasoning capabilities of Large Language Models (LLMs).
|
| 34 |
+
|
| 35 |
+
## Benchmark Overview
|
| 36 |
+
SAGE evaluates models across seven core scientific fields (57 sub-fields in total), covering the key domains of AI for Science (AI4S):
|
| 37 |
+
- **Mathematics** - Abstract algebra, analysis, differential equations, and computational mathematics
|
| 38 |
+
- **Physics** - Classical mechanics, electrodynamics, quantum mechanics, thermodynamics, and astrophysics
|
| 39 |
+
- **Chemistry** - Physical chemistry, inorganic chemistry, organic chemistry, and analytical chemistry
|
| 40 |
+
- **Biology** - Genetics, immunology, molecular biology, biophysics, and ecology
|
| 41 |
+
- **Computer Science** - Computer architecture, artificial intelligence, and software fundamentals
|
| 42 |
+
- **Earth Science** - Geography, geodesy, atmospheric chemistry, marine science, and geology
|
| 43 |
+
- **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
|
| 44 |
+
|
| 45 |
+
## Evaluation Metrics
|
| 46 |
+
- **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / Qwen3-235B-A22B)
|
| 47 |
+
- **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
|
| 48 |
+
- **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)
|
| 49 |
+
The leaderboard displays model performance sorted by average accuracy, with domain-specific scores reflecting strengths in different scientific fields. All metrics are derived from the SAGE validation/test set (≈800 expert-created original problems).
|
| 50 |
"""
|
| 51 |
|
| 52 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 53 |
LLM_BENCHMARKS_TEXT = f"""
|
| 54 |
+
## How SAGE Works
|
| 55 |
+
|
| 56 |
+
SAGE evaluates language models across six scientific domains through a comprehensive assessment of both content generation and reasoning capabilities.
|
| 57 |
+
|
| 58 |
+
### Evaluation Process:
|
| 59 |
+
1. **Multi-domain Assessment**: Models are tested on questions spanning Mathematics, Physics, Chemistry, Biology, Earth Science, and Astronomy
|
| 60 |
+
2. **Content + Reasoning**: Each submission requires both predicted answers and reasoning explanations
|
| 61 |
+
3. **Accuracy Scoring**: Performance is measured using accuracy metrics across all domains
|
| 62 |
+
4. **Comprehensive Reporting**: Results are aggregated to provide both overall and domain-specific scores
|
| 63 |
+
|
| 64 |
+
### Submission Format:
|
| 65 |
+
Submissions should follow this JSON structure:
|
| 66 |
+
```json
|
| 67 |
+
{{
|
| 68 |
+
"submission_org": "Your Organization",
|
| 69 |
+
"submission_email": "contact@example.com",
|
| 70 |
+
"predictions": [
|
| 71 |
+
{{
|
| 72 |
+
"original_question_id": 0,
|
| 73 |
+
"content": ["answer1", "answer2", "answer3", "answer4"],
|
| 74 |
+
"reasoning_content": ["reasoning1", "reasoning2", "reasoning3", "reasoning4"]
|
| 75 |
+
}}
|
| 76 |
+
]
|
| 77 |
+
}}
|
| 78 |
+
```
|
| 79 |
|
| 80 |
## Reproducibility
|
| 81 |
+
To reproduce our evaluation results:
|
| 82 |
+
1. Download the SAGE dataset from our repository
|
| 83 |
+
2. Use the evaluation scripts provided in the benchmark toolkit
|
| 84 |
+
3. Follow the submission format specifications exactly
|
| 85 |
+
4. Submit your results through this leaderboard interface
|
| 86 |
|
| 87 |
+
For detailed instructions, please refer to our [GitHub repository](https://github.com/SHAILab/SAGE) and technical documentation.
|
| 88 |
"""
|
| 89 |
|
| 90 |
EVALUATION_QUEUE_TEXT = """
|
| 91 |
+
## Submit Your SAGE Results
|
| 92 |
+
|
| 93 |
+
Results can be submitted as evaluation outputs in JSON format. Each submission should include predictions and reasoning content for all test questions.
|
| 94 |
+
|
| 95 |
+
### Required JSON Format:
|
| 96 |
+
```json
|
| 97 |
+
{
|
| 98 |
+
"submission_org": "Your Organization",
|
| 99 |
+
"submission_email": "contact@example.com",
|
| 100 |
+
"predictions": [
|
| 101 |
+
{
|
| 102 |
+
"original_question_id": 0,
|
| 103 |
+
"content": ["answer1", "answer2", "answer3", "answer4"],
|
| 104 |
+
"reasoning_content": ["reasoning1", "reasoning2", "reasoning3", "reasoning4"]
|
| 105 |
+
}
|
| 106 |
+
]
|
| 107 |
+
}
|
| 108 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
### Submission Guidelines:
|
| 111 |
+
- Each prediction must include exactly 4 content items and 4 reasoning items
|
| 112 |
+
- Question IDs should match the official SAGE test set
|
| 113 |
+
- Provide clear scientific reasoning for each prediction
|
| 114 |
+
- Ensure JSON format is valid and complete
|
| 115 |
|
| 116 |
+
Your submission will be automatically evaluated across all scientific domains and added to the leaderboard.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
"""
|
| 118 |
|
| 119 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 120 |
+
CITATION_BUTTON_TEXT = r"""@article{sage2024,
|
| 121 |
+
title={SAGE: Science AGent Evaluation for Large Language Models},
|
| 122 |
+
author={SHAILab Research Team},
|
| 123 |
+
journal={SciCompass Technical Report},
|
| 124 |
+
year={2024},
|
| 125 |
+
url={https://github.com/SHAILab/SAGE}
|
| 126 |
+
}"""
|
src/leaderboard/sage_eval.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Dict, List, Any
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
from src.display.formatting import make_clickable_model
|
| 9 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class SAGEResult:
|
| 14 |
+
"""Represents one SAGE evaluation result"""
|
| 15 |
+
submission_id: str
|
| 16 |
+
organization: str
|
| 17 |
+
email: str
|
| 18 |
+
results: Dict[str, float] # Domain -> accuracy
|
| 19 |
+
num_predictions: int
|
| 20 |
+
submitted_time: str
|
| 21 |
+
status: str = "EVALUATED"
|
| 22 |
+
|
| 23 |
+
def to_dict(self):
|
| 24 |
+
"""Converts the SAGE Result to a dict compatible with our dataframe display"""
|
| 25 |
+
# Use overall score if available, otherwise calculate average
|
| 26 |
+
if "sage_overall" in self.results:
|
| 27 |
+
average = self.results["sage_overall"]
|
| 28 |
+
else:
|
| 29 |
+
domain_scores = [v for v in self.results.values() if v is not None and isinstance(v, (int, float))]
|
| 30 |
+
average = sum(domain_scores) / len(domain_scores) if domain_scores else 0.0
|
| 31 |
+
|
| 32 |
+
# Extract model name from submission_id for initial results
|
| 33 |
+
if self.submission_id.startswith("initial_"):
|
| 34 |
+
model_name = self.submission_id.split("_", 2)[-1].replace("_", " ")
|
| 35 |
+
display_name = f"**{model_name}**"
|
| 36 |
+
model_symbol = "🤖"
|
| 37 |
+
else:
|
| 38 |
+
display_name = f"[{self.organization}]({self.email})"
|
| 39 |
+
model_symbol = "🏢"
|
| 40 |
+
|
| 41 |
+
data_dict = {
|
| 42 |
+
"eval_name": self.submission_id,
|
| 43 |
+
AutoEvalColumn.model.name: display_name,
|
| 44 |
+
AutoEvalColumn.model_type_symbol.name: model_symbol,
|
| 45 |
+
AutoEvalColumn.model_type.name: "SAGE Benchmark",
|
| 46 |
+
AutoEvalColumn.precision.name: self.organization, # Show organization/context info
|
| 47 |
+
AutoEvalColumn.weight_type.name: "Evaluated",
|
| 48 |
+
AutoEvalColumn.architecture.name: "Multi-domain",
|
| 49 |
+
AutoEvalColumn.average.name: round(average, 2),
|
| 50 |
+
AutoEvalColumn.license.name: "N/A",
|
| 51 |
+
AutoEvalColumn.likes.name: 0,
|
| 52 |
+
AutoEvalColumn.params.name: 0,
|
| 53 |
+
AutoEvalColumn.still_on_hub.name: True,
|
| 54 |
+
AutoEvalColumn.revision.name: self.submitted_time,
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
# Add domain-specific scores
|
| 58 |
+
for task in Tasks:
|
| 59 |
+
domain_key = task.value.benchmark
|
| 60 |
+
data_dict[task.value.col_name] = self.results.get(domain_key, 0.0)
|
| 61 |
+
|
| 62 |
+
return data_dict
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def evaluate_sage_submission(submission_data: Dict[str, Any]) -> Dict[str, float]:
|
| 66 |
+
"""
|
| 67 |
+
Evaluate a SAGE submission and calculate domain-specific accuracies.
|
| 68 |
+
This is a placeholder function - in practice, you would compare against ground truth.
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
# Placeholder evaluation - in real implementation, you would:
|
| 72 |
+
# 1. Load ground truth answers for each question
|
| 73 |
+
# 2. Compare submitted content with ground truth
|
| 74 |
+
# 3. Calculate accuracy for each scientific domain
|
| 75 |
+
|
| 76 |
+
predictions = submission_data["predictions"]
|
| 77 |
+
|
| 78 |
+
# Simulate domain classification and accuracy calculation
|
| 79 |
+
# In practice, you would have question_id -> domain mapping and ground truth
|
| 80 |
+
domain_counts = {
|
| 81 |
+
"sage_math": 0,
|
| 82 |
+
"sage_physics": 0,
|
| 83 |
+
"sage_chemistry": 0,
|
| 84 |
+
"sage_biology": 0,
|
| 85 |
+
"sage_earth_science": 0,
|
| 86 |
+
"sage_astronomy": 0
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
domain_correct = {
|
| 90 |
+
"sage_math": 0,
|
| 91 |
+
"sage_physics": 0,
|
| 92 |
+
"sage_chemistry": 0,
|
| 93 |
+
"sage_biology": 0,
|
| 94 |
+
"sage_earth_science": 0,
|
| 95 |
+
"sage_astronomy": 0
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
# Simulate evaluation - replace with actual evaluation logic
|
| 99 |
+
total_questions = len(predictions)
|
| 100 |
+
domain_size = total_questions // 6 # Assume equal distribution for demo
|
| 101 |
+
|
| 102 |
+
for i, prediction in enumerate(predictions):
|
| 103 |
+
# Assign questions to domains based on question_id (simplified)
|
| 104 |
+
question_id = prediction["original_question_id"]
|
| 105 |
+
|
| 106 |
+
# Simple domain assignment (in practice, use actual question metadata)
|
| 107 |
+
if question_id % 6 == 0:
|
| 108 |
+
domain = "sage_math"
|
| 109 |
+
elif question_id % 6 == 1:
|
| 110 |
+
domain = "sage_physics"
|
| 111 |
+
elif question_id % 6 == 2:
|
| 112 |
+
domain = "sage_chemistry"
|
| 113 |
+
elif question_id % 6 == 3:
|
| 114 |
+
domain = "sage_biology"
|
| 115 |
+
elif question_id % 6 == 4:
|
| 116 |
+
domain = "sage_earth_science"
|
| 117 |
+
else:
|
| 118 |
+
domain = "sage_astronomy"
|
| 119 |
+
|
| 120 |
+
domain_counts[domain] += 1
|
| 121 |
+
|
| 122 |
+
# Simulate accuracy (replace with actual evaluation against ground truth)
|
| 123 |
+
# For demo purposes, assign random accuracy between 60-90%
|
| 124 |
+
np.random.seed(question_id) # Consistent "accuracy" for demo
|
| 125 |
+
is_correct = np.random.random() > 0.3 # 70% accuracy simulation
|
| 126 |
+
|
| 127 |
+
if is_correct:
|
| 128 |
+
domain_correct[domain] += 1
|
| 129 |
+
|
| 130 |
+
# Calculate accuracies
|
| 131 |
+
domain_accuracies = {}
|
| 132 |
+
for domain in domain_counts:
|
| 133 |
+
if domain_counts[domain] > 0:
|
| 134 |
+
accuracy = (domain_correct[domain] / domain_counts[domain]) * 100
|
| 135 |
+
domain_accuracies[domain] = round(accuracy, 2)
|
| 136 |
+
else:
|
| 137 |
+
domain_accuracies[domain] = 0.0
|
| 138 |
+
|
| 139 |
+
# Add overall accuracy
|
| 140 |
+
total_correct = sum(domain_correct.values())
|
| 141 |
+
total_questions = sum(domain_counts.values())
|
| 142 |
+
overall_accuracy = (total_correct / total_questions) * 100 if total_questions > 0 else 0.0
|
| 143 |
+
domain_accuracies["sage_overall"] = round(overall_accuracy, 2)
|
| 144 |
+
|
| 145 |
+
return domain_accuracies
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def load_initial_sage_results() -> List[SAGEResult]:
|
| 149 |
+
"""Load initial SAGE results from the provided performance table"""
|
| 150 |
+
initial_results_path = "./initial_sage_results.json"
|
| 151 |
+
sage_results = []
|
| 152 |
+
|
| 153 |
+
if os.path.exists(initial_results_path):
|
| 154 |
+
try:
|
| 155 |
+
with open(initial_results_path, 'r') as f:
|
| 156 |
+
initial_data = json.load(f)
|
| 157 |
+
|
| 158 |
+
for i, entry in enumerate(initial_data):
|
| 159 |
+
sage_result = SAGEResult(
|
| 160 |
+
submission_id=f"initial_{i:02d}_{entry['model_name'].replace(' ', '_').replace('-', '_')}",
|
| 161 |
+
organization=f"{entry['organization']} ({entry['tokens']})",
|
| 162 |
+
email=f"contact@{entry['organization'].lower().replace(' ', '')}.com",
|
| 163 |
+
results=entry["results"],
|
| 164 |
+
num_predictions=1000, # Estimated from benchmark
|
| 165 |
+
submitted_time=entry["submitted_time"],
|
| 166 |
+
status="EVALUATED"
|
| 167 |
+
)
|
| 168 |
+
sage_results.append(sage_result)
|
| 169 |
+
|
| 170 |
+
except Exception as e:
|
| 171 |
+
print(f"Error loading initial SAGE results: {e}")
|
| 172 |
+
|
| 173 |
+
return sage_results
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def process_sage_results_for_leaderboard(submissions_dir: str = "./sage_submissions") -> List[SAGEResult]:
|
| 177 |
+
"""Process all SAGE submissions and convert them to leaderboard format"""
|
| 178 |
+
|
| 179 |
+
sage_results = []
|
| 180 |
+
|
| 181 |
+
# Load initial benchmark results
|
| 182 |
+
sage_results.extend(load_initial_sage_results())
|
| 183 |
+
|
| 184 |
+
# Load user submissions if directory exists
|
| 185 |
+
if os.path.exists(submissions_dir):
|
| 186 |
+
for org_dir in os.listdir(submissions_dir):
|
| 187 |
+
org_path = os.path.join(submissions_dir, org_dir)
|
| 188 |
+
if not os.path.isdir(org_path):
|
| 189 |
+
continue
|
| 190 |
+
|
| 191 |
+
for file in os.listdir(org_path):
|
| 192 |
+
if file.startswith("submission_") and file.endswith(".json"):
|
| 193 |
+
try:
|
| 194 |
+
# Load submission data
|
| 195 |
+
submission_path = os.path.join(org_path, file)
|
| 196 |
+
with open(submission_path, 'r') as f:
|
| 197 |
+
submission_data = json.load(f)
|
| 198 |
+
|
| 199 |
+
# Evaluate the submission
|
| 200 |
+
domain_accuracies = evaluate_sage_submission(submission_data)
|
| 201 |
+
|
| 202 |
+
# Create result object
|
| 203 |
+
timestamp = file.replace("submission_", "").replace(".json", "")
|
| 204 |
+
submission_id = f"{org_dir}_{timestamp}"
|
| 205 |
+
|
| 206 |
+
sage_result = SAGEResult(
|
| 207 |
+
submission_id=submission_id,
|
| 208 |
+
organization=submission_data["submission_org"],
|
| 209 |
+
email=submission_data["submission_email"],
|
| 210 |
+
results=domain_accuracies,
|
| 211 |
+
num_predictions=len(submission_data["predictions"]),
|
| 212 |
+
submitted_time=timestamp,
|
| 213 |
+
status="EVALUATED"
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
sage_results.append(sage_result)
|
| 217 |
+
|
| 218 |
+
except Exception as e:
|
| 219 |
+
print(f"Error processing SAGE submission {file}: {e}")
|
| 220 |
+
continue
|
| 221 |
+
|
| 222 |
+
return sage_results
|
src/populate.py
CHANGED
|
@@ -7,6 +7,12 @@ from src.display.formatting import has_no_nan_values, make_clickable_model
|
|
| 7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
|
@@ -22,8 +28,34 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 22 |
return df
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
| 26 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
| 28 |
all_evals = []
|
| 29 |
|
|
|
|
| 7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
|
| 10 |
+
# Import SAGE-specific modules
|
| 11 |
+
try:
|
| 12 |
+
from src.leaderboard.sage_eval import process_sage_results_for_leaderboard
|
| 13 |
+
except ImportError:
|
| 14 |
+
process_sage_results_for_leaderboard = None
|
| 15 |
+
|
| 16 |
|
| 17 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 18 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
|
| 28 |
return df
|
| 29 |
|
| 30 |
|
| 31 |
+
def get_sage_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 32 |
+
"""Creates a dataframe from SAGE evaluation results"""
|
| 33 |
+
if process_sage_results_for_leaderboard is None:
|
| 34 |
+
return pd.DataFrame()
|
| 35 |
+
|
| 36 |
+
# Get SAGE results
|
| 37 |
+
sage_results = process_sage_results_for_leaderboard()
|
| 38 |
+
all_data_json = [result.to_dict() for result in sage_results]
|
| 39 |
+
|
| 40 |
+
if not all_data_json:
|
| 41 |
+
return pd.DataFrame()
|
| 42 |
+
|
| 43 |
+
df = pd.DataFrame.from_records(all_data_json)
|
| 44 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 45 |
+
df = df[cols].round(decimals=2)
|
| 46 |
+
|
| 47 |
+
# filter out if any of the benchmarks have not been produced
|
| 48 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 49 |
+
return df
|
| 50 |
+
|
| 51 |
+
|
| 52 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
| 53 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
| 54 |
+
if not os.path.exists(save_path):
|
| 55 |
+
# Return empty dataframes if the path doesn't exist
|
| 56 |
+
empty_df = pd.DataFrame(columns=cols)
|
| 57 |
+
return empty_df, empty_df, empty_df
|
| 58 |
+
|
| 59 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
| 60 |
all_evals = []
|
| 61 |
|
src/submission/sage_submit.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime, timezone
|
| 4 |
+
from typing import Dict, List, Any
|
| 5 |
+
|
| 6 |
+
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def validate_sage_submission(submission_data: Dict[str, Any]) -> tuple[bool, str]:
|
| 10 |
+
"""Validates SAGE benchmark submission format"""
|
| 11 |
+
|
| 12 |
+
# Check required top-level fields
|
| 13 |
+
required_fields = ["submission_org", "submission_email", "predictions"]
|
| 14 |
+
for field in required_fields:
|
| 15 |
+
if field not in submission_data:
|
| 16 |
+
return False, f"Missing required field: {field}"
|
| 17 |
+
|
| 18 |
+
# Validate email format (basic)
|
| 19 |
+
email = submission_data["submission_email"]
|
| 20 |
+
if "@" not in email or "." not in email:
|
| 21 |
+
return False, "Invalid email format"
|
| 22 |
+
|
| 23 |
+
# Validate predictions
|
| 24 |
+
predictions = submission_data["predictions"]
|
| 25 |
+
if not isinstance(predictions, list) or len(predictions) == 0:
|
| 26 |
+
return False, "Predictions must be a non-empty list"
|
| 27 |
+
|
| 28 |
+
for i, prediction in enumerate(predictions):
|
| 29 |
+
# Check required prediction fields
|
| 30 |
+
pred_required_fields = ["original_question_id", "content", "reasoning_content"]
|
| 31 |
+
for field in pred_required_fields:
|
| 32 |
+
if field not in prediction:
|
| 33 |
+
return False, f"Missing field '{field}' in prediction {i}"
|
| 34 |
+
|
| 35 |
+
# Validate content arrays
|
| 36 |
+
content = prediction["content"]
|
| 37 |
+
reasoning_content = prediction["reasoning_content"]
|
| 38 |
+
|
| 39 |
+
if not isinstance(content, list) or len(content) != 4:
|
| 40 |
+
return False, f"Content in prediction {i} must be a list of exactly 4 items"
|
| 41 |
+
|
| 42 |
+
if not isinstance(reasoning_content, list) or len(reasoning_content) != 4:
|
| 43 |
+
return False, f"Reasoning content in prediction {i} must be a list of exactly 4 items"
|
| 44 |
+
|
| 45 |
+
# Validate question ID
|
| 46 |
+
if not isinstance(prediction["original_question_id"], int):
|
| 47 |
+
return False, f"Question ID in prediction {i} must be an integer"
|
| 48 |
+
|
| 49 |
+
return True, "Valid submission format"
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def process_sage_submission_simple(submission_file, org_name=None, email=None) -> str:
|
| 53 |
+
"""Process SAGE benchmark submission file - simplified version for basic leaderboard"""
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
# Read the submitted file (receives file path)
|
| 57 |
+
if submission_file is None:
|
| 58 |
+
return styled_error("No file uploaded. Please select a JSON file.")
|
| 59 |
+
|
| 60 |
+
# submission_file is a file path string
|
| 61 |
+
try:
|
| 62 |
+
with open(submission_file, 'r', encoding='utf-8') as f:
|
| 63 |
+
content = f.read()
|
| 64 |
+
except Exception as e:
|
| 65 |
+
return styled_error(f"Error reading file: {str(e)}")
|
| 66 |
+
|
| 67 |
+
# Parse JSON
|
| 68 |
+
try:
|
| 69 |
+
submission_data = json.loads(content)
|
| 70 |
+
except json.JSONDecodeError as e:
|
| 71 |
+
return styled_error(f"Invalid JSON format: {str(e)}")
|
| 72 |
+
|
| 73 |
+
# Use form inputs if submission data doesn't contain org/email
|
| 74 |
+
if org_name and email:
|
| 75 |
+
submission_data["submission_org"] = org_name
|
| 76 |
+
submission_data["submission_email"] = email
|
| 77 |
+
|
| 78 |
+
# Validate submission format
|
| 79 |
+
is_valid, message = validate_sage_submission(submission_data)
|
| 80 |
+
if not is_valid:
|
| 81 |
+
return styled_error(f"Submission validation failed: {message}")
|
| 82 |
+
|
| 83 |
+
# Save submission for later processing
|
| 84 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 85 |
+
org = submission_data["submission_org"].replace(" ", "_").replace("/", "_")
|
| 86 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 87 |
+
|
| 88 |
+
# Save raw submission
|
| 89 |
+
submission_dir = f"./sage_submissions/{org}"
|
| 90 |
+
os.makedirs(submission_dir, exist_ok=True)
|
| 91 |
+
raw_submission_path = f"{submission_dir}/submission_{timestamp}.json"
|
| 92 |
+
|
| 93 |
+
with open(raw_submission_path, 'w') as f:
|
| 94 |
+
json.dump(submission_data, f, indent=2)
|
| 95 |
+
|
| 96 |
+
# Simple evaluation using the evaluation module
|
| 97 |
+
try:
|
| 98 |
+
from src.leaderboard.sage_eval import evaluate_sage_submission
|
| 99 |
+
domain_accuracies = evaluate_sage_submission(submission_data)
|
| 100 |
+
|
| 101 |
+
# Update initial_sage_results.json directly for persistence
|
| 102 |
+
initial_results_file = "./initial_sage_results.json"
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
# Load existing initial results
|
| 106 |
+
if os.path.exists(initial_results_file):
|
| 107 |
+
with open(initial_results_file, 'r') as f:
|
| 108 |
+
initial_results = json.load(f)
|
| 109 |
+
else:
|
| 110 |
+
initial_results = []
|
| 111 |
+
|
| 112 |
+
# Convert to initial results format
|
| 113 |
+
new_result = {
|
| 114 |
+
"model_name": submission_data["submission_org"],
|
| 115 |
+
"organization": submission_data["submission_org"],
|
| 116 |
+
"tokens": "User Submission",
|
| 117 |
+
"accuracy": domain_accuracies["sage_overall"],
|
| 118 |
+
"mg_pass_2": domain_accuracies["sage_overall"], # Use same value for now
|
| 119 |
+
"mg_pass_4": domain_accuracies["sage_overall"], # Use same value for now
|
| 120 |
+
"submitted_time": datetime.now().strftime("%Y-%m-%d"),
|
| 121 |
+
"results": domain_accuracies,
|
| 122 |
+
"contact_email": submission_data["submission_email"]
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
# Check if organization already exists, update or add
|
| 126 |
+
org_name = submission_data["submission_org"]
|
| 127 |
+
updated = False
|
| 128 |
+
for i, result in enumerate(initial_results):
|
| 129 |
+
if (result.get("model_name") == org_name or
|
| 130 |
+
result.get("organization") == org_name):
|
| 131 |
+
initial_results[i] = new_result
|
| 132 |
+
updated = True
|
| 133 |
+
break
|
| 134 |
+
|
| 135 |
+
if not updated:
|
| 136 |
+
initial_results.append(new_result)
|
| 137 |
+
|
| 138 |
+
# Save updated initial results
|
| 139 |
+
with open(initial_results_file, 'w') as f:
|
| 140 |
+
json.dump(initial_results, f, indent=2)
|
| 141 |
+
|
| 142 |
+
print(f"✅ Updated {initial_results_file} with new submission from {org_name}")
|
| 143 |
+
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"⚠️ Failed to update initial results file: {e}")
|
| 146 |
+
|
| 147 |
+
# Format success message with scores
|
| 148 |
+
overall_accuracy = domain_accuracies.get("sage_overall", 0)
|
| 149 |
+
|
| 150 |
+
success_msg = styled_message(
|
| 151 |
+
f"🎉 SAGE submission processed successfully!\n\n"
|
| 152 |
+
f"**Organization:** {submission_data['submission_org']}\n"
|
| 153 |
+
f"**Overall Accuracy:** {overall_accuracy:.2f}%\n\n"
|
| 154 |
+
f"**Domain Scores:**\n"
|
| 155 |
+
f" • Mathematics: {domain_accuracies.get('sage_math', 0):.2f}%\n"
|
| 156 |
+
f" • Physics: {domain_accuracies.get('sage_physics', 0):.2f}%\n"
|
| 157 |
+
f" • Chemistry: {domain_accuracies.get('sage_chemistry', 0):.2f}%\n"
|
| 158 |
+
f" • Biology: {domain_accuracies.get('sage_biology', 0):.2f}%\n"
|
| 159 |
+
f" • Earth Science: {domain_accuracies.get('sage_earth_science', 0):.2f}%\n"
|
| 160 |
+
f" • Astronomy: {domain_accuracies.get('sage_astronomy', 0):.2f}%\n\n"
|
| 161 |
+
f"Your results have been added to the leaderboard. "
|
| 162 |
+
f"Please refresh the page to see updated rankings."
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
return success_msg
|
| 166 |
+
|
| 167 |
+
except Exception as eval_error:
|
| 168 |
+
# If evaluation fails, still save submission but mark as failed
|
| 169 |
+
return styled_warning(
|
| 170 |
+
f"⚠️ Submission received but evaluation failed.\n\n"
|
| 171 |
+
f"Error: {str(eval_error)}\n\n"
|
| 172 |
+
f"Your submission has been saved and will be processed manually. "
|
| 173 |
+
f"Please contact administrators if this issue persists."
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
return styled_error(f"Submission processing failed: {str(e)}")
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def load_sage_submissions(submissions_dir: str = "./sage_submissions") -> List[Dict]:
|
| 181 |
+
"""Load all SAGE submissions for display in queue"""
|
| 182 |
+
|
| 183 |
+
if not os.path.exists(submissions_dir):
|
| 184 |
+
return []
|
| 185 |
+
|
| 186 |
+
submissions = []
|
| 187 |
+
|
| 188 |
+
for org_dir in os.listdir(submissions_dir):
|
| 189 |
+
org_path = os.path.join(submissions_dir, org_dir)
|
| 190 |
+
if not os.path.isdir(org_path):
|
| 191 |
+
continue
|
| 192 |
+
|
| 193 |
+
for file in os.listdir(org_path):
|
| 194 |
+
if file.startswith("submission_") and file.endswith(".json"):
|
| 195 |
+
try:
|
| 196 |
+
with open(os.path.join(org_path, file), 'r') as f:
|
| 197 |
+
submission = json.load(f)
|
| 198 |
+
# Add metadata
|
| 199 |
+
submission["_filename"] = file
|
| 200 |
+
submission["_org_dir"] = org_dir
|
| 201 |
+
submissions.append(submission)
|
| 202 |
+
except Exception:
|
| 203 |
+
continue
|
| 204 |
+
|
| 205 |
+
# Sort by submission time (most recent first)
|
| 206 |
+
submissions.sort(key=lambda x: x.get("_filename", ""), reverse=True)
|
| 207 |
+
return submissions
|