🧪 SAGE: A High-Difficulty, Multidisciplinary Benchmark for Frontier Scientific Reasoning

from dataclasses import dataclass
from enum import Enum

@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    sage_overall = Task("sage_overall", "accuracy", "SAGE Overall")
    sage_math = Task("sage_math", "accuracy", "Mathematics")
    sage_physics = Task("sage_physics", "accuracy", "Physics")
    sage_chemistry = Task("sage_chemistry", "accuracy", "Chemistry")
    sage_biology = Task("sage_biology", "accuracy", "Biology")
    sage_earth_science = Task("sage_earth_science", "accuracy", "Earth Science")
    sage_astronomy = Task("sage_astronomy", "accuracy", "Astronomy")

NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------


# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">🧪 SAGE: A High-Difficulty, Multidisciplinary Benchmark for Frontier Scientific Reasoning</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
SAGE (Scientific Advanced General Evaluation) is a large-scale, high-difficulty, cross-disciplinary benchmark developed by Shanghai AI Laboratory for evaluating frontier scientific reasoning capabilities of Large Language Models (LLMs).

## Benchmark Overview
SAGE evaluates models across seven core scientific fields (57 sub-fields in total), covering the key domains of AI for Science (AI4S):
- **Mathematics** - Abstract algebra, analysis, differential equations, and computational mathematics
- **Physics** - Classical mechanics, electrodynamics, quantum mechanics, thermodynamics, and astrophysics
- **Chemistry** - Physical chemistry, inorganic chemistry, organic chemistry, and analytical chemistry
- **Biology** - Genetics, immunology, molecular biology, biophysics, and ecology
- **Computer Science** - Computer architecture, artificial intelligence, and software fundamentals
- **Earth Science** - Geography, geodesy, atmospheric chemistry, marine science, and geology
- **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis

## Evaluation Metrics
- **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / Qwen3-235B-A22B)
- **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
- **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)
The leaderboard displays model performance sorted by average accuracy, with domain-specific scores reflecting strengths in different scientific fields. All metrics are derived from the SAGE validation/test set (≈800 expert-created original problems).
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How SAGE Works

SAGE evaluates language models across six scientific domains through a comprehensive assessment of both content generation and reasoning capabilities.

### Evaluation Process:
1. **Multi-domain Assessment**: Models are tested on questions spanning Mathematics, Physics, Chemistry, Biology, Earth Science, and Astronomy
2. **Content + Reasoning**: Each submission requires both predicted answers and reasoning explanations  
3. **Accuracy Scoring**: Performance is measured using accuracy metrics across all domains
4. **Comprehensive Reporting**: Results are aggregated to provide both overall and domain-specific scores

### Submission Format:
Submissions should follow this JSON structure:
```json
{{
    "submission_org": "Your Organization",
    "submission_email": "contact@example.com",
    "predictions": [
        {{
            "original_question_id": 0,
            "content": ["answer1", "answer2", "answer3", "answer4"],
            "reasoning_content": ["reasoning1", "reasoning2", "reasoning3", "reasoning4"]
        }}
    ]
}}
```

## Reproducibility
To reproduce our evaluation results:
1. Download the SAGE dataset from our repository
2. Use the evaluation scripts provided in the benchmark toolkit
3. Follow the submission format specifications exactly
4. Submit your results through this leaderboard interface

For detailed instructions, please refer to our [GitHub repository](https://github.com/SHAILab/SAGE) and technical documentation.
"""

EVALUATION_QUEUE_TEXT = """
## Submit Your SAGE Results

Results can be submitted as evaluation outputs in JSON format. Each submission should include predictions and reasoning content for all test questions.

### Required JSON Format:
```json
{
    "submission_org": "Your Organization",
    "submission_email": "contact@example.com",
    "predictions": [
        {
            "original_question_id": 0,
            "content": ["answer1", "answer2", "answer3", "answer4"],
            "reasoning_content": ["reasoning1", "reasoning2", "reasoning3", "reasoning4"]
        }
    ]
}
```

### Submission Guidelines:
- Each prediction must include exactly 4 content items and 4 reasoning items
- Question IDs should match the official SAGE test set
- Provide clear scientific reasoning for each prediction
- Ensure JSON format is valid and complete

Your submission will be automatically evaluated across all scientific domains and added to the leaderboard.
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@article{sage2024,
    title={SAGE: Science AGent Evaluation for Large Language Models},
    author={SHAILab Research Team},
    journal={SciCompass Technical Report},
    year={2024},
    url={https://github.com/SHAILab/SAGE}
}"""