SAGE-Bench

Sleeping

App Files Files Community

SAGE OSS Evaluator commited on Sep 8

Commit

b1a0fd2

1 Parent(s): 7844386

update

Browse files

Files changed (3) hide show

src/leaderboard/read_evals.py +0 -196
src/leaderboard/sage_eval.py +0 -238
src/populate.py +11 -106

src/leaderboard/read_evals.py DELETED Viewed

@@ -1,196 +0,0 @@
-import glob
-import json
-import math
-import os
-from dataclasses import dataclass
-import dateutil
-import numpy as np
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
-from src.submission.check_validity import is_model_on_hub
-@dataclass
-class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
-    """
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
-    model: str
-    revision: str # commit hash, "" if main
-    results: dict
-    precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
-    license: str = "?"
-    likes: int = 0
-    num_params: int = 0
-    date: str = "" # submission date of request file
-    still_on_hub: bool = False
-    @classmethod
-    def init_from_json_file(self, json_filepath):
-        """Inits the result from the specific model result file"""
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        config = data.get("config")
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
-        architecture = "?"
-        if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
-            if architectures:
-                architecture = ";".join(architectures)
-        # Extract results available in this file (some results are split in several files)
-        results = {}
-        for task in Tasks:
-            task = task.value
-            # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-        return self(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            precision=precision,
-            revision= config.get("model_sha", ""),
-            still_on_hub=still_on_hub,
-            architecture=architecture
-        )
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
-        try:
-            with open(request_file, "r") as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
-            self.num_params = request.get("params", 0)
-            self.date = request.get("submitted_time", "")
-        except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
-    def to_dict(self):
-        """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
-        data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-        }
-        for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
-        return data_dict
-def get_request_file_for_model(requests_path, model_name, precision):
-    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    request_files = os.path.join(
-        requests_path,
-        f"{model_name}_eval_request_*.json",
-    )
-    request_files = glob.glob(request_files)
-    # Select correct request file (precision)
-    request_file = ""
-    request_files = sorted(request_files, reverse=True)
-    for tmp_request_file in request_files:
-        with open(tmp_request_file, "r") as f:
-            req_content = json.load(f)
-            if (
-                req_content["status"] in ["FINISHED"]
-                and req_content["precision"] == precision.split(".")[-1]
-            ):
-                request_file = tmp_request_file
-    return request_file
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
-    """From the path of the results folder root, extract all needed info for results"""
-    model_result_filepaths = []
-    for root, _, files in os.walk(results_path):
-        # We should only have json files in model results
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
-            continue
-        # Sort the files by date
-        try:
-            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
-            files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    eval_results = {}
-    for model_result_filepath in model_result_filepaths:
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
-    results = []
-    for v in eval_results.values():
-        try:
-            v.to_dict() # we test if the dict version is complete
-            results.append(v)
-        except KeyError:  # not all eval values present
-            continue
-    return results

src/leaderboard/sage_eval.py DELETED Viewed

@@ -1,238 +0,0 @@
-import json
-import os
-from dataclasses import dataclass
-from typing import Dict, List, Any
-import numpy as np
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
-@dataclass
-class SAGEResult:
-    """Represents one SAGE evaluation result"""
-    submission_id: str
-    organization: str
-    email: str
-    results: Dict[str, float]  # Domain -> accuracy
-    num_predictions: int
-    submitted_time: str
-    status: str = "EVALUATED"
-    def to_dict(self):
-        """Converts the SAGE Result to a dict compatible with our dataframe display"""
-        # Use overall score if available, otherwise calculate average
-        if "sage_overall" in self.results:
-            average = self.results["sage_overall"]
-        else:
-            domain_scores = [v for v in self.results.values() if v is not None and isinstance(v, (int, float))]
-            average = sum(domain_scores) / len(domain_scores) if domain_scores else 0.0
-        # Extract model name from submission_id for initial results
-        if self.submission_id.startswith("initial_"):
-            model_name = self.submission_id.split("_", 2)[-1].replace("_", " ")
-            display_name = f"**{model_name}**"
-            model_symbol = "🤖"
-        else:
-            display_name = f"[{self.organization}]({self.email})"
-            model_symbol = "🏢"
-        data_dict = {
-            "eval_name": self.submission_id,
-            AutoEvalColumn.model.name: display_name,
-            AutoEvalColumn.model_type_symbol.name: model_symbol,
-            AutoEvalColumn.model_type.name: "SAGE Benchmark",
-            AutoEvalColumn.precision.name: self.organization,  # Show organization/context info
-            AutoEvalColumn.weight_type.name: "Evaluated",
-            AutoEvalColumn.architecture.name: "Multi-domain",
-            AutoEvalColumn.average.name: round(average, 2),
-            AutoEvalColumn.license.name: "N/A",
-            AutoEvalColumn.likes.name: 0,
-            AutoEvalColumn.params.name: 0,
-            AutoEvalColumn.still_on_hub.name: True,
-            AutoEvalColumn.revision.name: self.submitted_time,
-        }
-        # Add domain-specific scores
-        for task in Tasks:
-            domain_key = task.value.benchmark
-            data_dict[task.value.col_name] = self.results.get(domain_key, 0.0)
-        return data_dict
-def evaluate_sage_submission(submission_data: Dict[str, Any]) -> Dict[str, float]:
-    """
-    Evaluate a SAGE submission and calculate domain-specific accuracies.
-    This is a placeholder function - in practice, you would compare against ground truth.
-    """
-    # Placeholder evaluation - in real implementation, you would:
-    # 1. Load ground truth answers for each question
-    # 2. Compare submitted content with ground truth
-    # 3. Calculate accuracy for each scientific domain
-    predictions = submission_data["predictions"]
-    # Simulate domain classification and accuracy calculation
-    # In practice, you would have question_id -> domain mapping and ground truth
-    domain_counts = {
-        "sage_math": 0,
-        "sage_physics": 0,
-        "sage_chemistry": 0,
-        "sage_biology": 0,
-        "sage_earth_science": 0,
-        "sage_astronomy": 0
-    }
-    domain_correct = {
-        "sage_math": 0,
-        "sage_physics": 0,
-        "sage_chemistry": 0,
-        "sage_biology": 0,
-        "sage_earth_science": 0,
-        "sage_astronomy": 0
-    }
-    # Simulate evaluation - replace with actual evaluation logic
-    total_questions = len(predictions)
-    domain_size = total_questions // 6  # Assume equal distribution for demo
-    for i, prediction in enumerate(predictions):
-        # Assign questions to domains based on question_id (simplified)
-        question_id = prediction["original_question_id"]
-        # Simple domain assignment (in practice, use actual question metadata)
-        if question_id % 6 == 0:
-            domain = "sage_math"
-        elif question_id % 6 == 1:
-            domain = "sage_physics"
-        elif question_id % 6 == 2:
-            domain = "sage_chemistry"
-        elif question_id % 6 == 3:
-            domain = "sage_biology"
-        elif question_id % 6 == 4:
-            domain = "sage_earth_science"
-        else:
-            domain = "sage_astronomy"
-        domain_counts[domain] += 1
-        # Simulate accuracy (replace with actual evaluation against ground truth)
-        # For demo purposes, assign random accuracy between 60-90%
-        np.random.seed(question_id)  # Consistent "accuracy" for demo
-        is_correct = np.random.random() > 0.3  # 70% accuracy simulation
-        if is_correct:
-            domain_correct[domain] += 1
-    # Calculate accuracies
-    domain_accuracies = {}
-    for domain in domain_counts:
-        if domain_counts[domain] > 0:
-            accuracy = (domain_correct[domain] / domain_counts[domain]) * 100
-            domain_accuracies[domain] = round(accuracy, 2)
-        else:
-            domain_accuracies[domain] = 0.0
-    # Add overall accuracy
-    total_correct = sum(domain_correct.values())
-    total_questions = sum(domain_counts.values())
-    overall_accuracy = (total_correct / total_questions) * 100 if total_questions > 0 else 0.0
-    domain_accuracies["sage_overall"] = round(overall_accuracy, 2)
-    return domain_accuracies
-def load_initial_sage_results() -> List[SAGEResult]:
-    """Load initial SAGE results from the provided performance table"""
-    # Try multiple possible paths for the initial results file
-    possible_paths = [
-        "./initial_sage_results.json",
-        "initial_sage_results.json",
-        os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "initial_sage_results.json")
-    ]
-    initial_results_path = None
-    for path in possible_paths:
-        if os.path.exists(path):
-            initial_results_path = path
-            break
-    sage_results = []
-    if initial_results_path:
-        try:
-            with open(initial_results_path, 'r') as f:
-                initial_data = json.load(f)
-            for i, entry in enumerate(initial_data):
-                sage_result = SAGEResult(
-                    submission_id=f"initial_{i:02d}_{entry['model_name'].replace(' ', '_').replace('-', '_')}",
-                    organization=f"{entry['organization']} ({entry['tokens']})",
-                    email=f"contact@{entry['organization'].lower().replace(' ', '')}.com",
-                    results=entry["results"],
-                    num_predictions=1000,  # Estimated from benchmark
-                    submitted_time=entry["submitted_time"],
-                    status="EVALUATED"
-                )
-                sage_results.append(sage_result)
-        except Exception as e:
-            print(f"Error loading initial SAGE results from {initial_results_path}: {e}")
-    else:
-        print(f"Initial SAGE results file not found. Tried paths: {possible_paths}")
-        print(f"Current working directory: {os.getcwd()}")
-        print(f"Files in current directory: {os.listdir('.')}")
-    return sage_results
-def process_sage_results_for_leaderboard(submissions_dir: str = "./sage_submissions") -> List[SAGEResult]:
-    """Process all SAGE submissions and convert them to leaderboard format"""
-    sage_results = []
-    # Load initial benchmark results
-    sage_results.extend(load_initial_sage_results())
-    # Load user submissions if directory exists
-    if os.path.exists(submissions_dir):
-        for org_dir in os.listdir(submissions_dir):
-            org_path = os.path.join(submissions_dir, org_dir)
-            if not os.path.isdir(org_path):
-                continue
-            for file in os.listdir(org_path):
-                if file.startswith("submission_") and file.endswith(".json"):
-                    try:
-                        # Load submission data
-                        submission_path = os.path.join(org_path, file)
-                        with open(submission_path, 'r') as f:
-                            submission_data = json.load(f)
-                        # Evaluate the submission
-                        domain_accuracies = evaluate_sage_submission(submission_data)
-                        # Create result object
-                        timestamp = file.replace("submission_", "").replace(".json", "")
-                        submission_id = f"{org_dir}_{timestamp}"
-                        sage_result = SAGEResult(
-                            submission_id=submission_id,
-                            organization=submission_data["submission_org"],
-                            email=submission_data["submission_email"],
-                            results=domain_accuracies,
-                            num_predictions=len(submission_data["predictions"]),
-                            submitted_time=timestamp,
-                            status="EVALUATED"
-                        )
-                        sage_results.append(sage_result)
-                    except Exception as e:
-                        print(f"Error processing SAGE submission {file}: {e}")
-                        continue
-    return sage_results

src/populate.py CHANGED Viewed

@@ -5,8 +5,7 @@ import pandas as pd
 from typing import List
 from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
-# from src.leaderboard.read_evals import get_raw_eval_results  # Removed to avoid transformers dependency
 # Import SAGE-specific modules - avoid transformers dependency
 process_sage_results_for_leaderboard = None
@@ -73,13 +72,12 @@ try:
             return data_dict
-    def load_initial_sage_results_local() -> List[SAGEResult]:
-        """Load initial SAGE results from OSS or local files"""
         sage_results = []
-        # 尝试从OSS加载
         try:
-            # 导入OSS排行榜管理器（现在在本地oss目录中）
             from src.oss.oss_leaderboard_manager import OSSLeaderboardManager
             # 从OSS加载排行榜数据
@@ -100,78 +98,26 @@ try:
                         status="EVALUATED"
                     )
                     sage_results.append(sage_result)
-                return sage_results
             else:
-                print("⚠️ OSS中未找到排行榜数据，尝试本地文件")
         except Exception as e:
-            print(f"⚠️ 从OSS加载排行榜失败: {e}")
-            print("🔄 回退到本地文件模式")
-        # 回退到本地文件模式
-        possible_paths = [
-            "./initial_sage_results.json",
-            "initial_sage_results.json",
-            os.path.join(os.path.dirname(os.path.dirname(__file__)), "initial_sage_results.json")
-        ]
-        initial_results_path = None
-        for path in possible_paths:
-            if os.path.exists(path):
-                initial_results_path = path
-                break
-        if initial_results_path:
-            try:
-                with open(initial_results_path, 'r') as f:
-                    initial_data = json.load(f)
-                print(f"✅ 从本地文件加载了 {len(initial_data)} 条排行榜记录: {initial_results_path}")
-                for i, entry in enumerate(initial_data):
-                    sage_result = SAGEResult(
-                        submission_id=f"local_{i:02d}_{entry['model_name'].replace(' ', '_').replace('-', '_')}",
-                        organization=f"{entry['organization']} ({entry.get('tokens', 'N/A')})",
-                        email=entry.get('contact_email', f"contact@{entry['organization'].lower().replace(' ', '')}.com"),
-                        results=entry["results"],
-                        num_predictions=1000,
-                        submitted_time=entry["submitted_time"],
-                        status="EVALUATED"
-                    )
-                    sage_results.append(sage_result)
-            except Exception as e:
-                print(f"❌ 从本地文件加载排行榜失败 {initial_results_path}: {e}")
-        else:
-            print(f"❌ 未找到排行榜文件。尝试过的路径: {possible_paths}")
         return sage_results
-    def process_sage_results_for_leaderboard_local(submissions_dir: str = "./sage_submissions") -> List[SAGEResult]:
-        """Process all SAGE submissions without external dependencies"""
-        sage_results = []
-        # Load initial benchmark results
-        sage_results.extend(load_initial_sage_results_local())
-        return sage_results
     # Set the function
-    process_sage_results_for_leaderboard = process_sage_results_for_leaderboard_local
 except ImportError as e:
     print(f"Could not set up SAGE results processing: {e}")
     process_sage_results_for_leaderboard = None
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    """Creates a dataframe from all the individual experiment results - disabled for SAGE"""
-    # For SAGE, we use get_sage_leaderboard_df instead
-    print("⚠️ get_leaderboard_df called - use get_sage_leaderboard_df for SAGE instead")
-    return pd.DataFrame()
 def get_sage_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from SAGE evaluation results"""
     if process_sage_results_for_leaderboard is None:
@@ -190,45 +136,4 @@ def get_sage_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame:
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
-    return df
-def get_evaluation_queue_df(save_path: str, cols: list) -> List[pd.DataFrame]:
-    """Creates the different dataframes for the evaluation queues requestes"""
-    if not os.path.exists(save_path):
-        # Return empty dataframes if the path doesn't exist
-        empty_df = pd.DataFrame(columns=cols)
-        return empty_df, empty_df, empty_df
-    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
-    all_evals = []
-    for entry in entries:
-        if ".json" in entry:
-            file_path = os.path.join(save_path, entry)
-            with open(file_path) as fp:
-                data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-            all_evals.append(data)
-        elif ".md" not in entry:
-            # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
-    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
-    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
-    df_running = pd.DataFrame.from_records(running_list, columns=cols)
-    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]

 from typing import List
 from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import AutoEvalColumn
 # Import SAGE-specific modules - avoid transformers dependency
 process_sage_results_for_leaderboard = None
             return data_dict
+    def load_initial_sage_results_from_oss() -> List[SAGEResult]:
+        """Load initial SAGE results from OSS"""
         sage_results = []
         try:
+            # 导入OSS排行榜管理器
             from src.oss.oss_leaderboard_manager import OSSLeaderboardManager
             # 从OSS加载排行榜数据
                         status="EVALUATED"
                     )
                     sage_results.append(sage_result)
             else:
+                print("⚠️ OSS中未找到排行榜数据")
         except Exception as e:
+            print(f"❌ 从OSS加载排行榜失败: {e}")
         return sage_results
+    def process_sage_results_for_leaderboard_oss() -> List[SAGEResult]:
+        """Process all SAGE results from OSS"""
+        return load_initial_sage_results_from_oss()
     # Set the function
+    process_sage_results_for_leaderboard = process_sage_results_for_leaderboard_oss
 except ImportError as e:
     print(f"Could not set up SAGE results processing: {e}")
     process_sage_results_for_leaderboard = None
 def get_sage_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from SAGE evaluation results"""
     if process_sage_results_for_leaderboard is None:
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
+    return df