Spaces:
Sleeping
Sleeping
File size: 8,669 Bytes
f9e337d ca09cf3 2086543 ca09cf3 2086543 f9e337d 2086543 f9e337d 2086543 f9e337d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import json
import os
import pandas as pd
from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn, EvalQueueColumn
from src.leaderboard.read_evals import get_raw_eval_results
# Import SAGE-specific modules - avoid transformers dependency
process_sage_results_for_leaderboard = None
try:
# Import SAGE modules without triggering transformers dependency
import sys
import os
import json
from dataclasses import dataclass
from typing import Dict, List, Any
import numpy as np
# Copy SAGEResult class locally to avoid import issues
@dataclass
class SAGEResult:
submission_id: str
organization: str
email: str
results: Dict[str, float]
num_predictions: int
submitted_time: str
status: str = "EVALUATED"
def to_dict(self):
"""Converts the SAGE Result to a dict compatible with our dataframe display"""
# Use overall score if available, otherwise calculate average
if "sage_overall" in self.results:
average = self.results["sage_overall"]
else:
domain_scores = [v for v in self.results.values() if v is not None and isinstance(v, (int, float))]
average = sum(domain_scores) / len(domain_scores) if domain_scores else 0.0
# Extract model name from submission_id for initial results
if self.submission_id.startswith("initial_"):
model_name = self.submission_id.split("_", 2)[-1].replace("_", " ")
display_name = f"**{model_name}**"
model_symbol = "🤖"
else:
display_name = f"[{self.organization}]({self.email})"
model_symbol = "🏢"
from src.display.utils import AutoEvalColumn, Tasks
data_dict = {
"eval_name": self.submission_id,
AutoEvalColumn.model.name: display_name,
AutoEvalColumn.model_type_symbol.name: model_symbol,
AutoEvalColumn.model_type.name: "SAGE Benchmark",
AutoEvalColumn.precision.name: self.organization,
AutoEvalColumn.weight_type.name: "Evaluated",
AutoEvalColumn.architecture.name: "Multi-domain",
AutoEvalColumn.average.name: round(average, 2),
AutoEvalColumn.license.name: "N/A",
AutoEvalColumn.likes.name: 0,
AutoEvalColumn.params.name: 0,
AutoEvalColumn.still_on_hub.name: True,
AutoEvalColumn.revision.name: self.submitted_time,
}
# Add domain-specific scores
for task in Tasks:
domain_key = task.value.benchmark
data_dict[task.value.col_name] = self.results.get(domain_key, 0.0)
return data_dict
def load_initial_sage_results_local() -> List[SAGEResult]:
"""Load initial SAGE results without external dependencies"""
possible_paths = [
"./initial_sage_results.json",
"initial_sage_results.json",
os.path.join(os.path.dirname(os.path.dirname(__file__)), "initial_sage_results.json")
]
initial_results_path = None
for path in possible_paths:
if os.path.exists(path):
initial_results_path = path
break
sage_results = []
if initial_results_path:
try:
with open(initial_results_path, 'r') as f:
initial_data = json.load(f)
for i, entry in enumerate(initial_data):
sage_result = SAGEResult(
submission_id=f"initial_{i:02d}_{entry['model_name'].replace(' ', '_').replace('-', '_')}",
organization=f"{entry['organization']} ({entry['tokens']})",
email=f"contact@{entry['organization'].lower().replace(' ', '')}.com",
results=entry["results"],
num_predictions=1000,
submitted_time=entry["submitted_time"],
status="EVALUATED"
)
sage_results.append(sage_result)
except Exception as e:
print(f"Error loading initial SAGE results from {initial_results_path}: {e}")
else:
print(f"Initial SAGE results file not found. Tried paths: {possible_paths}")
return sage_results
def process_sage_results_for_leaderboard_local(submissions_dir: str = "./sage_submissions") -> List[SAGEResult]:
"""Process all SAGE submissions without external dependencies"""
sage_results = []
# Load initial benchmark results
sage_results.extend(load_initial_sage_results_local())
return sage_results
# Set the function
process_sage_results_for_leaderboard = process_sage_results_for_leaderboard_local
except ImportError as e:
print(f"Could not set up SAGE results processing: {e}")
process_sage_results_for_leaderboard = None
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
"""Creates a dataframe from all the individual experiment results"""
raw_data = get_raw_eval_results(results_path, requests_path)
all_data_json = [v.to_dict() for v in raw_data]
df = pd.DataFrame.from_records(all_data_json)
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
df = df[cols].round(decimals=2)
# filter out if any of the benchmarks have not been produced
df = df[has_no_nan_values(df, benchmark_cols)]
return df
def get_sage_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame:
"""Creates a dataframe from SAGE evaluation results"""
if process_sage_results_for_leaderboard is None:
return pd.DataFrame()
# Get SAGE results
sage_results = process_sage_results_for_leaderboard()
all_data_json = [result.to_dict() for result in sage_results]
if not all_data_json:
return pd.DataFrame()
df = pd.DataFrame.from_records(all_data_json)
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
df = df[cols].round(decimals=2)
# filter out if any of the benchmarks have not been produced
df = df[has_no_nan_values(df, benchmark_cols)]
return df
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
"""Creates the different dataframes for the evaluation queues requestes"""
if not os.path.exists(save_path):
# Return empty dataframes if the path doesn't exist
empty_df = pd.DataFrame(columns=cols)
return empty_df, empty_df, empty_df
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
all_evals = []
for entry in entries:
if ".json" in entry:
file_path = os.path.join(save_path, entry)
with open(file_path) as fp:
data = json.load(fp)
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
all_evals.append(data)
elif ".md" not in entry:
# this is a folder
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
for sub_entry in sub_entries:
file_path = os.path.join(save_path, entry, sub_entry)
with open(file_path) as fp:
data = json.load(fp)
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
all_evals.append(data)
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
df_running = pd.DataFrame.from_records(running_list, columns=cols)
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
return df_finished[cols], df_running[cols], df_pending[cols]
|