SAGE OSS Evaluator commited on
Commit
326dd8e
·
1 Parent(s): b3a5ff6
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +5 -35
  3. src/oss/oss_leaderboard_manager.py +8 -4
  4. src/populate.py +38 -41
README.md CHANGED
@@ -9,7 +9,7 @@ pinned: true
9
  license: apache-2.0
10
  short_description: SAGE Scientific Reasoning Benchmark Leaderboard
11
  sdk_version: 5.43.1
12
- hf_oauth: true # ← 新增:启用OAuth
13
  tags:
14
  - leaderboard
15
  - science
 
9
  license: apache-2.0
10
  short_description: SAGE Scientific Reasoning Benchmark Leaderboard
11
  sdk_version: 5.43.1
12
+ hf_oauth: true
13
  tags:
14
  - leaderboard
15
  - science
app.py CHANGED
@@ -173,43 +173,13 @@ def get_leaderboard_dataframe():
173
  return pd.DataFrame()
174
 
175
  try:
176
- sage_results = process_sage_results_for_leaderboard()
177
- print(f"📊 Loaded {len(sage_results)} SAGE results")
178
 
179
- if not sage_results:
180
  print("❌ No SAGE results found")
181
  return pd.DataFrame()
182
 
183
- # Convert to leaderboard format
184
- leaderboard_data = []
185
- for result in sage_results:
186
- # Extract model name from submission_id
187
- if result.submission_id.startswith("initial_"):
188
- model_name = result.submission_id.split("_", 2)[-1].replace("_", " ")
189
- else:
190
- model_name = result.submission_id
191
-
192
- # Create model hyperlink (for now just display name)
193
- model_display = f"**{model_name}**"
194
-
195
- row = {
196
- "Model": model_display,
197
- "Organization": result.organization,
198
- "Overall (%)": result.results.get("sage_overall", 0),
199
- "Mathematics (%)": result.results.get("sage_math", 0),
200
- "Physics (%)": result.results.get("sage_physics", 0),
201
- "Chemistry (%)": result.results.get("sage_chemistry", 0),
202
- "Biology (%)": result.results.get("sage_biology", 0),
203
- "Earth Science (%)": result.results.get("sage_earth_science", 0),
204
- "Astronomy (%)": result.results.get("sage_astronomy", 0),
205
- "Submission Date": result.submitted_time
206
- }
207
- leaderboard_data.append(row)
208
-
209
- df = pd.DataFrame(leaderboard_data)
210
- if not df.empty:
211
- df = df.sort_values(by=["Overall (%)"], ascending=False)
212
-
213
  print(f"✅ Generated dataframe with {len(df)} rows")
214
  return df
215
 
@@ -230,7 +200,7 @@ leaderboard_df = get_leaderboard_dataframe()
230
  print(f"📈 Leaderboard initialized with {len(leaderboard_df)} rows")
231
 
232
  # Define column types for the dataframe
233
- COLUMN_TYPES = ["markdown", "str", "number", "number", "number", "number", "number", "number", "number", "str"]
234
 
235
 
236
  # Create Gradio interface
@@ -269,7 +239,7 @@ with demo:
269
  datatype=COLUMN_TYPES,
270
  interactive=False,
271
  wrap=True,
272
- column_widths=["25%", "15%", "8%", "8%", "8%", "8%", "8%", "8%", "8%", "12%"]
273
  )
274
 
275
  # Refresh button
 
173
  return pd.DataFrame()
174
 
175
  try:
176
+ # Use the updated get_sage_leaderboard_df function
177
+ df = get_sage_leaderboard_df()
178
 
179
+ if df.empty:
180
  print("❌ No SAGE results found")
181
  return pd.DataFrame()
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  print(f"✅ Generated dataframe with {len(df)} rows")
184
  return df
185
 
 
200
  print(f"📈 Leaderboard initialized with {len(leaderboard_df)} rows")
201
 
202
  # Define column types for the dataframe
203
+ COLUMN_TYPES = ["str", "markdown", "str", "str", "number", "number", "number", "str"]
204
 
205
 
206
  # Create Gradio interface
 
239
  datatype=COLUMN_TYPES,
240
  interactive=False,
241
  wrap=True,
242
+ column_widths=["8%", "25%", "15%", "10%", "12%", "12%", "12%", "12%"]
243
  )
244
 
245
  # Refresh button
src/oss/oss_leaderboard_manager.py CHANGED
@@ -168,9 +168,9 @@ class OSSLeaderboardManager:
168
  print(f"➕ 添加新的排行榜条目: {result_data.get('organization')}")
169
  leaderboard_data.append(result_data)
170
 
171
- # 按总分排序
172
  leaderboard_data.sort(
173
- key=lambda x: x.get("results", {}).get("sage_overall", 0),
174
  reverse=True
175
  )
176
 
@@ -207,8 +207,12 @@ class OSSLeaderboardManager:
207
  # 获取最高分
208
  top_scores = {}
209
  if leaderboard_data:
210
- top_entry = leaderboard_data[0] # 已按分数排序
211
- top_scores = top_entry.get("results", {})
 
 
 
 
212
 
213
  return {
214
  "total_entries": total_entries,
 
168
  print(f"➕ 添加新的排行榜条目: {result_data.get('organization')}")
169
  leaderboard_data.append(result_data)
170
 
171
+ # 按准确率排序
172
  leaderboard_data.sort(
173
+ key=lambda x: x.get("accuracy", 0),
174
  reverse=True
175
  )
176
 
 
207
  # 获取最高分
208
  top_scores = {}
209
  if leaderboard_data:
210
+ top_entry = leaderboard_data[0] # 已按准确率排序
211
+ top_scores = {
212
+ "accuracy": top_entry.get("accuracy", 0),
213
+ "mg_pass_2": top_entry.get("mg_pass_2", 0),
214
+ "mg_pass_4": top_entry.get("mg_pass_4", 0)
215
+ }
216
 
217
  return {
218
  "total_entries": total_entries,
src/populate.py CHANGED
@@ -24,52 +24,39 @@ try:
24
  submission_id: str
25
  organization: str
26
  email: str
27
- results: Dict[str, float]
28
- num_predictions: int
 
 
29
  submitted_time: str
30
  status: str = "EVALUATED"
31
 
32
  def to_dict(self):
33
  """Converts the SAGE Result to a dict compatible with our dataframe display"""
34
- # Use overall score if available, otherwise calculate average
35
- if "sage_overall" in self.results:
36
- average = self.results["sage_overall"]
37
- else:
38
- domain_scores = [v for v in self.results.values() if v is not None and isinstance(v, (int, float))]
39
- average = sum(domain_scores) / len(domain_scores) if domain_scores else 0.0
40
-
41
- # Extract model name from submission_id for initial results
42
- if self.submission_id.startswith("initial_"):
43
  model_name = self.submission_id.split("_", 2)[-1].replace("_", " ")
44
- display_name = f"**{model_name}**"
45
- model_symbol = "🤖"
46
  else:
47
- display_name = f"[{self.organization}]({self.email})"
48
- model_symbol = "🏢"
49
 
50
- from src.display.utils import AutoEvalColumn, Tasks
 
 
51
 
52
  data_dict = {
53
  "eval_name": self.submission_id,
54
- AutoEvalColumn.model.name: display_name,
55
- AutoEvalColumn.model_type_symbol.name: model_symbol,
56
- AutoEvalColumn.model_type.name: "SAGE Benchmark",
57
- AutoEvalColumn.precision.name: self.organization,
58
- AutoEvalColumn.weight_type.name: "Evaluated",
59
- AutoEvalColumn.architecture.name: "Multi-domain",
60
- AutoEvalColumn.average.name: round(average, 2),
61
- AutoEvalColumn.license.name: "N/A",
62
- AutoEvalColumn.likes.name: 0,
63
- AutoEvalColumn.params.name: 0,
64
- AutoEvalColumn.still_on_hub.name: True,
65
- AutoEvalColumn.revision.name: self.submitted_time,
66
  }
67
 
68
- # Add domain-specific scores
69
- for task in Tasks:
70
- domain_key = task.value.benchmark
71
- data_dict[task.value.col_name] = self.results.get(domain_key, 0.0)
72
-
73
  return data_dict
74
 
75
  def load_initial_sage_results_from_oss() -> List[SAGEResult]:
@@ -90,13 +77,17 @@ try:
90
  for i, entry in enumerate(initial_data):
91
  sage_result = SAGEResult(
92
  submission_id=f"oss_{i:02d}_{entry['model_name'].replace(' ', '_').replace('-', '_')}",
93
- organization=f"{entry['organization']} ({entry.get('tokens', 'N/A')})",
94
  email=entry.get('contact_email', f"contact@{entry['organization'].lower().replace(' ', '')}.com"),
95
- results=entry["results"],
96
- num_predictions=1000,
 
 
97
  submitted_time=entry["submitted_time"],
98
  status="EVALUATED"
99
  )
 
 
100
  sage_results.append(sage_result)
101
  else:
102
  print("⚠️ OSS中未找到排行榜数据")
@@ -118,7 +109,7 @@ except ImportError as e:
118
  process_sage_results_for_leaderboard = None
119
 
120
 
121
- def get_sage_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame:
122
  """Creates a dataframe from SAGE evaluation results"""
123
  if process_sage_results_for_leaderboard is None:
124
  return pd.DataFrame()
@@ -131,9 +122,15 @@ def get_sage_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame:
131
  return pd.DataFrame()
132
 
133
  df = pd.DataFrame.from_records(all_data_json)
134
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
135
- df = df[cols].round(decimals=2)
136
 
137
- # filter out if any of the benchmarks have not been produced
138
- df = df[has_no_nan_values(df, benchmark_cols)]
 
 
 
 
 
 
 
 
139
  return df
 
24
  submission_id: str
25
  organization: str
26
  email: str
27
+ tokens: str
28
+ accuracy: float
29
+ mg_pass_2: float
30
+ mg_pass_4: float
31
  submitted_time: str
32
  status: str = "EVALUATED"
33
 
34
  def to_dict(self):
35
  """Converts the SAGE Result to a dict compatible with our dataframe display"""
36
+ # Extract model name from submission_id or use model_name directly
37
+ if hasattr(self, 'model_name'):
38
+ model_name = self.model_name
39
+ elif self.submission_id.startswith("oss_"):
40
+ # Extract model name from submission_id
 
 
 
 
41
  model_name = self.submission_id.split("_", 2)[-1].replace("_", " ")
 
 
42
  else:
43
+ model_name = self.submission_id
 
44
 
45
+ # Create display name
46
+ display_name = f"**{model_name}**"
47
+ model_symbol = "🤖"
48
 
49
  data_dict = {
50
  "eval_name": self.submission_id,
51
+ "Model": display_name,
52
+ "Organization": self.organization,
53
+ "Tokens": self.tokens,
54
+ "Accuracy (%)": round(self.accuracy, 2),
55
+ "mG-Pass@2 (%)": round(self.mg_pass_2, 2),
56
+ "mG-Pass@4 (%)": round(self.mg_pass_4, 2),
57
+ "Submission Date": self.submitted_time,
 
 
 
 
 
58
  }
59
 
 
 
 
 
 
60
  return data_dict
61
 
62
  def load_initial_sage_results_from_oss() -> List[SAGEResult]:
 
77
  for i, entry in enumerate(initial_data):
78
  sage_result = SAGEResult(
79
  submission_id=f"oss_{i:02d}_{entry['model_name'].replace(' ', '_').replace('-', '_')}",
80
+ organization=entry['organization'],
81
  email=entry.get('contact_email', f"contact@{entry['organization'].lower().replace(' ', '')}.com"),
82
+ tokens=entry.get('tokens', 'N/A'),
83
+ accuracy=entry.get('accuracy', 0.0),
84
+ mg_pass_2=entry.get('mg_pass_2', 0.0),
85
+ mg_pass_4=entry.get('mg_pass_4', 0.0),
86
  submitted_time=entry["submitted_time"],
87
  status="EVALUATED"
88
  )
89
+ # Add model_name as additional attribute for display
90
+ sage_result.model_name = entry['model_name']
91
  sage_results.append(sage_result)
92
  else:
93
  print("⚠️ OSS中未找到排行榜数据")
 
109
  process_sage_results_for_leaderboard = None
110
 
111
 
112
+ def get_sage_leaderboard_df() -> pd.DataFrame:
113
  """Creates a dataframe from SAGE evaluation results"""
114
  if process_sage_results_for_leaderboard is None:
115
  return pd.DataFrame()
 
122
  return pd.DataFrame()
123
 
124
  df = pd.DataFrame.from_records(all_data_json)
 
 
125
 
126
+ # Sort by accuracy (descending)
127
+ if "Accuracy (%)" in df.columns:
128
+ df = df.sort_values(by=["Accuracy (%)"], ascending=False)
129
+
130
+ # Round numeric columns
131
+ numeric_cols = ["Accuracy (%)", "mG-Pass@2 (%)", "mG-Pass@4 (%)"]
132
+ for col in numeric_cols:
133
+ if col in df.columns:
134
+ df[col] = df[col].round(2)
135
+
136
  return df