sudanl commited on
Commit
1232cb8
·
1 Parent(s): c1060a1

feat: Simplify SAGE-Bench for OSS integration - remove complex validation, add OSS submission

Browse files
README.md CHANGED
@@ -61,5 +61,4 @@ Submit your evaluation results as JSON files with the following format:
61
  - `src/about.py` - SAGE-specific task definitions and content
62
  - `src/leaderboard/sage_eval.py` - SAGE evaluation logic and result processing
63
  - `src/submission/sage_submit.py` - Simplified submission processing
64
- - `initial_sage_results.json` - Benchmark results from major models
65
- - `reference_answers.json` - Reference data for evaluation
 
61
  - `src/about.py` - SAGE-specific task definitions and content
62
  - `src/leaderboard/sage_eval.py` - SAGE evaluation logic and result processing
63
  - `src/submission/sage_submit.py` - Simplified submission processing
64
+ - `initial_sage_results.json` - Benchmark results from major models
 
app.py CHANGED
@@ -177,27 +177,76 @@ with demo:
177
  with gr.Accordion("📊 Submit Your SAGE Results", open=False):
178
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
179
 
 
 
 
 
 
 
 
 
 
180
  with gr.Row():
181
  with gr.Column():
182
- org_textbox = gr.Textbox(label="Organization Name", placeholder="Your Organization")
183
- email_textbox = gr.Textbox(label="Contact Email", placeholder="contact@example.com")
 
 
 
 
 
 
 
 
184
  with gr.Column():
185
  file_upload = gr.File(
186
  label="Upload SAGE Results (JSON)",
187
  file_types=[".json"],
188
- type="filepath"
 
189
  )
190
 
191
- submit_button = gr.Button("Submit Results", variant="primary")
 
 
 
 
192
  submission_result = gr.HTML()
193
 
194
- # File collection submission handling
195
- def handle_submission(file_upload, org_name, email):
196
  try:
 
 
 
 
 
 
 
 
 
 
 
197
  from src.submission.simple_submit import process_sage_submission_simple
198
- return process_sage_submission_simple(file_upload, org_name, email)
199
- except ImportError:
200
- return format_error("❌ 提交系统暂时不可用,请稍后再试。")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  submit_button.click(
203
  handle_submission,
 
177
  with gr.Accordion("📊 Submit Your SAGE Results", open=False):
178
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
179
 
180
+ # 添加提交说明
181
+ gr.Markdown("""
182
+ ### 📋 提交要求
183
+ - **HuggingFace账户**: 必须登录HuggingFace账户
184
+ - **账户年龄**: 账户创建时间需超过60天
185
+ - **提交限制**: 每个账户每天只能提交一次
186
+ - **组织唯一**: 每个组织只能提交一次结果
187
+ """, elem_classes="markdown-text")
188
+
189
  with gr.Row():
190
  with gr.Column():
191
+ org_textbox = gr.Textbox(
192
+ label="Organization Name",
193
+ placeholder="Your Organization",
194
+ info="组织名称将显示在排行榜上"
195
+ )
196
+ email_textbox = gr.Textbox(
197
+ label="Contact Email",
198
+ placeholder="contact@example.com",
199
+ info="邮箱仅用于联系,不会公开显示"
200
+ )
201
  with gr.Column():
202
  file_upload = gr.File(
203
  label="Upload SAGE Results (JSON)",
204
  file_types=[".json"],
205
+ type="filepath",
206
+ info="上传符合SAGE格式的JSON结果文件"
207
  )
208
 
209
+ # HuggingFace登录按钮
210
+ with gr.Row():
211
+ gr.LoginButton(value="🔐 Login with HuggingFace")
212
+ submit_button = gr.Button("Submit Results", variant="primary")
213
+
214
  submission_result = gr.HTML()
215
 
216
+ # Simplified submission handling via OSS
217
+ def handle_submission(file_upload, org_name, email, profile: gr.OAuthProfile):
218
  try:
219
+ # 基本验证
220
+ if not file_upload:
221
+ return format_error("请选择要上传的文件")
222
+ if not org_name or not org_name.strip():
223
+ return format_error("请输入组织名称")
224
+ if not email or not email.strip():
225
+ return format_error("请输入邮箱地址")
226
+ if not profile:
227
+ return format_error("请先登录HuggingFace账户")
228
+
229
+ # 处理文件提交 (通过OSS)
230
  from src.submission.simple_submit import process_sage_submission_simple
231
+ result = process_sage_submission_simple(file_upload, org_name, email)
232
+
233
+ # 添加成功提交的额外信息
234
+ success_info = f"""
235
+ <div style="background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 5px; padding: 15px; margin: 10px 0;">
236
+ <h4 style="color: #155724; margin-top: 0;">🎉 提交成功!</h4>
237
+ <p style="color: #155724; margin: 5px 0;"><strong>组织:</strong> {org_name}</p>
238
+ <p style="color: #155724; margin: 5px 0;"><strong>用户:</strong> {profile.username if profile else 'Unknown'}</p>
239
+ <p style="color: #155724; margin: 5px 0;"><strong>提交时间:</strong> {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
240
+ <p style="color: #155724; margin-bottom: 0;">您的结果已通过OSS提交,将在5-10分钟内完成评测并更新排行榜。</p>
241
+ </div>
242
+ """
243
+
244
+ return success_info + result
245
+
246
+ except ImportError as e:
247
+ return format_error(f"提交系统模块不可用: {e}")
248
+ except Exception as e:
249
+ return format_error(f"提交过程中出现错误: {e}")
250
 
251
  submit_button.click(
252
  handle_submission,
initial_sage_results.json CHANGED
@@ -1,4 +1,25 @@
1
  [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  {
3
  "model_name": "OpenAI GPT-5-High",
4
  "organization": "OpenAI",
@@ -232,5 +253,68 @@
232
  "sage_earth_science": 9.2,
233
  "sage_astronomy": 9.2
234
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  }
236
- ]
 
1
  [
2
+ {
3
+ "model_name": "Unknown",
4
+ "organization": "Unknown",
5
+ "tokens": "User Submission",
6
+ "accuracy": 100.0,
7
+ "mg_pass_2": 100.0,
8
+ "mg_pass_4": 100.0,
9
+ "submitted_time": "2025-09-05",
10
+ "results": {
11
+ "sage_mathematics": 100.0,
12
+ "sage_physics": 100.0,
13
+ "sage_chemistry": 100.0,
14
+ "sage_biology": 100.0,
15
+ "sage_earth_science": 0.0,
16
+ "sage_astronomy": 0.0,
17
+ "sage_overall": 100.0
18
+ },
19
+ "contact_email": "",
20
+ "evaluation_timestamp": "2025-09-05T16:14:32.476871",
21
+ "result_file": "results/result_Demo_Test_Org_20250905_161432.json"
22
+ },
23
  {
24
  "model_name": "OpenAI GPT-5-High",
25
  "organization": "OpenAI",
 
253
  "sage_earth_science": 9.2,
254
  "sage_astronomy": 9.2
255
  }
256
+ },
257
+ {
258
+ "model_name": "QuickDemo_TestOrg",
259
+ "organization": "QuickDemo_TestOrg",
260
+ "tokens": "User Submission (Simulated)",
261
+ "accuracy": 0.619,
262
+ "mg_pass_2": 0.619,
263
+ "mg_pass_4": 0.619,
264
+ "submitted_time": "2025-09-05",
265
+ "results": {
266
+ "sage_mathematics": 0.877,
267
+ "sage_physics": 0.895,
268
+ "sage_chemistry": 0.756,
269
+ "sage_biology": 0.316,
270
+ "sage_earth_science": 0.312,
271
+ "sage_astronomy": 0.56,
272
+ "sage_overall": 0.619
273
+ },
274
+ "contact_email": "test@demo.com",
275
+ "evaluation_timestamp": "2025-09-05T16:19:39.864071",
276
+ "result_file": "results/simulated_result_QuickDemo_TestOrg_20250905_161939.json"
277
+ },
278
+ {
279
+ "model_name": "QuickDemo_HighAccuracy",
280
+ "organization": "QuickDemo_HighAccuracy",
281
+ "tokens": "User Submission (Simulated)",
282
+ "accuracy": 0.598,
283
+ "mg_pass_2": 0.598,
284
+ "mg_pass_4": 0.598,
285
+ "submitted_time": "2025-09-05",
286
+ "results": {
287
+ "sage_mathematics": 0.88,
288
+ "sage_physics": 0.331,
289
+ "sage_chemistry": 0.646,
290
+ "sage_biology": 0.501,
291
+ "sage_earth_science": 0.818,
292
+ "sage_astronomy": 0.415,
293
+ "sage_overall": 0.598
294
+ },
295
+ "contact_email": "high@demo.com",
296
+ "evaluation_timestamp": "2025-09-05T16:19:43.874748",
297
+ "result_file": "results/simulated_result_QuickDemo_HighAccuracy_20250905_161943.json"
298
+ },
299
+ {
300
+ "model_name": "QuickDemo_MediumAccuracy",
301
+ "organization": "QuickDemo_MediumAccuracy",
302
+ "tokens": "User Submission (Simulated)",
303
+ "accuracy": 0.516,
304
+ "mg_pass_2": 0.516,
305
+ "mg_pass_4": 0.516,
306
+ "submitted_time": "2025-09-05",
307
+ "results": {
308
+ "sage_mathematics": 0.474,
309
+ "sage_physics": 0.518,
310
+ "sage_chemistry": 0.674,
311
+ "sage_biology": 0.638,
312
+ "sage_earth_science": 0.318,
313
+ "sage_astronomy": 0.473,
314
+ "sage_overall": 0.516
315
+ },
316
+ "contact_email": "medium@demo.com",
317
+ "evaluation_timestamp": "2025-09-05T16:19:41.868409",
318
+ "result_file": "results/simulated_result_QuickDemo_MediumAccuracy_20250905_161941.json"
319
  }
320
+ ]
reference_answers.json DELETED
@@ -1,44 +0,0 @@
1
- {
2
- "reference_answers": [
3
- {
4
- "question_id": 0,
5
- "domain": "mathematics",
6
- "question": "What is 6 multiplied by 7?",
7
- "correct_answer": "42",
8
- "alternative_answers": ["42", "forty-two", "6×7", "6*7"],
9
- "explanation": "The multiplication of 6 and 7 equals 42."
10
- },
11
- {
12
- "question_id": 1,
13
- "domain": "chemistry",
14
- "question": "What is the chemical formula for water?",
15
- "correct_answer": "H2O",
16
- "alternative_answers": ["H2O", "water", "dihydrogen monoxide"],
17
- "explanation": "Water consists of two hydrogen atoms and one oxygen atom."
18
- },
19
- {
20
- "question_id": 2,
21
- "domain": "biology",
22
- "question": "What molecule carries genetic information in living organisms?",
23
- "correct_answer": "DNA",
24
- "alternative_answers": ["DNA", "deoxyribonucleic acid", "genetic material"],
25
- "explanation": "DNA stores and transmits genetic information in all living organisms."
26
- },
27
- {
28
- "question_id": 3,
29
- "domain": "physics",
30
- "question": "What is the acceleration due to gravity on Earth?",
31
- "correct_answer": "9.8 m/s²",
32
- "alternative_answers": ["9.8 m/s²", "9.81 m/s²", "9.8", "9.81"],
33
- "explanation": "Earth's gravitational acceleration is approximately 9.8 meters per second squared."
34
- },
35
- {
36
- "question_id": 4,
37
- "domain": "biology",
38
- "question": "What is the process by which plants convert sunlight into energy?",
39
- "correct_answer": "photosynthesis",
40
- "alternative_answers": ["photosynthesis", "6CO2 + 6H2O + light → C6H12O6 + 6O2"],
41
- "explanation": "Photosynthesis converts light energy into chemical energy in plants."
42
- }
43
- ]
44
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py DELETED
@@ -1,99 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo
10
- from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import AutoTokenizer
12
-
13
- def check_model_card(repo_id: str) -> tuple[bool, str]:
14
- """Checks if the model card and license exist and have been filled"""
15
- try:
16
- card = ModelCard.load(repo_id)
17
- except huggingface_hub.utils.EntryNotFoundError:
18
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
-
20
- # Enforce license metadata
21
- if card.data.license is None:
22
- if not ("license_name" in card.data and "license_link" in card.data):
23
- return False, (
24
- "License not found. Please add a license to your model card using the `license` metadata or a"
25
- " `license_name`/`license_link` pair."
26
- )
27
-
28
- # Enforce card content
29
- if len(card.text) < 200:
30
- return False, "Please add a description to your model card, it is too short."
31
-
32
- return True, ""
33
-
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
- try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
- if test_tokenizer:
39
- try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- except ValueError as e:
42
- return (
43
- False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
- )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
- return True, None, config
50
-
51
- except ValueError:
52
- return (
53
- False,
54
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
- )
57
-
58
- except Exception as e:
59
- return False, "was not found on hub!", None
60
-
61
-
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
- """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
- try:
65
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
- except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
- model_size = size_factor * model_size
71
- return model_size
72
-
73
- def get_model_arch(model_info: ModelInfo):
74
- """Gets the model architecture from the configuration"""
75
- return model_info.config.get("architectures", "Unknown")
76
-
77
- def already_submitted_models(requested_models_dir: str) -> set[str]:
78
- """Gather a list of already submitted models to avoid duplicates"""
79
- depth = 1
80
- file_names = []
81
- users_to_submission_dates = defaultdict(list)
82
-
83
- for root, _, files in os.walk(requested_models_dir):
84
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
- if current_depth == depth:
86
- for file in files:
87
- if not file.endswith(".json"):
88
- continue
89
- with open(os.path.join(root, file), "r") as f:
90
- info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
-
93
- # Select organisation
94
- if info["model"].count("/") == 0 or "submitted_time" not in info:
95
- continue
96
- organisation, _ = info["model"].split("/")
97
- users_to_submission_dates[organisation].append(info["submitted_time"])
98
-
99
- return set(file_names), users_to_submission_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/sage_submit.py DELETED
@@ -1,221 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
- from typing import Dict, List, Any
5
-
6
- from src.display.formatting import styled_error, styled_message, styled_warning
7
-
8
-
9
- def validate_sage_submission(submission_data: Dict[str, Any]) -> tuple[bool, str]:
10
- """Validates SAGE benchmark submission format"""
11
-
12
- # Check required top-level fields
13
- required_fields = ["submission_org", "submission_email", "predictions"]
14
- for field in required_fields:
15
- if field not in submission_data:
16
- return False, f"Missing required field: {field}"
17
-
18
- # Validate email format (basic)
19
- email = submission_data["submission_email"]
20
- if "@" not in email or "." not in email:
21
- return False, "Invalid email format"
22
-
23
- # Validate predictions
24
- predictions = submission_data["predictions"]
25
- if not isinstance(predictions, list) or len(predictions) == 0:
26
- return False, "Predictions must be a non-empty list"
27
-
28
- for i, prediction in enumerate(predictions):
29
- # Check required prediction fields
30
- pred_required_fields = ["original_question_id", "content", "reasoning_content"]
31
- for field in pred_required_fields:
32
- if field not in prediction:
33
- return False, f"Missing field '{field}' in prediction {i}"
34
-
35
- # Validate content arrays
36
- content = prediction["content"]
37
- reasoning_content = prediction["reasoning_content"]
38
-
39
- if not isinstance(content, list) or len(content) != 4:
40
- return False, f"Content in prediction {i} must be a list of exactly 4 items"
41
-
42
- if not isinstance(reasoning_content, list) or len(reasoning_content) != 4:
43
- return False, f"Reasoning content in prediction {i} must be a list of exactly 4 items"
44
-
45
- # Validate question ID
46
- if not isinstance(prediction["original_question_id"], int):
47
- return False, f"Question ID in prediction {i} must be an integer"
48
-
49
- return True, "Valid submission format"
50
-
51
-
52
- def process_sage_submission_simple(submission_file, org_name=None, email=None) -> str:
53
- """Process SAGE benchmark submission file - simplified version for basic leaderboard"""
54
-
55
- try:
56
- # Read the submitted file (receives file path)
57
- if submission_file is None:
58
- return styled_error("No file uploaded. Please select a JSON file.")
59
-
60
- # submission_file is a file path string
61
- try:
62
- with open(submission_file, 'r', encoding='utf-8') as f:
63
- content = f.read()
64
- except Exception as e:
65
- return styled_error(f"Error reading file: {str(e)}")
66
-
67
- # Parse JSON
68
- try:
69
- submission_data = json.loads(content)
70
- except json.JSONDecodeError as e:
71
- return styled_error(f"Invalid JSON format: {str(e)}")
72
-
73
- # Use form inputs if submission data doesn't contain org/email
74
- if org_name and email:
75
- submission_data["submission_org"] = org_name
76
- submission_data["submission_email"] = email
77
-
78
- # Validate submission format
79
- is_valid, message = validate_sage_submission(submission_data)
80
- if not is_valid:
81
- return styled_error(f"Submission validation failed: {message}")
82
-
83
- # Save submission for later processing
84
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
85
- org = submission_data["submission_org"].replace(" ", "_").replace("/", "_")
86
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
87
-
88
- # Save raw submission
89
- submission_dir = f"./sage_submissions/{org}"
90
- os.makedirs(submission_dir, exist_ok=True)
91
- raw_submission_path = f"{submission_dir}/submission_{timestamp}.json"
92
-
93
- with open(raw_submission_path, 'w') as f:
94
- json.dump(submission_data, f, indent=2)
95
-
96
- # Simple evaluation using the evaluation module
97
- try:
98
- from src.leaderboard.sage_eval import evaluate_sage_submission
99
- domain_accuracies = evaluate_sage_submission(submission_data)
100
-
101
- # Update initial_sage_results.json directly for persistence
102
- # Try multiple possible paths for the initial results file
103
- possible_paths = [
104
- "./initial_sage_results.json",
105
- "initial_sage_results.json",
106
- os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "initial_sage_results.json")
107
- ]
108
-
109
- initial_results_file = None
110
- for path in possible_paths:
111
- if os.path.exists(path):
112
- initial_results_file = path
113
- break
114
-
115
- if not initial_results_file:
116
- initial_results_file = possible_paths[0] # Use first path as fallback
117
-
118
- try:
119
- # Load existing initial results
120
- if os.path.exists(initial_results_file):
121
- with open(initial_results_file, 'r') as f:
122
- initial_results = json.load(f)
123
- else:
124
- initial_results = []
125
-
126
- # Convert to initial results format
127
- new_result = {
128
- "model_name": submission_data["submission_org"],
129
- "organization": submission_data["submission_org"],
130
- "tokens": "User Submission",
131
- "accuracy": domain_accuracies["sage_overall"],
132
- "mg_pass_2": domain_accuracies["sage_overall"], # Use same value for now
133
- "mg_pass_4": domain_accuracies["sage_overall"], # Use same value for now
134
- "submitted_time": datetime.now().strftime("%Y-%m-%d"),
135
- "results": domain_accuracies,
136
- "contact_email": submission_data["submission_email"]
137
- }
138
-
139
- # Check if organization already exists, update or add
140
- org_name = submission_data["submission_org"]
141
- updated = False
142
- for i, result in enumerate(initial_results):
143
- if (result.get("model_name") == org_name or
144
- result.get("organization") == org_name):
145
- initial_results[i] = new_result
146
- updated = True
147
- break
148
-
149
- if not updated:
150
- initial_results.append(new_result)
151
-
152
- # Save updated initial results
153
- with open(initial_results_file, 'w') as f:
154
- json.dump(initial_results, f, indent=2)
155
-
156
- print(f"✅ Updated {initial_results_file} with new submission from {org_name}")
157
-
158
- except Exception as e:
159
- print(f"⚠️ Failed to update initial results file: {e}")
160
-
161
- # Format success message with scores
162
- overall_accuracy = domain_accuracies.get("sage_overall", 0)
163
-
164
- success_msg = styled_message(
165
- f"🎉 SAGE submission processed successfully!\n\n"
166
- f"**Organization:** {submission_data['submission_org']}\n"
167
- f"**Overall Accuracy:** {overall_accuracy:.2f}%\n\n"
168
- f"**Domain Scores:**\n"
169
- f" • Mathematics: {domain_accuracies.get('sage_math', 0):.2f}%\n"
170
- f" • Physics: {domain_accuracies.get('sage_physics', 0):.2f}%\n"
171
- f" • Chemistry: {domain_accuracies.get('sage_chemistry', 0):.2f}%\n"
172
- f" • Biology: {domain_accuracies.get('sage_biology', 0):.2f}%\n"
173
- f" • Earth Science: {domain_accuracies.get('sage_earth_science', 0):.2f}%\n"
174
- f" • Astronomy: {domain_accuracies.get('sage_astronomy', 0):.2f}%\n\n"
175
- f"Your results have been added to the leaderboard. "
176
- f"Please refresh the page to see updated rankings."
177
- )
178
-
179
- return success_msg
180
-
181
- except Exception as eval_error:
182
- # If evaluation fails, still save submission but mark as failed
183
- return styled_warning(
184
- f"⚠️ Submission received but evaluation failed.\n\n"
185
- f"Error: {str(eval_error)}\n\n"
186
- f"Your submission has been saved and will be processed manually. "
187
- f"Please contact administrators if this issue persists."
188
- )
189
-
190
- except Exception as e:
191
- return styled_error(f"Submission processing failed: {str(e)}")
192
-
193
-
194
- def load_sage_submissions(submissions_dir: str = "./sage_submissions") -> List[Dict]:
195
- """Load all SAGE submissions for display in queue"""
196
-
197
- if not os.path.exists(submissions_dir):
198
- return []
199
-
200
- submissions = []
201
-
202
- for org_dir in os.listdir(submissions_dir):
203
- org_path = os.path.join(submissions_dir, org_dir)
204
- if not os.path.isdir(org_path):
205
- continue
206
-
207
- for file in os.listdir(org_path):
208
- if file.startswith("submission_") and file.endswith(".json"):
209
- try:
210
- with open(os.path.join(org_path, file), 'r') as f:
211
- submission = json.load(f)
212
- # Add metadata
213
- submission["_filename"] = file
214
- submission["_org_dir"] = org_dir
215
- submissions.append(submission)
216
- except Exception:
217
- continue
218
-
219
- # Sort by submission time (most recent first)
220
- submissions.sort(key=lambda x: x.get("_filename", ""), reverse=True)
221
- return submissions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/simple_submit.py CHANGED
@@ -1,112 +1,24 @@
1
  #!/usr/bin/env python3
2
  """
3
- 简化的SAGE提交处理 - 文件收集模式
4
- 只负责接收和验证提交文件,不进行评测
5
  """
6
 
7
  import json
8
  import os
9
- import shutil
10
  from datetime import datetime
11
  from typing import Dict, Any
 
12
 
13
- def try_http_push_to_local(submission_data, filename):
14
- """HTTP直推到本地服务器"""
15
- try:
16
- import requests
17
-
18
- # 本地服务器地址配置(可以根据需要修改)
19
- local_endpoints = [
20
- 'http://127.0.0.1:8080/api/submissions',
21
- 'http://localhost:8080/api/submissions',
22
- ]
23
-
24
- payload = {
25
- "filename": filename,
26
- "content": submission_data,
27
- "timestamp": datetime.now().isoformat(),
28
- "source": "huggingface_spaces",
29
- "organization": submission_data.get("submission_org", "Unknown"),
30
- "email": submission_data.get("submission_email", "")
31
- }
32
-
33
- for endpoint in local_endpoints:
34
- try:
35
- print(f"🔄 尝试HTTP推送到: {endpoint}")
36
- response = requests.post(
37
- endpoint,
38
- json=payload,
39
- timeout=5,
40
- headers={'Content-Type': 'application/json'}
41
- )
42
-
43
- if response.status_code == 200:
44
- result = response.json()
45
- print(f"✅ HTTP推送成功: {result.get('message', 'OK')}")
46
- print(f"📁 本地路径: {result.get('local_path', 'Unknown')}")
47
- return True, result
48
- else:
49
- print(f"⚠️ HTTP响应错误 {response.status_code}: {response.text}")
50
-
51
- except requests.ConnectionError:
52
- print(f"⚠️ 无法连接到本地服务器: {endpoint}")
53
- except requests.Timeout:
54
- print(f"⚠️ 连接超时: {endpoint}")
55
- except Exception as e:
56
- print(f"⚠️ HTTP推送失败 {endpoint}: {e}")
57
-
58
- return False, None
59
-
60
- except ImportError:
61
- print("⚠️ requests模块未安装,无法使用HTTP推送")
62
- return False, None
63
- except Exception as e:
64
- print(f"❌ HTTP推送模块失败: {e}")
65
- return False, None
66
-
67
- def try_git_commit_and_push(saved_path, filename):
68
- """Git提交并推送到远程仓库"""
69
- try:
70
- import subprocess
71
-
72
- # 配置Git用户信息和认证
73
- try:
74
- subprocess.run(["git", "config", "user.email", "sage-bench@huggingface.co"], cwd=".")
75
- subprocess.run(["git", "config", "user.name", "SAGE Bench System"], cwd=".")
76
-
77
- # 配置HuggingFace认证(如果有HF_TOKEN环境变量)
78
- import os
79
- hf_token = os.getenv('HF_TOKEN')
80
- if hf_token:
81
- # 配置Git使用token认证
82
- repo_url = f"https://oauth2:{hf_token}@huggingface.co/spaces/Sudanl/SAGE-Bench"
83
- subprocess.run(["git", "remote", "set-url", "origin", repo_url], cwd=".")
84
- print("✅ Git认证配置完成")
85
-
86
- except Exception as e:
87
- print(f"⚠️ Git配置警告: {e}")
88
- pass # 如果已配置则忽略错误
89
-
90
- # Git操作:add -> commit -> push
91
- subprocess.run(["git", "add", saved_path], check=True, cwd=".")
92
- commit_msg = f"feat: 新用户提交 {filename}"
93
- subprocess.run(["git", "commit", "-m", commit_msg], check=True, cwd=".")
94
-
95
- # 尝试推送到远程仓库
96
- try:
97
- subprocess.run(["git", "push"], check=True, cwd=".", timeout=30)
98
- print(f"✅ Git提交并推送成功: {filename}")
99
- return True
100
- except subprocess.TimeoutExpired:
101
- print(f"⚠️ Git推送超时,但文件已本地提交: {filename}")
102
- return True # 本地提交成功也算部分成功
103
- except Exception as push_error:
104
- print(f"⚠️ Git推送失败但本地已提交: {push_error}")
105
- return True # 本地提交成功也算部分成功
106
-
107
- except Exception as e:
108
- print(f"❌ Git操作失败: {e}")
109
- return False
110
 
111
  def format_error(msg):
112
  return f"<p style='color: red; font-size: 16px;'>{msg}</p>"
@@ -218,32 +130,24 @@ def process_sage_submission_simple(submission_file, org_name=None, email=None) -
218
  saved_path = save_submission_file(submission_data)
219
  print(f"✅ 提交文件已保存到: {saved_path}")
220
 
221
- # 多重传输策略:HTTP主推 + Git备份
222
- filename = os.path.basename(saved_path)
223
- success_methods = []
224
- transfer_details = {}
225
-
226
- # 方法1: HTTP直推到本地(主要方式)
227
- http_success, http_result = try_http_push_to_local(submission_data, filename)
228
- if http_success:
229
- success_methods.append("HTTP直推")
230
- transfer_details["http"] = http_result
231
-
232
- # 方法2: Git提交推送(备用方式)
233
- if try_git_commit_and_push(saved_path, filename):
234
- success_methods.append("Git推送")
 
235
 
236
- # 生成传输状态消息
237
- if success_methods:
238
- if "HTTP直推" in success_methods:
239
- transfer_status = f"✅ 文件已成功传输到本地评测系统 (通过: {', '.join(success_methods)})"
240
- transfer_speed = "⚡ 立即开始评测"
241
- else:
242
- transfer_status = f"✅ 文件已保存 (通过: {', '.join(success_methods)})"
243
- transfer_speed = "⏳ 等待同步到评测系统"
244
- else:
245
- transfer_status = "⚠️ 直接传输失败,文件已保存到HuggingFace Spaces,将通过备用同步机制处理"
246
- transfer_speed = "⏰ 可能需要稍长时间"
247
 
248
  # 生成成功消息
249
  org = submission_data["submission_org"]
@@ -255,17 +159,17 @@ def process_sage_submission_simple(submission_file, org_name=None, email=None) -
255
  📋 <strong>提交信息:</strong><br>
256
  • 组织: {org}<br>
257
  • 邮箱: {email_addr}<br>
258
- • 预测数量: {num_predictions} 个问题<br><br>
259
- 🚀 <strong>传输状态:</strong><br>
260
- {transfer_status}<br>
261
- {transfer_speed}<br><br>
262
  ⏳ <strong>评测流程:</strong><br>
263
  您的提交将使用LLM-as-Judge进行自动评估,包括科学推理能力的全面测试。<br>
264
  评测完成后,结果将自动出现在排行榜中。<br><br>
265
  🕐 <strong>预计时间:</strong><br>
266
- HTTP直推: 5-15分钟<br>
267
- 备用同步: 15-60分钟<br><br>
268
- 感谢您参与SAGE基准测试!🧪
269
  """)
270
 
271
  return success_msg
@@ -317,57 +221,5 @@ def get_submission_stats(submissions_dir: str = "./submissions") -> Dict[str, An
317
  "recent": submissions[:10] # 最近10个
318
  }
319
 
320
- def try_http_push_to_local(submission_data, filename):
321
- """HTTP直推到本地服务器"""
322
- try:
323
- import requests
324
-
325
- # 本地服务器地址配置(可以根据需要修改)
326
- local_endpoints = [
327
- 'http://127.0.0.1:8080/api/submissions',
328
- 'http://localhost:8080/api/submissions',
329
- ]
330
-
331
- payload = {
332
- "filename": filename,
333
- "content": submission_data,
334
- "timestamp": datetime.now().isoformat(),
335
- "source": "huggingface_spaces",
336
- "organization": submission_data.get("submission_org", "Unknown"),
337
- "email": submission_data.get("submission_email", "")
338
- }
339
-
340
- for endpoint in local_endpoints:
341
- try:
342
- print(f"🔄 尝试HTTP推送到: {endpoint}")
343
- response = requests.post(
344
- endpoint,
345
- json=payload,
346
- timeout=5,
347
- headers={'Content-Type': 'application/json'}
348
- )
349
-
350
- if response.status_code == 200:
351
- result = response.json()
352
- print(f"✅ HTTP推送成功: {result.get('message', 'OK')}")
353
- print(f"📁 本地路径: {result.get('local_path', 'Unknown')}")
354
- return True, result
355
- else:
356
- print(f"⚠️ HTTP响应错误 {response.status_code}: {response.text}")
357
-
358
- except requests.ConnectionError:
359
- print(f"⚠️ 无法连接到本地服务器: {endpoint}")
360
- except requests.Timeout:
361
- print(f"⚠️ 连接超时: {endpoint}")
362
- except Exception as e:
363
- print(f"⚠️ HTTP推送失败 {endpoint}: {e}")
364
-
365
- return False, None
366
-
367
- except ImportError:
368
- print("⚠️ requests模块未安装,无法使用HTTP推送")
369
- return False, None
370
- except Exception as e:
371
- print(f"❌ HTTP推送模块失败: {e}")
372
- return False, None
373
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ SAGE提交处理 - OSS模式
4
+ 使用阿里云OSS替代git/http提交方式
5
  """
6
 
7
  import json
8
  import os
9
+ import sys
10
  from datetime import datetime
11
  from typing import Dict, Any
12
+ from pathlib import Path
13
 
14
+ # 导入OSS提交处理器
15
+ sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..', 'oss_sage_evaluator'))
16
+ try:
17
+ from oss_submission_handler import OSSSubmissionHandler
18
+ OSS_AVAILABLE = True
19
+ except ImportError as e:
20
+ print(f"⚠️ OSS模块不可用,将使用备用模式: {e}")
21
+ OSS_AVAILABLE = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def format_error(msg):
24
  return f"<p style='color: red; font-size: 16px;'>{msg}</p>"
 
130
  saved_path = save_submission_file(submission_data)
131
  print(f"✅ 提交文件已保存到: {saved_path}")
132
 
133
+ # OSS上传策略
134
+ if OSS_AVAILABLE:
135
+ try:
136
+ # 使用OSS提交处理器
137
+ oss_handler = OSSSubmissionHandler()
138
+ result = oss_handler.process_sage_submission(submission_data, org_name, email)
139
+
140
+ # 如果OSS处理成功,直接返回结果
141
+ if "提交成功" in result:
142
+ return result
143
+ else:
144
+ # OSS失败,继续使用备用模式
145
+ print(f"⚠️ OSS提交失败,使用备用模式: {result}")
146
+ except Exception as e:
147
+ print(f"⚠️ OSS提交异常,使用备用模式: {e}")
148
 
149
+ # 备用模式:本地保存
150
+ filename = os.path.basename(saved_path)
 
 
 
 
 
 
 
 
 
151
 
152
  # 生成成功消息
153
  org = submission_data["submission_org"]
 
159
  📋 <strong>提交信息:</strong><br>
160
  • 组织: {org}<br>
161
  • 邮箱: {email_addr}<br>
162
+ • 预测数量: {num_predictions} 个问题<br>
163
+ 文件名: {filename}<br><br>
164
+ 🚀 <strong>存储状态:</strong><br>
165
+ 文件已保存到本地存储,等待系统同步到评测环境。<br><br>
166
  ⏳ <strong>评测流程:</strong><br>
167
  您的提交将使用LLM-as-Judge进行自动评估,包括科学推理能力的全面测试。<br>
168
  评测完成后,结果将自动出现在排行榜中。<br><br>
169
  🕐 <strong>预计时间:</strong><br>
170
+ 正常情况: 5-15分钟<br>
171
+ 同步延迟: 15-60分钟<br><br>
172
+ 🧪 感谢您参与SAGE科学推理基准测试!
173
  """)
174
 
175
  return success_msg
 
221
  "recent": submissions[:10] # 最近10个
222
  }
223
 
224
+ # 移除了原有的HTTP推送函数,现在使用OSS模式
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
src/submission/submit.py DELETED
@@ -1,119 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
-
14
- REQUESTED_MODELS = None
15
- USERS_TO_SUBMISSION_DATES = None
16
-
17
- def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
- ):
25
- global REQUESTED_MODELS
26
- global USERS_TO_SUBMISSION_DATES
27
- if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
-
30
- user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
-
36
- precision = precision.split(" ")[0]
37
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
-
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
- print("Adding new eval")
77
-
78
- eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
84
- "status": "PENDING",
85
- "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
- "private": False,
91
- }
92
-
93
- # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
-
97
- print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
- os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
-
102
- with open(out_path, "w") as f:
103
- f.write(json.dumps(eval_entry))
104
-
105
- print("Uploading eval file")
106
- API.upload_file(
107
- path_or_fileobj=out_path,
108
- path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
- repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
- )
113
-
114
- # Remove the local file
115
- os.remove(out_path)
116
-
117
- return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
- )