sudanl commited on
Commit
1051335
·
1 Parent(s): bc17e3e

feat: 实现分离式提交评测系统

Browse files

新功能:
🏗️ 系统架构
- 用户提交 → SAGE-Bench收集 → 本地评测 → 更新排行榜
- 保护API密钥,防止在线仓库暴露评测代码

📁 新增文件
- src/submission/simple_submit.py: 文件收集模式的提交处理
- submissions/: 用户提交文件收集目录

🔧 功能特点
- 只验证和保存JSON文件,不进行在线评测
- 用户提交后立即收到确认信息
- 支持组织信息和邮箱验证
- 完整的提交格式验证

🎯 下一步
- 使用local_eval/目录进行本地评测
- 运行update_leaderboard.py更新排行榜

Files changed (2) hide show
  1. app.py +10 -3
  2. src/submission/simple_submit.py +189 -0
app.py CHANGED
@@ -191,10 +191,17 @@ with demo:
191
  submit_button = gr.Button("Submit Results", variant="primary")
192
  submission_result = gr.HTML()
193
 
194
- # Simplified submission handling
 
 
 
 
 
 
 
195
  submit_button.click(
196
- lambda: format_warning("📋 Submission feature coming soon! For now, please contact administrators directly."),
197
- inputs=[],
198
  outputs=[submission_result]
199
  )
200
 
 
191
  submit_button = gr.Button("Submit Results", variant="primary")
192
  submission_result = gr.HTML()
193
 
194
+ # File collection submission handling
195
+ def handle_submission(file_upload, org_name, email):
196
+ try:
197
+ from src.submission.simple_submit import process_sage_submission_simple
198
+ return process_sage_submission_simple(file_upload, org_name, email)
199
+ except ImportError:
200
+ return format_error("❌ 提交系统暂时不可用,请稍后再试。")
201
+
202
  submit_button.click(
203
+ handle_submission,
204
+ inputs=[file_upload, org_textbox, email_textbox],
205
  outputs=[submission_result]
206
  )
207
 
src/submission/simple_submit.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ 简化的SAGE提交处理 - 文件收集模式
4
+ 只负责接收和验证提交文件,不进行评测
5
+ """
6
+
7
+ import json
8
+ import os
9
+ import shutil
10
+ from datetime import datetime
11
+ from typing import Dict, Any
12
+
13
+ def format_error(msg):
14
+ return f"<p style='color: red; font-size: 16px;'>{msg}</p>"
15
+
16
+ def format_success(msg):
17
+ return f"<p style='color: green; font-size: 16px;'>{msg}</p>"
18
+
19
+ def format_warning(msg):
20
+ return f"<p style='color: orange; font-size: 16px;'>{msg}</p>"
21
+
22
+ def validate_sage_submission(submission_data: Dict[str, Any]) -> tuple[bool, str]:
23
+ """验证SAGE基准提交格式"""
24
+
25
+ # 检查必需的顶级字段
26
+ required_fields = ["submission_org", "submission_email", "predictions"]
27
+ for field in required_fields:
28
+ if field not in submission_data:
29
+ return False, f"缺少必需字段: {field}"
30
+
31
+ # 验证邮箱格式(基本验证)
32
+ email = submission_data["submission_email"]
33
+ if "@" not in email or "." not in email:
34
+ return False, "邮箱格式无效"
35
+
36
+ # 验证predictions
37
+ predictions = submission_data["predictions"]
38
+ if not isinstance(predictions, list) or len(predictions) == 0:
39
+ return False, "predictions必须是非空列表"
40
+
41
+ for i, prediction in enumerate(predictions):
42
+ # 检查必需的prediction字段
43
+ pred_required_fields = ["original_question_id", "content", "reasoning_content"]
44
+ for field in pred_required_fields:
45
+ if field not in prediction:
46
+ return False, f"预测{i}中缺少字段: {field}"
47
+
48
+ # 验证content数组
49
+ content = prediction["content"]
50
+ reasoning_content = prediction["reasoning_content"]
51
+
52
+ if not isinstance(content, list) or len(content) != 4:
53
+ return False, f"预测{i}的content必须是包含4个项目的列表"
54
+
55
+ if not isinstance(reasoning_content, list) or len(reasoning_content) != 4:
56
+ return False, f"预测{i}的reasoning_content必须是包含4个项目的列表"
57
+
58
+ # 验证question ID
59
+ if not isinstance(prediction["original_question_id"], int):
60
+ return False, f"预测{i}的question ID必须是整数"
61
+
62
+ return True, "提交格式有效"
63
+
64
+ def save_submission_file(submission_data: Dict[str, Any], submissions_dir: str = "./submissions") -> str:
65
+ """保存提交文件到指定目录"""
66
+
67
+ # 确保submissions目录存在
68
+ os.makedirs(submissions_dir, exist_ok=True)
69
+
70
+ # 生成文件名
71
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
72
+ org_name = submission_data["submission_org"].replace(" ", "_").replace("/", "_").replace("\\", "_")
73
+ filename = f"submission_{org_name}_{timestamp}.json"
74
+
75
+ # 完整文件路径
76
+ file_path = os.path.join(submissions_dir, filename)
77
+
78
+ # 保存文件
79
+ with open(file_path, 'w', encoding='utf-8') as f:
80
+ json.dump(submission_data, f, indent=2, ensure_ascii=False)
81
+
82
+ return file_path
83
+
84
+ def process_sage_submission_simple(submission_file, org_name=None, email=None) -> str:
85
+ """
86
+ 处理SAGE基准提交文件 - 文件收集模式
87
+ 只负责验证和保存,不进行评测
88
+ """
89
+
90
+ try:
91
+ # 读取提交的文件
92
+ if submission_file is None:
93
+ return format_error("❌ 没有上传文件。请选择一个JSON文件。")
94
+
95
+ # submission_file是文件路径字符串
96
+ try:
97
+ with open(submission_file, 'r', encoding='utf-8') as f:
98
+ content = f.read()
99
+ except Exception as e:
100
+ return format_error(f"❌ 读取文件时出错: {str(e)}")
101
+
102
+ # 解析JSON
103
+ try:
104
+ submission_data = json.loads(content)
105
+ except json.JSONDecodeError as e:
106
+ return format_error(f"❌ JSON格式无效: {str(e)}")
107
+
108
+ # 如果表单提供了组织名和邮箱,使用表单数据
109
+ if org_name and email:
110
+ submission_data["submission_org"] = org_name.strip()
111
+ submission_data["submission_email"] = email.strip()
112
+
113
+ # 验证提交格式
114
+ is_valid, message = validate_sage_submission(submission_data)
115
+ if not is_valid:
116
+ return format_error(f"❌ 提交验证失败: {message}")
117
+
118
+ # 保存提交文件
119
+ try:
120
+ saved_path = save_submission_file(submission_data)
121
+ print(f"✅ 提交文件已保存到: {saved_path}")
122
+
123
+ # 生成成功消息
124
+ org = submission_data["submission_org"]
125
+ email_addr = submission_data["submission_email"]
126
+ num_predictions = len(submission_data["predictions"])
127
+
128
+ success_msg = format_success(f"""
129
+ 🎉 <strong>提交成功接收!</strong><br><br>
130
+ 📋 <strong>提交信息:</strong><br>
131
+ • 组织: {org}<br>
132
+ • 邮箱: {email_addr}<br>
133
+ • 预测数量: {num_predictions} 个问题<br><br>
134
+ ⏳ <strong>下一步:</strong><br>
135
+ 您的提交正在排队等待评测。我们的系统将使用LLM-as-Judge对您的答案进行评估,<br>
136
+ 并计算各个科学领域的分数。评测完成后,结果将自动添加到排行榜中。<br><br>
137
+ 📧 评测完成后,我们会通过邮箱通知您结果。<br>
138
+ 🕐 预计评测时间: 1-24小时(取决于队列长度)<br><br>
139
+ 感谢您参与SAGE基准测试!🧪
140
+ """)
141
+
142
+ return success_msg
143
+
144
+ except Exception as e:
145
+ return format_error(f"❌ 保存提交文件时出错: {str(e)}")
146
+
147
+ except Exception as e:
148
+ return format_error(f"❌ 提交处理失败: {str(e)}")
149
+
150
+ def get_submission_stats(submissions_dir: str = "./submissions") -> Dict[str, Any]:
151
+ """获取提交统计信息"""
152
+
153
+ if not os.path.exists(submissions_dir):
154
+ return {"total": 0, "recent": []}
155
+
156
+ submissions = []
157
+
158
+ for filename in os.listdir(submissions_dir):
159
+ if filename.startswith("submission_") and filename.endswith(".json"):
160
+ file_path = os.path.join(submissions_dir, filename)
161
+ try:
162
+ with open(file_path, 'r', encoding='utf-8') as f:
163
+ data = json.load(f)
164
+
165
+ # 提取信息
166
+ timestamp_str = filename.split("_")[-1].replace(".json", "")
167
+ try:
168
+ timestamp = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
169
+ formatted_time = timestamp.strftime("%Y-%m-%d %H:%M")
170
+ except:
171
+ formatted_time = timestamp_str
172
+
173
+ submissions.append({
174
+ "org": data.get("submission_org", "Unknown"),
175
+ "email": data.get("submission_email", ""),
176
+ "time": formatted_time,
177
+ "predictions": len(data.get("predictions", []))
178
+ })
179
+
180
+ except Exception:
181
+ continue
182
+
183
+ # 按时间排序,最新的在前
184
+ submissions.sort(key=lambda x: x["time"], reverse=True)
185
+
186
+ return {
187
+ "total": len(submissions),
188
+ "recent": submissions[:10] # 最近10个
189
+ }