Spaces:
Sleeping
Sleeping
File size: 9,839 Bytes
2086543 f24b2b1 2086543 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import json
import os
from datetime import datetime, timezone
from typing import Dict, List, Any
from src.display.formatting import styled_error, styled_message, styled_warning
def validate_sage_submission(submission_data: Dict[str, Any]) -> tuple[bool, str]:
"""Validates SAGE benchmark submission format"""
# Check required top-level fields
required_fields = ["submission_org", "submission_email", "predictions"]
for field in required_fields:
if field not in submission_data:
return False, f"Missing required field: {field}"
# Validate email format (basic)
email = submission_data["submission_email"]
if "@" not in email or "." not in email:
return False, "Invalid email format"
# Validate predictions
predictions = submission_data["predictions"]
if not isinstance(predictions, list) or len(predictions) == 0:
return False, "Predictions must be a non-empty list"
for i, prediction in enumerate(predictions):
# Check required prediction fields
pred_required_fields = ["original_question_id", "content", "reasoning_content"]
for field in pred_required_fields:
if field not in prediction:
return False, f"Missing field '{field}' in prediction {i}"
# Validate content arrays
content = prediction["content"]
reasoning_content = prediction["reasoning_content"]
if not isinstance(content, list) or len(content) != 4:
return False, f"Content in prediction {i} must be a list of exactly 4 items"
if not isinstance(reasoning_content, list) or len(reasoning_content) != 4:
return False, f"Reasoning content in prediction {i} must be a list of exactly 4 items"
# Validate question ID
if not isinstance(prediction["original_question_id"], int):
return False, f"Question ID in prediction {i} must be an integer"
return True, "Valid submission format"
def process_sage_submission_simple(submission_file, org_name=None, email=None) -> str:
"""Process SAGE benchmark submission file - simplified version for basic leaderboard"""
try:
# Read the submitted file (receives file path)
if submission_file is None:
return styled_error("No file uploaded. Please select a JSON file.")
# submission_file is a file path string
try:
with open(submission_file, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
return styled_error(f"Error reading file: {str(e)}")
# Parse JSON
try:
submission_data = json.loads(content)
except json.JSONDecodeError as e:
return styled_error(f"Invalid JSON format: {str(e)}")
# Use form inputs if submission data doesn't contain org/email
if org_name and email:
submission_data["submission_org"] = org_name
submission_data["submission_email"] = email
# Validate submission format
is_valid, message = validate_sage_submission(submission_data)
if not is_valid:
return styled_error(f"Submission validation failed: {message}")
# Save submission for later processing
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
org = submission_data["submission_org"].replace(" ", "_").replace("/", "_")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save raw submission
submission_dir = f"./sage_submissions/{org}"
os.makedirs(submission_dir, exist_ok=True)
raw_submission_path = f"{submission_dir}/submission_{timestamp}.json"
with open(raw_submission_path, 'w') as f:
json.dump(submission_data, f, indent=2)
# Simple evaluation using the evaluation module
try:
from src.leaderboard.sage_eval import evaluate_sage_submission
domain_accuracies = evaluate_sage_submission(submission_data)
# Update initial_sage_results.json directly for persistence
# Try multiple possible paths for the initial results file
possible_paths = [
"./initial_sage_results.json",
"initial_sage_results.json",
os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "initial_sage_results.json")
]
initial_results_file = None
for path in possible_paths:
if os.path.exists(path):
initial_results_file = path
break
if not initial_results_file:
initial_results_file = possible_paths[0] # Use first path as fallback
try:
# Load existing initial results
if os.path.exists(initial_results_file):
with open(initial_results_file, 'r') as f:
initial_results = json.load(f)
else:
initial_results = []
# Convert to initial results format
new_result = {
"model_name": submission_data["submission_org"],
"organization": submission_data["submission_org"],
"tokens": "User Submission",
"accuracy": domain_accuracies["sage_overall"],
"mg_pass_2": domain_accuracies["sage_overall"], # Use same value for now
"mg_pass_4": domain_accuracies["sage_overall"], # Use same value for now
"submitted_time": datetime.now().strftime("%Y-%m-%d"),
"results": domain_accuracies,
"contact_email": submission_data["submission_email"]
}
# Check if organization already exists, update or add
org_name = submission_data["submission_org"]
updated = False
for i, result in enumerate(initial_results):
if (result.get("model_name") == org_name or
result.get("organization") == org_name):
initial_results[i] = new_result
updated = True
break
if not updated:
initial_results.append(new_result)
# Save updated initial results
with open(initial_results_file, 'w') as f:
json.dump(initial_results, f, indent=2)
print(f"✅ Updated {initial_results_file} with new submission from {org_name}")
except Exception as e:
print(f"⚠️ Failed to update initial results file: {e}")
# Format success message with scores
overall_accuracy = domain_accuracies.get("sage_overall", 0)
success_msg = styled_message(
f"🎉 SAGE submission processed successfully!\n\n"
f"**Organization:** {submission_data['submission_org']}\n"
f"**Overall Accuracy:** {overall_accuracy:.2f}%\n\n"
f"**Domain Scores:**\n"
f" • Mathematics: {domain_accuracies.get('sage_math', 0):.2f}%\n"
f" • Physics: {domain_accuracies.get('sage_physics', 0):.2f}%\n"
f" • Chemistry: {domain_accuracies.get('sage_chemistry', 0):.2f}%\n"
f" • Biology: {domain_accuracies.get('sage_biology', 0):.2f}%\n"
f" • Earth Science: {domain_accuracies.get('sage_earth_science', 0):.2f}%\n"
f" • Astronomy: {domain_accuracies.get('sage_astronomy', 0):.2f}%\n\n"
f"Your results have been added to the leaderboard. "
f"Please refresh the page to see updated rankings."
)
return success_msg
except Exception as eval_error:
# If evaluation fails, still save submission but mark as failed
return styled_warning(
f"⚠️ Submission received but evaluation failed.\n\n"
f"Error: {str(eval_error)}\n\n"
f"Your submission has been saved and will be processed manually. "
f"Please contact administrators if this issue persists."
)
except Exception as e:
return styled_error(f"Submission processing failed: {str(e)}")
def load_sage_submissions(submissions_dir: str = "./sage_submissions") -> List[Dict]:
"""Load all SAGE submissions for display in queue"""
if not os.path.exists(submissions_dir):
return []
submissions = []
for org_dir in os.listdir(submissions_dir):
org_path = os.path.join(submissions_dir, org_dir)
if not os.path.isdir(org_path):
continue
for file in os.listdir(org_path):
if file.startswith("submission_") and file.endswith(".json"):
try:
with open(os.path.join(org_path, file), 'r') as f:
submission = json.load(f)
# Add metadata
submission["_filename"] = file
submission["_org_dir"] = org_dir
submissions.append(submission)
except Exception:
continue
# Sort by submission time (most recent first)
submissions.sort(key=lambda x: x.get("_filename", ""), reverse=True)
return submissions
|