import json import os from datetime import datetime, timezone from typing import Dict, List, Any from src.display.formatting import styled_error, styled_message, styled_warning def validate_sage_submission(submission_data: Dict[str, Any]) -> tuple[bool, str]: """Validates SAGE benchmark submission format""" # Check required top-level fields required_fields = ["submission_org", "submission_email", "predictions"] for field in required_fields: if field not in submission_data: return False, f"Missing required field: {field}" # Validate email format (basic) email = submission_data["submission_email"] if "@" not in email or "." not in email: return False, "Invalid email format" # Validate predictions predictions = submission_data["predictions"] if not isinstance(predictions, list) or len(predictions) == 0: return False, "Predictions must be a non-empty list" for i, prediction in enumerate(predictions): # Check required prediction fields pred_required_fields = ["original_question_id", "content", "reasoning_content"] for field in pred_required_fields: if field not in prediction: return False, f"Missing field '{field}' in prediction {i}" # Validate content arrays content = prediction["content"] reasoning_content = prediction["reasoning_content"] if not isinstance(content, list) or len(content) != 4: return False, f"Content in prediction {i} must be a list of exactly 4 items" if not isinstance(reasoning_content, list) or len(reasoning_content) != 4: return False, f"Reasoning content in prediction {i} must be a list of exactly 4 items" # Validate question ID if not isinstance(prediction["original_question_id"], int): return False, f"Question ID in prediction {i} must be an integer" return True, "Valid submission format" def process_sage_submission_simple(submission_file, org_name=None, email=None) -> str: """Process SAGE benchmark submission file - simplified version for basic leaderboard""" try: # Read the submitted file (receives file path) if submission_file is None: return styled_error("No file uploaded. Please select a JSON file.") # submission_file is a file path string try: with open(submission_file, 'r', encoding='utf-8') as f: content = f.read() except Exception as e: return styled_error(f"Error reading file: {str(e)}") # Parse JSON try: submission_data = json.loads(content) except json.JSONDecodeError as e: return styled_error(f"Invalid JSON format: {str(e)}") # Use form inputs if submission data doesn't contain org/email if org_name and email: submission_data["submission_org"] = org_name submission_data["submission_email"] = email # Validate submission format is_valid, message = validate_sage_submission(submission_data) if not is_valid: return styled_error(f"Submission validation failed: {message}") # Save submission for later processing current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") org = submission_data["submission_org"].replace(" ", "_").replace("/", "_") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Save raw submission submission_dir = f"./sage_submissions/{org}" os.makedirs(submission_dir, exist_ok=True) raw_submission_path = f"{submission_dir}/submission_{timestamp}.json" with open(raw_submission_path, 'w') as f: json.dump(submission_data, f, indent=2) # Simple evaluation using the evaluation module try: from src.leaderboard.sage_eval import evaluate_sage_submission domain_accuracies = evaluate_sage_submission(submission_data) # Update initial_sage_results.json directly for persistence # Try multiple possible paths for the initial results file possible_paths = [ "./initial_sage_results.json", "initial_sage_results.json", os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "initial_sage_results.json") ] initial_results_file = None for path in possible_paths: if os.path.exists(path): initial_results_file = path break if not initial_results_file: initial_results_file = possible_paths[0] # Use first path as fallback try: # Load existing initial results if os.path.exists(initial_results_file): with open(initial_results_file, 'r') as f: initial_results = json.load(f) else: initial_results = [] # Convert to initial results format new_result = { "model_name": submission_data["submission_org"], "organization": submission_data["submission_org"], "tokens": "User Submission", "accuracy": domain_accuracies["sage_overall"], "mg_pass_2": domain_accuracies["sage_overall"], # Use same value for now "mg_pass_4": domain_accuracies["sage_overall"], # Use same value for now "submitted_time": datetime.now().strftime("%Y-%m-%d"), "results": domain_accuracies, "contact_email": submission_data["submission_email"] } # Check if organization already exists, update or add org_name = submission_data["submission_org"] updated = False for i, result in enumerate(initial_results): if (result.get("model_name") == org_name or result.get("organization") == org_name): initial_results[i] = new_result updated = True break if not updated: initial_results.append(new_result) # Save updated initial results with open(initial_results_file, 'w') as f: json.dump(initial_results, f, indent=2) print(f"✅ Updated {initial_results_file} with new submission from {org_name}") except Exception as e: print(f"⚠️ Failed to update initial results file: {e}") # Format success message with scores overall_accuracy = domain_accuracies.get("sage_overall", 0) success_msg = styled_message( f"🎉 SAGE submission processed successfully!\n\n" f"**Organization:** {submission_data['submission_org']}\n" f"**Overall Accuracy:** {overall_accuracy:.2f}%\n\n" f"**Domain Scores:**\n" f" • Mathematics: {domain_accuracies.get('sage_math', 0):.2f}%\n" f" • Physics: {domain_accuracies.get('sage_physics', 0):.2f}%\n" f" • Chemistry: {domain_accuracies.get('sage_chemistry', 0):.2f}%\n" f" • Biology: {domain_accuracies.get('sage_biology', 0):.2f}%\n" f" • Earth Science: {domain_accuracies.get('sage_earth_science', 0):.2f}%\n" f" • Astronomy: {domain_accuracies.get('sage_astronomy', 0):.2f}%\n\n" f"Your results have been added to the leaderboard. " f"Please refresh the page to see updated rankings." ) return success_msg except Exception as eval_error: # If evaluation fails, still save submission but mark as failed return styled_warning( f"⚠️ Submission received but evaluation failed.\n\n" f"Error: {str(eval_error)}\n\n" f"Your submission has been saved and will be processed manually. " f"Please contact administrators if this issue persists." ) except Exception as e: return styled_error(f"Submission processing failed: {str(e)}") def load_sage_submissions(submissions_dir: str = "./sage_submissions") -> List[Dict]: """Load all SAGE submissions for display in queue""" if not os.path.exists(submissions_dir): return [] submissions = [] for org_dir in os.listdir(submissions_dir): org_path = os.path.join(submissions_dir, org_dir) if not os.path.isdir(org_path): continue for file in os.listdir(org_path): if file.startswith("submission_") and file.endswith(".json"): try: with open(os.path.join(org_path, file), 'r') as f: submission = json.load(f) # Add metadata submission["_filename"] = file submission["_org_dir"] = org_dir submissions.append(submission) except Exception: continue # Sort by submission time (most recent first) submissions.sort(key=lambda x: x.get("_filename", ""), reverse=True) return submissions