SAGE-Bench / app.py
“pangjh3”
modified: app.py
944f8a1
import os
import json
import datetime
import requests
from email.utils import parseaddr
import gradio as gr
import pandas as pd
import numpy as np
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
ModelType,
fields,
WeightType,
Precision
)
# SAGE specific imports - use populate module to avoid transformers dependency
try:
from src.populate import process_sage_results_for_leaderboard, get_sage_leaderboard_df
SAGE_MODULES_AVAILABLE = process_sage_results_for_leaderboard is not None
if SAGE_MODULES_AVAILABLE:
print("✅ SAGE modules loaded successfully")
else:
print("❌ SAGE modules not available")
except ImportError as e:
print(f"Warning: SAGE modules not available: {e}")
SAGE_MODULES_AVAILABLE = False
# Configuration
TOKEN = os.environ.get("HF_TOKEN", None)
OWNER = "opencompass"
# OSS submission tracking paths
SUBMISSION_TRACKING_PATH = "atlas_eval/submissions/user_tracking/"
SUBMISSION_HISTORY_FILE = "submission_history.json"
def format_error(msg):
return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
def format_warning(msg):
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
def format_log(msg):
return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
def model_hyperlink(link, model_name):
if link and link.startswith("http"):
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
return model_name
def load_submission_history():
"""Load user submission history from OSS"""
try:
from src.oss.oss_file_manager import OSSFileManager
oss_manager = OSSFileManager()
# Try to download submission history file
history_content = oss_manager.download_file_content(
SUBMISSION_TRACKING_PATH + SUBMISSION_HISTORY_FILE
)
if history_content:
return json.loads(history_content)
else:
print("📝 Creating new submission history")
return {}
except Exception as e:
print(f"⚠️ Failed to load submission history: {e}")
return {}
def save_submission_history(history):
"""Save user submission history to OSS"""
try:
from src.oss.oss_file_manager import OSSFileManager
oss_manager = OSSFileManager()
# Upload submission history
history_json = json.dumps(history, indent=2, ensure_ascii=False)
success = oss_manager.upload_file_content(
content=history_json,
object_key=SUBMISSION_TRACKING_PATH + SUBMISSION_HISTORY_FILE
)
return success
except Exception as e:
print(f"❌ Failed to save submission history: {e}")
return False
def check_user_submission_eligibility(profile: gr.OAuthProfile, org_name: str):
"""Check user submission eligibility"""
try:
# 1. Check account age limit (60 days)
user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
if user_data.status_code == 200:
creation_date = json.loads(user_data.content)["createdAt"]
account_age = datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ')
if account_age < datetime.timedelta(days=60):
return False, "This account does not meet the submission requirement. Account age must exceed 60 days."
else:
return False, "Unable to verify account information. Please try again later."
# 2. Check daily submission limit
submission_history = load_submission_history()
user_submissions = submission_history.get(profile.username, [])
today = datetime.datetime.today().strftime('%Y-%m-%d')
today_submissions = [s for s in user_submissions if s.get("date", "") == today]
if len(today_submissions) >= 2:
return False, "You have already submitted twice today. Please try again tomorrow."
return True, "Eligibility check passed"
except Exception as e:
print(f"❌ User eligibility check failed: {e}")
return False, f"System check error, please try again later: {str(e)}"
def record_user_submission(profile: gr.OAuthProfile, model_name: str, org_name: str, email: str):
"""Record user submission"""
try:
submission_history = load_submission_history()
if profile.username not in submission_history:
submission_history[profile.username] = []
# Record this submission
submission_record = {
"date": datetime.datetime.today().strftime('%Y-%m-%d'),
"time": datetime.datetime.now().strftime('%H:%M:%S'),
"model": model_name,
"organization": org_name,
"email": email,
"username": profile.username
}
submission_history[profile.username].append(submission_record)
# Save submission history
return save_submission_history(submission_history)
except Exception as e:
print(f"❌ Failed to record submission history: {e}")
return False
def get_leaderboard_dataframe():
"""Generate leaderboard dataframe from SAGE results"""
print("🔄 Loading SAGE leaderboard data...")
if not SAGE_MODULES_AVAILABLE:
print("❌ SAGE modules not available")
return pd.DataFrame()
try:
# Use the updated get_sage_leaderboard_df function
df = get_sage_leaderboard_df()
if df.empty:
print("❌ No SAGE results found")
return pd.DataFrame()
print(f"✅ Generated dataframe with {len(df)} rows")
return df
except Exception as e:
print(f"❌ Error generating leaderboard dataframe: {e}")
import traceback
traceback.print_exc()
return pd.DataFrame()
def refresh_leaderboard():
"""Refresh the leaderboard data"""
print("🔄 Refreshing leaderboard data...")
return get_leaderboard_dataframe()
# Initialize data
print("🚀 Initializing SAGE-Bench leaderboard...")
leaderboard_df = get_leaderboard_dataframe()
print(f"📈 Leaderboard initialized with {len(leaderboard_df)} rows")
# Define column types for the dataframe (Model, Organization, Accuracy, mG-Pass@2, mG-Pass@4, Submission Date)
COLUMN_TYPES = ["markdown", "str", "number", "number", "number", "str"]
# Create Gradio interface
demo = gr.Blocks(css="""
.markdown-text {
font-size: 16px !important;
}
#citation-button {
font-family: monospace;
}
""")
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
lines=10,
max_lines=10,
interactive=False
)
# Main leaderboard table
gr.Markdown("## 🏆 SAGE Benchmark Results", elem_classes="markdown-text")
# Debug information - dynamic component
results_count = gr.Markdown(f"📊 **Showing {len(leaderboard_df)} results**")
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
datatype=COLUMN_TYPES,
interactive=False,
wrap=True,
column_widths=["30%", "20%", "12%", "12%", "12%", "14%"]
)
# Refresh button
refresh_button = gr.Button("🔄 Refresh Leaderboard")
def refresh_leaderboard_with_count():
"""Refresh leaderboard and update count display"""
df = refresh_leaderboard()
count_text = f"📊 **Showing {len(df)} results**"
return df, count_text
refresh_button.click(
refresh_leaderboard_with_count,
inputs=[],
outputs=[leaderboard_table, results_count]
)
# Submission section
with gr.Accordion("📊 Submit Your SAGE Results", open=False):
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
# 添加提交说明(登录要求暂时注释)
gr.Markdown("""
### 📋 Submission Requirements
<!--
- Login required: You must log in with a Hugging Face account
- Account age: Account must be older than 60 days
- Submission frequency: Each user can submit up to 2 times per day
-->
- File format: Upload a JSON file in the SAGE format
- Organization: Provide the exact organization name (shown on the leaderboard)
- Contact email: Provide a valid email for notifications
- Auto evaluation: After submission, the system will run LLM-based evaluation and update the leaderboard
<!--
### 🔐 Security Policy
To prevent spam and ensure evaluation quality, we enforce:
- New accounts must wait 60 days before submitting (prevents abuse)
- Daily submission limits to ensure leaderboard quality and system stability
- Duplicate checks to avoid multiple submissions for the same organization
-->
""", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_textbox = gr.Textbox(
label="Model Name - will be shown on the leaderboard",
placeholder="Your Model Name (e.g., GPT-4, Llama-2-70B)"
)
org_textbox = gr.Textbox(
label="Organization Name - will be shown on the leaderboard",
placeholder="Your Organization"
)
email_textbox = gr.Textbox(
label="Contact Email - used for contact, not publicly visible",
placeholder="contact@example.com"
)
with gr.Column():
file_upload = gr.File(
label="Upload SAGE Results (JSON)",
file_types=[".json"],
type="filepath"
)
# 提交按钮 (登录功能暂时注释)
with gr.Row():
login_button = gr.LoginButton("🔐 Login with HuggingFace", size="lg")
submit_button = gr.Button("Submit Results", variant="primary", size="lg")
# 登录状态与用户信息
profile_state = gr.State()
login_status = gr.Markdown(visible=True)
# def on_login(profile: gr.OAuthProfile):
# try:
# if profile and getattr(profile, "name", None):
# name = profile.name
# text = f"✅ Logged in as: **{name}**"
# else:
# text = "❌ Login failed, please try again"
# return profile, text
# except Exception:
# return None, "❌ Login failed, please try again"
# login_button.click(on_login, inputs=None, outputs=[profile_state, login_status])
# 进度显示和结果显示区域
progress_info = gr.HTML()
submission_result = gr.HTML()
def show_progress(step, message, total_steps=4):
"""Show progress information"""
progress_percentage = int((step / total_steps) * 100)
progress_html = f"""
<div style="background-color: #e7f3ff; border: 1px solid #4dabf7; border-radius: 5px; padding: 15px; margin: 10px 0;">
<div style="display: flex; align-items: center; margin-bottom: 10px;">
<h4 style="color: #1971c2; margin: 0; flex-grow: 1;">⏳ Processing submission...</h4>
<span style="color: #1971c2; font-weight: bold;">{progress_percentage}%</span>
</div>
<p style="color: #1971c2; margin: 5px 0;"><strong>Step {step}/{total_steps}:</strong> {message}</p>
<div style="background-color: #fff; border-radius: 10px; height: 20px; margin: 10px 0; border: 1px solid #dee2e6;">
<div style="background: linear-gradient(90deg, #4dabf7, #74c0fc); height: 100%; width: {progress_percentage}%; border-radius: 10px; transition: width 0.5s ease; display: flex; align-items: center; justify-content: center;">
{f'<span style="color: white; font-size: 12px; font-weight: bold;">{progress_percentage}%</span>' if progress_percentage > 20 else ''}
</div>
</div>
<p style="color: #495057; font-size: 14px; margin: 5px 0;">
{'✨ Almost done, please wait...' if step >= total_steps else '📤 Please wait, processing your submission...'}
</p>
</div>
"""
return progress_html
def handle_submission(file_upload, model_name, org_name, email, user_profile: gr.OAuthProfile):
try:
# 步骤1: 基本验证
yield show_progress(1, "Validating submission info"), ""
# 校验登录
if user_profile is None or getattr(user_profile, "name", None) is None:
yield "", format_error("Please log in with Hugging Face before submitting")
return
print(f"user_profile: {user_profile}")
print(f"user_profile.name: {user_profile.name}")
if not file_upload:
yield "", format_error("Please select a file to upload")
return
if not model_name or not model_name.strip():
yield "", format_error("Please enter model name")
return
if not org_name or not org_name.strip():
yield "", format_error("Please enter organization name")
return
if not email or not email.strip():
yield "", format_error("Please enter email address")
return
# 验证邮箱格式
_, parsed_email = parseaddr(email)
if "@" not in parsed_email:
yield "", format_warning("Please provide a valid email address")
return
# 步骤2: 文件验证和读取
yield show_progress(2, "Validating file format and content"), ""
import time
time.sleep(0.5) # allow users to see progress update
# 用户资格检查(账号年龄/频率/重复提交)
eligible, msg = check_user_submission_eligibility(user_profile, org_name)
if not eligible:
yield "", format_error(msg)
return
# 步骤3: 上传到OSS
yield show_progress(3, "Uploading file to OSS storage"), ""
# 处理文件提交
from src.submission.submit import process_sage_submission_simple
result = process_sage_submission_simple(file_upload, model_name, org_name, email)
# 步骤4: 完成
yield show_progress(4, "Submission completed, preparing evaluation"), ""
time.sleep(0.5) # allow users to see completion state
# 记录提交历史
try:
record_user_submission(user_profile, model_name, org_name, email)
except Exception:
pass
# 生成成功信息
success_info = f"""
<div style="background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 5px; padding: 15px; margin: 10px 0;">
<h4 style="color: #155724; margin-top: 0;">🎉 Submission successful!</h4>
<p style="color: #155724; margin: 5px 0;"><strong>Model:</strong> {model_name}</p>
<p style="color: #155724; margin: 5px 0;"><strong>Organization:</strong> {org_name}</p>
<p style="color: #155724; margin: 5px 0;"><strong>Email:</strong> {email}</p>
<p style="color: #155724; margin: 5px 0;"><strong>Submitted at:</strong> {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<p style="color: #155724; margin-bottom: 0;">Your results have been submitted via OSS. LLM evaluation will complete in 5-10 minutes and the leaderboard will be updated.</p>
</div>
"""
# 清除进度条,显示最终结果
yield "", success_info + result
except ImportError as e:
yield "", format_error(f"Submission system modules unavailable: {e}")
except Exception as e:
import traceback
traceback.print_exc()
yield "", format_error(f"An error occurred during submission: {str(e)}")
submit_button.click(
handle_submission,
inputs=[file_upload, model_textbox, org_textbox, email_textbox], # profile_state
outputs=[progress_info, submission_result]
)
# Launch the app
if __name__ == "__main__":
# Disable SSR mode for better OAuth compatibility
# Note: OAuth is handled internally via gr.LoginButton, not at launch level
demo.launch(ssr_mode=False)