Spaces:

Fysics-AI
/

FysicsWorld-LeaderBoard

Running

App Files Files Community

FysicsWorld-LeaderBoard / app.py

FRENKIE-CHIANG

Upload app.py with huggingface_hub

c9835d3 verified 17 days ago

raw

history blame contribute delete

10.7 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import json
	import os
	from pathlib import Path
	from huggingface_hub import snapshot_download, HfApi

	# =========================
	# Basic Config
	# =========================

	DATASET_REPO = "Fysics-AI/FysicsWorld-Leaderborad-Result"
	HF_TOKEN = os.environ.get("HF_TOKEN")

	TRACK_TO_CSV = {
	"omni-mllm": "omni-mllm.csv",
	"image-gen": "image-gen.csv",
	"video-gen": "video-gen.csv",
	}

	# =========================
	# Download Dataset (once)
	# =========================
	LOCAL_DATA_DIR = Path(
	snapshot_download(
	repo_id=DATASET_REPO,
	repo_type="dataset",
	token=HF_TOKEN,
	)
	)

	print("📂 Dataset dir:", LOCAL_DATA_DIR)
	print("📄 Files:", [p.name for p in LOCAL_DATA_DIR.iterdir()])

	# =========================
	# Column Rename Maps (关键修复点)
	# =========================

	OMNI_MLLM_RENAME = {
	"Task1-1": "Image\nUnderstanding",
	"Task1-2": "Video\nUnderstanding",

	"Task2-1": "Speech-Driven\nImage Understanding",
	"Task2-2": "Image-Audio\nReasoning",
	"Task2-3": "Speech-Based\nImage QA",
	"Task2-4": "Speech Generation\nfrom Image",
	"Task2-5": "Audio Matching\nfrom Image",

	"Task3-1": "Speech-Driven\nVideo Understanding",
	"Task3-2": "Video-Audio\nReasoning",
	"Task3-3": "Speech-Based\nVideo QA",
	"Task3-4": "Speech Generation\nfrom Video",
	"Task3-5": "Audio Matching\nfrom Video",
	"Task3-6": "Next-Action\nPrediction",
	}

	AUDIO_RENAME = {
	"Task1-3": "Audio Reasoning"
	}

	IMAGE_GEN_RENAME = {
	"WIScore": "WIScore",
	"SC": "Semantic\nConsistency",
	"PQ": "Perceptual\nQuality",
	"OR": "Overall\nQuality",
	}

	VIDEO_GEN_RENAME = {
	"Imaging": "Imaging",
	"Aesthetic": "Aesthetic",
	"Motion": "Motion",
	"Temporal": "Temporal",
	}

	# =========================
	# Utils
	# =========================
	def format_numeric_columns(df, decimals=2):
	df = df.copy()
	numeric_cols = df.select_dtypes(include=[np.number]).columns
	for col in numeric_cols:
	df[col] = df[col].map(
	lambda x: f"{x:.{decimals}f}" if pd.notnull(x) else ""
	)
	return df


	def load_csv(filename, sort_key=None, ascending=False):
	csv_path = LOCAL_DATA_DIR / filename
	df = pd.read_csv(csv_path)

	if sort_key and sort_key in df.columns:
	df = df.sort_values(sort_key, ascending=ascending)

	df = format_numeric_columns(df, decimals=2)
	return df


	# =========================
	# Submission Logic（不动）
	# =========================
	api = HfApi()


	def parse_submission(file_bytes):
	data = json.loads(file_bytes.decode("utf-8"))

	required = ["benchmark", "track", "model", "type", "metrics"]
	for k in required:
	if k not in data:
	raise ValueError(f"Missing field: {k}")

	if data["benchmark"] != "OmniWorld":
	raise ValueError("Invalid benchmark")

	if data["track"] not in TRACK_TO_CSV:
	raise ValueError("Invalid track")

	return data


	def append_submission(data):
	csv_name = TRACK_TO_CSV[data["track"]]
	csv_path = LOCAL_DATA_DIR / csv_name

	df = pd.read_csv(csv_path)

	if data["model"] in df["Model"].values:
	raise ValueError("Model already exists in leaderboard")

	row = {
	"Model": data["model"],
	"Type": data["type"],
	}
	row.update(data["metrics"])

	df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
	df.to_csv(csv_path, index=False)

	api.upload_file(
	path_or_fileobj=str(csv_path),
	path_in_repo=csv_name,
	repo_id=DATASET_REPO,
	repo_type="dataset",
	token=HF_TOKEN,
	)



	def handle_submit(file):
	if file is None:
	return "❌ No file uploaded"

	try:
	data = parse_submission(file)
	append_submission(data)
	return "✅ Submission successful! Please refresh leaderboard."
	except Exception as e:
	return f"❌ Error: {str(e)}"


	# =========================
	# Gradio UI
	# =========================
	with gr.Blocks(
	theme=gr.themes.Soft(),
	css="""
	.container {
	max-width: 1200px;
	margin: auto;
	}
	.leaderboard-links a {
	display: inline-block;
	margin: 0 8px;
	padding: 6px 12px;
	border-radius: 20px;
	background: #f4f4f5;
	color: #111827;
	text-decoration: none;
	font-weight: 500;
	font-size: 14px;
	}
	.leaderboard-links a:hover {
	background: #e5e7eb;
	}
	.description {
	max-width: 900px;
	margin: 18px auto 30px auto;
	font-size: 16px;
	line-height: 1.7;
	color: #374151;
	text-align: center;
	}
	body, .gradio-container {
	font-family:
	-apple-system,
	BlinkMacSystemFont,
	"Segoe UI",
	Roboto,
	"Helvetica Neue",
	Arial,
	"Noto Sans",
	"Liberation Sans",
	sans-serif;
	}
	/* OmniLLM 表格：第 1 列（Model） */
	table th:nth-child(1),
	table td:nth-child(1) {
	min-width: 220px;
	max-width: 220px;
	white-space: nowrap;
	}

	/* 第 2 列（Type） */
	table th:nth-child(2),
	table td:nth-child(2) {
	min-width: 120px;
	max-width: 120px;
	}

	.overall-definition {
	max-width: 900px;
	margin: 30px auto 40px auto;
	padding: 22px 28px;
	background: #f9fafb;
	border: 1px solid #e5e7eb;
	border-radius: 14px;
	font-size: 15px;
	line-height: 1.7;
	color: #1f2937;
	}

	.overall-definition h3 {
	text-align: center;
	font-size: 22px;
	margin-bottom: 16px;
	}

	.overall-definition strong {
	color: #111827;
	}

	""") as demo:
	gr.Markdown(
	"""
	<h1 style="text-align:center; font-size:42px; margin-bottom:10px;">
	🏆 FysicsWorld Leaderboard
	</h1>

	<div class="leaderboard-links" style="text-align:center; margin-bottom:12px;">
	<a href="https://github.com/Fysics-AI/FysicsWorld" target="_blank"
	style="margin: 0 10px;">
	🏠 Project Page
	</a>
	<a href="https://arxiv.org/pdf/2512.12756" target="_blank"
	style="margin: 0 10px;">
	📖 Paper
	</a>
	<a href="https://huggingface.co/datasets/Fysics-AI/FysicsWorld" target="_blank"
	style="margin: 0 10px;">
	🤗 Dataset
	</a>
	<a href="https://www.modelscope.cn/datasets/Fysics-AI/FysicsWorld" target="_blank"
	style="margin: 0 10px;">
	👾 ModelScope
	</a>
	</div>

	<div class="description">
	We introduce <b><i>FysicsWorld</i></b>, the <b>first</b> unified full-modality benchmark
	that supports bidirectional input-output across <i>image, video, audio, and text</i>,
	enabling comprehensive any-to-any evaluation across understanding, generation, and reasoning.
	Our systematic design spans uni-modal perception tasks to fusion-dependent reasoning
	under strong cross-modal coupling, allowing us to diagnose, with unprecedented clarity,
	the limitations and emerging strengths of modern multimodal and omni-modal architectures.
	</div>
	"""
	)

	with gr.Tabs():

	# ---------- OmniLLM / MLLM ----------
	with gr.Tab("🧠 OmniLLM / MLLM"):
	gr.Markdown("Evaluation results for OmniLLM / MLLM models.")

	df_omni = load_csv("omni-mllm.csv", sort_key="Overall")
	df_omni = df_omni.rename(columns=OMNI_MLLM_RENAME)

	omni_table = gr.Dataframe(
	value=df_omni,
	interactive=False,
	wrap=True
	)

	# ---------- Image Generation ----------
	with gr.Tab("🎨 Image Generation"):
	gr.Markdown("Evaluation results for image generation models.")

	df_img = load_csv("image-gen.csv", sort_key="Overall")
	df_img = df_img.rename(columns=IMAGE_GEN_RENAME)

	image_table = gr.Dataframe(
	value=df_img,
	interactive=False,
	)

	# ---------- Video Generation ----------
	with gr.Tab("🎬 Video Generation"):
	gr.Markdown("Evaluation results for video generation models.")

	df_vid = load_csv("video-gen.csv", sort_key="Overall")
	df_vid = df_vid.rename(columns=VIDEO_GEN_RENAME)

	video_table = gr.Dataframe(
	value=df_vid,
	interactive=False,
	)

	# ---------- Audio Reasoning ----------
	with gr.Tab("🎵 Audio Reasoning"):
	gr.Markdown("Evaluation results for OmniLLMs, MLLMs, and AudioLLMs.")

	df_aud = load_csv("audio-reasoning.csv", sort_key="Task1-3")
	df_aud = df_aud.rename(columns=AUDIO_RENAME)

	audio_table = gr.Dataframe(
	value=df_aud,
	interactive=False,
	)


	# ---------- Refresh ----------
	gr.Button("🔄 Refresh All").click(
	fn=lambda: (
	load_csv("omni-mllm.csv", "Overall").rename(columns=OMNI_MLLM_RENAME),
	load_csv("image-gen.csv", "Overall").rename(columns=IMAGE_GEN_RENAME),
	load_csv("video-gen.csv", "Overall").rename(columns=VIDEO_GEN_RENAME),
	load_csv("audio-reasoning.csv", "Task1-3").rename(columns=AUDIO_RENAME),
	),
	outputs=[omni_table, image_table, video_table, audio_table],
	)

	gr.Markdown(
	r"""
	### 📊 Overall Score Definition

	To facilitate clearer and more consistent comparison across models, we introduce an Overall score for each leaderboard track.

	1. OmniLLM / MLLM
	The Overall score is computed as the arithmetic mean of all reported task-specific scores.

	2. Image Generation
	The evaluation involves metrics defined on different numerical scales. WIScore is used for image generation, while VIEScore (averaged over three dimensions) is used for image editing.
	The Overall score is defined as:

	$$
	\text{Overall}=\frac{(\text{WIScore}\times 10)+\left(\frac{\sum \text{VIEScore}}{3}\right)}{2}
	$$

	This normalization-based formulation ensures a balanced contribution from both image generation and image editing performance.

	3. Video Generation
	The Overall score is calculated as the arithmetic mean of all evaluated dimensions, including imaging quality, aesthetics, motion, and temporal consistency.
	"""
	)

	demo.launch()