|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import json |
|
|
import os |
|
|
from pathlib import Path |
|
|
from huggingface_hub import snapshot_download, HfApi |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DATASET_REPO = "Fysics-AI/FysicsWorld-Leaderborad-Result" |
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
|
|
|
TRACK_TO_CSV = { |
|
|
"omni-mllm": "omni-mllm.csv", |
|
|
"image-gen": "image-gen.csv", |
|
|
"video-gen": "video-gen.csv", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LOCAL_DATA_DIR = Path( |
|
|
snapshot_download( |
|
|
repo_id=DATASET_REPO, |
|
|
repo_type="dataset", |
|
|
token=HF_TOKEN, |
|
|
) |
|
|
) |
|
|
|
|
|
print("๐ Dataset dir:", LOCAL_DATA_DIR) |
|
|
print("๐ Files:", [p.name for p in LOCAL_DATA_DIR.iterdir()]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OMNI_MLLM_RENAME = { |
|
|
"Task1-1": "Image\nUnderstanding", |
|
|
"Task1-2": "Video\nUnderstanding", |
|
|
|
|
|
"Task2-1": "Speech-Driven\nImage Understanding", |
|
|
"Task2-2": "Image-Audio\nReasoning", |
|
|
"Task2-3": "Speech-Based\nImage QA", |
|
|
"Task2-4": "Speech Generation\nfrom Image", |
|
|
"Task2-5": "Audio Matching\nfrom Image", |
|
|
|
|
|
"Task3-1": "Speech-Driven\nVideo Understanding", |
|
|
"Task3-2": "Video-Audio\nReasoning", |
|
|
"Task3-3": "Speech-Based\nVideo QA", |
|
|
"Task3-4": "Speech Generation\nfrom Video", |
|
|
"Task3-5": "Audio Matching\nfrom Video", |
|
|
"Task3-6": "Next-Action\nPrediction", |
|
|
} |
|
|
|
|
|
AUDIO_RENAME = { |
|
|
"Task1-3": "Audio Reasoning" |
|
|
} |
|
|
|
|
|
IMAGE_GEN_RENAME = { |
|
|
"WIScore": "WIScore", |
|
|
"SC": "Semantic\nConsistency", |
|
|
"PQ": "Perceptual\nQuality", |
|
|
"OR": "Overall\nQuality", |
|
|
} |
|
|
|
|
|
VIDEO_GEN_RENAME = { |
|
|
"Imaging": "Imaging", |
|
|
"Aesthetic": "Aesthetic", |
|
|
"Motion": "Motion", |
|
|
"Temporal": "Temporal", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def format_numeric_columns(df, decimals=2): |
|
|
df = df.copy() |
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
|
for col in numeric_cols: |
|
|
df[col] = df[col].map( |
|
|
lambda x: f"{x:.{decimals}f}" if pd.notnull(x) else "" |
|
|
) |
|
|
return df |
|
|
|
|
|
|
|
|
def load_csv(filename, sort_key=None, ascending=False): |
|
|
csv_path = LOCAL_DATA_DIR / filename |
|
|
df = pd.read_csv(csv_path) |
|
|
|
|
|
if sort_key and sort_key in df.columns: |
|
|
df = df.sort_values(sort_key, ascending=ascending) |
|
|
|
|
|
df = format_numeric_columns(df, decimals=2) |
|
|
return df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api = HfApi() |
|
|
|
|
|
|
|
|
def parse_submission(file_bytes): |
|
|
data = json.loads(file_bytes.decode("utf-8")) |
|
|
|
|
|
required = ["benchmark", "track", "model", "type", "metrics"] |
|
|
for k in required: |
|
|
if k not in data: |
|
|
raise ValueError(f"Missing field: {k}") |
|
|
|
|
|
if data["benchmark"] != "OmniWorld": |
|
|
raise ValueError("Invalid benchmark") |
|
|
|
|
|
if data["track"] not in TRACK_TO_CSV: |
|
|
raise ValueError("Invalid track") |
|
|
|
|
|
return data |
|
|
|
|
|
|
|
|
def append_submission(data): |
|
|
csv_name = TRACK_TO_CSV[data["track"]] |
|
|
csv_path = LOCAL_DATA_DIR / csv_name |
|
|
|
|
|
df = pd.read_csv(csv_path) |
|
|
|
|
|
if data["model"] in df["Model"].values: |
|
|
raise ValueError("Model already exists in leaderboard") |
|
|
|
|
|
row = { |
|
|
"Model": data["model"], |
|
|
"Type": data["type"], |
|
|
} |
|
|
row.update(data["metrics"]) |
|
|
|
|
|
df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) |
|
|
df.to_csv(csv_path, index=False) |
|
|
|
|
|
api.upload_file( |
|
|
path_or_fileobj=str(csv_path), |
|
|
path_in_repo=csv_name, |
|
|
repo_id=DATASET_REPO, |
|
|
repo_type="dataset", |
|
|
token=HF_TOKEN, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
def handle_submit(file): |
|
|
if file is None: |
|
|
return "โ No file uploaded" |
|
|
|
|
|
try: |
|
|
data = parse_submission(file) |
|
|
append_submission(data) |
|
|
return "โ
Submission successful! Please refresh leaderboard." |
|
|
except Exception as e: |
|
|
return f"โ Error: {str(e)}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks( |
|
|
theme=gr.themes.Soft(), |
|
|
css=""" |
|
|
.container { |
|
|
max-width: 1200px; |
|
|
margin: auto; |
|
|
} |
|
|
.leaderboard-links a { |
|
|
display: inline-block; |
|
|
margin: 0 8px; |
|
|
padding: 6px 12px; |
|
|
border-radius: 20px; |
|
|
background: #f4f4f5; |
|
|
color: #111827; |
|
|
text-decoration: none; |
|
|
font-weight: 500; |
|
|
font-size: 14px; |
|
|
} |
|
|
.leaderboard-links a:hover { |
|
|
background: #e5e7eb; |
|
|
} |
|
|
.description { |
|
|
max-width: 900px; |
|
|
margin: 18px auto 30px auto; |
|
|
font-size: 16px; |
|
|
line-height: 1.7; |
|
|
color: #374151; |
|
|
text-align: center; |
|
|
} |
|
|
body, .gradio-container { |
|
|
font-family: |
|
|
-apple-system, |
|
|
BlinkMacSystemFont, |
|
|
"Segoe UI", |
|
|
Roboto, |
|
|
"Helvetica Neue", |
|
|
Arial, |
|
|
"Noto Sans", |
|
|
"Liberation Sans", |
|
|
sans-serif; |
|
|
} |
|
|
/* OmniLLM ่กจๆ ผ๏ผ็ฌฌ 1 ๅ๏ผModel๏ผ */ |
|
|
table th:nth-child(1), |
|
|
table td:nth-child(1) { |
|
|
min-width: 220px; |
|
|
max-width: 220px; |
|
|
white-space: nowrap; |
|
|
} |
|
|
|
|
|
/* ็ฌฌ 2 ๅ๏ผType๏ผ */ |
|
|
table th:nth-child(2), |
|
|
table td:nth-child(2) { |
|
|
min-width: 120px; |
|
|
max-width: 120px; |
|
|
} |
|
|
|
|
|
.overall-definition { |
|
|
max-width: 900px; |
|
|
margin: 30px auto 40px auto; |
|
|
padding: 22px 28px; |
|
|
background: #f9fafb; |
|
|
border: 1px solid #e5e7eb; |
|
|
border-radius: 14px; |
|
|
font-size: 15px; |
|
|
line-height: 1.7; |
|
|
color: #1f2937; |
|
|
} |
|
|
|
|
|
.overall-definition h3 { |
|
|
text-align: center; |
|
|
font-size: 22px; |
|
|
margin-bottom: 16px; |
|
|
} |
|
|
|
|
|
.overall-definition strong { |
|
|
color: #111827; |
|
|
} |
|
|
|
|
|
""") as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
<h1 style="text-align:center; font-size:42px; margin-bottom:10px;"> |
|
|
๐ FysicsWorld Leaderboard |
|
|
</h1> |
|
|
|
|
|
<div class="leaderboard-links" style="text-align:center; margin-bottom:12px;"> |
|
|
<a href="https://github.com/Fysics-AI/FysicsWorld" target="_blank" |
|
|
style="margin: 0 10px;"> |
|
|
๐ Project Page |
|
|
</a> |
|
|
<a href="https://arxiv.org/pdf/2512.12756" target="_blank" |
|
|
style="margin: 0 10px;"> |
|
|
๐ Paper |
|
|
</a> |
|
|
<a href="https://huggingface.co/datasets/Fysics-AI/FysicsWorld" target="_blank" |
|
|
style="margin: 0 10px;"> |
|
|
๐ค Dataset |
|
|
</a> |
|
|
<a href="https://www.modelscope.cn/datasets/Fysics-AI/FysicsWorld" target="_blank" |
|
|
style="margin: 0 10px;"> |
|
|
๐พ ModelScope |
|
|
</a> |
|
|
</div> |
|
|
|
|
|
<div class="description"> |
|
|
We introduce <b><i>FysicsWorld</i></b>, the <b>first</b> unified full-modality benchmark |
|
|
that supports bidirectional input-output across <i>image, video, audio, and text</i>, |
|
|
enabling comprehensive any-to-any evaluation across understanding, generation, and reasoning. |
|
|
Our systematic design spans uni-modal perception tasks to fusion-dependent reasoning |
|
|
under strong cross-modal coupling, allowing us to diagnose, with unprecedented clarity, |
|
|
the limitations and emerging strengths of modern multimodal and omni-modal architectures. |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
|
|
|
with gr.Tab("๐ง OmniLLM / MLLM"): |
|
|
gr.Markdown("Evaluation results for OmniLLM / MLLM models.") |
|
|
|
|
|
df_omni = load_csv("omni-mllm.csv", sort_key="Overall") |
|
|
df_omni = df_omni.rename(columns=OMNI_MLLM_RENAME) |
|
|
|
|
|
omni_table = gr.Dataframe( |
|
|
value=df_omni, |
|
|
interactive=False, |
|
|
wrap=True |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("๐จ Image Generation"): |
|
|
gr.Markdown("Evaluation results for image generation models.") |
|
|
|
|
|
df_img = load_csv("image-gen.csv", sort_key="Overall") |
|
|
df_img = df_img.rename(columns=IMAGE_GEN_RENAME) |
|
|
|
|
|
image_table = gr.Dataframe( |
|
|
value=df_img, |
|
|
interactive=False, |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("๐ฌ Video Generation"): |
|
|
gr.Markdown("Evaluation results for video generation models.") |
|
|
|
|
|
df_vid = load_csv("video-gen.csv", sort_key="Overall") |
|
|
df_vid = df_vid.rename(columns=VIDEO_GEN_RENAME) |
|
|
|
|
|
video_table = gr.Dataframe( |
|
|
value=df_vid, |
|
|
interactive=False, |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("๐ต Audio Reasoning"): |
|
|
gr.Markdown("Evaluation results for OmniLLMs, MLLMs, and AudioLLMs.") |
|
|
|
|
|
df_aud = load_csv("audio-reasoning.csv", sort_key="Task1-3") |
|
|
df_aud = df_aud.rename(columns=AUDIO_RENAME) |
|
|
|
|
|
audio_table = gr.Dataframe( |
|
|
value=df_aud, |
|
|
interactive=False, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
gr.Button("๐ Refresh All").click( |
|
|
fn=lambda: ( |
|
|
load_csv("omni-mllm.csv", "Overall").rename(columns=OMNI_MLLM_RENAME), |
|
|
load_csv("image-gen.csv", "Overall").rename(columns=IMAGE_GEN_RENAME), |
|
|
load_csv("video-gen.csv", "Overall").rename(columns=VIDEO_GEN_RENAME), |
|
|
load_csv("audio-reasoning.csv", "Task1-3").rename(columns=AUDIO_RENAME), |
|
|
), |
|
|
outputs=[omni_table, image_table, video_table, audio_table], |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
r""" |
|
|
### ๐ Overall Score Definition |
|
|
|
|
|
To facilitate clearer and more consistent comparison across models, we introduce an **Overall** score for each leaderboard track. |
|
|
|
|
|
**1. OmniLLM / MLLM** |
|
|
The **Overall** score is computed as the arithmetic mean of all reported task-specific scores. |
|
|
|
|
|
**2. Image Generation** |
|
|
The evaluation involves metrics defined on different numerical scales. **WIScore** is used for image generation, while **VIEScore** (averaged over three dimensions) is used for image editing. |
|
|
The **Overall** score is defined as: |
|
|
|
|
|
$$ |
|
|
\text{Overall}=\frac{(\text{WIScore}\times 10)+\left(\frac{\sum \text{VIEScore}}{3}\right)}{2} |
|
|
$$ |
|
|
|
|
|
This normalization-based formulation ensures a balanced contribution from both image generation and image editing performance. |
|
|
|
|
|
**3. Video Generation** |
|
|
The **Overall** score is calculated as the arithmetic mean of all evaluated dimensions, including imaging quality, aesthetics, motion, and temporal consistency. |
|
|
""" |
|
|
) |
|
|
|
|
|
demo.launch() |
|
|
|
|
|
|