nebulae09's picture
init leaderboard
0f5b104
# CONSTANTS-URL
URL = "http://opencompass.openxlab.space/assets/OpenVLM_Subjective_Leaderboard.json"
VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
# CONSTANTS-CITATION
CITATION_BUTTON_TEXT = r"""@inproceedings{duan2024vlmevalkit,
title={Vlmevalkit: An open-source toolkit for evaluating large multi-modality models},
author={Duan, Haodong and Yang, Junming and Qiao, Yuxuan and Fang, Xinyu and Chen, Lin and Liu, Yuan and Dong, Xiaoyi and Zang, Yuhang and Zhang, Pan and Wang, Jiaqi and others},
booktitle={Proceedings of the 32nd ACM International Conference on Multimedia},
pages={11198--11201},
year={2024}
}"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
# CONSTANTS-TEXT
LEADERBORAD_INTRODUCTION = """# OpenVLM Subjective Leaderboard
### Welcome to the OpenVLM Subjective Leaderboard! On this leaderboard we share the evaluation results of VLMs obtained by the OpenSource Framework:
### [*VLMEvalKit*: A Toolkit for Evaluating Large Vision-Language Models](https://github.com/open-compass/VLMEvalKit) 🏆
### Currently, OpenVLM Subjective Leaderboard covers {} different VLMs (including GPT4o, Gemini, Qwen2.5-VL, InternVL2.5 etc.) and {} different multi-modal benchmarks.
This leaderboard was last updated: {}.
OpenVLM Subjective Leaderboard only includes open-source VLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) to support your VLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [opencompass, fangxinyu, dingshengyuan]@pjlab.org.cn.
"""
# CONSTANTS-FIELDS
META_FIELDS = [
'Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Eval Date'
]
MAIN_FIELDS = [
'Creation_MMBench', 'MIA-Bench', 'MM-IFEval',
'MMAlignBench', 'MMVet', 'WildVision'
]
DEFAULT_BENCH = [
'Creation_MMBench', 'MIA-Bench', 'MM-IFEval',
'MMAlignBench', 'MMVet', 'WildVision'
]
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
MODEL_TYPE = ['API', 'OpenSource']
# The README file for each benchmark
LEADERBOARD_MD = {}
LEADERBOARD_MD['MAIN'] = f"""
## Main Evaluation Results
- Metrics:
- Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
- Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
- Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!**
- Metrics For each dataset
- Creation-MMBench: Reward Score/Visual Factuality Score
- MMAlignBench, WildVision: Reward Score
- MIA-Bench, MM-IFEval, MMVet: Overall Score
- By default, we present the overall evaluation results based on {len(DEFAULT_BENCH)} VLM benchmarks, sorted by the descending order of Avg Score.
- The following datasets are included in the main results: {', '.join(DEFAULT_BENCH)}.
- Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.
"""
LEADERBOARD_MD['Creation_MMBench'] = """
## Creation MMBench Evaluation Results
- Creation-MMBench is a multimodal benchmark specifically designed to evaluate the creative capabilities of MLLMs. It consists of **765 test cases**, covering **51 fine-grained tasks** across **4 categories**: *Literary Writing*, *Creative Multimodal Understanding*, *Professional Functionality Writing*, and *Common Functionality Writing*. As an MLLM benchmark, it contains a total of **1001 images** encompassing more than 25 different categories, with some questions incorporating up to 9 images.
- Creation-MMBench includes carefully crafted **instance-specific criteria** for each test case, enabling assessment of both general response quality and visual-factual alignment in model-generated content. It employs a pair-wise comparison approach, where the model's output is compared with the reference answer (considering both the true answer, input prompt, and visual content) to get the assessment result. The **Dual Evaluation, and GPT-4o judge model** is the evaluation strategy for Creation-MMBench.
- VFS stands for Visual Factuality Score. The rankings in this leaderboard are arranged in descending order based on the overall reward of each model, with **GPT-4o-1120** providing the **Reference Answer** for comparison, thus serving as the Baseline Model.
- View More detail on [**Creation-MMBench Official WebPage**](https://open-compass.github.io/Creation-MMBench/)
"""
LEADERBOARD_MD['MM-IFEval'] = """
## MM-IFEval Evaluation Results
- MM-IFEval is a comprehensive multimodal instruction-following benchmark designed to rigorously assess the capabilities of Multimodal Large Language Models (MLLMs). It includes 400 high-quality questions across two levels: 300 compose-level tasks that emphasize output format and content constraints, and 100 perception-level tasks that require precise visual understanding.
- To ensure accurate evaluation, MM-IFEval employs a hybrid strategy combining rule-based verification with LLM-based judgment models. More details see https://arxiv.org/abs/2504.07957
- Currently, we use GPT4o (gpt-4o-2024-05-13) when needing an LLM judge model.
"""
LEADERBOARD_MD['MMAlignBench'] = """
## MMAlignBench Evaluation Results
- MM-AlignBench target for evaluating MLLMs' alignment with human preferences. It includes 252 high-quality, human-annotated samples with diverse image types and open-ended questions. Modeled after Arena-style benchmarks, it uses GPT-4o as the judge model and Claude-Sonnet-3 as the reference model.
- More Details see https://github.com/PhoenixZ810/OmniAlign-V
"""
LEADERBOARD_MD['MIA-Bench'] = """
## MIA-Bench Evaluation Results
- MIA-Bench contains 400 carefully-crafted image-prompt pairs that stress‐test an MLLM’s ability to **follow layered, exacting instructions** in its responses. ([MIA-Bench: Towards Better Instruction Following Evaluation of Multimodal LLMs](https://arxiv.org/abs/2407.01509), [Towards Better Instruction Following Evaluation of Multimodal LLMs](https://machinelearning.apple.com/research/mia-bench))
- The leaderboard reports the **overall avg score**. Judge Model is **GPT-4o**.
"""
LEADERBOARD_MD['MMVet'] = """
## MMVet Evaluation Results
- In MMVet Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
- No specific prompt template adopted for **ALL VLMs**.
"""
LEADERBOARD_MD['WildVision'] = """
## WildVision Evaluation Results
- WildVision-Bench offers **500 real-world multimodal prompts** curated from the WildVision-Arena crowdsourcing platform to benchmark models **by human preference** in natural conversations.
- The leaderboard lists reports the **overall reward score**.
- Judge Model is **GPT-4o**. Reference Model is **Claude-Sonnet-3**.
"""