|
|
|
|
|
URL = "http://opencompass.openxlab.space/assets/OpenVLM_Subjective_Leaderboard.json" |
|
|
VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md' |
|
|
|
|
|
CITATION_BUTTON_TEXT = r"""@inproceedings{duan2024vlmevalkit, |
|
|
title={Vlmevalkit: An open-source toolkit for evaluating large multi-modality models}, |
|
|
author={Duan, Haodong and Yang, Junming and Qiao, Yuxuan and Fang, Xinyu and Chen, Lin and Liu, Yuan and Dong, Xiaoyi and Zang, Yuhang and Zhang, Pan and Wang, Jiaqi and others}, |
|
|
booktitle={Proceedings of the 32nd ACM International Conference on Multimedia}, |
|
|
pages={11198--11201}, |
|
|
year={2024} |
|
|
}""" |
|
|
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
|
|
|
|
|
LEADERBORAD_INTRODUCTION = """# OpenVLM Subjective Leaderboard |
|
|
### Welcome to the OpenVLM Subjective Leaderboard! On this leaderboard we share the evaluation results of VLMs obtained by the OpenSource Framework: |
|
|
### [*VLMEvalKit*: A Toolkit for Evaluating Large Vision-Language Models](https://github.com/open-compass/VLMEvalKit) 🏆 |
|
|
### Currently, OpenVLM Subjective Leaderboard covers {} different VLMs (including GPT4o, Gemini, Qwen2.5-VL, InternVL2.5 etc.) and {} different multi-modal benchmarks. |
|
|
|
|
|
This leaderboard was last updated: {}. |
|
|
|
|
|
OpenVLM Subjective Leaderboard only includes open-source VLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) to support your VLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [opencompass, fangxinyu, dingshengyuan]@pjlab.org.cn. |
|
|
""" |
|
|
|
|
|
META_FIELDS = [ |
|
|
'Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Eval Date' |
|
|
] |
|
|
MAIN_FIELDS = [ |
|
|
'Creation_MMBench', 'MIA-Bench', 'MM-IFEval', |
|
|
'MMAlignBench', 'MMVet', 'WildVision' |
|
|
] |
|
|
DEFAULT_BENCH = [ |
|
|
'Creation_MMBench', 'MIA-Bench', 'MM-IFEval', |
|
|
'MMAlignBench', 'MMVet', 'WildVision' |
|
|
] |
|
|
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown'] |
|
|
MODEL_TYPE = ['API', 'OpenSource'] |
|
|
|
|
|
|
|
|
LEADERBOARD_MD = {} |
|
|
|
|
|
LEADERBOARD_MD['MAIN'] = f""" |
|
|
## Main Evaluation Results |
|
|
|
|
|
- Metrics: |
|
|
- Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better). |
|
|
- Avg Rank: The average rank on all VLM Benchmarks (the lower the better). |
|
|
- Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!** |
|
|
- Metrics For each dataset |
|
|
- Creation-MMBench: Reward Score/Visual Factuality Score |
|
|
- MMAlignBench, WildVision: Reward Score |
|
|
- MIA-Bench, MM-IFEval, MMVet: Overall Score |
|
|
- By default, we present the overall evaluation results based on {len(DEFAULT_BENCH)} VLM benchmarks, sorted by the descending order of Avg Score. |
|
|
- The following datasets are included in the main results: {', '.join(DEFAULT_BENCH)}. |
|
|
- Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs. |
|
|
""" |
|
|
|
|
|
LEADERBOARD_MD['Creation_MMBench'] = """ |
|
|
## Creation MMBench Evaluation Results |
|
|
|
|
|
- Creation-MMBench is a multimodal benchmark specifically designed to evaluate the creative capabilities of MLLMs. It consists of **765 test cases**, covering **51 fine-grained tasks** across **4 categories**: *Literary Writing*, *Creative Multimodal Understanding*, *Professional Functionality Writing*, and *Common Functionality Writing*. As an MLLM benchmark, it contains a total of **1001 images** encompassing more than 25 different categories, with some questions incorporating up to 9 images. |
|
|
- Creation-MMBench includes carefully crafted **instance-specific criteria** for each test case, enabling assessment of both general response quality and visual-factual alignment in model-generated content. It employs a pair-wise comparison approach, where the model's output is compared with the reference answer (considering both the true answer, input prompt, and visual content) to get the assessment result. The **Dual Evaluation, and GPT-4o judge model** is the evaluation strategy for Creation-MMBench. |
|
|
- VFS stands for Visual Factuality Score. The rankings in this leaderboard are arranged in descending order based on the overall reward of each model, with **GPT-4o-1120** providing the **Reference Answer** for comparison, thus serving as the Baseline Model. |
|
|
- View More detail on [**Creation-MMBench Official WebPage**](https://open-compass.github.io/Creation-MMBench/) |
|
|
""" |
|
|
|
|
|
LEADERBOARD_MD['MM-IFEval'] = """ |
|
|
## MM-IFEval Evaluation Results |
|
|
|
|
|
- MM-IFEval is a comprehensive multimodal instruction-following benchmark designed to rigorously assess the capabilities of Multimodal Large Language Models (MLLMs). It includes 400 high-quality questions across two levels: 300 compose-level tasks that emphasize output format and content constraints, and 100 perception-level tasks that require precise visual understanding. |
|
|
- To ensure accurate evaluation, MM-IFEval employs a hybrid strategy combining rule-based verification with LLM-based judgment models. More details see https://arxiv.org/abs/2504.07957 |
|
|
- Currently, we use GPT4o (gpt-4o-2024-05-13) when needing an LLM judge model. |
|
|
""" |
|
|
|
|
|
LEADERBOARD_MD['MMAlignBench'] = """ |
|
|
## MMAlignBench Evaluation Results |
|
|
|
|
|
- MM-AlignBench target for evaluating MLLMs' alignment with human preferences. It includes 252 high-quality, human-annotated samples with diverse image types and open-ended questions. Modeled after Arena-style benchmarks, it uses GPT-4o as the judge model and Claude-Sonnet-3 as the reference model. |
|
|
- More Details see https://github.com/PhoenixZ810/OmniAlign-V |
|
|
""" |
|
|
|
|
|
LEADERBOARD_MD['MIA-Bench'] = """ |
|
|
## MIA-Bench Evaluation Results |
|
|
|
|
|
- MIA-Bench contains 400 carefully-crafted image-prompt pairs that stress‐test an MLLM’s ability to **follow layered, exacting instructions** in its responses. ([MIA-Bench: Towards Better Instruction Following Evaluation of Multimodal LLMs](https://arxiv.org/abs/2407.01509), [Towards Better Instruction Following Evaluation of Multimodal LLMs](https://machinelearning.apple.com/research/mia-bench)) |
|
|
- The leaderboard reports the **overall avg score**. Judge Model is **GPT-4o**. |
|
|
""" |
|
|
|
|
|
LEADERBOARD_MD['MMVet'] = """ |
|
|
## MMVet Evaluation Results |
|
|
|
|
|
- In MMVet Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported. |
|
|
- No specific prompt template adopted for **ALL VLMs**. |
|
|
""" |
|
|
|
|
|
LEADERBOARD_MD['WildVision'] = """ |
|
|
## WildVision Evaluation Results |
|
|
|
|
|
- WildVision-Bench offers **500 real-world multimodal prompts** curated from the WildVision-Arena crowdsourcing platform to benchmark models **by human preference** in natural conversations. |
|
|
- The leaderboard lists reports the **overall reward score**. |
|
|
- Judge Model is **GPT-4o**. Reference Model is **Claude-Sonnet-3**. |
|
|
""" |