{ "bomFormat": "CycloneDX", "specVersion": "1.6", "serialNumber": "urn:uuid:f23c2a8d-7038-4272-957e-ca97df335610", "version": 1, "metadata": { "timestamp": "2025-06-05T09:38:20.021414+00:00", "component": { "type": "machine-learning-model", "bom-ref": "HuggingFaceM4/idefics2-8b-7bb21b56-dde0-500e-bf25-f4c32e64aa26", "name": "HuggingFaceM4/idefics2-8b", "externalReferences": [ { "url": "https://huggingface.co/HuggingFaceM4/idefics2-8b", "type": "documentation" } ], "modelCard": { "modelParameters": { "task": "image-text-to-text", "architectureFamily": "idefics2", "modelArchitecture": "Idefics2ForConditionalGeneration", "datasets": [ { "ref": "HuggingFaceM4/OBELICS-54e0c87c-8ce6-51eb-af0d-52a7ddb63e49" }, { "ref": "laion/laion-coco-6e73f888-3348-5039-864c-d2250f312f2e" }, { "ref": "wikipedia-8c5eb686-d691-517b-aad7-040fa51febc3" }, { "ref": "facebook/pmd-71fff1f1-79e2-5837-818a-a847941edaff" }, { "ref": "pixparse/idl-wds-5ddb0bef-9a41-5fb6-acce-2d16b0053443" }, { "ref": "pixparse/pdfa-eng-wds-6e24c86a-28c2-50e7-80c1-b74796cd7222" }, { "ref": "wendlerc/RenderedText-2ea0375f-e9a2-55d0-aa1e-734edec2644e" }, { "ref": "HuggingFaceM4/the_cauldron-0b60b937-29a7-5f0c-9fa6-ec10bf894687" }, { "ref": "teknium/OpenHermes-2.5-1a7eb3be-7eaa-5577-91f6-d4ad0d639c6c" }, { "ref": "GAIR/lima-afa8f631-d0ed-59c0-a5a1-170c80a5117e" }, { "ref": "databricks/databricks-dolly-15k-1008cfb1-7624-5c5b-93cc-d856239b86ea" }, { "ref": "meta-math/MetaMathQA-c6cf810a-8b06-5552-a876-53681c5fe9a1" }, { "ref": "TIGER-Lab/MathInstruct-9d9c997d-f6c1-5029-96fd-6003c4f0ec06" }, { "ref": "microsoft/orca-math-word-problems-200k-611afa9f-b6db-5b9f-9a51-598e4ce79d0e" }, { "ref": "camel-ai/math-9f7ffeb0-2fb3-5b72-9ca4-1b461f022e61" }, { "ref": "AtlasUnified/atlas-math-sets-49141027-0c69-515d-8182-254cf889efae" }, { "ref": "tiedong/goat-38642401-79d7-541d-a92f-d3d7acdb8db8" }, { "ref": "Lin-Chen/ShareGPT4V-f6982603-3c10-5cd0-9d2d-83d573b61341" }, { "ref": "jxu124/llava_conversation_58k-7dcd95f0-d26a-5075-89f4-4afc13a5b93f" } ] }, "properties": [ { "name": "library_name", "value": "transformers" } ], "consideration": { "useCases": "`idefics2-8b-base` and `idefics2-8b` can be used to perform inference on multimodal (image + text) tasks in which the input is composed of a text query along with one (or multiple) image(s). Text and images can be arbitrarily interleaved. That includes image captioning, visual question answering, etc. These model does not support image generation.For optimal results, we recommend fine-tuning `idefics2-8b` on one's specific use-case and data. In fact, the instruction-fine-tuned model (`idefics2-8b`) is significantly better at following instructions from users and thus should be preferred when using the models out-of-the-box or as a starting point for fine-tuning.`idefics2-8b` usually generates very short answers. For long generations, use `idefics2-8b-chatty`, which was further fine-tuned on long conversations.As a starting point, we provide fine-tuning codes that can be adapted for one's particular scenario:- With the [TRL library](https://github.com/huggingface/trl): [Script](https://gist.github.com/edbeeching/228652fc6c2b29a1641be5a5778223cb)- With the [Hugging Face Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#api-reference%20][%20transformers.Trainer): [Tutorial notebook](https://colab.research.google.com/drive/1NtcTgRbSBKN7pYD3Vdx1j9m8pt3fhFDB?usp=sharing)" } }, "authors": [ { "name": "HuggingFaceM4" } ], "licenses": [ { "license": { "id": "Apache-2.0", "url": "https://spdx.org/licenses/Apache-2.0.html" } } ], "description": "- **Developed by:** Hugging Face- **Model type:** Multi-modal model (image+text)- **Language(s) (NLP):** en- **License:** Apache 2.0- **Parent Models:** [google/siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) and [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)- **Resources for more information:**- Description of [OBELICS](https://huggingface.co/datasets/HuggingFaceM4/OBELICS): [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527)- Paper: [What matters when building vision-language models?](https://huggingface.co/papers/2405.02246)", "tags": [ "transformers", "safetensors", "idefics2", "image-text-to-text", "multimodal", "vision", "en", "dataset:HuggingFaceM4/OBELICS", "dataset:laion/laion-coco", "dataset:wikipedia", "dataset:facebook/pmd", "dataset:pixparse/idl-wds", "dataset:pixparse/pdfa-eng-wds", "dataset:wendlerc/RenderedText", "dataset:HuggingFaceM4/the_cauldron", "dataset:teknium/OpenHermes-2.5", "dataset:GAIR/lima", "dataset:databricks/databricks-dolly-15k", "dataset:meta-math/MetaMathQA", "dataset:TIGER-Lab/MathInstruct", "dataset:microsoft/orca-math-word-problems-200k", "dataset:camel-ai/math", "dataset:AtlasUnified/atlas-math-sets", "dataset:tiedong/goat", "dataset:Lin-Chen/ShareGPT4V", "dataset:jxu124/llava_conversation_58k", "arxiv:2306.16527", "arxiv:2405.02246", "arxiv:2307.06304", "arxiv:2311.07575", "arxiv:2103.03206", "license:apache-2.0", "text-generation-inference", "endpoints_compatible", "region:us" ] } }, "components": [ { "type": "data", "bom-ref": "HuggingFaceM4/OBELICS-54e0c87c-8ce6-51eb-af0d-52a7ddb63e49", "name": "HuggingFaceM4/OBELICS", "data": [ { "type": "dataset", "bom-ref": "HuggingFaceM4/OBELICS-54e0c87c-8ce6-51eb-af0d-52a7ddb63e49", "name": "HuggingFaceM4/OBELICS", "contents": { "url": "https://huggingface.co/datasets/HuggingFaceM4/OBELICS", "properties": [ { "name": "language", "value": "en" }, { "name": "size_categories", "value": "100M