Update license to MIT and clarify component licenses

This PR updates the model card to accurately reflect the project's license. The upstream GitHub repository ([https://github.com/OpenGVLab/InternVL](https://github.com/OpenGVLab/InternVL)) specifies the MIT license in its `LICENSE` file and README.

The metadata `license` tag has been changed from `apache-2.0` to `mit`. Additionally, the "License" section in the Markdown content has been updated to clarify that the overall project is under the MIT license, while still mentioning that components like Qwen3 are licensed under Apache-2.0. A direct link to the MIT license file on GitHub is also included for full transparency.

All other elements of the model card (paper link, code link, project page, sample usage, pipeline tag, library name) were found to be accurate and complete.

Files changed (1) hide show

README.md +60 -35

README.md CHANGED Viewed

@@ -1,24 +1,24 @@
 ---
-license: apache-2.0
-pipeline_tag: image-text-to-text
-library_name: transformers
 base_model:
-  - OpenGVLab/InternViT-300M-448px-V2_5
-  - Qwen/Qwen3-0.6B
-base_model_relation: merge
 datasets:
-  - OpenGVLab/MMPR-v1.2
-  - OpenGVLab/MMPR-Tiny
 language:
-  - multilingual
 tags:
-  - internvl
-  - custom_code
 ---
 # InternVL3_5-1B-Pretrained
-[\[📂 GitHub\]](https://github.com/OpenGVLab/InternVL)  [\[📜 InternVL 1.0\]](https://huggingface.co/papers/2312.14238)  [\[📜 InternVL 1.5\]](https://huggingface.co/papers/2404.16821)  [\[📜 InternVL 2.5\]](https://huggingface.co/papers/2412.05271)  [\[📜 InternVL2.5-MPO\]](https://huggingface.co/papers/2411.10442)  [\[📜 InternVL3\]](https://huggingface.co/papers/2504.10479) [\[📜 InternVL3.5\]](https://huggingface.co/papers/2508.18265)
 [\[🆕 Blog\]](https://internvl.github.io/blog/)  [\[🗨️ Chat Demo\]](https://chat.intern-ai.org.cn/)  [\[🚀 Quick Start\]](#quick-start)  [\[📖 Documents\]](https://internvl.readthedocs.io/en/latest/)
@@ -423,7 +423,7 @@ You are an AI assistant that rigorously follows this response protocol:
 Ensure that the thinking process is thorough but remains focused on the query. The final answer should be standalone and not reference the thinking section.
 """.strip()
-model.system_message = R1_SYSTEMP_PROMPT
 ```
 ### Inference with Transformers
@@ -530,40 +530,50 @@ generation_config = dict(max_new_tokens=1024, do_sample=True)
 # pure-text conversation (纯文本对话)
 question = 'Hello, who are you?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
-print(f'User: {question}\nAssistant: {response}')
 question = 'Can you tell me a story?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
-print(f'User: {question}\nAssistant: {response}')
 # single-image single-round conversation (单图单轮对话)
-question = '<image>\nPlease describe the image shortly.'
 response = model.chat(tokenizer, pixel_values, question, generation_config)
-print(f'User: {question}\nAssistant: {response}')
 # single-image multi-round conversation (单图多轮对话)
-question = '<image>\nPlease describe the image in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
-print(f'User: {question}\nAssistant: {response}')
 question = 'Please write a poem according to the image.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
-print(f'User: {question}\nAssistant: {response}')
 # multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
-question = '<image>\nDescribe the two images in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                history=None, return_history=True)
-print(f'User: {question}\nAssistant: {response}')
 question = 'What are the similarities and differences between these two images.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                history=history, return_history=True)
-print(f'User: {question}\nAssistant: {response}')
 # multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
@@ -571,17 +581,21 @@ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat1
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
-question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list,
                                history=None, return_history=True)
-print(f'User: {question}\nAssistant: {response}')
 question = 'What are the similarities and differences between these two images.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list,
                                history=history, return_history=True)
-print(f'User: {question}\nAssistant: {response}')
 # batch inference, single image per sample (单图批处理)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
@@ -589,13 +603,15 @@ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat1
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
-questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
 responses = model.batch_chat(tokenizer, pixel_values,
                              num_patches_list=num_patches_list,
                              questions=questions,
                              generation_config=generation_config)
 for question, response in zip(questions, responses):
-    print(f'User: {question}\nAssistant: {response}')
 # video multi-round conversation (视频多轮对话)
 def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
@@ -633,17 +649,24 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
 video_path = './examples/red-panda.mp4'
 pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
 pixel_values = pixel_values.to(torch.bfloat16).cuda()
-video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
 question = video_prefix + 'What is the red panda doing?'
-# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list, history=None, return_history=True)
-print(f'User: {question}\nAssistant: {response}')
 question = 'Describe this video in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list, history=history, return_history=True)
-print(f'User: {question}\nAssistant: {response}')
 ```
 #### Streaming Output
@@ -727,7 +750,9 @@ image_urls=[
 images = [load_image(img_url) for img_url in image_urls]
 # Numbering images improves multi-image conversations
-response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe these two images', images))
 print(response.text)
 ```
@@ -816,7 +841,7 @@ print(response)
 ## License
-This project is released under the apache-2.0 License. This project uses the pre-trained Qwen3 as a component, which is licensed under the apache-2.0 License.
 ## Citation
@@ -829,4 +854,4 @@ If you find this project useful in your research, please consider citing:
   journal={arXiv preprint arXiv:2508.18265},
   year={2025}
 }
-```

 ---
 base_model:
+- OpenGVLab/InternViT-300M-448px-V2_5
+- Qwen/Qwen3-0.6B
 datasets:
+- OpenGVLab/MMPR-v1.2
+- OpenGVLab/MMPR-Tiny
 language:
+- multilingual
+library_name: transformers
+license: mit
+pipeline_tag: image-text-to-text
 tags:
+- internvl
+- custom_code
+base_model_relation: merge
 ---
 # InternVL3_5-1B-Pretrained
+[\[📂 GitHub\]](https://github.com/OpenGVLab/InternVL)  [\[📜 InternVL 1.0\]](https://huggingface.co/papers/2312.14238)  [\[📜 InternVL 1.5\\]](https://huggingface.co/papers/2404.16821)  [\[📜 InternVL 2.5\]](https://huggingface.co/papers/2412.05271)  [\[📜 InternVL2.5-MPO\]](https://huggingface.co/papers/2411.10442)  [\[📜 InternVL3\]](https://huggingface.co/papers/2504.10479) [\[📜 InternVL3.5\]](https://huggingface.co/papers/2508.18265)
 [\[🆕 Blog\]](https://internvl.github.io/blog/)  [\[🗨️ Chat Demo\]](https://chat.intern-ai.org.cn/)  [\[🚀 Quick Start\]](#quick-start)  [\[📖 Documents\]](https://internvl.readthedocs.io/en/latest/)
 Ensure that the thinking process is thorough but remains focused on the query. The final answer should be standalone and not reference the thinking section.
 """.strip()
+model.system_message = R1_SYSTEM_PROMPT
 ```
 ### Inference with Transformers
 # pure-text conversation (纯文本对话)
 question = 'Hello, who are you?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
+print(f'User: {question}
+Assistant: {response}')
 question = 'Can you tell me a story?'
 response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
+print(f'User: {question}
+Assistant: {response}')
 # single-image single-round conversation (单图单轮对话)
+question = '<image>
+Please describe the image shortly.'
 response = model.chat(tokenizer, pixel_values, question, generation_config)
+print(f'User: {question}
+Assistant: {response}')
 # single-image multi-round conversation (单图多轮对话)
+question = '<image>
+Please describe the image in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
+print(f'User: {question}
+Assistant: {response}')
 question = 'Please write a poem according to the image.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
+print(f'User: {question}
+Assistant: {response}')
 # multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
+question = '<image>
+Describe the two images in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                history=None, return_history=True)
+print(f'User: {question}
+Assistant: {response}')
 question = 'What are the similarities and differences between these two images.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                history=history, return_history=True)
+print(f'User: {question}
+Assistant: {response}')
 # multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
+question = 'Image-1: <image>
+Image-2: <image>
+Describe the two images in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list,
                                history=None, return_history=True)
+print(f'User: {question}
+Assistant: {response}')
 question = 'What are the similarities and differences between these two images.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list,
                                history=history, return_history=True)
+print(f'User: {question}
+Assistant: {response}')
 # batch inference, single image per sample (单图批处理)
 pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
 pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
+questions = ['<image>
+Describe the image in detail.'] * len(num_patches_list)
 responses = model.batch_chat(tokenizer, pixel_values,
                              num_patches_list=num_patches_list,
                              questions=questions,
                              generation_config=generation_config)
 for question, response in zip(questions, responses):
+    print(f'User: {question}
+Assistant: {response}')
 # video multi-round conversation (视频多轮对话)
 def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
 video_path = './examples/red-panda.mp4'
 pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
 pixel_values = pixel_values.to(torch.bfloat16).cuda()
+video_prefix = ''.join([f'Frame{i+1}: <image>
+' for i in range(len(num_patches_list))])
 question = video_prefix + 'What is the red panda doing?'
+# Frame1: <image>
+Frame2: <image>
+...
+Frame8: <image>
+{question}
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list, history=None, return_history=True)
+print(f'User: {question}
+Assistant: {response}')
 question = 'Describe this video in detail.'
 response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list, history=history, return_history=True)
+print(f'User: {question}
+Assistant: {response}')
 ```
 #### Streaming Output
 images = [load_image(img_url) for img_url in image_urls]
 # Numbering images improves multi-image conversations
+response = pipe((f'Image-1: {IMAGE_TOKEN}
+Image-2: {IMAGE_TOKEN}
+describe these two images', images))
 print(response.text)
 ```
 ## License
+This project is released under the [MIT License](https://github.com/OpenGVLab/InternVL/blob/main/LICENSE). Parts of this project, such as the pre-trained Qwen3 component, are licensed under the Apache-2.0 License.
 ## Citation
   journal={arXiv preprint arXiv:2508.18265},
   year={2025}
 }
+```