Update license to MIT and clarify component licenses
Browse filesThis PR updates the model card to accurately reflect the project's license. The upstream GitHub repository ([https://github.com/OpenGVLab/InternVL](https://github.com/OpenGVLab/InternVL)) specifies the MIT license in its `LICENSE` file and README.
The metadata `license` tag has been changed from `apache-2.0` to `mit`. Additionally, the "License" section in the Markdown content has been updated to clarify that the overall project is under the MIT license, while still mentioning that components like Qwen3 are licensed under Apache-2.0. A direct link to the MIT license file on GitHub is also included for full transparency.
All other elements of the model card (paper link, code link, project page, sample usage, pipeline tag, library name) were found to be accurate and complete.
|
@@ -1,24 +1,24 @@
|
|
| 1 |
---
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
pipeline_tag: image-text-to-text
|
| 4 |
-
library_name: transformers
|
| 5 |
base_model:
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
base_model_relation: merge
|
| 9 |
datasets:
|
| 10 |
-
|
| 11 |
-
|
| 12 |
language:
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
| 14 |
tags:
|
| 15 |
-
|
| 16 |
-
|
|
|
|
| 17 |
---
|
| 18 |
|
| 19 |
# InternVL3_5-1B-Pretrained
|
| 20 |
|
| 21 |
-
[\[📂 GitHub\]](https://github.com/OpenGVLab/InternVL) [\[📜 InternVL 1.0\]](https://huggingface.co/papers/2312.14238) [\[📜 InternVL 1.5
|
| 22 |
|
| 23 |
[\[🆕 Blog\]](https://internvl.github.io/blog/) [\[🗨️ Chat Demo\]](https://chat.intern-ai.org.cn/) [\[🚀 Quick Start\]](#quick-start) [\[📖 Documents\]](https://internvl.readthedocs.io/en/latest/)
|
| 24 |
|
|
@@ -423,7 +423,7 @@ You are an AI assistant that rigorously follows this response protocol:
|
|
| 423 |
Ensure that the thinking process is thorough but remains focused on the query. The final answer should be standalone and not reference the thinking section.
|
| 424 |
""".strip()
|
| 425 |
|
| 426 |
-
model.system_message =
|
| 427 |
```
|
| 428 |
|
| 429 |
### Inference with Transformers
|
|
@@ -530,40 +530,50 @@ generation_config = dict(max_new_tokens=1024, do_sample=True)
|
|
| 530 |
# pure-text conversation (纯文本对话)
|
| 531 |
question = 'Hello, who are you?'
|
| 532 |
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
|
| 533 |
-
print(f'User: {question}
|
|
|
|
| 534 |
|
| 535 |
question = 'Can you tell me a story?'
|
| 536 |
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
|
| 537 |
-
print(f'User: {question}
|
|
|
|
| 538 |
|
| 539 |
# single-image single-round conversation (单图单轮对话)
|
| 540 |
-
question = '<image
|
|
|
|
| 541 |
response = model.chat(tokenizer, pixel_values, question, generation_config)
|
| 542 |
-
print(f'User: {question}
|
|
|
|
| 543 |
|
| 544 |
# single-image multi-round conversation (单图多轮对话)
|
| 545 |
-
question = '<image
|
|
|
|
| 546 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
| 547 |
-
print(f'User: {question}
|
|
|
|
| 548 |
|
| 549 |
question = 'Please write a poem according to the image.'
|
| 550 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
|
| 551 |
-
print(f'User: {question}
|
|
|
|
| 552 |
|
| 553 |
# multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
|
| 554 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 555 |
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 556 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 557 |
|
| 558 |
-
question = '<image
|
|
|
|
| 559 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 560 |
history=None, return_history=True)
|
| 561 |
-
print(f'User: {question}
|
|
|
|
| 562 |
|
| 563 |
question = 'What are the similarities and differences between these two images.'
|
| 564 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 565 |
history=history, return_history=True)
|
| 566 |
-
print(f'User: {question}
|
|
|
|
| 567 |
|
| 568 |
# multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
|
| 569 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
|
@@ -571,17 +581,21 @@ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat1
|
|
| 571 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 572 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 573 |
|
| 574 |
-
question = 'Image-1: <image
|
|
|
|
|
|
|
| 575 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 576 |
num_patches_list=num_patches_list,
|
| 577 |
history=None, return_history=True)
|
| 578 |
-
print(f'User: {question}
|
|
|
|
| 579 |
|
| 580 |
question = 'What are the similarities and differences between these two images.'
|
| 581 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 582 |
num_patches_list=num_patches_list,
|
| 583 |
history=history, return_history=True)
|
| 584 |
-
print(f'User: {question}
|
|
|
|
| 585 |
|
| 586 |
# batch inference, single image per sample (单图批处理)
|
| 587 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
|
@@ -589,13 +603,15 @@ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat1
|
|
| 589 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 590 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 591 |
|
| 592 |
-
questions = ['<image
|
|
|
|
| 593 |
responses = model.batch_chat(tokenizer, pixel_values,
|
| 594 |
num_patches_list=num_patches_list,
|
| 595 |
questions=questions,
|
| 596 |
generation_config=generation_config)
|
| 597 |
for question, response in zip(questions, responses):
|
| 598 |
-
print(f'User: {question}
|
|
|
|
| 599 |
|
| 600 |
# video multi-round conversation (视频多轮对话)
|
| 601 |
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
|
|
@@ -633,17 +649,24 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
|
|
| 633 |
video_path = './examples/red-panda.mp4'
|
| 634 |
pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
|
| 635 |
pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
| 636 |
-
video_prefix = ''.join([f'Frame{i+1}: <image
|
|
|
|
| 637 |
question = video_prefix + 'What is the red panda doing?'
|
| 638 |
-
# Frame1: <image
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 640 |
num_patches_list=num_patches_list, history=None, return_history=True)
|
| 641 |
-
print(f'User: {question}
|
|
|
|
| 642 |
|
| 643 |
question = 'Describe this video in detail.'
|
| 644 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 645 |
num_patches_list=num_patches_list, history=history, return_history=True)
|
| 646 |
-
print(f'User: {question}
|
|
|
|
| 647 |
```
|
| 648 |
|
| 649 |
#### Streaming Output
|
|
@@ -727,7 +750,9 @@ image_urls=[
|
|
| 727 |
|
| 728 |
images = [load_image(img_url) for img_url in image_urls]
|
| 729 |
# Numbering images improves multi-image conversations
|
| 730 |
-
response = pipe((f'Image-1: {IMAGE_TOKEN}
|
|
|
|
|
|
|
| 731 |
print(response.text)
|
| 732 |
```
|
| 733 |
|
|
@@ -816,7 +841,7 @@ print(response)
|
|
| 816 |
|
| 817 |
## License
|
| 818 |
|
| 819 |
-
This project is released under the
|
| 820 |
|
| 821 |
## Citation
|
| 822 |
|
|
@@ -829,4 +854,4 @@ If you find this project useful in your research, please consider citing:
|
|
| 829 |
journal={arXiv preprint arXiv:2508.18265},
|
| 830 |
year={2025}
|
| 831 |
}
|
| 832 |
-
```
|
|
|
|
| 1 |
---
|
|
|
|
|
|
|
|
|
|
| 2 |
base_model:
|
| 3 |
+
- OpenGVLab/InternViT-300M-448px-V2_5
|
| 4 |
+
- Qwen/Qwen3-0.6B
|
|
|
|
| 5 |
datasets:
|
| 6 |
+
- OpenGVLab/MMPR-v1.2
|
| 7 |
+
- OpenGVLab/MMPR-Tiny
|
| 8 |
language:
|
| 9 |
+
- multilingual
|
| 10 |
+
library_name: transformers
|
| 11 |
+
license: mit
|
| 12 |
+
pipeline_tag: image-text-to-text
|
| 13 |
tags:
|
| 14 |
+
- internvl
|
| 15 |
+
- custom_code
|
| 16 |
+
base_model_relation: merge
|
| 17 |
---
|
| 18 |
|
| 19 |
# InternVL3_5-1B-Pretrained
|
| 20 |
|
| 21 |
+
[\[📂 GitHub\]](https://github.com/OpenGVLab/InternVL) [\[📜 InternVL 1.0\]](https://huggingface.co/papers/2312.14238) [\[📜 InternVL 1.5\\]](https://huggingface.co/papers/2404.16821) [\[📜 InternVL 2.5\]](https://huggingface.co/papers/2412.05271) [\[📜 InternVL2.5-MPO\]](https://huggingface.co/papers/2411.10442) [\[📜 InternVL3\]](https://huggingface.co/papers/2504.10479) [\[📜 InternVL3.5\]](https://huggingface.co/papers/2508.18265)
|
| 22 |
|
| 23 |
[\[🆕 Blog\]](https://internvl.github.io/blog/) [\[🗨️ Chat Demo\]](https://chat.intern-ai.org.cn/) [\[🚀 Quick Start\]](#quick-start) [\[📖 Documents\]](https://internvl.readthedocs.io/en/latest/)
|
| 24 |
|
|
|
|
| 423 |
Ensure that the thinking process is thorough but remains focused on the query. The final answer should be standalone and not reference the thinking section.
|
| 424 |
""".strip()
|
| 425 |
|
| 426 |
+
model.system_message = R1_SYSTEM_PROMPT
|
| 427 |
```
|
| 428 |
|
| 429 |
### Inference with Transformers
|
|
|
|
| 530 |
# pure-text conversation (纯文本对话)
|
| 531 |
question = 'Hello, who are you?'
|
| 532 |
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
|
| 533 |
+
print(f'User: {question}
|
| 534 |
+
Assistant: {response}')
|
| 535 |
|
| 536 |
question = 'Can you tell me a story?'
|
| 537 |
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
|
| 538 |
+
print(f'User: {question}
|
| 539 |
+
Assistant: {response}')
|
| 540 |
|
| 541 |
# single-image single-round conversation (单图单轮对话)
|
| 542 |
+
question = '<image>
|
| 543 |
+
Please describe the image shortly.'
|
| 544 |
response = model.chat(tokenizer, pixel_values, question, generation_config)
|
| 545 |
+
print(f'User: {question}
|
| 546 |
+
Assistant: {response}')
|
| 547 |
|
| 548 |
# single-image multi-round conversation (单图多轮对话)
|
| 549 |
+
question = '<image>
|
| 550 |
+
Please describe the image in detail.'
|
| 551 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
| 552 |
+
print(f'User: {question}
|
| 553 |
+
Assistant: {response}')
|
| 554 |
|
| 555 |
question = 'Please write a poem according to the image.'
|
| 556 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
|
| 557 |
+
print(f'User: {question}
|
| 558 |
+
Assistant: {response}')
|
| 559 |
|
| 560 |
# multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
|
| 561 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 562 |
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 563 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 564 |
|
| 565 |
+
question = '<image>
|
| 566 |
+
Describe the two images in detail.'
|
| 567 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 568 |
history=None, return_history=True)
|
| 569 |
+
print(f'User: {question}
|
| 570 |
+
Assistant: {response}')
|
| 571 |
|
| 572 |
question = 'What are the similarities and differences between these two images.'
|
| 573 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 574 |
history=history, return_history=True)
|
| 575 |
+
print(f'User: {question}
|
| 576 |
+
Assistant: {response}')
|
| 577 |
|
| 578 |
# multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
|
| 579 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
|
|
|
| 581 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 582 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 583 |
|
| 584 |
+
question = 'Image-1: <image>
|
| 585 |
+
Image-2: <image>
|
| 586 |
+
Describe the two images in detail.'
|
| 587 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 588 |
num_patches_list=num_patches_list,
|
| 589 |
history=None, return_history=True)
|
| 590 |
+
print(f'User: {question}
|
| 591 |
+
Assistant: {response}')
|
| 592 |
|
| 593 |
question = 'What are the similarities and differences between these two images.'
|
| 594 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 595 |
num_patches_list=num_patches_list,
|
| 596 |
history=history, return_history=True)
|
| 597 |
+
print(f'User: {question}
|
| 598 |
+
Assistant: {response}')
|
| 599 |
|
| 600 |
# batch inference, single image per sample (单图批处理)
|
| 601 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
|
|
|
| 603 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 604 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 605 |
|
| 606 |
+
questions = ['<image>
|
| 607 |
+
Describe the image in detail.'] * len(num_patches_list)
|
| 608 |
responses = model.batch_chat(tokenizer, pixel_values,
|
| 609 |
num_patches_list=num_patches_list,
|
| 610 |
questions=questions,
|
| 611 |
generation_config=generation_config)
|
| 612 |
for question, response in zip(questions, responses):
|
| 613 |
+
print(f'User: {question}
|
| 614 |
+
Assistant: {response}')
|
| 615 |
|
| 616 |
# video multi-round conversation (视频多轮对话)
|
| 617 |
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
|
|
|
|
| 649 |
video_path = './examples/red-panda.mp4'
|
| 650 |
pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
|
| 651 |
pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
| 652 |
+
video_prefix = ''.join([f'Frame{i+1}: <image>
|
| 653 |
+
' for i in range(len(num_patches_list))])
|
| 654 |
question = video_prefix + 'What is the red panda doing?'
|
| 655 |
+
# Frame1: <image>
|
| 656 |
+
Frame2: <image>
|
| 657 |
+
...
|
| 658 |
+
Frame8: <image>
|
| 659 |
+
{question}
|
| 660 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 661 |
num_patches_list=num_patches_list, history=None, return_history=True)
|
| 662 |
+
print(f'User: {question}
|
| 663 |
+
Assistant: {response}')
|
| 664 |
|
| 665 |
question = 'Describe this video in detail.'
|
| 666 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 667 |
num_patches_list=num_patches_list, history=history, return_history=True)
|
| 668 |
+
print(f'User: {question}
|
| 669 |
+
Assistant: {response}')
|
| 670 |
```
|
| 671 |
|
| 672 |
#### Streaming Output
|
|
|
|
| 750 |
|
| 751 |
images = [load_image(img_url) for img_url in image_urls]
|
| 752 |
# Numbering images improves multi-image conversations
|
| 753 |
+
response = pipe((f'Image-1: {IMAGE_TOKEN}
|
| 754 |
+
Image-2: {IMAGE_TOKEN}
|
| 755 |
+
describe these two images', images))
|
| 756 |
print(response.text)
|
| 757 |
```
|
| 758 |
|
|
|
|
| 841 |
|
| 842 |
## License
|
| 843 |
|
| 844 |
+
This project is released under the [MIT License](https://github.com/OpenGVLab/InternVL/blob/main/LICENSE). Parts of this project, such as the pre-trained Qwen3 component, are licensed under the Apache-2.0 License.
|
| 845 |
|
| 846 |
## Citation
|
| 847 |
|
|
|
|
| 854 |
journal={arXiv preprint arXiv:2508.18265},
|
| 855 |
year={2025}
|
| 856 |
}
|
| 857 |
+
```
|