leoye commited on Oct 17

Commit

fd01e7c

0 Parent(s):

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +39 -0
README.md +95 -0
asset/NVIDIA_OneWay_Noncommercial_License.docx +0 -0
asset/omni_benchmarks.png +3 -0
asset/omni_benchmarks2.png +3 -0
asset/performance.png +3 -0
audio_encoder.py +70 -0
auto_processor.py +476 -0
base_projector.py +236 -0
builder.py +253 -0
config.json +0 -0
configuration_vila.py +127 -0
constants.py +83 -0
conversation.py +189 -0
distributed.py +89 -0
environment_setup.sh +60 -0
example_infer.py +335 -0
example_mini_audio.py +89 -0
example_mini_image.py +76 -0
example_mini_video.py +101 -0
llm/added_tokens.json +20 -0
llm/config.json +0 -0
llm/generation_config.json +14 -0
llm/merges.txt +0 -0
llm/model-00001-of-00004.safetensors +3 -0
llm/model-00002-of-00004.safetensors +3 -0
llm/model-00003-of-00004.safetensors +3 -0
llm/model-00004-of-00004.safetensors +3 -0
llm/model.safetensors.index.json +346 -0
llm/special_tokens_map.json +39 -0
llm/tokenizer.json +3 -0
llm/tokenizer_config.json +165 -0
llm/vocab.json +0 -0
media.py +555 -0
media_encoder.py +955 -0
mm_projector/config.json +10 -0
mm_projector/model.safetensors +3 -0
mm_utils.py +567 -0
model_utils_packing.py +50 -0
modeling_vila.py +1834 -0
preprocessor_config.json +13 -0
pyproject.toml +59 -0
qwen2.jinja +11 -0
qwen_audio_encoder.py +89 -0
siglip_encoder.py +293 -0
sound_base_projector.py +126 -0
sound_mm_projector/config.json +10 -0
sound_mm_projector/model.safetensors +3 -0
sound_tower/config.json +50 -0
sound_tower/model.safetensors +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,39 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+asset/omni_benchmarks.png filter=lfs diff=lfs merge=lfs -text
+asset/omni_benchmarks2.png filter=lfs diff=lfs merge=lfs -text
+llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+asset/performance.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+# <span style="background: linear-gradient(45deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #f5576c 75%, #4facfe 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; font-weight: bold; font-size: 1.1em;">**OmniVinci: Enhancing Architecture and Data for Omni-Modal Understanding LLM**</span> <br />
+[![Paper](https://img.shields.io/badge/ArXiv-Paper-brown)](https://arxiv.org/)
+[![Code](https://img.shields.io/badge/GitHub-Link-blue)](https://github.com/NVlabs)
+[![Model](https://img.shields.io/badge/HuggingFace-Model-yellow)](https://huggingface.co/nvidia/omnivinci)
+## Introduction
+OmniVinci is an NVIDIA research project focused on exploring omni-modal LLMs that can not only see and read but also listen, speak, and reason.
+We are among the best omni-modality understanding models. Check out our performance on some of the most popular omni-modality, audio, and vision benchmarks:
+<p align="center">
+    <img src="./asset/performance.png" width="80%"/>
+<p>
+## Quickstart
+Below, we provide simple examples to show how to use our model with Transformers.
+### Environment Setup
+1. Download and navigate to the HuggingFace repository:
+```
+huggingface-cli download nvidia/omnivinci --local-dir ./omnivinci --local-dir-use-symlinks False
+cd ./omnivinci
+```
+2. Install Python environment (based on NVILA codebase):
+```
+bash ./environment_setup.sh omnivinci
+```
+### 🤗 Transformers Usage
+#### Video (with Audio) Inference Example
+```python
+from transformers import AutoProcessor, AutoModel, AutoConfig,AutoModelForCausalLM
+import torch
+import os
+# default: Load the model on the available device(s)
+model_path = "./"
+video_path = "xxx.mp4"
+generation_kwargs = {"max_new_tokens": 1024, "max_length": 99999999}
+load_audio_in_video = True
+num_video_frames = 128
+audio_length = "max_3600"
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModel.from_pretrained(model_path,
+                                  trust_remote_code=True,
+                                  torch_dtype="torch.float16",
+                                  device_map="auto")
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+generation_config = model.default_generation_config
+generation_config.update(**generation_kwargs)
+model.config.load_audio_in_video = load_audio_in_video
+processor.config.load_audio_in_video = load_audio_in_video
+if num_video_frames > 0:
+    model.config.num_video_frames = num_video_frames
+    processor.config.num_video_frames = num_video_frames
+if audio_length != -1:
+    model.config.audio_chunk_length = audio_length
+    processor.config.audio_chunk_length = audio_length
+conversation = [{
+        "role": "user",
+        "content": [
+            {"type": "video", "video":video_path},
+            {"type": "text", "text": "Assess the video, followed by a detailed description of its video and audio contents."}
+        ]
+}]
+text = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+inputs = processor([text])
+output_ids = model.generate(
+    input_ids=inputs.input_ids,
+    media=getattr(inputs, 'media', None),
+    media_config=getattr(inputs, 'media_config', None),
+    generation_config=generation_config,
+)
+print(processor.tokenizer.batch_decode(output_ids, skip_special_tokens=True))
+```
+- **For audio and image inference examples, please refer to `example_mini_audio.py` and `example_mini_image.py`.**
+## License / Terms of Use
+The model is released under the [NVIDIA OneWay Noncommercial License](asset/NVIDIA_OneWay_Noncommercial_License.docx).

asset/NVIDIA_OneWay_Noncommercial_License.docx ADDED Viewed

Binary file (20.6 kB). View file

asset/omni_benchmarks.png ADDED Viewed

Git LFS Details

SHA256: 582f1f1c454a3c775162ed469ceed6a76aeca1f3e4d57e5c7710ae0eb1310dfa
Pointer size: 131 Bytes
Size of remote file: 684 kB

asset/omni_benchmarks2.png ADDED Viewed

Git LFS Details

SHA256: a7d759ea879119e1b894dd16d353eb24409d9d1c7d206a3b79eee3e093cce28d
Pointer size: 131 Bytes
Size of remote file: 151 kB

asset/performance.png ADDED Viewed

Git LFS Details

SHA256: 33be284f6fcff5627ebb9e3597944ac5fca9f8003a78a465ece7be7eefa3d0c1
Pointer size: 131 Bytes
Size of remote file: 233 kB

audio_encoder.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class AudioTower(nn.Module):
+    def __init__(self, audio_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.audio_tower_name = audio_tower
+        self.cfg_only = None
+    def forward(self, sounds):
+        if type(sounds) is list:
+            sound_features = []
+            audio_output_lengths = []
+            for sound in sounds:
+                if hasattr(sound, "input_features"):
+                    sound = sound["input_features"]
+                sound_feature = self.audio_tower(sound)
+                sound_feature = sound_feature.last_hidden_state
+                sound_feature = sound_feature.to(sound.dtype)
+                sound_features.append(sound_feature)
+                audio_output_lengths.append(sound_feature.shape[1])
+            sound_features = torch.cat(sound_features, dim=1).squeeze(0)
+        else:
+            raise NotImplementedError("Not implemented for this encoder")
+        return sound_features, audio_output_lengths
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.audio_tower.dtype
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.audio_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def device(self):
+        return self.audio_tower.device
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size

auto_processor.py ADDED Viewed

	@@ -0,0 +1,476 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import os
+import os.path as osp
+import warnings
+from collections import defaultdict
+from io import BytesIO
+from typing import List, Optional, Union
+import PIL.Image
+import requests
+import torch
+from transformers import AutoConfig, AutoImageProcessor, AutoModel, AutoProcessor, AutoTokenizer
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+from .constants import DEFAULT_IMAGE_TOKEN, MEDIA_TOKENS
+from .media import Image, Video, extract_media, Sound
+from .mm_utils import process_image, process_images
+from .tokenizer_utils import tokenize_conversation
+def to_rgb(pil_image: PIL.Image.Image) -> PIL.Image.Image:
+    """Convert PIL image to RGB format."""
+    if pil_image.mode == "RGBA":
+        white_background = PIL.Image.new("RGB", pil_image.size, (255, 255, 255))
+        white_background.paste(pil_image, mask=pil_image.split()[3])  # Use alpha channel as mask
+        return white_background
+    else:
+        return pil_image.convert("RGB")
+def fetch_image(ele: dict[str, str | PIL.Image.Image], size_factor=None) -> PIL.Image.Image:
+    """Fetch and load image from various sources (local path, URL, base64, PIL.Image)."""
+    if "image" in ele:
+        image = ele["image"]
+    else:
+        image = ele["image_url"]
+    image_obj = None
+    if isinstance(image, PIL.Image.Image):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        response = requests.get(image, stream=True)
+        image_obj = PIL.Image.open(BytesIO(response.content))
+    elif image.startswith("file://"):
+        image_obj = PIL.Image.open(image[7:])
+    elif image.startswith("data:image"):
+        if "base64," in image:
+            _, base64_data = image.split("base64,", 1)
+            data = base64.b64decode(base64_data)
+            image_obj = PIL.Image.open(BytesIO(data))
+    else:
+        image_obj = PIL.Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    image = to_rgb(image_obj)
+    return image
+def fetch_image_url_or_fpath(url_or_fpath):
+    """Fetch image from URL or local file path, returns local file path."""
+    if url_or_fpath.startswith("http") or url_or_fpath.startswith("https"):
+        import tempfile
+        import requests
+        # Download the image to a temporary file
+        temp_dir = tempfile.mkdtemp()
+        temp_file = os.path.join(temp_dir, os.path.basename(url_or_fpath))
+        response = requests.get(url_or_fpath, stream=True)
+        response.raise_for_status()
+        with open(temp_file, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        return temp_file
+    elif url_or_fpath.startswith("file://"):
+        fpath = url_or_fpath.replace("file://", "")
+        assert osp.exists(fpath), f"File {fpath} does not exist"
+        return fpath
+    elif osp.exists(url_or_fpath):
+        assert osp.isfile(url_or_fpath), f"File {url_or_fpath} does not exist"
+        return url_or_fpath
+    else:
+        raise ValueError(f"Unsupported image path: {url_or_fpath}")
+def pad_fn(input_ids_list: List[torch.Tensor], padding_value=0, target_len=None, padding_side="left") -> torch.Tensor:
+    # tensor shape is (batch_size, seq_len)
+    max_len = max([ids.shape[1] for ids in input_ids_list])
+    if target_len is not None:
+        assert target_len >= max_len, "target_len must be greater than or equal to max_len"
+        max_len = target_len
+    new_input_ids_list = []
+    for i, input_ids in enumerate(input_ids_list):
+        pad_tensor = torch.ones_like(input_ids) * padding_value
+        curr_len = input_ids.shape[1]
+        pad_tensor = pad_tensor[:, : max_len - curr_len]
+        if padding_side == "right":
+            input_ids = torch.cat((input_ids, pad_tensor), dim=1)
+        else:
+            input_ids = torch.cat((pad_tensor, input_ids), dim=1)
+        new_input_ids_list.append(input_ids)
+    return torch.cat(new_input_ids_list, dim=0)
+def extract_value_from_conv(chat):
+    value = []
+    if isinstance(chat["content"], str):
+        value.append(chat["content"])
+        return value
+    # otherwise, it's a list of content
+    for content in chat["content"]:
+        if content["type"] == "image":
+            if "path" in content:
+                # VILA style, can be either filepath or http url
+                value.append(Image(fetch_image_url_or_fpath(content["path"])))
+            elif "image" in content:
+                # Qwen style
+                value.append(Image(fetch_image_url_or_fpath(content["image"])))
+            elif "image_pil" in content:
+                # Qwen style
+                assert isinstance(content["image_pil"], PIL.Image.Image), f"Type of image_pil must be PIL.Image.Image"
+                value.append(content["image_pil"])
+            else:
+                raise ValueError(f"Type = `image` , but no `path` or `image` in  {chat['content']}")
+        elif content["type"] == "video":
+            if "video" in content:
+                # Qwen style
+                value.append(Video(fetch_image_url_or_fpath(content["video"])))
+            else:
+                raise ValueError(f"Type = `video` , but no `video` in {chat['content']}")
+        elif content["type"] == "text":
+            value.append(content["text"])
+        elif content["type"] == "audio":
+            value.append(Sound(fetch_image_url_or_fpath(content["audio"])))
+        elif content["type"] == "sound":
+            value.append(Sound(fetch_image_url_or_fpath(content["sound"])))
+        elif content["type"] == "speech":
+            value.append(Sound(fetch_image_url_or_fpath(content["speech"])))
+        else:
+            raise ValueError(f"Unsupported content type: {content['type']}")
+    return value
+class VILAProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+class VILAProcessor(ProcessorMixin):
+    attributes = []
+    valid_kwargs = []
+    def __init__(
+        self, image_processor=None, tokenizer=None, chat_template=None, config=None, padding_side="left", **kwargs
+    ):
+        self.image_token = MEDIA_TOKENS["image"]
+        self.video_token = MEDIA_TOKENS["video"]
+        self.speech_token = MEDIA_TOKENS["speech"]
+        self.sound_token = MEDIA_TOKENS["sound"]
+        self.config = config
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.padding_side = padding_side
+        # Use <|endoftext|> token as padding token for Qwen models
+        self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0]
+        self.eos_token_id = self.tokenizer.eos_token_id
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    @staticmethod
+    def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
+        """
+        Extract vision information from conversations.
+        Reference: qwen_vl_utils
+        """
+        vision_infos = []
+        if isinstance(conversations[0], dict):
+            conversations = [conversations]
+        for conversation in conversations:
+            for message in conversation:
+                if isinstance(message["content"], list):
+                    for ele in message["content"]:
+                        if (
+                            "image" in ele
+                            or "image_url" in ele
+                            or "video" in ele
+                            or ele["type"] in ("image", "image_url", "video")
+                        ):
+                            vision_infos.append(ele)
+        return vision_infos
+    @staticmethod
+    def process_vision_info(
+        conversations: list[dict] | list[list[dict]],
+        return_video_kwargs: bool = False,
+    ) -> tuple[list[PIL.Image.Image] | None, list[torch.Tensor | list[PIL.Image.Image]] | None, Optional[dict]]:
+        """
+        Process vision information from conversations.
+        Reference: qwen_vl_utils
+        Note: NVILA does not depend on this function, but maintains the same interface.
+        """
+        vision_infos = extract_vision_info(conversations)
+        # Read images or videos
+        image_inputs = []
+        video_inputs = []
+        video_sample_fps_list = []
+        for vision_info in vision_infos:
+            if "image" in vision_info or "image_url" in vision_info:
+                image_inputs.append(fetch_image(vision_info))
+            elif "video" in vision_info:
+                video_input, video_sample_fps = fetch_video(vision_info, return_video_sample_fps=True)
+                video_sample_fps_list.append(video_sample_fps)
+                video_inputs.append(video_input)
+            else:
+                raise ValueError("image, image_url or video should in content.")
+        if len(image_inputs) == 0:
+            image_inputs = None
+        if len(video_inputs) == 0:
+            video_inputs = None
+        if return_video_kwargs:
+            return image_inputs, video_inputs, {"fps": video_sample_fps_list}
+        return image_inputs, video_inputs
+    @staticmethod
+    def move_data_to_device(cls, prompt_inputs):
+        def _move_data_to_device(item):
+            # wrap function grpo trainer _prepare_input
+            kwargs = {"device": cls.args.device}
+            if cls.is_deepspeed_enabled and (torch.is_floating_point(item) or torch.is_complex(item)):
+                kwargs.update({"dtype": cls.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()})
+            return item.to(**kwargs)
+        prompt_inputs.input_ids = _move_data_to_device(prompt_inputs.input_ids)
+        prompt_inputs.attention_mask = _move_data_to_device(prompt_inputs.attention_mask)
+        if "image" in prompt_inputs.media:
+            prompt_inputs.media["image"] = [_move_data_to_device(img) for img in prompt_inputs.media["image"]]
+        return prompt_inputs
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        padding_side = kwargs.get("padding_side", "left")
+        if os.path.isdir(pretrained_model_name_or_path):
+            pretrained_model_name_or_path = pretrained_model_name_or_path
+        else:
+            print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
+            from huggingface_hub import snapshot_download
+            pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
+        image_processor = AutoImageProcessor.from_pretrained(
+            osp.join(pretrained_model_name_or_path, "vision_tower"), trust_remote_code=True
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True
+        )
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+        return cls(image_processor=image_processor, tokenizer=tokenizer, config=config, padding_side=padding_side)
+    def __repr__(self):
+        return f"VILAProcessor(image_processor=SigLip, tokenizer={self.tokenizer}, config={self.config})"
+    def __call__(
+        self,
+        conversation=None,
+        **kwargs: Unpack[VILAProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        The `conv` will be look like
+        [
+            {
+                'from': 'human',
+                'value': [
+                    <transformers_modules.NVILA-Lite-2B-hf-preview.media.Image object at 0x154e68e4c460>,
+                    'What are the common elements in these pictures?'
+                ]
+            }
+        ]
+        and `conversation` will be a list of such `conv`s
+        """
+        if kwargs.get("text", None) is not None:
+            conversation = kwargs.get("text")
+        assert conversation is not None, "`conversation` or `text` is required"
+        padding_side = kwargs.get("padding_side", self.padding_side)
+        input_ids_list = []
+        attention_mask = []
+        media = defaultdict(list)
+        media_config = defaultdict(dict)
+        for conv in conversation:
+            feat = self.__single_call__(conv, **kwargs)
+            input_ids_list.append(feat.input_ids)
+            attention_mask.append(feat.attention_mask)
+            for name in feat.media:
+                media[name] += feat.media[name]
+            for name in feat.media_config:
+                media_config[name].update(feat.media_config[name])
+        # pad the input_ids to batchfy
+        input_ids = pad_fn(
+            input_ids_list,
+            padding_value=self.pad_token_id,
+            padding_side=padding_side,
+        )
+        # Ignore the pad token in the attention mask
+        attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        attention_mask[input_ids == self.pad_token_id] = False
+        input_texts = self.tokenizer.batch_decode(input_ids)
+        bdata = BatchFeature(
+            data={
+                # "input_texts": input_texts,
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "media": media,
+                "media_config": media_config,
+            }
+        )
+        return bdata
+    def __single_call__(
+        self,
+        conversation,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        videos = None,
+        **kwargs: Unpack[VILAProcessorKwargs],
+    ) -> BatchFeature:
+        conversation = copy.deepcopy(conversation)
+        media = extract_media(conversation, self.config)
+        # Process media
+        media_config = defaultdict(dict)
+        for name in media:
+            if name == "image":
+                if len(media["image"]) == 1 and self.config.image_aspect_ratio in ["dynamic", "dynamic_s2"]:
+                    self.config.image_processor = self.image_processor
+                    if self.config.image_aspect_ratio == "dynamic":
+                        images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
+                        # Note: This assumes images appear at the first conversation position
+                        conversation[0]["value"] = conversation[0]["value"].replace(
+                            DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
+                        )
+                    else:
+                        if type(self.config.s2_scales) is str:
+                            self.config.s2_scales = list(map(int, self.config.s2_scales.split(",")))
+                        images, block_sizes = process_image(
+                            media["image"][0], self.config, None, enable_dynamic_s2=True
+                        )
+                        images = images.half()
+                        media_config[name]["block_sizes"] = [block_sizes]
+                else:
+                    images = process_images(media["image"], self.image_processor, self.config).half()
+                media[name] = [image for image in images]
+            elif name == "video":
+                media[name] = [
+                    process_images(images, self.image_processor, self.config).half() for images in media[name]
+                ]
+            elif name == "speech":
+                speeches = media["speech"]
+                media[name] = [speech for speech in speeches]
+            elif name == "sound":
+                sounds = media["sound"]
+                for sound in sounds:
+                    if type(sound) is dict:
+                        for k, v in sound.items():
+                            sound[k] = v.half()
+                media[name] = [sound for sound in sounds]
+            elif name == "video_info":
+                media[name] = [media["video_info"]]
+            elif name == "audio_info":
+                media[name] = [media["audio_info"]]
+            else:
+                raise ValueError(f"Unsupported media type: {name}")
+        inputs = tokenize_conversation(
+                conversation,
+                self.tokenizer,
+                mm_use_bos_eos_tokens=self.config.mm_use_bos_eos_tokens,
+                unified_audio_encoder=self.config.unified_audio_encoder,
+                add_generation_prompt=True,
+            )
+        input_ids = inputs.unsqueeze(0)
+        attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        return BatchFeature(
+            data={
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "media": media,
+                "media_config": media_config,
+            }
+        )
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def convert_gpt_conv_to_vila_conv(self, conversation):
+        vila_conv = []
+        for chat in conversation:
+            vila_chat = {"from": "", "value": []}
+            if chat["role"] in ("user", "system"):
+                # user allows to input image and text
+                vila_chat["from"] = "human" if chat["role"] == "user" else "system"
+                vila_chat["value"] = extract_value_from_conv(chat)
+            elif chat["role"] == "assistant":
+                vila_chat["from"] = "gpt"
+                vila_chat["value"] = extract_value_from_conv(chat)
+            else:
+                raise ValueError(f"Unsupported role: {chat['role']} in chat {chat}")
+            vila_conv.append(vila_chat)
+        return vila_conv
+    def apply_chat_template(self, conversation, add_generation_prompt=True, **kwargs):
+        return self.convert_gpt_conv_to_vila_conv(conversation)

base_projector.py ADDED Viewed

	@@ -0,0 +1,236 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import re
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel
+class IdentityMap(nn.Module):
+    """Identity mapping that returns input unchanged."""
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": "identity"}
+class SimpleResBlock(nn.Module):
+    """Simple residual block with layer normalization."""
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels))
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+class DownSampleBlock(nn.Module):
+    """Downsample 2D feature maps by rearranging into 2x2 blocks."""
+    def forward(self, x):
+        vit_embeds = x
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.flat_square(vit_embeds)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        return vit_embeds
+    def flat_square(self, x):
+        n, w, h, c = x.size()
+        if w % 2 == 1:
+            x = torch.concat([x, torch.zeros((n, 1, h, c), dtype=x.dtype).to(x.device)], dim=1).contiguous()
+            n, w, h, c = x.size()
+        if h % 2 == 1:
+            x = torch.concat([x, torch.zeros((n, w, 1, c), dtype=x.dtype).to(x.device)], dim=2).contiguous()
+            n, w, h, c = x.size()
+        x = x.contiguous()
+        x = x.view(n, w, int(h / 2), int(c * 2))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h / 2), int(w / 2), int(c * 4))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+class DownSample2x2BlockFix(nn.Module):
+    """Downsample 2D feature maps by rearranging into 2x2 blocks (fixed version)."""
+    def forward(self, x):
+        vit_embeds = x
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = flat_square_2x2(vit_embeds)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        return vit_embeds
+def flat_square_2x2(x):
+    """Rearrange feature map into 2x2 blocks."""
+    n, w, h, c = x.size()
+    if w % 2 == 1:
+        x = torch.concat([x, torch.zeros((n, 1, h, c), dtype=x.dtype).to(x.device)], dim=1).contiguous()
+        n, w, h, c = x.size()
+    x = x.contiguous()
+    if h % 2 == 1:
+        x = torch.concat([x, torch.zeros((n, w, 1, c), dtype=x.dtype).to(x.device)], dim=2).contiguous()
+        n, w, h, c = x.size()
+    x = x.view(n, w, int(h / 2), int(c * 2))
+    x = x.permute(0, 2, 1, 3).contiguous()
+    x = x.view(n, int(h / 2), int(w / 2), int(c * 4))
+    x = x.permute(0, 2, 1, 3).contiguous()
+    return x
+class DownSample3x3BlockFix(nn.Module):
+    """Downsample 2D feature maps by rearranging into 3x3 blocks (fixed version)."""
+    def forward(self, x):
+        vit_embeds = x
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = flat_square_3x3(vit_embeds)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        return vit_embeds
+def flat_square_3x3(x):
+    """Rearrange feature map into 3x3 blocks."""
+    n, w, h, c = x.size()
+    if w % 3 != 0:
+        x = torch.concat([x, torch.zeros((n, 3 - (w % 3), h, c), dtype=x.dtype).to(x.device)], dim=1).contiguous()
+        n, w, h, c = x.size()
+    x = x.contiguous()
+    if h % 3 != 0:
+        x = torch.concat([x, torch.zeros((n, w, 3 - (h % 3), c), dtype=x.dtype).to(x.device)], dim=2).contiguous()
+        n, w, h, c = x.size()
+    x = x.view(n, w, int(h / 3), int(c * 3))
+    x = x.permute(0, 2, 1, 3).contiguous()
+    x = x.view(n, int(h / 3), int(w / 3), int(c * 9))
+    x = x.permute(0, 2, 1, 3).contiguous()
+    return x
+class MultimodalProjectorConfig(PretrainedConfig):
+    """Configuration for vision-to-language projector."""
+    model_type = "v2l_projector"
+    def __init__(self, mm_projector_type: str = None, **kwargs):
+        super().__init__()
+        self.mm_projector_type = mm_projector_type
+class MultimodalProjector(PreTrainedModel):
+    """Multimodal projector for mapping vision features to LLM space."""
+    config_class = MultimodalProjectorConfig
+    def __init__(self, mm_projector_cfg: MultimodalProjectorConfig, config: PretrainedConfig):
+        super().__init__(mm_projector_cfg)
+        mm_projector_type = mm_projector_cfg.mm_projector_type
+        self.downsample_rate = 1
+        if mm_projector_type == "identity":
+            self.layers = IdentityMap()
+        elif mm_projector_type == "linear":
+            self.layers = nn.Linear(config.mm_hidden_size, config.hidden_size)
+        elif mm_projector_type == "mlp_downsample":
+            self.layers = nn.Sequential(
+                DownSampleBlock(),
+                nn.LayerNorm(config.mm_hidden_size * 4),
+                nn.Linear(config.mm_hidden_size * 4, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+            self.downsample_rate = 2
+        elif mm_projector_type == "mlp_downsample_2x2_fix":
+            self.layers = nn.Sequential(
+                DownSample2x2BlockFix(),
+                nn.LayerNorm(config.mm_hidden_size * 4),
+                nn.Linear(config.mm_hidden_size * 4, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+            self.downsample_rate = 2
+        elif mm_projector_type == "mlp_downsample_3x3_fix":
+            self.layers = nn.Sequential(
+                DownSample3x3BlockFix(),
+                nn.LayerNorm(config.mm_hidden_size * 9),
+                nn.Linear(config.mm_hidden_size * 9, config.mm_hidden_size * 3),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size * 3),
+                nn.Linear(config.mm_hidden_size * 3, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+            self.downsample_rate = 3
+        elif mm_projector_type == "mlp_downsample_3x3_s2":
+            self.layers = nn.Sequential(
+                DownSample3x3BlockFix(),
+                nn.LayerNorm(config.mm_hidden_size * 9),
+                nn.Linear(config.mm_hidden_size * 9, config.mm_hidden_size * 3),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size * 3),
+                nn.Linear(config.mm_hidden_size * 3, config.mm_hidden_size),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size),
+                nn.Linear(config.mm_hidden_size, config.mm_hidden_size // 3),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size // 3),
+                nn.Linear(config.mm_hidden_size // 3, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        elif mm_projector_type == "mlp_downsample_3x3_s2_new":
+            self.layers = nn.Sequential(
+                DownSample3x3BlockFix(),
+                nn.LayerNorm(config.mm_hidden_size * 9),
+                nn.Linear(config.mm_hidden_size * 9, config.mm_hidden_size * 4),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size * 4),
+                nn.Linear(config.mm_hidden_size * 4, config.mm_hidden_size * 2),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size * 2),
+                nn.Linear(config.mm_hidden_size * 2, config.mm_hidden_size),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size),
+                nn.Linear(config.mm_hidden_size, config.mm_hidden_size // 3),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size // 3),
+                nn.Linear(config.mm_hidden_size // 3, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        else:
+            mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", mm_projector_type)
+            if mlp_gelu_match:
+                mlp_depth = int(mlp_gelu_match.group(1))
+                modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+                for _ in range(1, mlp_depth):
+                    modules.append(nn.GELU())
+                    modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+                self.layers = nn.Sequential(*modules)
+            else:
+                raise ValueError(f"Unknown projector type: {mm_projector_type}")
+    def forward(self, x, *args, **kwargs):
+        return self.layers(x)

builder.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import math
+import os
+import os.path as osp
+import warnings
+from dataclasses import asdict
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+import torch
+import transformers
+from huggingface_hub import file_exists, repo_exists
+from huggingface_hub.utils import HFValidationError
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    PretrainedConfig,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+)
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from .constants import MEDIA_TOKENS, SENTINEL_TOKEN
+from .conversation import SeparatorStyle, default_conversation
+DUMMY_CONVERSATION = [
+    {"from": "human", "value": "question"},
+    {"from": "gpt", "value": "answer"},
+] * 10
+def tokenizer_image_token(prompt, tokenizer, return_tensors=None):
+    """Tokenize a prompt and return input IDs."""
+    return tokenizer(prompt, return_tensors=return_tensors).input_ids[0]
+def has_tokenizer(repo_id_or_path: str) -> bool:
+    """Check if a tokenizer exists at the given path or repository."""
+    # Check if the tokenizer is in a local directory
+    if osp.exists(osp.join(repo_id_or_path, "tokenizer_config.json")):
+        return True
+    # Check if the tokenizer is in a Hugging Face Hub repo
+    try:
+        return repo_exists(repo_id_or_path) and file_exists(repo_id_or_path, "tokenizer_config.json")
+    except HFValidationError:
+        return False
+def _maybe_add_sentinel_token(tokenizer: transformers.PreTrainedTokenizer) -> None:
+    """Add sentinel token to tokenizer if not already present."""
+    if not hasattr(tokenizer, "sentinel_token"):
+        tokenizer.add_tokens([SENTINEL_TOKEN], special_tokens=True)
+        tokenizer.sentinel_token = SENTINEL_TOKEN
+        tokenizer.sentinel_token_id = tokenizer.convert_tokens_to_ids(SENTINEL_TOKEN)
+def tokenize_conversation_legacy(
+    messages: Sequence[Dict[str, str]],
+    tokenizer: transformers.PreTrainedTokenizer,
+    add_generation_prompt: bool = False,
+    overrides: Optional[Dict[str, str]] = None,
+    no_system_prompt: bool = False,
+) -> torch.Tensor:
+    """Tokenize conversation using legacy format."""
+    conv = default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    if no_system_prompt:
+        conv.system = ""
+    # Skip the first message if it is not from human
+    if messages[0]["from"] != "human":
+        messages = messages[1:]
+    # Add a generation prompt if needed
+    if add_generation_prompt:
+        messages.append({"from": "gpt", "value": None})
+    conv.messages = []
+    for turn, message in enumerate(messages):
+        role = roles[message["from"]]
+        assert role == conv.roles[turn % 2]
+        if overrides is not None and message["from"] in overrides:
+            conv.append_message(role, overrides[message["from"]])
+        else:
+            conv.append_message(role, message["value"])
+    return tokenizer_image_token(conv.get_prompt(), tokenizer, return_tensors="pt")
+def tokenize_conversation(
+    messages: Sequence[Dict[str, str]],
+    tokenizer: transformers.PreTrainedTokenizer,
+    add_generation_prompt: bool = False,
+    overrides: Optional[Dict[str, str]] = None,
+    no_system_prompt: bool = False,
+) -> torch.Tensor:
+    """Tokenize conversation using modern chat template format."""
+    # Normalize the conversation before tokenization
+    for message in messages:
+        message["value"] = message["value"].strip()
+    if default_conversation.sep_style != SeparatorStyle.AUTO:
+        return tokenize_conversation_legacy(
+            messages,
+            tokenizer,
+            add_generation_prompt=add_generation_prompt,
+            overrides=overrides,
+            no_system_prompt=no_system_prompt,
+        )
+    conversation = []
+    for m in messages:
+        message = {}
+        if m["from"] == "human":
+            message["role"] = "user"
+        elif m["from"] == "gpt":
+            message["role"] = "assistant"
+        else:
+            raise ValueError(f"Unexpected sender '{m['from']}' in conversation entry.")
+        message["content"] = m["value"]
+        if overrides is not None and m["from"] in overrides:
+            message["content"] = overrides[m["from"]]
+        conversation.append(message)
+    if no_system_prompt:
+        conversation = [{"role": "system", "content": ""}] + conversation
+    text = tokenizer.apply_chat_template(
+        conversation,
+        add_generation_prompt=add_generation_prompt,
+        tokenize=False,
+    )
+    return tokenizer_image_token(text, tokenizer, return_tensors="pt")
+def infer_stop_tokens(tokenizer: transformers.PreTrainedTokenizer) -> List[str]:
+    """Infer stop tokens from tokenizer by analyzing dummy conversation."""
+    _maybe_add_sentinel_token(tokenizer)
+    template = tokenize_conversation(DUMMY_CONVERSATION, tokenizer, overrides={"gpt": SENTINEL_TOKEN})
+    stop_tokens = {tokenizer.eos_token}
+    for k in range(template.size(0) - 1):
+        if template[k] == tokenizer.sentinel_token_id:
+            stop_token = tokenizer.decode(template[k + 1])
+            stop_tokens.add(stop_token)
+    return list(stop_tokens)
+def context_length_extension(config):
+    """Extend context length using RoPE scaling if needed."""
+    orig_ctx_len = getattr(config, "max_position_embeddings", None)
+    model_max_length = getattr(config, "model_max_length", None)
+    if orig_ctx_len and model_max_length > orig_ctx_len:
+        print(f"Scaling RoPE from {orig_ctx_len} to {model_max_length}")
+        scaling_factor = float(math.ceil(model_max_length / orig_ctx_len))
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    return config
+def build_llm_and_tokenizer(
+    model_name_or_path: str,
+    config: PretrainedConfig,
+    attn_implementation=None,
+    model_max_length=None,
+    *args,
+    **kwargs,
+) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+    """Build language model and tokenizer from pretrained checkpoint."""
+    llm_cfg = AutoConfig.from_pretrained(model_name_or_path)
+    llm_cfg._attn_implementation = attn_implementation
+    llm_cfg.model_max_length = model_max_length
+    if model_max_length is not None:
+        context_length_extension(llm_cfg)
+    # Quantization related
+    quantization_restore_from_checkpoint = False
+    if type(config.model_dtype) == str:
+        model_dtype = eval(config.model_dtype)
+    else:
+        model_dtype = config.model_dtype
+    if quantization_restore_from_checkpoint:
+        fp8_model_name_or_path = kwargs.pop("fp8_llm_cfg", None)
+        llm = AutoModelForCausalLM.from_pretrained(
+            fp8_model_name_or_path, config=llm_cfg, torch_dtype=model_dtype, *args, **kwargs
+        )
+    else:
+        if is_deepspeed_zero3_enabled():
+            kwargs.pop("device_map")
+        llm = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path, config=llm_cfg, torch_dtype=model_dtype, *args, **kwargs
+        )
+        print(f"Loaded model from {model_name_or_path} with dtype {model_dtype}")
+    # Locate the tokenizer.
+    llm_path = model_name_or_path
+    if not has_tokenizer(llm_path):
+        llm_path = osp.join(llm_path, "llm")
+    if not has_tokenizer(llm_path):
+        raise ValueError(f"Cannot find tokenizer in {llm_path}.")
+    tokenizer = AutoTokenizer.from_pretrained(llm_path, padding_side="right", use_fast=True, legacy=False)
+    if model_max_length is not None:
+        tokenizer.model_max_length = model_max_length
+    # Load chat template if specified.
+    if getattr(config, "chat_template", None) is not None:
+        print(f"Using chat template: {config.chat_template}")
+        fpath = os.path.join(os.path.dirname(__file__), "chat_templates", f"{config.chat_template}.jinja")
+        if not os.path.exists(fpath):
+            fpath = os.path.join(os.path.dirname(model_name_or_path), f"{config.chat_template}.jinja")
+        with open(fpath) as fd:
+            chat_template = fd.read()
+        tokenizer.chat_template = chat_template.replace("    ", "").replace("\n", "")
+    # Set stop tokens for the tokenizer
+    tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
+    tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)
+    # Add media tokens to the tokenizer
+    tokenizer.media_tokens = MEDIA_TOKENS
+    tokenizer.media_token_ids = {}
+    for name, token in MEDIA_TOKENS.items():
+        if config.speech_tower_cfg is None and name == "speech":
+            continue
+        if config.sound_tower_cfg is None and name == "sound":
+            continue
+        tokenizer.add_tokens([token], special_tokens=True)
+        tokenizer.media_token_ids[name] = tokenizer.convert_tokens_to_ids(token)
+        tokenizer.media_tokens[name] = token
+    config.hidden_size = llm.config.hidden_size
+    return llm, tokenizer

config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

configuration_vila.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import math
+import os
+import os.path as osp
+from copy import deepcopy
+from threading import Thread
+from typing import List, Optional
+import torch
+import torchvision
+from PIL import Image
+from transformers import (
+    AutoProcessor,
+    PretrainedConfig,
+    PreTrainedModel,
+    Qwen2Config,
+    Qwen2ForCausalLM,
+    Qwen2PreTrainedModel,
+    TextIteratorStreamer,
+)
+class VILAConfig(PretrainedConfig):
+    model_type = "vila"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        llm_cfg=None,
+        vision_tower_cfg=None,
+        mm_projector_cfg=None,
+        speech_tower_cfg=None,
+        sound_tower_cfg=None,
+        speech_mm_projector_cfg=None,
+        sound_mm_projector_cfg=None,
+        architectures=None,
+        resume_path=None,
+        hidden_size=None,
+        mm_hidden_size=None,
+        image_aspect_ratio=None,
+        num_video_frames=None,
+        fps=None,
+        mm_vision_select_layer=None,
+        mm_vision_select_feature=None,
+        mm_use_im_start_end=False,
+        mm_use_im_patch_token=False,
+        mm_projector_lr=None,
+        vision_tower_lr=None,
+        vision_resolution=None,
+        interpolate_mode=None,
+        s2=None,
+        dynamic_s2=None,
+        s2_scales=None,
+        s2_max_split_size=None,
+        s2_resize_output_to_scale_idx=0,
+        min_tiles: Optional[int] = 1,
+        max_tiles: Optional[int] = 12,
+        num_time_tokens=None,
+        time_token_format=None,
+        image_encoder: str = '{"_target_": "llava.model.encoders.BasicImageEncoder"}',
+        video_encoder: str = '{"_target_": "llava.model.encoders.TSPVideoEncoder"}',
+        sound_encoder: str = '{"_target_": "llava.model.encoders.BasicSoundEncoder"}',
+        speech_encoder: str = '{"_target_": "llava.model.encoders.BasicSpeechEncoder"}',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.architectures = architectures
+        self.llm_cfg = llm_cfg
+        self.vision_tower_cfg = vision_tower_cfg
+        self.mm_projector_cfg = mm_projector_cfg
+        self.speech_tower_cfg = speech_tower_cfg
+        self.sound_tower_cfg = sound_tower_cfg
+        self.speech_mm_projector_cfg = speech_mm_projector_cfg
+        self.sound_mm_projector_cfg = sound_mm_projector_cfg
+        self.resume_path = resume_path
+        self.hidden_size = hidden_size
+        self.mm_hidden_size = mm_hidden_size
+        self.image_aspect_ratio = image_aspect_ratio
+        self.num_video_frames = num_video_frames
+        self.fps = fps
+        self.mm_vision_select_layer = mm_vision_select_layer
+        self.mm_vision_select_feature = mm_vision_select_feature
+        self.mm_use_im_start_end = mm_use_im_start_end
+        self.mm_use_im_patch_token = mm_use_im_patch_token
+        self.mm_projector_lr = mm_projector_lr
+        self.vision_tower_lr = vision_tower_lr
+        self.vision_resolution = vision_resolution
+        self.interpolate_mode = interpolate_mode
+        self.s2 = s2
+        self.dynamic_s2 = dynamic_s2
+        self.s2_scales = s2_scales
+        self.s2_max_split_size = s2_max_split_size
+        self.s2_resize_output_to_scale_idx = s2_resize_output_to_scale_idx
+        self.min_tiles = min_tiles
+        self.max_tiles = max_tiles
+        self.num_time_tokens = num_time_tokens
+        self.time_token_format = time_token_format
+        self.image_encoder = image_encoder
+        self.video_encoder = video_encoder
+        self.sound_encoder = sound_encoder
+        self.speech_encoder = speech_encoder
+        self.audio_sampling_rate = 16000
+        self.audio_chunk_length = 120
+        self.interleaved_vis_aud_in_video = True
+        self.interleaved_video_segment_duration = 30
+        self.audio_hop_length = 60
+        super().__init__(**kwargs)

constants.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_SOUND_TOKEN = "<sound>"
+DEFAULT_SPEECH_TOKEN = "<speech>"
+SENTINEL_TOKEN = "<vila/sentinel>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+SENTINEL_TOKEN = "<vila/sentinel>"
+MEDIA_TOKENS = {
+    "image": "<image>",
+    "video": "<vila/video>",
+    "speech": "<speech>",
+    "sound": "<sound>",
+}
+# Token IDs for different model variants:
+"""
+vila:
+    151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151646: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151647: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151648: AddedToken("<vila/sentinel>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151649: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151650: AddedToken("<vila/video>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+xvila:
+    151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151646: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151647: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151648: AddedToken("<vila/sentinel>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151649: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151650: AddedToken("<vila/video>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151651: AddedToken("<speech>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151652: AddedToken("<sound>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151653: AddedToken("<|image_bos|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151654: AddedToken("<|image_eos|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151655: AddedToken("<|video_bos|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151656: AddedToken("<|video_eos|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151657: AddedToken("<|speech_bos|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151658: AddedToken("<|speech_eos|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151659: AddedToken("<|sound_bos|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+    151660: AddedToken("<|sound_eos|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+"""
+MM_BOS_EOS_TOKENS = {
+    "image": ["<|image_bos|>", "<|image_eos|>"],
+    "video": ["<|video_bos|>", "<|video_eos|>"],
+    "speech": ["<|speech_bos|>", "<|speech_eos|>"],
+    "sound": ["<|sound_bos|>", "<|sound_eos|>"],
+}
+NUM_EXTRA_TOKENS_VILA = 8
+NUM_EXTRA_TOKENS_XVILA = 10

conversation.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import dataclasses
+from enum import Enum, auto
+from typing import List
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    AUTO = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_3 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    sep_style: SeparatorStyle = SeparatorStyle.AUTO
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            ret = self.system + self.sep
+            for rid, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    sep = self.sep if rid < len(messages) - 1 else self.sep2
+                    ret += role + message + sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+conv_auto = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    sep_style=SeparatorStyle.AUTO,
+    sep="\n",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+hermes_2 = Conversation(
+    system="<|im_start|>system\nAnswer the questions.",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+    messages=(),
+    version="hermes-2",
+)
+llama_3_chat = Conversation(
+    system="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama_v3",
+    messages=(),
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|eot_id|>",
+    sep2="<|end_of_text|>",
+)
+default_conversation = conv_auto
+conv_templates = {
+    "auto": conv_auto,
+    "hermes-2": hermes_2,
+    "llama_3": llama_3_chat,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "plain": conv_llava_plain,
+}
+CONVERSATION_MODE_MAPPING = {
+    "vila1.5-3b": "vicuna_v1",
+    "vila1.5-8b": "llama_3",
+    "vila1.5-13b": "vicuna_v1",
+    "vila1.5-40b": "hermes-2",
+    "llama-3": "llama_3",
+    "llama3": "llama_3",
+}
+def auto_set_conversation_mode(model_name_or_path: str) -> str:
+    """Automatically set conversation mode based on model name/path."""
+    global default_conversation
+    for k, v in CONVERSATION_MODE_MAPPING.items():
+        if k in model_name_or_path.lower():
+            print(f"Setting conversation mode to `{v}` based on model name/path `{model_name_or_path}`.")
+            default_conversation = conv_templates[v]
+            return

distributed.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import warnings
+from typing import Any, List, Optional
+from torch import distributed as dist
+__all__ = [
+    "init",
+    "is_initialized",
+    "size",
+    "rank",
+    "local_size",
+    "local_rank",
+    "is_main",
+    "barrier",
+    "gather",
+    "all_gather",
+]
+def init() -> None:
+    if "RANK" not in os.environ:
+        warnings.warn("Environment variable `RANK` is not set. Skipping distributed initialization.")
+        return
+    dist.init_process_group(backend="nccl", init_method="env://")
+def is_initialized() -> bool:
+    return dist.is_initialized()
+def size() -> int:
+    return int(os.environ.get("WORLD_SIZE", 1))
+def rank() -> int:
+    return int(os.environ.get("RANK", 0))
+def local_size() -> int:
+    return int(os.environ.get("LOCAL_WORLD_SIZE", 1))
+def local_rank() -> int:
+    return int(os.environ.get("LOCAL_RANK", 0))
+def is_main() -> bool:
+    return rank() == 0
+def barrier() -> None:
+    dist.barrier()
+def gather(obj: Any, dst: int = 0) -> Optional[List[Any]]:
+    if not is_initialized():
+        return [obj]
+    if is_main():
+        objs = [None for _ in range(size())]
+        dist.gather_object(obj, objs, dst=dst)
+        return objs
+    else:
+        dist.gather_object(obj, dst=dst)
+        return None
+def all_gather(obj: Any) -> List[Any]:
+    if not is_initialized():
+        return [obj]
+    objs = [None for _ in range(size())]
+    dist.all_gather_object(objs, obj)
+    return objs

environment_setup.sh ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env bash
+set -e
+CONDA_ENV=${1:-""}
+if [ -n "$CONDA_ENV" ]; then
+    # This is required to activate conda environment
+    eval "$(conda shell.bash hook)"
+    conda create -n $CONDA_ENV python=3.10.14 -y
+    conda activate $CONDA_ENV
+    # This is optional if you prefer to use built-in nvcc
+    conda install -c nvidia cuda-toolkit=12.2 -y
+else
+    echo "Skipping conda environment creation. Make sure you have the correct environment activated."
+fi
+# Using uv to speedup installations
+pip install uv
+alias uvp="uv pip"
+echo "[INFO] Using python $(which python)"
+echo "[INFO] Using pip $(which pip)"
+echo "[INFO] Using uv $(which uv)"
+# This is required to enable PEP 660 support
+uv pip install --upgrade pip setuptools
+# Install FlashAttention2
+uv pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+# Install VILA
+uv pip install -e ".[train,eval]"
+# numpy introduce a lot dependencies issues, separate from pyproject.yaml
+pip install numpy==1.26.4
+# audio
+uv pip install soundfile librosa openai-whisper ftfy
+conda install -c conda-forge ffmpeg
+uv pip install jiwer
+# Downgrade protobuf to 3.20 for backward compatibility
+uv pip install protobuf==3.20.*
+# Replace transformers and deepspeed files
+site_pkg_path=$(python -c 'import site; print(site.getsitepackages()[0])')
+cp -rv ./transformers/modeling_utils.py $site_pkg_path/transformers/modeling_utils.py # for using qwen 2.5 omni checkpoint
+# for benchmark adoption
+uv pip install faiss-gpu-cu12
+# Quantization requires the newest triton version, and introduce dependency issue
+uv pip install triton==3.1.0 # we don't need this version if we do not use FP8LinearQwen2Config, QLlavaLlamaConfig, etc. It is not compatible with mamba-ssm.
+uv pip install kaldiio
+# for rotary embedding
+uv pip install beartype
+uv pip install pydantic==1.10.22

example_infer.py ADDED Viewed

	@@ -0,0 +1,335 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers import AutoProcessor, AutoModel, AutoConfig, GenerationConfig
+import torch
+import os
+import time
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Union
+import logging
+import sys
+os.environ["HF_HUB_OFFLINE"] = "1"  # Use local cache for models
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def add_to_sys_path_direct(model_path):
+    """Add model path directly to sys.path"""
+    if model_path not in sys.path:
+        sys.path.insert(0, model_path)  # Insert at beginning for priority
+        print(f"✓ Added to sys.path: {model_path}")
+    else:
+        print(f"Already in sys.path: {model_path}")
+class NVOmniVideoInference:
+    """A class to handle NVOmni video model inference with improved error handling and flexibility."""
+    def __init__(self, model_path: str, torch_dtype="torch.float16", device_map="auto"):
+        """
+        Initialize the NVOmni model for video inference.
+        Args:
+            model_path (str): Path to the model directory
+            torch_dtype: PyTorch data type for model weights
+            device_map (str): Device mapping strategy for model loading
+        """
+        self.model_path = model_path
+        self.torch_dtype = torch_dtype
+        self.device_map = device_map
+        self.model = None
+        self.processor = None
+        self.config = None
+        self.device = None
+        self.load_model()
+    def validate_paths(self, model_path: str, video_path: str = None) -> bool:
+        """Validate that required paths exist."""
+        if not Path(model_path).exists():
+            logger.error(f"Model path does not exist: {model_path}")
+            return False
+        if video_path and not Path(video_path).exists():
+            logger.error(f"Video path does not exist: {video_path}")
+            return False
+        return True
+    def load_model(self) -> bool:
+        """Load the model, processor, and config with error handling."""
+        if not self.validate_paths(self.model_path):
+            return False
+        if True:
+            logger.info("Loading model configuration...")
+            self.config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
+            logger.info("Loading model...")
+            start_time = time.time()
+            self.model = AutoModel.from_pretrained(
+                self.model_path,
+                trust_remote_code=True,
+                torch_dtype=self.torch_dtype,
+                device_map=self.device_map,
+                low_cpu_mem_usage=True  # More memory efficient loading
+            )#.to(eval(self.torch_dtype))
+            load_time = time.time() - start_time
+            logger.info(f"Model loaded in {load_time:.2f} seconds")
+            logger.info("Loading processor...")
+            self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True)
+            # Set device for single-device setups
+            if hasattr(self.model, 'device'):
+                self.device = self.model.device
+            else:
+                self.device = next(self.model.parameters()).device if self.model.parameters() else torch.device('cpu')
+            logger.info(f"Model successfully loaded on device: {self.device}")
+            self._print_model_info()
+            return True
+    def _print_model_info(self):
+        """Print useful information about the loaded model."""
+        logger.info("=" * 50)
+        logger.info("MODEL INFORMATION")
+        logger.info("=" * 50)
+        if self.config:
+            logger.info(f"Model type: {getattr(self.config, 'model_type', 'Unknown')}")
+            logger.info(f"Hidden size: {getattr(self.config, 'hidden_size', 'Unknown')}")
+        if self.model and torch.cuda.is_available():
+            logger.info(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
+            logger.info(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
+    def create_conversation(self, video_path: str, text_prompt: str) -> List[Dict[str, Any]]:
+        """
+        Create a conversation format for the model.
+        Args:
+            video_path (str): Path to the video file
+            text_prompt (str): Text prompt for the model
+        Returns:
+            List[Dict]: Conversation in the expected format
+        """
+        return [{
+            "role": "user",
+            "content": [
+                {"type": "video", "video": video_path},
+                {"type": "text", "text": text_prompt}
+            ]
+        }]
+    @torch.inference_mode()
+    def generate_response(
+        self,
+        video_path: str,
+        text_prompt: str,
+        max_new_tokens: int = 256,
+        temperature: float = None,
+        top_p: float = None,
+        do_sample: bool = None,
+        num_video_frames: int = -1,
+        load_audio_in_video: bool = True,
+        audio_length: Union[int, str] = "max_3600",
+    ) -> Optional[str]:
+        """
+        Generate a response from the model given a video and text prompt.
+        Args:
+            video_path (str): Path to the video file
+            text_prompt (str): Text prompt for the model
+            max_new_tokens (int): Maximum number of new tokens to generate
+            temperature (float): Sampling temperature
+            top_p (float): Top-p sampling parameter
+            do_sample (bool): Whether to use sampling
+            custom_generation_config (GenerationConfig): Custom generation configuration
+        Returns:
+            Optional[str]: Generated response or None if failed
+        """
+        if not self.model or not self.processor:
+            logger.error("Model or processor not loaded. Please initialize the model first.")
+            return None
+        if not self.validate_paths(self.model_path, video_path):
+            return None
+        # try:
+        if True:
+            logger.info(f"Processing video: {video_path}")
+            logger.info(f"Text prompt: {text_prompt}")
+            # Create conversation
+            conversation = self.create_conversation(video_path, text_prompt)
+            # Apply chat template
+            text = self.processor.apply_chat_template(
+                conversation,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            logger.info(f"Chat template applied")
+            # set model params
+            self.model.config.load_audio_in_video = load_audio_in_video
+            self.processor.config.load_audio_in_video = load_audio_in_video
+            if num_video_frames > 0:
+                self.model.config.num_video_frames = num_video_frames
+                self.processor.config.num_video_frames = num_video_frames
+            if audio_length != -1:
+                self.model.config.audio_chunk_length = audio_length
+                self.processor.config.audio_chunk_length = audio_length
+            logger.info(f"Model config - load_audio_in_video: {self.model.config.load_audio_in_video}, num_video_frames: {self.model.config.num_video_frames}, audio_chunk_length: {self.model.config.audio_chunk_length}")
+            # Process inputs
+            start_time = time.time()
+            inputs = self.processor([text])
+            # Move inputs to the correct device if needed
+            if hasattr(inputs, 'input_ids') and inputs.input_ids is not None:
+                inputs.input_ids = inputs.input_ids.to(self.device)
+            processing_time = time.time() - start_time
+            logger.info(f"Input processing completed in {processing_time:.2f} seconds")
+            logger.info("Generating response...")
+            start_time = time.time()
+            generation_kwargs = {"max_new_tokens": max_new_tokens, "max_length": 99999999}
+            if top_p is not None:
+                generation_kwargs["top_p"] = top_p
+            if do_sample is not None:
+                generation_kwargs["do_sample"] = do_sample
+            if temperature is not None:
+                generation_kwargs["temperature"] = temperature
+            generation_config = self.model.default_generation_config
+            generation_config.update(**generation_kwargs)
+            logger.info(f"Generation config: {generation_config.to_dict()}")
+            with torch.no_grad():
+                output_ids = self.model.generate(
+                    input_ids=inputs.input_ids,
+                    media=getattr(inputs, 'media', None),
+                    media_config=getattr(inputs, 'media_config', None),
+                    generation_config=generation_config,
+                )
+            generation_time = time.time() - start_time
+            logger.info(f"Generation completed in {generation_time:.2f} seconds")
+            # Decode response
+            response = self.processor.tokenizer.batch_decode(
+                output_ids,
+                skip_special_tokens=True
+            )[0]
+            return response
+    def batch_generate(
+        self,
+        video_text_pairs: List[tuple],
+        **generation_kwargs
+    ) -> List[Optional[str]]:
+        """
+        Generate responses for multiple video-text pairs.
+        Args:
+            video_text_pairs (List[tuple]): List of (video_path, text_prompt) tuples
+            **generation_kwargs: Arguments passed to generate_response
+        Returns:
+            List[Optional[str]]: List of generated responses
+        """
+        responses = []
+        for i, (video_path, text_prompt) in enumerate(video_text_pairs):
+            logger.info(f"Processing batch item {i+1}/{len(video_text_pairs)}")
+            response = self.generate_response(video_path, text_prompt, **generation_kwargs)
+            responses.append(response)
+            # Clear cache between generations to manage memory
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        return responses
+def main():
+    """Main function demonstrating usage of the NVOmni model."""
+    # Configuration
+    MODEL_PATH = "./"
+    VIDEO_PATH = "xxx.mp4"
+    TEXT_PROMPT = "Assess the video, followed by a detailed description of it's video and audio contents."
+    num_video_frames=128
+    audio_length="max_3600"
+    load_audio_in_video=True
+    add_to_sys_path_direct(MODEL_PATH)
+    # Initialize the inference class
+    logger.info("Initializing NVOmni Video Inference...")
+    inferencer = NVOmniVideoInference(MODEL_PATH, torch_dtype="torch.float16")
+    if inferencer.model is None:
+        logger.error("Failed to initialize model. Exiting.")
+        return
+    # Generate response
+    logger.info("Starting inference...")
+    response = inferencer.generate_response(
+        video_path=VIDEO_PATH,
+        text_prompt=TEXT_PROMPT,
+        num_video_frames=num_video_frames,
+        load_audio_in_video=load_audio_in_video,
+        audio_length=audio_length,
+        max_new_tokens=1024,
+    )
+    if response:
+        print("\n" + "="*60)
+        print("GENERATED RESPONSE")
+        print("="*60)
+        print(response)
+        print("="*60)
+    else:
+        logger.error("Failed to generate response")
+    # Example of batch processing
+    if False:
+        logger.info("\nExample: Batch processing")
+        batch_pairs = [
+            (VIDEO_PATH, "What is happening in this video?"),
+            (VIDEO_PATH, "Describe the audio content of this video."),
+        ]
+        batch_responses = inferencer.batch_generate(batch_pairs, max_new_tokens=128)
+        for i, (pair, response) in enumerate(zip(batch_pairs, batch_responses)):
+            print(f"\n--- Batch Response {i+1} ---")
+            print(f"Prompt: {pair[1]}")
+            print(f"Response: {response}")
+if __name__ == "__main__":
+    main()

example_mini_audio.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Example script for audio transcription using the model.
+This script demonstrates how to:
+1. Load the model and processor
+2. Configure audio processing parameters
+3. Process audio input
+4. Generate transcription output
+Usage:
+    python example_mini_audio.py --model_path <path_to_model> --audio_path <path_to_audio>
+"""
+from transformers import AutoProcessor, AutoModel, AutoConfig, AutoModelForCausalLM
+import torch
+import os
+import argparse
+# Configuration
+parser = argparse.ArgumentParser(description="Audio transcription example")
+parser.add_argument("--model_path", type=str, default="./", help="Path to the model")
+parser.add_argument("--audio_path", type=str, required=True, help="Path to the audio file")
+parser.add_argument("--max_new_tokens", type=int, default=1024, help="Maximum number of tokens to generate")
+parser.add_argument("--num_video_frames", type=int, default=128, help="Number of video frames to process")
+parser.add_argument("--audio_length", type=str, default="max_3600", help="Maximum audio length")
+args = parser.parse_args()
+model_path = args.model_path
+audio_path = args.audio_path
+generation_kwargs = {"max_new_tokens": args.max_new_tokens, "max_length": 99999999}
+load_audio_in_video = True
+num_video_frames = args.num_video_frames
+audio_length = args.audio_length
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModel.from_pretrained(model_path,
+                                  trust_remote_code=True,
+                                  torch_dtype="torch.float16",
+                                  device_map="auto")
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+generation_config = model.default_generation_config
+generation_config.update(**generation_kwargs)
+model.config.load_audio_in_video = load_audio_in_video
+processor.config.load_audio_in_video = load_audio_in_video
+if num_video_frames > 0:
+    model.config.num_video_frames = num_video_frames
+    processor.config.num_video_frames = num_video_frames
+if audio_length != -1:
+    model.config.audio_chunk_length = audio_length
+    processor.config.audio_chunk_length = audio_length
+conversation = [{
+        "role": "user",
+        "content": [
+            {"type": "audio", "audio": audio_path},
+            {"type": "text", "text": "Transcribe the whole speech."}
+        ]
+}]
+text = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+inputs = processor([text])
+output_ids = model.generate(
+    input_ids=inputs.input_ids,
+    media=getattr(inputs, 'media', None),
+    media_config=getattr(inputs, 'media_config', None),
+    generation_config=generation_config,
+)
+print(processor.tokenizer.batch_decode(output_ids, skip_special_tokens=True))

example_mini_image.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Example script for image understanding using the model.
+This script demonstrates how to:
+1. Load the model and processor
+2. Process image input
+3. Generate description output
+Usage:
+    python example_mini_image.py --model_path <path_to_model> --image_path <path_to_image>
+"""
+from transformers import AutoProcessor, AutoModel, AutoConfig, AutoModelForCausalLM
+import torch
+import os
+import argparse
+# Configuration
+parser = argparse.ArgumentParser(description="Image understanding example")
+parser.add_argument("--model_path", type=str, default="./", help="Path to the model")
+parser.add_argument("--image_path", type=str, required=True, help="Path to the image file")
+parser.add_argument("--max_new_tokens", type=int, default=1024, help="Maximum number of tokens to generate")
+parser.add_argument("--prompt", type=str, default="Describe the image in detail.", help="Text prompt for the model")
+args = parser.parse_args()
+model_path = args.model_path
+image_path = args.image_path
+generation_kwargs = {"max_new_tokens": args.max_new_tokens, "max_length": 99999999}
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModel.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+generation_config = model.default_generation_config
+generation_config.update(**generation_kwargs)
+conversation = [{
+    "role": "user",
+    "content": [
+        {"type": "image", "image": image_path},
+        {"type": "text", "text": args.prompt}
+    ]
+}]
+text = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+inputs = processor([text])
+output_ids = model.generate(
+    input_ids=inputs.input_ids,
+    media=getattr(inputs, 'media', None),
+    media_config=getattr(inputs, 'media_config', None),
+    generation_config=generation_config,
+)
+print(processor.tokenizer.batch_decode(output_ids, skip_special_tokens=True))

example_mini_video.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Example script for video understanding using the model.
+This script demonstrates how to:
+1. Load the model and processor
+2. Configure video and audio processing parameters
+3. Process video input with optional audio
+4. Generate description output
+Usage:
+    python example_mini_video.py --model_path <path_to_model> --video_path <path_to_video>
+"""
+from transformers import AutoProcessor, AutoModel, AutoConfig, AutoModelForCausalLM
+import torch
+import os
+import argparse
+# Configuration
+parser = argparse.ArgumentParser(description="Video understanding example")
+parser.add_argument("--model_path", type=str, default="./", help="Path to the model")
+parser.add_argument("--video_path", type=str, required=True, help="Path to the video file")
+parser.add_argument("--max_new_tokens", type=int, default=1024, help="Maximum number of tokens to generate")
+parser.add_argument("--num_video_frames", type=int, default=128, help="Number of video frames to process")
+parser.add_argument("--audio_length", type=str, default="max_3600", help="Maximum audio length")
+parser.add_argument("--prompt", type=str, default="What are they talking about in detail?", help="Text prompt for the model")
+parser.add_argument("--load_audio", action="store_true", default=True, help="Load audio from video")
+args = parser.parse_args()
+model_path = args.model_path
+video_path = args.video_path
+generation_kwargs = {"max_new_tokens": args.max_new_tokens, "max_length": 99999999}
+load_audio_in_video = args.load_audio
+num_video_frames = args.num_video_frames
+audio_length = args.audio_length
+text_prompt = args.prompt
+assert os.path.exists(video_path), f"Video path {video_path} does not exist."
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModel.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+generation_config = model.default_generation_config
+generation_config.update(**generation_kwargs)
+model.config.load_audio_in_video = load_audio_in_video
+processor.config.load_audio_in_video = load_audio_in_video
+if num_video_frames > 0:
+    model.config.num_video_frames = num_video_frames
+    processor.config.num_video_frames = num_video_frames
+if audio_length != -1:
+    model.config.audio_chunk_length = audio_length
+    processor.config.audio_chunk_length = audio_length
+def forward_inference(video_path, text_prompt):
+    """Run inference on video with text prompt."""
+    print(f"Text prompt: {text_prompt}")
+    print(f"Video path: {video_path}")
+    conversation = [{
+        "role": "user",
+        "content": [
+            {"type": "video", "video": video_path},
+            {"type": "text", "text": text_prompt}
+        ]
+    }]
+    text = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+    inputs = processor([text])
+    output_ids = model.generate(
+        input_ids=inputs.input_ids,
+        media=getattr(inputs, 'media', None),
+        media_config=getattr(inputs, 'media_config', None),
+        generation_config=generation_config,
+    )
+    print(processor.tokenizer.batch_decode(output_ids, skip_special_tokens=True))
+forward_inference(video_path, text_prompt)

llm/added_tokens.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "<image>": 151649,
+  "<sound>": 151652,
+  "<speech>": 151651,
+  "<vila/sentinel>": 151648,
+  "<vila/video>": 151650,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_bos|>": 151653,
+  "<|image_eos|>": 151654,
+  "<|sound_bos|>": 151659,
+  "<|sound_eos|>": 151660,
+  "<|speech_bos|>": 151657,
+  "<|speech_eos|>": 151658,
+  "<|video_bos|>": 151655,
+  "<|video_eos|>": 151656,
+  "[BOS]": 151646,
+  "[PAD]": 151647
+}

llm/config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

llm/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.46.0"
+}

llm/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

llm/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94f591013a08df71d152c96e5bc415bedc434bf30514fb03bd39c8d49e7161cd
+size 4874772072

llm/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f8d257b5d45c7d51d3abd0b25a524ae8234cca1d6536976dfa833aa9bb06ffe
+size 4932751008

llm/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:872102a5b7897af896ea518db185fda955786e8f45d70a49c48bbb8a4a45d305
+size 4330865200

llm/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b82d845caeacc0f77859bece772e05aaf828b51b25946d4ab58801845130b30
+size 1087106176

llm/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,346 @@

+{
+  "metadata": {
+    "total_size": 15225455616
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors"
+  }
+}

llm/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|sound_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|sound_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

llm/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:491635783283196cfd9ab5d019617234a246b35a58da4761afd6ad77380f43c8
+size 11415920

llm/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,165 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<vila/sentinel>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<vila/video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<speech>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<sound>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|image_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|image_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|video_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<|speech_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151658": {
+      "content": "<|speech_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151659": {
+      "content": "<|sound_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151660": {
+      "content": "<|sound_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|sound_bos|>",
+    "<|sound_eos|>"
+  ],
+  "bos_token": "[BOS]",
+  "chat_template": "{% if messages[0]['role'] != 'system' %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{% for message in messages if message['content'] is not none %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "legacy": false,
+  "model_max_length": 14000,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

llm/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

media.py ADDED Viewed

	@@ -0,0 +1,555 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import time
+import random
+import os
+import tempfile
+from collections import defaultdict
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Union
+import io
+import cv2
+import kaldiio
+import librosa
+import soundfile as sf
+import torch
+import numpy as np
+import PIL
+import PIL.Image
+import requests
+import tarfile
+import whisper
+import decord
+from decord import AudioReader, cpu
+from transformers import PretrainedConfig
+MEDIA_TOKENS = {
+    "image": "<image>",
+    "video": "<vila/video>",
+    "speech": "<speech>",
+    "sound": "<sound>",
+}
+class Media:
+    """Base class for media objects."""
+    pass
+class File(Media):
+    """File-based media object."""
+    def __init__(self, path: str) -> None:
+        self.path = path
+class Image(File):
+    """Image media object."""
+    pass
+class Video(File):
+    """Video media object."""
+    pass
+class Speech(File):
+    """Speech audio media object."""
+    def __init__(self, path, extension: str = None) -> None:
+        self.path = path
+        self.extension = extension
+class Sound(File):
+    """Sound/music audio media object."""
+    def __init__(self, path, extension: str = None) -> None:
+        self.path = path
+        self.extension = extension
+def make_list(obj: Any) -> List:
+    """Convert object to list if not already a list."""
+    return obj if isinstance(obj, list) else [obj]
+def _extract_image(image: Union[Image, PIL.Image.Image]) -> PIL.Image.Image:
+    """Extract PIL Image from Image object or return PIL Image as-is."""
+    if isinstance(image, Image):
+        if image.path.startswith("http://") or image.path.startswith("https://"):
+            image = PIL.Image.open(requests.get(image.path, stream=True).raw)
+        else:
+            image = PIL.Image.open(image.path)
+    return image
+def _load_video_bytesio(
+    video_bytesio: BytesIO, *, num_frames: int, config: PretrainedConfig, load_aud: bool = False
+) -> List[PIL.Image.Image]:
+    """Load video from BytesIO object by writing to temporary file."""
+    with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as temp_video:
+        temp_video.write(video_bytesio.read())
+        temp_video_name = temp_video.name
+        return _load_video(temp_video_name, num_frames=num_frames, load_aud=load_aud, config=config)
+def get_overlap(inp1, inp2):
+    """
+    Calculates the overlapping time frame between a video clip and an audio segment.
+    Args:
+        inp1 (list): [start_sec, end_sec]
+        inp2 (list): [start_sec, end_sec]
+    Returns:
+        tuple or None: (overlap_start, overlap_end) if overlap exists, else None.
+    """
+    # Calculate the maximum start time and minimum end time
+    overlap_start = max(inp1[0], inp2[0])
+    overlap_end = min(inp1[1], inp2[1])
+    # Check if there is an actual overlap
+    if overlap_start < overlap_end:
+        return (overlap_start, overlap_end)
+    else:
+        return None
+def _load_video(
+    video_path: str, *, num_frames: int, config: PretrainedConfig, load_aud: bool = False
+) -> List[PIL.Image.Image]:
+    # Load video frames from a directory
+    if os.path.isdir(video_path):
+        frame_paths = sorted(glob.glob(os.path.join(video_path, "*")))
+        indices = np.round(np.linspace(0, len(frame_paths) - 1, num_frames)).astype(int)
+        return [PIL.Image.open(frame_paths[index]) for index in indices]
+    # Load video frames from a video file
+    vidcap = cv2.VideoCapture(video_path)
+    # Load audio if available and needed
+    audio_info = None
+    if load_aud:
+        try:
+            aud_feature, audio_info = _load_speech(video_path, config)
+        except Exception as e:
+            aud_feature = None
+    else:
+        aud_feature = None
+    # Find the last frame as frame count might not be accurate
+    frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    while frame_count > 0:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, frame_count - 1)
+        if vidcap.grab():
+            break
+        frame_count -= 1
+    else:
+        raise ValueError(f"Video '{video_path}' has no frames.")
+    # Extract frames uniformly
+    indices = np.round(np.linspace(0, frame_count - 1, num_frames)).astype(int)
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    video_duration = frame_count / fps
+    # When load_audio_in_video and interleaved_vis_aud_in_video is True, we need to load frames for each video segment
+    if config.load_audio_in_video and config.interleaved_vis_aud_in_video and aud_feature is not None:
+        segment_duration = config.interleaved_video_segment_duration
+        if segment_duration == -1:
+            raise ValueError("video_segment_duration is not set")
+        segment_vis_indices_list = []
+        segment_aud_indices_list = []
+        segment_counts = np.ceil(video_duration / segment_duration).astype(int)
+        if type(aud_feature) == dict:
+            aud_feas = aud_feature["input_features"]
+        else:
+            aud_feas = aud_feature
+        audio_start_sec = audio_info['audio_start_sec']
+        audio_end_sec = audio_info['audio_end_sample_sec']
+        stft_frames_per_second = config.audio_sampling_rate // config.audio_hop_length
+        _idx = 0
+        aud_sample_start_idx = 0
+        for i in range(segment_counts):
+            end_frame = min((i+1) * segment_duration * fps, frame_count)
+            _indices = []
+            while _idx < len(indices) and indices[_idx] < end_frame and _idx < len(indices):
+                _indices.append(indices[_idx])
+                _idx += 1
+            segment_vis_indices_list.append(_indices)
+            clip_start_sec = i * segment_duration
+            clip_end_sec = min(clip_start_sec + segment_duration, video_duration)
+            # get the audio indices for the current clip
+            overlap = get_overlap([clip_start_sec, clip_end_sec], [audio_start_sec, audio_end_sec])
+            if overlap is not None:
+                aud_sample_end_idx = round((overlap[1] - audio_start_sec) * stft_frames_per_second)
+                segment_aud_indices_list.append([aud_sample_start_idx, aud_sample_end_idx])
+                aud_sample_start_idx = aud_sample_end_idx
+            else:
+                segment_aud_indices_list.append([])
+    frames = {}
+    frame_times = {}
+    for index in indices:
+        if index in frames:
+            continue
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, index)
+        success, frame = vidcap.read()
+        if not success:
+            print(f"Failed to read frame {index} from video '{video_path}'. Skipped.")
+            continue
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frames[index] = PIL.Image.fromarray(frame)
+        frame_times[index] = index / fps
+    output_frames = [frames[index] for index in indices if index in frames]
+    output_frame_times = [frame_times[index] for index in indices if index in frame_times]
+    video_info = {}
+    if config.load_audio_in_video and config.interleaved_vis_aud_in_video and aud_feature is not None:
+        new_segment_vis_indices_list = []
+        processed_frame_index = 0
+        for i, segment_indices in enumerate(segment_vis_indices_list):
+            new_segment_vis_indices_list.append([])
+            for index in segment_indices:
+                if index in frames:
+                    new_segment_vis_indices_list[-1].append(processed_frame_index)
+                    processed_frame_index += 1
+        segment_vis_indices_list = new_segment_vis_indices_list
+        video_info["segment_vis_indices_list"] = segment_vis_indices_list
+        video_info["segment_aud_indices_list"] = segment_aud_indices_list
+        video_info['expected_frame_count'] = len(indices)
+    video_info['video_path'] = video_path
+    if audio_info is not None:
+        audio_info['video_path'] = video_path
+    video_info['has_audio'] = aud_feature is not None
+    video_info['video_duration'] = video_duration
+    video_info['audio_info'] = audio_info
+    # calculate the time of each frame
+    video_info['video_frame_times'] = output_frame_times
+    return output_frames, aud_feature, video_info
+def _extract_video(video: Video, config: PretrainedConfig) -> List[PIL.Image.Image]:
+    num_frames = config.num_video_frames
+    aud_fea = None
+    if getattr(config, "fps") != 0:
+        print("Extracting frames from video with specified FPS is not supported yet. Ignored.")
+    if isinstance(video.path, BytesIO):
+        frames, aud_fea, video_info = _load_video_bytesio(
+            video.path, num_frames=num_frames, config=config, load_aud=config.load_audio_in_video
+        )
+    else:
+        frames, aud_fea, video_info = _load_video(
+            video.path, num_frames=num_frames, config=config, load_aud=config.load_audio_in_video
+        )
+    if config.load_audio_in_video:
+        return frames, aud_fea, video_info
+    else:
+        return frames, video_info
+def soundFile_read_audio(audio_file, offset=None, duration=None, dtype='float32'):
+        if dtype not in ['int32', 'float32']:
+            print("audio dtype must be int32 or float32. Default to float32")
+            dtype = 'float32'
+        # return read audio and its sample rate
+        if isinstance(audio_file, bytes):
+            audio_file = io.BytesIO(audio_file)
+        with sf.SoundFile(audio_file, 'r') as f:
+            sample_rate = f.samplerate
+            if offset is not None and offset > 0:
+                f.seek(int(offset * sample_rate))
+            if duration is not None and duration > 0:
+                samples = f.read(int(duration * sample_rate), dtype=dtype)
+            else:
+                samples = f.read(dtype=dtype)
+        return samples, sample_rate
+def load_audio_from_tar(tar_file, audio_file):
+    with tarfile.open(tar_file, 'r') as tar:
+        audio_member = tar.getmember(audio_file)
+        audio_file = tar.extractfile(audio_member)
+        return librosa.load(audio_file)
+def _load_audio_file(audio_path: str, config: PretrainedConfig):
+    # Load video frames from a directory
+    if audio_path is None:
+        return None
+    dirname = os.path.dirname(audio_path)
+    filename = os.path.basename(audio_path)
+    if dirname.endswith(".tar"):
+        speech, sample_rate = load_audio_from_tar(dirname, filename)
+    else:
+        sample_rate = config.audio_sampling_rate
+        speech = whisper.load_audio(audio_path, sr=sample_rate)
+    return speech, sample_rate
+def _load_audio(audio: Union[str, dict], config: PretrainedConfig):
+    if isinstance(audio, str):
+        return _load_audio_file(audio, config)
+    elif isinstance(audio, dict):
+        audio_sample = audio['sample']
+        if isinstance(audio_sample, (bytes, io.BytesIO)):
+            offset = audio.get('offset', None)
+            duration = audio.get('duration', None)
+            dtype = audio.get('dtype', 'float32')
+            return soundFile_read_audio(
+                audio_sample, offset=offset, duration=duration, dtype=dtype
+            )
+        elif isinstance(audio_sample, np.ndarray):
+            return audio_sample, audio.get('sample_rate')
+        else:
+            raise ValueError(f"Expect the loaded audio to be a processed numpy array or raw bytes. Got {type(audio_sample)}")
+    else:
+        raise ValueError(f"Expect input to be a path string or dict. Got {type(audio)}")
+def _whisper_process(audio, sample_rate, audio_chunk_length, max_chunks_per_file):
+    outputs = []
+    num_audio_chunks = 0
+    chunk_length = audio_chunk_length * sample_rate
+    for i in range(0, len(audio), chunk_length):
+        chunk = audio[i : i + chunk_length]
+        chunk = whisper.pad_or_trim(chunk)
+        if chunk.dtype != np.float32:
+            chunk = chunk.astype(np.float32)
+        mel = whisper.log_mel_spectrogram(chunk, n_mels=128)
+        num_audio_chunks+=1
+        outputs.append(mel)
+        if num_audio_chunks == max_chunks_per_file:
+            break
+    frames = torch.stack(outputs, dim=0)
+    return frames.numpy().tolist()
+def _load_speech(speech, config: PretrainedConfig):
+    if type(speech) == str:
+        speech_path = speech
+    else:
+        speech_path = speech.path
+    # Load video frames from a directory
+    if speech_path is None:
+        return None
+    speech_outputs = []
+    if config.audio_chunk_length and not (type(config.audio_chunk_length) == str and "max" in config.audio_chunk_length):
+        try:
+            config.audio_chunk_length = int(config.audio_chunk_length)
+        except Exception as e:
+            print(f"Error setting audio_chunk_length: {e}")
+            raise e
+    audio_n_samples_limit = config.audio_chunk_length * config.audio_sampling_rate
+    def load_wav(speech_path):
+        speech, sr = librosa.load(speech_path, sr=config.audio_sampling_rate)
+        cur_max_length = speech.shape[0]
+        ori_audio_duration = cur_max_length / sr
+        return speech, ori_audio_duration
+    def get_audio(speech, audio_n_samples):
+        if type(speech) == decord.audio_reader.AudioReader:
+            ori_n_samples = speech.shape[1]
+        else:
+            ori_n_samples = speech.shape[0]
+        # random audio smaple
+        audio_start_sample_id = 0
+        audio_end_sample_id = ori_n_samples
+        load_max_audio = type(config.audio_chunk_length) == str and "max" in config.audio_chunk_length
+        if hasattr(config, 'random_audio_sample') and not load_max_audio:
+            if ori_n_samples > audio_n_samples:
+                audio_start_sample_id = random.randint(0, ori_n_samples - audio_n_samples)
+                audio_end_sample_id = audio_start_sample_id + audio_n_samples
+        else:
+            if load_max_audio:
+                if "_" in config.audio_chunk_length:
+                    max_audio_chunk_length = int(config.audio_chunk_length.split("_")[1])
+                    max_audio_n_samples = max_audio_chunk_length * config.audio_sampling_rate
+                    audio_n_samples = min(ori_n_samples, max_audio_n_samples)
+                    audio_end_sample_id = audio_n_samples
+                else:
+                    audio_n_samples = ori_n_samples
+                    audio_end_sample_id = audio_n_samples
+            else:
+                audio_end_sample_id = min(audio_n_samples, ori_n_samples)
+        if type(speech) == decord.audio_reader.AudioReader:
+            speech = speech[audio_start_sample_id:audio_end_sample_id].asnumpy()[0]
+        else:
+            speech = speech[audio_start_sample_id:audio_end_sample_id]
+        return speech, audio_n_samples, audio_start_sample_id, audio_end_sample_id
+    if isinstance(speech_path, dict):
+        if "offset" in speech_path:
+            speech, ori_sample_rate = _load_audio(speech_path, config)
+        else:
+            speech = speech_path["sample"]
+            ori_sample_rate = speech_path["sample_rate"]
+        # resample the speech based on  current sample rate
+        speech = librosa.resample(speech, orig_sr=ori_sample_rate, target_sr=config.audio_sampling_rate)
+        # variable audio sequence lengths
+        ori_audio_duration = speech.shape[0] / config.audio_sampling_rate
+        speech, audio_n_samples, audio_start_sample_id, audio_end_sample_id = get_audio(speech, audio_n_samples_limit)
+    elif isinstance(speech_path, BytesIO):
+        if speech.extension == ".wav":
+            # speech, sr = librosa.load(speech_path, sr=config.audio_sampling_rate)
+            # ori_audio_duration = speech.shape[0] / sr
+            speech, ori_audio_duration = load_wav(speech_path)
+            speech, audio_n_samples, audio_start_sample_id, audio_end_sample_id = get_audio(speech, audio_n_samples_limit)
+        else:
+            raise ValueError(f"Unsupported audio extension: {speech.extension}")
+    elif ".mat" in speech_path or ".ark" in speech_path:
+        rate, speech = kaldiio.load_mat(speech_path)
+        speech = librosa.resample(speech, orig_sr=rate, target_sr=config.audio_sampling_rate)
+        speech, audio_n_samples, audio_start_sample_id, audio_end_sample_id = get_audio(speech, audio_n_samples_limit)
+        ori_audio_duration = speech.shape[0] / config.audio_sampling_rate
+    elif ".mp4" in speech_path:
+        # Load audio from video file
+        ar = AudioReader(speech_path, ctx=cpu(0), sample_rate=config.audio_sampling_rate, mono=True)
+        cur_max_length = ar.shape[1]
+        ori_audio_duration = cur_max_length / config.audio_sampling_rate
+        speech, audio_n_samples, audio_start_sample_id, audio_end_sample_id = get_audio(ar, audio_n_samples_limit)
+    else:
+        assert os.path.exists(speech_path), f"File {speech_path} does not exist"
+        speech, ori_audio_duration = load_wav(speech_path)
+        speech, audio_n_samples, audio_start_sample_id, audio_end_sample_id = get_audio(speech, audio_n_samples_limit)
+    # convert to float
+    speech = speech.astype(np.float32)
+    audio_n_samples = int(np.ceil(speech.shape[0] / (config.audio_sampling_rate * 30)) * (config.audio_sampling_rate * 30))
+    speech = whisper.pad_or_trim(speech, length=audio_n_samples) # we don't pad or trim here, instead, we pad based on the max length of all audio samples in the batch size later
+    new_audio_chunk_length = int(audio_n_samples // config.audio_sampling_rate)
+    audio_start_sec = audio_start_sample_id / config.audio_sampling_rate
+    audio_end_sample_sec = audio_end_sample_id / config.audio_sampling_rate
+    audio_info = {}
+    audio_info['new_audio_chunk_length'] = new_audio_chunk_length
+    audio_info['new_audio_n_samples'] = audio_n_samples
+    audio_info['ori_audio_duration'] = ori_audio_duration
+    audio_info['audio_start_sec'] = audio_start_sec
+    audio_info['audio_end_sample_sec'] = audio_end_sample_sec
+    return speech, audio_info
+def _extract_speech(speech: Speech, config: PretrainedConfig):
+    frames, audio_info = _load_speech(speech, config)
+    return frames, audio_info
+_extract_sound = _extract_speech
+def extract_media(
+    messages: List[Dict[str, Any]],
+    config: Optional[PretrainedConfig] = None,
+    draft: bool = False,
+) -> Dict[str, List[Any]]:
+    media = defaultdict(list)
+    if not hasattr(config, "load_audio_in_video"):
+        print(f"Warning: load_audio_in_video not in config, set to False")
+        config.load_audio_in_video = False
+    for message in messages:
+        text = ""
+        for part in make_list(message["value"]):
+            if isinstance(part, str):
+                for token in MEDIA_TOKENS.values():
+                    if token in part:
+                        print(f"Media token '{token}' found in text: '{part}'. Removed.")
+                        part = part.replace(token, "").strip()
+                text += part
+            elif isinstance(part, (Image, PIL.Image.Image)):
+                if draft:
+                    media["image"].append(part)
+                else:
+                    media["image"].append(_extract_image(part))
+                text += MEDIA_TOKENS["image"]
+            elif isinstance(part, Video):
+                if draft:
+                    media["video"].append(part)
+                else:
+                    if config.load_audio_in_video:
+                        output, aud_fea, video_info = _extract_video(part, config)
+                        media["video"].append(output)
+                        media["video_info"].append(video_info)
+                        if aud_fea is not None:
+                            media["sound"].append(aud_fea)
+                            media["audio_info"].append(video_info['audio_info'])
+                            text += MEDIA_TOKENS["sound"]
+                    else:
+                        output, video_info = _extract_video(part, config)
+                        media["video"].append(output)
+                        media["video_info"].append(video_info)
+                text += MEDIA_TOKENS["video"]
+            elif isinstance(part, Speech):
+                if draft:
+                    if config.unified_audio_encoder:
+                        media["sound"].append(part)
+                        text += MEDIA_TOKENS["sound"]
+                    else:
+                        media["speech"].append(part)
+                        text += MEDIA_TOKENS["speech"]
+                else:
+                    output, audio_info = _extract_speech(part, config)
+                    if output is not None:
+                        if config.unified_audio_encoder:
+                            media["sound"].append(output)
+                            text += MEDIA_TOKENS["sound"]
+                        else:
+                            media["speech"].append(output)
+                            text += MEDIA_TOKENS["speech"]
+                        media["audio_info"].append(audio_info)
+            elif isinstance(part, Sound):
+                if draft:
+                    media["sound"].append(part)
+                    text += MEDIA_TOKENS["sound"]
+                else:
+                    output, audio_info = _extract_sound(part, config)
+                    if output is not None:
+                        media["sound"].append(output)
+                        media["audio_info"].append(audio_info)
+                        text += MEDIA_TOKENS["sound"]
+            else:
+                print(f"part: {part}")
+                raise ValueError(f"Unsupported prompt part type: {type(part)}")
+        message["value"] = text
+    return media

media_encoder.py ADDED Viewed

	@@ -0,0 +1,955 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from torch import nn
+from torch.nn import Module, ModuleList
+import numpy as np
+from einops import rearrange, repeat
+from torch.cuda.amp import autocast
+from torch import nn, einsum, broadcast_tensors, Tensor
+from beartype import beartype
+from beartype.typing import Literal, Union, Optional
+from math import pi, log
+import math
+class CacheFeatures(object):
+    def __init__(self, value, type):
+        self.value = value
+        self.type = type
+    def my_to(self, device, dtype):
+        self.value['features'] = self.value['features'].to(device, dtype) if 'features' in self.value and self.value['features'] is not None else None
+        return self
+    def __call__(self):
+        return self.value
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+# broadcat, as tortoise-tts was using it
+def broadcat(tensors, dim = -1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+def pool(x: torch.Tensor, size: int, dim: int) -> torch.Tensor:
+    # return x.view(x.shape[:dim] + (-1, size) + x.shape[dim + 1 :]).mean(dim + 1)
+    # Reshape x to group elements along the specified dimension into chunks of 'size', then average over those chunks.
+    # Check if the dimension is divisible by the pool size, if not pad with mean values
+    if x.shape[dim] % size != 0:
+        print(f"Warning: dimension {dim} with size {x.shape[dim]} is not divisible by pool size {size}, padding with mean values")
+        remainder = x.shape[dim] % size
+        pad_len = size - remainder
+        # Get the mean of the last few elements along the dimension to be pooled
+        last_elements = x.narrow(dim, x.shape[dim] - remainder, remainder)
+        mean_value = last_elements.mean()
+        # Create padding tensor with the same shape as x except for the dimension being pooled
+        pad_shape = list(x.shape)
+        pad_shape[dim] = pad_len
+        padding = torch.ones(pad_shape, device=x.device, dtype=x.dtype) * mean_value
+        # Concatenate the original tensor with the padding along the specified dimension
+        x = torch.cat([x, padding], dim=dim)
+    shape_before = x.shape[:dim]
+    shape_after = x.shape[dim + 1 :]
+    new_shape = shape_before + (-1, size) + shape_after
+    x_reshaped = x.view(new_shape)
+    return x_reshaped.mean(dim + 1)
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+def apply_rotary_emb(freqs, t, start_index = 0, scale = 1., seq_dim = -2):
+    with torch.amp.autocast(device_type='cuda', enabled=False):
+        ori_dtype = t.dtype
+        embed_dtype = torch.float64
+        t = t.to(embed_dtype)
+        if t.ndim == 3:
+            seq_len = t.shape[seq_dim]
+            freqs = freqs[-seq_len:].to(t)
+        rot_dim = freqs.shape[-1]
+        end_index = start_index + rot_dim
+        assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+        t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+        t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    return torch.cat((t_left, t, t_right), dim = -1).to(ori_dtype)
+class MaxTimeContinuousTimeRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_time, period_mode="shortest", device=None):
+        super().__init__()
+        assert dim % 2 == 0, "RoPE embedding dimension must be even"
+        # Set max period = max_time
+        if period_mode == "shortest": # shortest period is max_time
+            base = 5
+            inv_freq = 2 * math.pi / (max_time * (base ** (torch.arange(0, dim // 2).float() / (dim // 2))))
+        elif period_mode == "longest": # longest period is max_time ** ((dim // 2) / (dim // 2 - 1))
+            theta = max_time ** ((dim // 2) / (dim // 2 - 1))
+            inv_freq = 2 * math.pi / ((theta ** (torch.arange(0, dim // 2).float() / (dim // 2))))
+        else:
+            raise ValueError(f"Invalid period mode: {period_mode}")
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, time_values: torch.Tensor):
+        """
+        time_values: [batch_size, seq_len], in seconds (or any continuous unit)
+        Returns:
+            cos, sin: [batch_size, seq_len, dim]
+        """
+        batch_size, seq_len = time_values.shape
+        time_values_exp = time_values[:, None, :]  # [batch, 1, seq_len]
+        freqs = (self.inv_freq[None, :, None] @ time_values_exp).transpose(1, 2)  # [batch, seq_len, dim//2]
+        # emb = torch.cat([freqs, freqs], dim=-1)  # [batch, seq_len, dim]
+        # return emb.cos(), emb.sin()
+        return freqs
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+        for ind, dim in enumerate(dims):
+            pos = torch.arange(dim, device = self.device)
+            freqs = self.forward(pos, seq_len = dim)
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim = -1)
+class RotaryEmbedding(Module):
+    @beartype
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Optional[Tensor] = None,
+        freqs_for: Union[Literal['lang', 'pixel', 'constant']] = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+        learned_freq = False,
+        use_xpos = False,
+        xpos_scale_base = 512,
+        interpolate_factor = 1.,
+        theta_rescale_factor = 1.,
+        seq_before_head_dim = False,
+        cache_if_possible = True,
+        max_time = None
+    ):
+        super().__init__()
+        self.dim = dim
+        self.freqs_for = freqs_for
+        self.max_freq = max_freq
+        self.num_freqs = num_freqs
+        self.learned_freq = learned_freq
+        self.use_xpos = use_xpos
+        self.xpos_scale_base = xpos_scale_base
+        self.interpolate_factor = interpolate_factor
+        self.theta_rescale_factor = theta_rescale_factor
+        self.cache_if_possible = cache_if_possible
+        self.max_time = max_time
+        self.tmp_store('cached_freqs', None)
+        self.tmp_store('cached_scales', None)
+        # Adjust theta to avoid angle wrapping after large times
+        if exists(max_time) and freqs_for == 'lang':
+            # Make sure highest frequency completes 1 full rotation over max time
+            # theta = base of exponent: higher theta → lower frequency range
+            # max_time * (1/theta^(0)) = 2pi  =>  theta = max_time / (2pi)
+            theta = max_time / (2 * pi)
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+        self.theta = theta
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        self.freqs = nn.Parameter(freqs, requires_grad = learned_freq)
+        self.learned_freq = learned_freq
+        # dummy for device
+        self.tmp_store('dummy', torch.tensor(0))
+        # default sequence dimension
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+        # interpolation factors
+        assert interpolate_factor >= 1.
+        self.interpolate_factor = interpolate_factor
+        # xpos
+        if not use_xpos:
+            self.tmp_store('scale', None)
+            return
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.tmp_store('scale', scale)
+        # add apply_rotary_emb as static method
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+    @property
+    def device(self):
+        return self.dummy.device
+    def tmp_store(self, key, value):
+        self.register_buffer(key, value, persistent = False)
+    def get_seq_pos(self, seq_len, device, dtype, offset = 0):
+        return (torch.arange(seq_len, device = device, dtype = dtype) + offset) / self.interpolate_factor
+    def rotate_queries_or_keys(self, t, seq_dim = None, offset = 0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert not self.use_xpos, 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+        freqs = self.forward(self.get_seq_pos(seq_len, device = device, dtype = dtype, offset = offset), seq_len = seq_len, offset = offset)
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+        return apply_rotary_emb(freqs, t, seq_dim = seq_dim)
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim = None, offset = 0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim = seq_dim, offset = k_len - q_len + offset)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim = seq_dim, offset = offset)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def rotate_queries_and_keys(self, q, k, seq_dim = None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, dtype = dtype, device = device)
+        freqs = self.forward(seq, seq_len = seq_len)
+        scale = self.get_scale(seq, seq_len = seq_len).to(dtype)
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+            scale = rearrange(scale, 'n d -> n 1 d')
+        rotated_q = apply_rotary_emb(freqs, q, scale = scale, seq_dim = seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale = scale ** -1, seq_dim = seq_dim)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    @beartype
+    def get_scale(
+        self,
+        t: Tensor,
+        seq_len: Optional[int] = None,
+        offset = 0
+    ):
+        assert self.use_xpos
+        should_cache = (
+            self.cache_if_possible and
+            exists(seq_len)
+        )
+        if (
+            should_cache and \
+            exists(self.cached_scales) and \
+            (seq_len + offset) <= self.cached_scales.shape[0]
+        ):
+            return self.cached_scales[offset:(offset + seq_len)]
+        scale = 1.
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** rearrange(power, 'n -> n 1')
+            scale = torch.cat((scale, scale), dim = -1)
+        if should_cache:
+            self.tmp_store('cached_scales', scale)
+        return scale
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == 'pixel':
+                pos = torch.linspace(-1, 1, steps = dim, device = self.device)
+            else:
+                pos = torch.arange(dim, device = self.device)
+            freqs = self.forward(pos, seq_len = dim)
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim = -1)
+    def forward(
+        self,
+        t: Tensor,
+        seq_len = None,
+        offset = 0
+    ):
+        should_cache = (
+            self.cache_if_possible and \
+            not self.learned_freq and \
+            exists(seq_len) and \
+            self.freqs_for != 'pixel'
+        )
+        if (
+            should_cache and \
+            exists(self.cached_freqs) and \
+            (offset + seq_len) <= self.cached_freqs.shape[0]
+        ):
+            return self.cached_freqs[offset:(offset + seq_len)].detach()
+        freqs = self.freqs
+        # Scale time to keep t * freq <= 2pi
+        if hasattr(self, 'max_time') and self.max_time is not None:
+            t = t / self.max_time * (2 * pi)
+        freqs = einsum('..., f -> ... f', t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
+        if should_cache:
+            self.tmp_store('cached_freqs', freqs.detach())
+        return freqs
+class BaseEncoder(nn.Module):
+    def __init__(self, parent: nn.Module) -> None:
+        super().__init__()
+        self._parent = [parent]
+    @property
+    def parent(self) -> nn.Module:
+        return self._parent[0]
+class BasicImageEncoder(BaseEncoder):
+    def __init__(
+        self,
+        parent: torch.nn.Module,
+        start_tokens: Optional[str] = None,
+        end_tokens: Optional[str] = "\n",
+    ) -> None:
+        super().__init__(parent)
+        end_tokens = None if end_tokens == "None" else end_tokens
+        self.start_tokens = start_tokens
+        self.end_tokens = end_tokens
+    def embed_tokens(self, tokens: Optional[str]) -> Optional[torch.Tensor]:
+        if tokens is None:
+            return None
+        token_ids = self.parent.tokenizer(tokens).input_ids
+        token_ids = torch.tensor(token_ids, device=self.parent.device)
+        return self.parent.llm_model_embed_tokens(token_ids)
+    def _process_features(
+        self,
+        features: torch.Tensor,
+        start_token_embeds: Optional[torch.Tensor],
+        end_token_embeds: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if start_token_embeds is not None:
+            features = torch.cat([start_token_embeds, features], dim=0)
+        if end_token_embeds is not None:
+            features = torch.cat([features, end_token_embeds], dim=0)
+        return features
+    def forward(self, images: List[torch.Tensor], config: Dict[str, Any], mm_info: dict) -> List[torch.Tensor]:
+        images = torch.stack(images, dim=0)
+        features = self.parent.encode_images(images, block_sizes=config.get("block_sizes"))
+        process_features = partial(
+            self._process_features,
+            start_token_embeds=self.embed_tokens(self.start_tokens),
+            end_token_embeds=self.embed_tokens(self.end_tokens),
+        )
+        return [process_features(f) for f in features]
+class BasicVideoEncoder(BaseEncoder):
+    def __init__(
+        self,
+        parent: torch.nn.Module,
+        start_tokens: Optional[str] = None,
+        end_tokens: Optional[str] = "\n",
+    ) -> None:
+        super().__init__(parent)
+        end_tokens = None if end_tokens == "None" else end_tokens
+        self.start_tokens = start_tokens
+        self.end_tokens = end_tokens
+    def embed_tokens(self, tokens: Optional[str]) -> Optional[torch.Tensor]:
+        if tokens is None:
+            return None
+        token_ids = self.parent.tokenizer(tokens).input_ids
+        token_ids = torch.tensor(token_ids, device=self.parent.device)
+        return self.parent.llm_model_embed_tokens(token_ids)
+    def _process_features(
+        self,
+        features: torch.Tensor,
+        start_token_embeds: Optional[torch.Tensor],
+        end_token_embeds: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if start_token_embeds is not None:
+            start_embeds = torch.stack([start_token_embeds] * features.shape[0], dim=0)
+            features = torch.cat([start_embeds, features], dim=1)
+        if end_token_embeds is not None:
+            end_embeds = torch.stack([end_token_embeds] * features.shape[0], dim=0)
+            features = torch.cat([features, end_embeds], dim=1)
+        return features.flatten(0, 1)
+    def forward(self, videos: List[torch.Tensor], config: Dict[str, Any]) -> List[torch.Tensor]:
+        num_frames = [video.shape[0] for video in videos]
+        images = torch.cat(videos, dim=0)
+        features = self.parent.encode_images(images)
+        features = torch.split(features, num_frames)
+        process_features = partial(
+            self._process_features,
+            start_token_embeds=self.embed_tokens(self.start_tokens),
+            end_token_embeds=self.embed_tokens(self.end_tokens),
+        )
+        return [process_features(f) for f in features]
+class BasicSoundEncoder(BaseEncoder):
+    def __init__(
+        self,
+        parent: torch.nn.Module,
+        start_tokens: Optional[str] = None,
+        end_tokens: Optional[str] = "\n",
+        embed_time = "True",
+        trope_theta = 50000,
+        trope_dim = 128,
+        max_time = None,
+        time_embed_type = "pixel",
+        period_fix = False,
+    ) -> None:
+        super().__init__(parent)
+        end_tokens = None if end_tokens == "None" else end_tokens
+        if embed_time == "True":
+            embed_time = True
+        elif embed_time == "False":
+            embed_time = False
+        self.start_tokens = start_tokens
+        self.end_tokens = end_tokens
+        if embed_time == "False" or embed_time == False:
+            self.embed_time = False
+        else:
+            self.embed_time = True
+            self.time_embed_type = time_embed_type
+            period_mode = None
+            if type(period_fix) == str:
+                if period_fix == "shortest":
+                    period_fix = "MTCT"
+                    period_mode = "shortest"
+                elif period_fix == "longest":
+                    period_fix = "MTCT"
+                    period_mode = "longest"
+            self.period_fix = period_fix
+            self.max_time = max_time
+            if period_fix == "MTCT":
+                if period_mode is None:
+                    self.pos_emb = MaxTimeContinuousTimeRotaryEmbedding(
+                        dim = trope_dim,
+                        max_time = max_time,
+                    )
+                else:
+                    self.pos_emb = MaxTimeContinuousTimeRotaryEmbedding(
+                        dim = trope_dim,
+                        max_time = max_time,
+                        period_mode = period_mode,
+                    )
+            elif time_embed_type in ["pixel", "lang"]:
+                if trope_dim is None and max_time is None:
+                        raise ValueError("trope_dim or max_time is required when embed_time is True")
+                self.pos_emb = RotaryEmbedding(
+                            dim = trope_dim,
+                            freqs_for = time_embed_type,
+                            max_freq = 256,
+                            max_time = max_time,
+                            )
+            elif time_embed_type == "learned_embed":
+                self.time_embed = parent.sound_mm_projector.time_embed
+            else:
+                raise ValueError(f"Invalid time_embed_type: {time_embed_type}")
+    def embed_tokens(self, tokens: Optional[str]) -> Optional[torch.Tensor]:
+        if tokens is None:
+            return None
+        token_ids = self.parent.tokenizer(tokens).input_ids
+        token_ids = torch.tensor(token_ids, device=self.parent.device)
+        # return self.parent.llm.model.embed_tokens(token_ids)
+        return self.parent.llm_model_embed_tokens(token_ids)
+    def _process_features(
+        self,
+        features: torch.Tensor,
+        start_token_embeds: Optional[torch.Tensor],
+        end_token_embeds: Optional[torch.Tensor],
+        times: Optional[torch.Tensor] = None,
+        time_embed: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        features = features.to(self.parent.device)
+        device = features.device
+        dtype = features.dtype
+        if self.embed_time:
+            device = features.device
+            dtype = features.dtype
+            # Handle different embedding types
+            if self.time_embed_type in ["pixel", "lang"]:
+                times = times.unsqueeze(0)
+                new_times = times
+                pos_emb = self.pos_emb.to(device)
+                if self.period_fix == "True":
+                    if self.max_time is not None:
+                        angle = new_times.to(device) / self.max_time * 2 * np.pi
+                    else:
+                        angle = new_times.to(device)
+                elif self.period_fix == "MTCT":
+                    freqs = self.pos_emb(new_times.float())
+                    freqs = freqs.squeeze(0)
+                    features = apply_rotary_emb(freqs, features)
+                else:
+                    angle = (-new_times * 2 * np.pi).to(device)
+                if not self.period_fix == "MTCT":
+                    freqs = pos_emb.get_axial_freqs(new_times.shape[0], features.shape[-2]).to(device)
+                    angle_expanded = angle.unsqueeze(2)
+                    angle_expanded = angle_expanded.expand(new_times.shape[0], features.shape[-2], freqs.shape[-1])
+                    freqs = freqs * angle_expanded
+                    freqs = freqs.squeeze(0)
+                    # ori_dtype = features.dtype
+                    # embed_dtype = torch.float32
+                    # features = features.to(embed_dtype)
+                    features = apply_rotary_emb(freqs, features)
+                    # features = features.to(ori_dtype)
+            elif self.time_embed_type == "learned_embed":  # Learned embedding
+                # Add time embeddings to features
+                features = features + time_embed
+            else:
+                raise ValueError(f"Invalid time_embed_type: {self.time_embed_type}")
+        if start_token_embeds is not None:
+            features = torch.cat([start_token_embeds, features], dim=0)
+        if end_token_embeds is not None:
+            features = torch.cat([features, end_token_embeds], dim=0)
+        return features
+    def forward(self, sounds: List[torch.Tensor], config: Dict[str, Any], mm_info: dict) -> List[torch.Tensor]:
+        # sounds = torch.stack(sounds, dim=0)
+        features = self.parent.encode_sound(sounds, mm_info=mm_info)
+        process_features = partial(
+            self._process_features,
+            start_token_embeds=self.embed_tokens(self.start_tokens),
+            end_token_embeds=self.embed_tokens(self.end_tokens),
+        )
+        if self.embed_time:
+            new_features = []
+            device = features[0].device
+            fea_count = len(features)
+            aud_idx = 0
+            bs = len(mm_info["audio_info"])
+            if self.time_embed_type == "learned_embed":  # Learned embedding, we need to first collect all times and only do time embedding once
+                times_list = []
+                for i in range(bs):
+                    _audio_info = mm_info["audio_info"][i]
+                    if _audio_info is not None:
+                        for j in range(len(_audio_info)):
+                            _feature = features[aud_idx]
+                            if _audio_info[j] == "dummy":
+                                times = torch.zeros(_feature.shape[0], device=device, dtype=_feature.dtype)
+                            else:
+                                audio_chunk_length = _audio_info[j]["new_audio_chunk_length"]
+                                sec_per_embed = audio_chunk_length / _feature.shape[0]
+                                audio_start_sec = _audio_info[j]["audio_start_sec"]
+                                times = [audio_start_sec + i * sec_per_embed + sec_per_embed / 2 for i in range(_feature.shape[0])]
+                                times = torch.tensor(times).to(device)
+                            times_list.append(times)
+                            aud_idx += 1
+                times = torch.stack(times_list, dim=0)
+                time_embeds = self.time_embed(times, dtype=features[0].dtype)
+            aud_idx = 0
+            for i in range(bs):
+                _audio_info = mm_info["audio_info"][i]
+                if _audio_info is not None:
+                    for j in range(len(_audio_info)):
+                        try:
+                            _feature = features[aud_idx]
+                        except Exception as e:
+                            print(f"Error: {e}. Length of features: {len(features)}. Length of _audio_info: {len(_audio_info)}. Length of _feature: {_feature.shape[0]}")
+                            raise e
+                        if _audio_info[j] == "dummy":
+                            times = torch.zeros(_feature.shape[0], device=device, dtype=_feature.dtype)
+                        else:
+                            audio_chunk_length = _audio_info[j]["new_audio_chunk_length"]
+                            sec_per_embed = audio_chunk_length / _feature.shape[0]
+                            audio_start_sec = _audio_info[j]["audio_start_sec"]
+                            times = [audio_start_sec + i * sec_per_embed + sec_per_embed / 2 for i in range(_feature.shape[0])]
+                            times = torch.tensor(times).to(device)
+                        if self.time_embed_type == "learned_embed":
+                            _feature = process_features(_feature, time_embed=time_embeds[aud_idx])
+                        else:
+                            _feature = process_features(_feature, times=times)
+                        new_features.append(_feature)
+                        aud_idx += 1
+            assert aud_idx == fea_count , "aud_idx: {}, fea_count: {}".format(aud_idx, fea_count)
+            features = new_features
+        else:
+            features = [process_features(f) for f in features]
+        return features
+        # return [process_features(f) for f in feature
+class TSPVideoEncoder(BasicVideoEncoder):
+    def __init__(
+        self,
+        parent: torch.nn.Module,
+        pool_sizes: List[Tuple[int, int, int]],
+        start_tokens: Optional[str] = None,
+        end_tokens: Optional[str] = "\n",
+        sep_tokens: Optional[str] = None,
+        embed_time: str = "False",
+        trope_theta = 50000,
+        trope_dim = 128,
+        max_time = None,
+        time_embed_type = "pixel",
+        period_fix = False,
+    ) -> None:
+        super().__init__(parent, start_tokens=start_tokens, end_tokens=end_tokens)
+        self.pool_sizes = pool_sizes
+        self.sep_tokens = sep_tokens
+        if embed_time == "False":
+            self.embed_time = False
+        else:
+            self.embed_time = True
+            self.time_embed_type = time_embed_type
+            period_mode = None
+            if type(period_fix) == str:
+                if period_fix == "shortest":
+                    period_fix = "MTCT"
+                    period_mode = "shortest"
+                elif period_fix == "longest":
+                    period_fix = "MTCT"
+                    period_mode = "longest"
+            self.period_fix = period_fix
+            self.max_time = max_time
+            if period_fix == "MTCT":
+                if period_mode is None:
+                    self.pos_emb = MaxTimeContinuousTimeRotaryEmbedding(
+                        dim = trope_dim,
+                        max_time = max_time,
+                    )
+                else:
+                    self.pos_emb = MaxTimeContinuousTimeRotaryEmbedding(
+                        dim = trope_dim,
+                        max_time = max_time,
+                        period_mode = period_mode,
+                    )
+            elif time_embed_type in ["pixel", "lang"]:
+                if trope_dim is None and max_time is None:
+                        raise ValueError("trope_dim or max_time is required when embed_time is True")
+                if time_embed_type == "lang":
+                    self.pos_emb = RotaryEmbedding(
+                                dim = trope_dim,
+                                freqs_for = 'lang',
+                                theta = trope_theta,
+                                max_time = max_time,
+                            )
+                elif time_embed_type == "pixel":
+                    self.pos_emb = RotaryEmbedding(
+                                dim = trope_dim,
+                                freqs_for = time_embed_type,
+                                max_freq = 256
+                                )
+            elif time_embed_type == "learned_embed":
+                self.time_embed = parent.mm_projector.time_embed
+            else:
+                raise ValueError(f"Invalid time_embed_type: {time_embed_type}")
+    def _process_features(
+        self,
+        inputs: torch.Tensor,
+        start_token_embeds: Optional[torch.Tensor],
+        end_token_embeds: Optional[torch.Tensor],
+        sep_token_embeds: Optional[torch.Tensor],
+        times: Optional[torch.Tensor] = None,
+        time_embed: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        nt, ns = inputs.shape[:2]
+        nl = int(ns**0.5)
+        outputs = []
+        for pool_size in self.pool_sizes:
+            features = inputs.view(nt, nl, nl, -1)
+            for dim, p in enumerate(pool_size):
+                try:
+                    features = pool(features, p, dim=dim)
+                except Exception as e:
+                    print(f"Error: Pooling failed: {e}")
+                    print(f"inputs.shape: {inputs.shape}, features.shape: {features.shape}, pool_size: {p}, dim: {dim}")
+                    raise e
+            features = features.flatten(1, 2)
+            if self.embed_time:
+                device = features.device
+                dtype = features.dtype
+                if self.time_embed_type in ["pixel", "lang"]:
+                    # consider the pooling in self.pool_sizes
+                    temporal_pool_size = pool_size[0]
+                    if temporal_pool_size != 1:
+                        if len(times) % temporal_pool_size != 0:
+                            # pad
+                            print(f"Warning: length of times: {len(times)} is not a multiple of temporal_pool_size: {temporal_pool_size}")
+                            remainder = len(times) % temporal_pool_size
+                            pad_len = temporal_pool_size - remainder
+                            last_window_mean_times = times[-remainder:].mean()
+                            times = torch.cat([times, torch.ones(pad_len).to(times.device) * last_window_mean_times])
+                        new_times = pool(times, temporal_pool_size, 0)
+                    else:
+                        new_times = times
+                    pos_emb = self.pos_emb.to(device)
+                    if self.period_fix == "True":
+                        if self.max_time is not None:
+                            angle = new_times.to(device) / self.max_time * 2 * np.pi
+                        else:
+                            angle = new_times.to(device)
+                    elif self.period_fix == "MTCT":
+                        if new_times.ndim == 1:
+                            new_times = new_times.unsqueeze(0)
+                        freqs = self.pos_emb(new_times.float())
+                        freqs = freqs.squeeze(0)
+                        freqs = freqs.unsqueeze(1)
+                        features = apply_rotary_emb(freqs, features, seq_dim=0)
+                    else:
+                        angle = (-new_times * 2 * np.pi).to(device)
+                    if not self.period_fix == "MTCT":
+                        freqs = pos_emb.get_axial_freqs(new_times.shape[0], features.shape[-2]).to(device)
+                        angle_expanded = angle.unsqueeze(1).unsqueeze(2)
+                        angle_expanded = angle_expanded.expand(new_times.shape[0], features.shape[-2], freqs.shape[-1])
+                        freqs = freqs * angle_expanded
+                        # ori_dtype = features.dtype
+                        # embed_dtype = torch.float32
+                        # features = features.to(embed_dtype)
+                        features = apply_rotary_emb(freqs, features)
+                        # features = features.to(ori_dtype)
+                elif self.time_embed_type == "learned_embed":  # Learned embedding
+                    # Add time embeddings to features
+                    features = features + time_embed
+                else:
+                    raise ValueError(f"Invalid time_embed_type: {self.time_embed_type}")
+            features = super()._process_features(
+                features,
+                start_token_embeds=start_token_embeds,
+                end_token_embeds=end_token_embeds,
+            )
+            if sep_token_embeds is not None:
+                features = torch.cat([features, sep_token_embeds], dim=0)
+            outputs.append(features)
+        return torch.cat(outputs, dim=0)
+    def forward(self, videos: List[torch.Tensor], config: Dict[str, Any], mm_info: dict) -> List[torch.Tensor]:
+        cache_feas = []
+        cache_feas_index = []
+        for _idx in range(len(videos)):
+            if type(videos[_idx]) == CacheFeatures:
+                cache_feas.append(videos[_idx])
+                cache_feas_index.append(_idx)
+        num_frames = [
+            _.value['features'].shape[0] if isinstance(_, CacheFeatures) else _.shape[0]
+            for _ in videos
+        ]
+        features = self.parent.encode_video(videos, mm_info=mm_info, num_frames=num_frames)
+        features = torch.split(features, num_frames)
+        process_features = partial(
+            self._process_features,
+            start_token_embeds=self.embed_tokens(self.start_tokens),
+            end_token_embeds=self.embed_tokens(self.end_tokens),
+            sep_token_embeds=self.embed_tokens(self.sep_tokens),
+        )
+        if self.embed_time:
+            bs = len(mm_info["video_info"])
+            vid_idx = 0
+            device = features[0].device
+            if self.time_embed_type == "learned_embed":
+                # Learned embedding, we need to first collect all times from all videos and only do time embedding once
+                times_list = []
+                for i in range(bs):
+                    _video_info = mm_info["video_info"][i]
+                    if _video_info is not None:
+                        for j in range(len(_video_info)):
+                            _feature = features[vid_idx]
+                            if _video_info[j] == "dummy":
+                                times = torch.zeros(_feature.shape[0], device=device, dtype=_feature.dtype)
+                            else:
+                                times = _video_info[j]["video_frame_times"]
+                                times = torch.tensor(times).to(device)
+                            for pool_size in self.pool_sizes:
+                                temporal_pool_size = pool_size[0]
+                                if temporal_pool_size != 1:
+                                    if len(times) % temporal_pool_size != 0:
+                                        # pad
+                                        print(f"Warning: length of times: {len(times)} is not a multiple of temporal_pool_size: {temporal_pool_size}")
+                                        remainder = len(times) % temporal_pool_size
+                                        pad_len = temporal_pool_size - remainder
+                                        last_window_mean_times = times[-remainder:].mean()
+                                        times = torch.cat([times, torch.ones(pad_len).to(times.device) * last_window_mean_times])
+                                    times = pool(times, temporal_pool_size, 0)
+                            times_list.append(times)
+                            vid_idx += 1
+                # pad the times to the same length
+                ori_lens = [len(times) for times in times_list]
+                max_len = max(ori_lens)
+                for i in range(len(times_list)):
+                    if len(times_list[i]) < max_len:
+                        times_list[i] = torch.cat([times_list[i], torch.zeros(max_len - len(times_list[i])).to(times_list[i].device)])
+                times = torch.stack(times_list, dim=0)
+                time_embeds = self.time_embed(times, dtype=features[0].dtype)
+                # remove the padding for each embed
+                new_time_embeds = []
+                for i in range(len(times_list)):
+                    new_time_embeds.append(time_embeds[i][:ori_lens[i]].unsqueeze(1).expand(-1, features[0].shape[1], -1))
+                # add dummy embed to the first embed
+                new_time_embeds[0] = new_time_embeds[0] + 0 * time_embeds.mean()
+            new_features = []
+            fea_count = len(features)
+            vid_idx = 0
+            for i in range(bs):
+                _video_info = mm_info["video_info"][i]
+                if _video_info is not None:
+                    for j in range(len(_video_info)):
+                        _feature = features[vid_idx]
+                        if _video_info[j] == "dummy":
+                            times = torch.zeros(_feature.shape[0], device=device, dtype=_feature.dtype)
+                        else:
+                            times = _video_info[j]["video_frame_times"]
+                            times = torch.tensor(times).to(device)
+                        if self.time_embed_type == "learned_embed":
+                            _feature = process_features(_feature, time_embed=new_time_embeds[vid_idx])
+                        else:
+                            _feature = process_features(_feature, times=times)
+                        new_features.append(_feature)
+                        vid_idx += 1
+            assert vid_idx == fea_count, "vid_idx: {}, fea_count: {}".format(vid_idx, fea_count)
+            features = new_features
+        else:
+            features = [process_features(f) for f in features]
+        return features
+    def _encode_video_frames(self, video_frames: torch.Tensor) -> torch.Tensor:
+        """Helper method to encode video frames when cached features are not available."""
+        features = self.parent.encode_images(video_frames.unsqueeze(0))
+        return features.squeeze(0)

mm_projector/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_name_or_path": "outputs/model/mm_projector",
+  "architectures": [
+    "MultimodalProjector"
+  ],
+  "mm_projector_type": "mlp_downsample",
+  "model_type": "v2l_projector",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.0"
+}

mm_projector/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e80a60453d3c104445816f05e1926f676e89a2f99ceb950e4876e99a1e391913
+size 124850712

mm_utils.py ADDED Viewed

	@@ -0,0 +1,567 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# Note: dynamic_preprocess and find_closest_aspect_ratio are referenced from https://github.com/OpenGVLab/InternVL
+import base64
+import os
+import tempfile
+from io import BytesIO
+import numpy as np
+import torch
+from PIL import Image
+from transformers import StoppingCriteria
+from .constants import DEFAULT_IMAGE_TOKEN
+def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
+    """Extract frames from video capture object."""
+    import cv2
+    if fps is None or frame_count is None:
+        # Recompute if either fps or frame_count is None
+        fps = vidcap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if fps == 0 or frame_count == 0:
+        print(f"Video file not found. return empty images. {video_file_name}")
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * num_frames, 0
+    duration = frame_count / fps
+    frame_interval = frame_count // num_frames
+    if frame_interval == 0 and frame_count <= 1:
+        print(f"frame_interval is equal to 0. return empty image. {video_file_name}")
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * num_frames, 0
+    # print("duration:", duration, "frames:", frame_count, "intervals:", frame_interval)
+    images = []
+    count = 0
+    success = True
+    frame_indices = np.linspace(0, frame_count - 1, num_frames, dtype=int)
+    while success:
+        # print("frame_count:", frame_count, "count:", count, "num_frames:", num_frames, "frame_interval:", frame_interval)
+        if frame_count >= num_frames:
+            success, frame = vidcap.read()
+            if count in frame_indices:
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except BaseException:
+                    continue
+                if len(images) >= num_frames:
+                    return images, num_frames
+            count += 1
+        else:
+            # Left padding frames if the video is not long enough
+            success, frame = vidcap.read()
+            if success:
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except BaseException:
+                    continue
+                count += 1
+            else:
+                break
+    if len(images) == 0:
+        raise ValueError("Did not find enough frames in the video. return empty image.")
+    return images, len(images)
+def get_frame_from_vcap_with_fps(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
+    """
+    Extract frames from video capture with FPS consideration.
+    Args:
+        vidcap: OpenCV video capture object
+        num_frames: Maximum number of frames the model can support
+        max_fps: Maximum FPS the model can support
+        fps: FPS of the input video
+        frame_count: Number of frames in the input video
+        video_file_name: Name of the video file for logging
+    """
+    import random
+    import cv2
+    if fps is None or frame_count is None:
+        # Recompute if either fps or frame_count is None
+        fps = vidcap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if fps == 0 or frame_count == 0:
+        print(f"Video file not found. return empty images. {video_file_name}")
+        empty_video_frames = int(random.uniform(2, 8 * max_fps))
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * empty_video_frames, 0
+    duration = frame_count / fps
+    # print("duration:", duration, "frames:", frame_count, "fps:", fps, "num_frames:", num_frames, "max_fps:", max_fps)
+    # If the video is too long (longer than max_fps and num_frames can support),
+    # we will use lower fps to sample frames.
+    if duration >= num_frames / max_fps:
+        frame_interval = frame_count // num_frames
+        # If the video is too short, we will skip the video if there is only one frame.
+        if frame_interval == 0 and frame_count <= 1:
+            print(f"frame_interval is equal to 0. return empty image. {video_file_name}")
+            empty_video_frames = int(random.uniform(2, 8 * max_fps))
+            return [
+                Image.new("RGB", (720, 720)),
+            ] * empty_video_frames, 0
+        images = []
+        count = 0
+        success = True
+        frame_indices = np.linspace(0, frame_count - 1, num_frames, dtype=int)
+        while success:
+            if frame_count >= num_frames:
+                if count in frame_indices:
+                    success, frame = vidcap.read()
+                    try:
+                        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                        im_pil = Image.fromarray(img)
+                        images.append(im_pil)
+                    except:
+                        continue
+                    if len(images) >= num_frames:
+                        return images, num_frames
+                else:
+                    success = vidcap.grab()
+                count += 1
+            else:
+                # Left padding frames if the video is not long enough
+                success, frame = vidcap.read()
+                if success:
+                    try:
+                        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                        im_pil = Image.fromarray(img)
+                        images.append(im_pil)
+                    except:
+                        continue
+                    count += 1
+                else:
+                    break
+    else:
+        frames_required = int(duration * max_fps)
+        frame_indices = np.linspace(0, frame_count - 1, frames_required, dtype=int)
+        if frames_required == 0:
+            print(f"frames_required is fewer than 2. Duration {duration}, return empty image.")
+            empty_video_frames = int(random.uniform(2, 8 * max_fps))
+            return [
+                Image.new("RGB", (720, 720)),
+            ] * empty_video_frames, 0
+        elif frames_required == 1:
+            frame_indices = np.linspace(0, frame_count - 1, 2, dtype=int)
+        images = []
+        count = 0
+        looked = 0
+        success = True
+        while success:
+            success, frame = vidcap.read()
+            if success and (looked in frame_indices):
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except:
+                    continue
+                count += 1
+            looked += 1
+    if len(images) == 0:
+        empty_video_frames = int(random.uniform(2, 8 * max_fps))
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * empty_video_frames, 0
+    else:
+        return images, len(images)
+def opencv_extract_frames(vpath_or_bytesio, frames=6, max_fps=0.0, fps=None, frame_count=None):
+    """
+    Extract frames from a video using OpenCV.
+    Args:
+        vpath_or_bytesio (str or BytesIO): Path to the video file or BytesIO object containing the video.
+        frames (int): Number of frames to extract from the video.
+        fps (float): Frames per second of the video. If 0.0, the function will extract frames at equal intervals.
+    Returns:
+        list: List of PIL Images extracted from the video.
+    Raises:
+        NotImplementedError: If the type of `vpath_or_bytesio` is not supported.
+    """
+    import cv2
+    if isinstance(vpath_or_bytesio, str):
+        vidcap = cv2.VideoCapture(vpath_or_bytesio)
+        if max_fps > 0.0:
+            return get_frame_from_vcap_with_fps(
+                vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=vpath_or_bytesio
+            )
+        return get_frame_from_vcap(
+            vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=vpath_or_bytesio
+        )
+    elif isinstance(vpath_or_bytesio, (BytesIO,)):
+        # assuming mp4
+        with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as temp_video:
+            temp_video.write(vpath_or_bytesio.read())
+            temp_video_name = temp_video.name
+            vidcap = cv2.VideoCapture(temp_video_name)
+            if max_fps > 0.0:
+                return get_frame_from_vcap_with_fps(
+                    vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=temp_video_name
+                )
+            return get_frame_from_vcap(
+                vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=temp_video_name
+            )
+    else:
+        raise NotImplementedError(type(vpath_or_bytesio))
+def load_image_from_base64(image):
+    """Load PIL Image from base64 encoded string."""
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    """
+    Expand the given PIL image to a square shape by adding padding.
+    Parameters:
+    - pil_img: The PIL image to be expanded.
+    - background_color: The color of the padding to be added.
+    Returns:
+    - The expanded PIL image.
+    If the image is already square, it is returned as is.
+    If the image is wider than it is tall, padding is added to the top and bottom.
+    If the image is taller than it is wide, padding is added to the left and right.
+    """
+    width, height = pil_img.size
+    if pil_img.mode == "L":
+        background_color = background_color[0]
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    """Find the closest aspect ratio from target ratios."""
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=384, use_thumbnail=True):
+    """Dynamically preprocess image into multiple tiles based on aspect ratio."""
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # Calculate the existing image aspect ratio
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    }
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def dynamic_s2_preprocess(image, s2_scales=[384, 768, 1152], max_num=12, image_size=384):
+    """Dynamically preprocess image with multi-scale S2 strategy."""
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    min_num = (s2_scales[-1] // s2_scales[0]) ** 2
+    processed_images = []
+    # Add tiles for all but the last scale using fixed square ratio
+    for scale in s2_scales[:-1]:
+        target_width = image_size * (scale // s2_scales[0])
+        target_height = image_size * (scale // s2_scales[0])
+        blocks = (scale // s2_scales[0]) ** 2
+        # resize the image
+        resized_img = image.resize((target_width, target_height))
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size,
+            )
+            # split the image
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+    # Add tiles for the last scale using dynamic aspect ratio
+    # Calculate the existing image aspect ratio
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    }
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    return processed_images, (target_aspect_ratio[1], target_aspect_ratio[0])
+def dynamic_process_images_and_prompt(images, prompt, data_args, image_folder=None, max_tiles=None):
+    prompt = prompt.split(DEFAULT_IMAGE_TOKEN)
+    idx = 0
+    all_images = []
+    for img in images:
+        processed_images = process_image(img, data_args, image_folder, enable_dynamic_res=True, max_tiles=max_tiles)
+        all_images.append(processed_images)
+        prompt.insert(idx + 1, f"{DEFAULT_IMAGE_TOKEN}\n" * processed_images.shape[0])
+        idx += 2
+    prompt = "".join(prompt)
+    if all_images:
+        all_images = torch.cat(all_images)
+    else:
+        all_images = None
+        prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, "")
+    return all_images, prompt
+def dynamic_s2_process_images_and_prompt(images, prompt, data_args, image_folder=None):
+    idx = 0
+    all_images = []
+    all_block_size = []
+    for img in images:
+        processed_images, block_size = process_image(img, data_args, image_folder, enable_dynamic_s2=True)
+        all_images.append(processed_images)
+        all_block_size.append(block_size)
+        idx += 2
+    if all_images:
+        all_images = torch.cat(all_images)
+    else:
+        all_images = None
+    return all_images, all_block_size
+def process_image(
+    image_file, data_args, image_folder, enable_dynamic_res=False, enable_dynamic_s2=False, max_tiles=None
+):
+    processor = data_args.image_processor
+    if isinstance(image_file, str):
+        if image_folder is not None:
+            image = Image.open(os.path.join(image_folder, image_file)).convert("RGB")
+        else:
+            image = Image.open(image_file).convert("RGB")
+    else:
+        # image is stored in bytearray
+        image = image_file
+    image = image.convert("RGB")
+    if hasattr(data_args.image_processor, "crop_size"):
+        # CLIP vision tower
+        crop_size = data_args.image_processor.crop_size
+    else:
+        # SIGLIP vision tower
+        assert hasattr(data_args.image_processor, "size")
+        crop_size = data_args.image_processor.size
+    if "dynamic_s2" in data_args.image_aspect_ratio and enable_dynamic_s2:
+        assert crop_size["height"] == crop_size["width"]
+        images, block_size = dynamic_s2_preprocess(
+            image, s2_scales=data_args.s2_scales, max_num=data_args.max_tiles, image_size=crop_size["height"]
+        )
+        images = [processor.preprocess(image, return_tensors="pt")["pixel_values"][0] for image in images]
+        return torch.stack(images), block_size
+    if "dynamic" in data_args.image_aspect_ratio and enable_dynamic_res:
+        assert crop_size["height"] == crop_size["width"]
+        if max_tiles is not None:
+            max_num = max_tiles
+        else:
+            max_num = data_args.max_tiles
+        images = dynamic_preprocess(image, min_num=data_args.min_tiles, max_num=max_num, image_size=crop_size["height"])
+        images = [processor.preprocess(image, return_tensors="pt")["pixel_values"][0] for image in images]
+        return torch.stack(images)
+    if data_args.image_aspect_ratio == "resize":
+        image = image.resize((crop_size["width"], crop_size["height"]))
+    elif data_args.image_aspect_ratio == "pad":
+        image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+        image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+    else:
+        # Using default behavior of the vision encoder
+        # For CLIP, default is central crop
+        # For Radio, default is central crop
+        # For Siglip, default is resize
+        # For InternVIT, default is resize
+        image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+    return image
+def process_images(images, image_processor, model_cfg, enable_dynamic_res=False, max_tiles=None):
+    """Process a batch of images using the model's image processor."""
+    model_cfg.image_processor = image_processor
+    new_images = [
+        process_image(image, model_cfg, None, enable_dynamic_res=enable_dynamic_res, max_tiles=max_tiles)
+        for image in images
+    ]
+    if all(x.shape == new_images[0].shape for x in new_images):
+        if len(new_images[0].shape) == 4:
+            new_images = torch.cat(new_images, dim=0)
+        elif len(new_images[0].shape) == 3:
+            new_images = torch.stack(new_images, dim=0)
+        else:
+            raise ValueError(f"new_images rank does not equal to 4, rank: {len(new_images[0].shape)}")
+    else:
+        raise ValueError("The shape of images in new_images is different!")
+    return new_images
+def tokenizer_image_token(prompt, tokenizer, return_tensors=None, return_ids=True):
+    """Tokenize prompt with media tokens."""
+    if return_ids:
+        return tokenizer(prompt, return_tensors=return_tensors).input_ids[0]
+    else:
+        return tokenizer(prompt, return_tensors=return_tensors)
+def is_gemma_tokenizer(tokenizer):
+    """Check if the tokenizer is a Gemma tokenizer."""
+    return "gemma" in tokenizer.__class__.__name__.lower()
+def get_model_name_from_path(model_path):
+    """Extract model name from file path."""
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    """Stopping criteria based on keyword tokens."""
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if (output_ids[0, -keyword_id.shape[0] :] == keyword_id).all():
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)

model_utils_packing.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from importlib import import_module
+from typing import Tuple
+import torch
+import transformers
+from torch import nn
+from torch.nn import functional as F
+__all__ = ["patch"]
+def _get_unpad_data(attention_mask: torch.Tensor, *args, **kwargs) -> Tuple[torch.Tensor, torch.Tensor, int]:
+    if hasattr(_get_unpad_data, "seqlens_in_batch"):
+        seqlens_in_batch = _get_unpad_data.seqlens_in_batch
+    else:
+        seqlens_in_batch = torch.sum(attention_mask, dim=1)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return indices, cu_seqlens, max_seqlen_in_batch
+def set_seqlens_in_batch(seqlens_in_batch: torch.Tensor) -> None:
+    _get_unpad_data.seqlens_in_batch = seqlens_in_batch
+def patch(model: nn.Module) -> None:
+    if transformers.__version__ < "4.43.0":
+        m = import_module(model.__module__)
+        if not hasattr(m, "_get_unpad_data"):
+            raise ValueError(f"Module {m} does not have function '_get_unpad_data' for packing")
+        m._get_unpad_data = _get_unpad_data
+    else:
+        transformers.modeling_flash_attention_utils._get_unpad_data = _get_unpad_data

modeling_vila.py ADDED Viewed

	@@ -0,0 +1,1834 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import json
+import logging
+import numpy as np
+import os
+import os.path
+import os.path as osp
+import shutil
+import warnings
+from abc import ABC
+from collections import OrderedDict, defaultdict, deque
+from copy import deepcopy
+from itertools import chain
+from threading import Thread
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+from einops import rearrange
+from PIL import Image
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    LogitsProcessor,
+    PretrainedConfig,
+    PreTrainedModel,
+    Qwen2Config,
+    Qwen2ForCausalLM,
+    Qwen2PreTrainedModel,
+    TextIteratorStreamer,
+    WhisperFeatureExtractor,
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import ContextManagers, no_init_weights
+from .auto_processor import VILAProcessor
+from .base_projector import MultimodalProjector, MultimodalProjectorConfig
+from .sound_base_projector import SoundMultimodalProjector, SoundMultimodalProjectorConfig
+from .speech_base_projector import SpeechMultimodalProjector, SpeechMultimodalProjectorConfig
+from .builder import build_llm_and_tokenizer
+from .configuration_vila import VILAConfig
+from .constants import *
+from .conversation import SeparatorStyle, default_conversation
+from .distributed import all_gather as vila_all_gather
+from .media import extract_media
+from .media_encoder import BasicImageEncoder, BasicVideoEncoder, TSPVideoEncoder, BasicSoundEncoder, CacheFeatures
+from .mm_utils import process_image, process_images
+from .model_utils_packing import set_seqlens_in_batch
+from .siglip_encoder import SiglipVisionTower, SiglipVisionTowerDynamicS2, SiglipVisionTowerS2
+from .tokenizer_utils import tokenize_conversation
+from .utils import get_model_config, load_tokenizer_then_handle_media_tokens_and_chat_template
+from .constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, NUM_EXTRA_TOKENS_VILA, NUM_EXTRA_TOKENS_XVILA
+from .qwen_audio_encoder import Qwen2AudioTower
+import whisper
+from .audio_encoder import AudioTower
+def build_mm_projector(model_type_or_path: str, config: PretrainedConfig) -> PreTrainedModel:
+    """Build multimodal projector from path or configuration."""
+    if model_type_or_path is None:
+        return None
+    if config.resume_path:
+        assert os.path.exists(model_type_or_path), f"Resume mm projector path {model_type_or_path} does not exist!"
+        return MultimodalProjector.from_pretrained(model_type_or_path, config)
+    else:
+        mm_projector_cfg = MultimodalProjectorConfig(model_type_or_path)
+        mm_projector = MultimodalProjector(mm_projector_cfg, config)
+        return mm_projector
+def build_speech_mm_projector(model_type_or_path: str, config: PretrainedConfig) -> PreTrainedModel:
+    """Build speech multimodal projector from path or configuration."""
+    if model_type_or_path is None:
+        return None
+    if config.resume_path:
+        assert os.path.exists(model_type_or_path), f"Resume speech mm projector path {model_type_or_path} does not exist!"
+        _model = SpeechMultimodalProjector.from_pretrained(
+            model_type_or_path, config, torch_dtype=eval(config.model_dtype)
+        )
+        return _model
+    else:
+        speech_mm_projector_cfg = SpeechMultimodalProjectorConfig(model_type_or_path)
+        speech_mm_projector = SpeechMultimodalProjector(speech_mm_projector_cfg, config).to(eval(config.model_dtype))
+        return speech_mm_projector
+def build_sound_mm_projector(model_type_or_path: str, config: PretrainedConfig) -> PreTrainedModel:
+    """Build sound multimodal projector from path or configuration."""
+    if model_type_or_path is None:
+        return None
+    if type(config.model_dtype) == str:
+        model_dtype = eval(config.model_dtype)
+    else:
+        model_dtype = config.model_dtype
+    if config.resume_path:
+        assert os.path.exists(model_type_or_path), f"Resume sound mm projector path {model_type_or_path} does not exist!"
+        _model = SoundMultimodalProjector.from_pretrained(
+            model_type_or_path, config, torch_dtype=model_dtype
+        )
+        return _model
+    else:
+        sound_mm_projector_cfg = SoundMultimodalProjectorConfig(model_type_or_path)
+        sound_mm_projector = SoundMultimodalProjector(sound_mm_projector_cfg, config).to(model_dtype)
+        return sound_mm_projector
+def check_dot_in_model_path(model_path: str):
+    """Check if the model path contains a dot, which may affect model loading."""
+    if osp.isdir(model_path):
+        if "." in osp.abspath(model_path):
+            return True
+    else:
+        if "." in model_path:
+            return True
+    return False
+def get_vila_version(model_path: str) -> str:
+    VERSIONS = ["vila1.5", "vila-u", "longvila", "nvila", "vila-m3"]
+    for version in VERSIONS:
+        if version in model_path.lower():
+            return version
+    return None
+def generate_jinja_template(conv_mode: str) -> str:
+    if conv_mode == "vicuna_v1":
+        return """{% set system_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. " %}
+{% set roles = ["user", "assistant"] %}
+{% set sep = " " %}
+{{ system_prompt }}
+{% for message in messages %}
+    {% if message['role'] == roles[0] %}
+        {{ "USER: " }}{{ sep }}{{ message['content'] }}{{ sep }}
+    {% else %}
+        {{ "ASSISTANT: " }}{{ sep }}{{ message['content'] }}{{ sep }}
+    {% endif %}
+{% endfor %}
+{% if messages[-1]['role'] == 'user' %}
+    {{ "ASSISTANT:" }}
+{% endif %}
+"""
+    elif conv_mode == "llama_3":
+        return """{% set system_prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|>" %}
+{% set roles = ["<|start_header_id|>user<|end_header_id|>\\n\\n", "<|start_header_id|>assistant<|end_header_id|>\\n\\n"]%}
+{% set sep = "<|eot_id|>" %}
+{{ system_prompt }}
+{% for message in messages %}
+    {% if message['role'] == 'user' %}
+        {{ roles[0] }}{{ message['content'] }}{{ sep }}
+    {% else %}
+        {{ roles[1] }}{{ message['content'] }}{{ sep }}
+    {% endif %}
+{% endfor %}
+{% if messages[-1]['role'] == 'user' %}
+    {{ roles[1] }}
+{% endif %}
+"""
+    elif conv_mode == "hermes_2":
+        return """{% set system_prompt = "<|im_start|>system\nAnswer the questions." %}
+{% set roles = ["<|im_start|>user\n", "<|im_start|>assistant\n"] %}
+{% set sep = "<|im_end|>" %}
+{{ system_prompt }}{{ sep }}
+{% for message in messages %}
+    {% if message['role'] == 'user' %}
+        {{ roles[0] }}{{ message['content'] }}{{ sep }}
+    {% else %}
+        {{ roles[1] }}{{ message['content'] }}{{ sep }}
+    {% endif %}
+{% endfor %}"""
+    else:
+        raise NotImplementedError(f"Jinja template generation is not implemented for {conv_mode}.")
+def build_vision_tower(model_name_or_path: str, config: PretrainedConfig) -> PreTrainedModel:
+    """Build vision tower from path or configuration."""
+    # Skip vision tower instantiation if path is None
+    if model_name_or_path is None:
+        return None
+    vision_tower_arch = None
+    if config.resume_path and "radio" not in model_name_or_path:
+        assert os.path.exists(model_name_or_path), f"Resume vision tower path {model_name_or_path} does not exist!"
+        vision_tower_cfg = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
+        vision_tower_arch = vision_tower_cfg.architectures[0].lower()
+    vision_tower_name = vision_tower_arch if vision_tower_arch is not None else model_name_or_path
+    use_s2 = getattr(config, "s2", False)
+    use_dynamic_s2 = getattr(config, "dynamic_s2", False)
+    if "siglip" in vision_tower_name:
+        if use_dynamic_s2:
+            vision_tower = SiglipVisionTowerDynamicS2(model_name_or_path, config)
+        elif use_s2:
+            vision_tower = SiglipVisionTowerS2(model_name_or_path, config)
+        else:
+            vision_tower = SiglipVisionTower(model_name_or_path, config)
+    else:
+        raise NotImplementedError(f"Unknown vision tower: {model_name_or_path}")
+    config.mm_hidden_size = (
+        vision_tower.config.hidden_size if not (use_s2 or use_dynamic_s2) else vision_tower.hidden_size
+    )
+    return vision_tower
+def build_audio_tower(model_name_or_path: str, config: PretrainedConfig, encoder_type: str) -> PreTrainedModel:
+    """Build audio tower for sound or speech processing."""
+    assert encoder_type in ["sound", "speech"]
+    # Skip tower instantiation if path is None
+    if model_name_or_path is None:
+        return None
+    model_type = "af3"
+    if model_type == "af3":
+        model = Qwen2AudioTower(model_name_or_path, config)
+        output_dim = 1280
+    else:
+        raise NotImplementedError(f"Not implemented for this encoder: {model_name_or_path}")
+    if encoder_type == "sound":
+        config.sound_hidden_size = output_dim
+    elif encoder_type == "speech":
+        config.speech_hidden_size = output_dim
+    else:
+        raise NotImplementedError(f"Not implemented for this encoder: {model_name_or_path}")
+    return model
+class VILAPretrainedModel(PreTrainedModel):
+    config_class = VILAConfig
+    main_input_name = "input_embeds"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _no_split_modules = ["Qwen2DecoderLayer", "SiglipEncoderLayer"]
+    def __init__(self, config: VILAConfig, *args, **kwargs):
+        super().__init__(config)
+        self.config = config
+        cfgs = get_model_config(config)
+        if len(cfgs) == 7:
+            (
+                llm_cfg,
+                vision_tower_cfg,
+                speech_tower_cfg,
+                sound_tower_cfg,
+                mm_projector_cfg,
+                speech_mm_projector_cfg,
+                sound_mm_projector_cfg,
+            ) = cfgs
+        else:
+            raise ValueError(
+                "`llm_cfg` `mm_projector_cfg` `speech_mm_projector_cfg` `sound_mm_projector_cfg` `vision_tower_cfg` `speech_tower_cfg` `sound_tower_cfg` not found in the config."
+            )
+        # loading on auto by default
+        device_map = kwargs.get("device_map", "auto")
+        self.mm_projector = build_mm_projector(mm_projector_cfg, config)
+        self.vision_tower = build_vision_tower(vision_tower_cfg, config)
+        if speech_tower_cfg:
+            self.speech_tower = build_audio_tower(speech_tower_cfg, config, encoder_type="speech")
+            self.speech_mm_projector = build_speech_mm_projector(speech_mm_projector_cfg, config)
+        if sound_tower_cfg:
+            self.sound_tower = build_audio_tower(sound_tower_cfg, config, encoder_type="sound")
+            self.sound_mm_projector = build_sound_mm_projector(sound_mm_projector_cfg, config)
+        if device_map in ["auto", "cuda"]:
+            self.mm_projector = self.mm_projector.cuda()
+            self.vision_tower = self.vision_tower.cuda()
+            self.speech_tower = self.speech_tower.cuda() if hasattr(self, "speech_tower") else None
+            self.sound_tower = self.sound_tower.cuda() if hasattr(self, "sound_tower") else None
+            self.speech_mm_projector = self.speech_mm_projector.cuda() if hasattr(self, "speech_mm_projector") else None
+            self.sound_mm_projector = self.sound_mm_projector.cuda() if hasattr(self, "sound_mm_projector") else None
+        # set device_map auto can autoamtically shard llm to different devices
+        self.llm, self.tokenizer = self.init_llm(llm_cfg, config, device_map=device_map)
+        self.llm_model_embed_tokens = self.llm.model.embed_tokens
+        self.tokenizer.padding_side = "left"
+        self.vocab_size = len(self.tokenizer)
+        self.update_vocab_size = lambda: setattr(self, "vocab_size", len(self.tokenizer))
+        self.encoders = {}
+        for name in ["image", "video", "speech", "sound"]:
+            encoder_config = getattr(self.config, f"{name}_encoder")
+            if isinstance(encoder_config, str):
+                encoder_config = json.loads(encoder_config)
+            if encoder_config.get("embed_time", False) == "True":
+                if "trope_dim" not in encoder_config and encoder_config.get("time_embed_type", "") in ["pixel", "lang"]:
+                    encoder_config["trope_dim"] = self.config.hidden_size // 2
+                    print(f"Warning: trope_dim not found in config, defaulting to hidden_size // 2: {encoder_config['trope_dim']}")
+            encoder_config.pop('_target_')
+            if name == "video":
+                self.encoders[name] = TSPVideoEncoder(parent=self, **encoder_config)
+            elif name == "image":
+                self.encoders[name] = BasicImageEncoder(self)
+            else:
+                self.encoders[name] = BasicSoundEncoder(parent=self, **encoder_config)
+        self.post_config()
+        self.is_loaded = True
+        self.llm_only_need_embed = kwargs.get("llm_only_need_embed", False)
+        if self.llm_only_need_embed:
+            print("We only need the embed_tokens in llm.")
+            del self.llm
+            self.llm = None
+            torch.cuda.empty_cache()
+        assert (
+            self.llm is not None
+            or self.vision_tower is not None
+            or self.speech_tower is not None
+            or self.mm_projector is not None
+            or self.speech_mm_projector is not None
+        ), "At least one of the components must be instantiated."
+    @classmethod
+    def copy_or_symlink_directory(cls, model_path, output_dir, copy=True):
+        # Create output directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+        # Create symlinks for all files in model_path to output_dir
+        for item in os.listdir(model_path):
+            src_path = os.path.join(model_path, item)
+            dst_path = os.path.join(output_dir, item)
+            # Remove existing file/directory at destination if it exists
+            if os.path.exists(dst_path):
+                if os.path.islink(dst_path):
+                    os.unlink(dst_path)
+                elif os.path.isdir(dst_path):
+                    shutil.rmtree(dst_path)
+                else:
+                    os.remove(dst_path)
+            # Create symlink
+            if copy:
+                if os.path.isdir(src_path):
+                    shutil.copytree(src_path, dst_path)
+                else:
+                    shutil.copy2(src_path, dst_path)
+                print(f"Copied {src_path} to {dst_path}")
+            else:
+                os.symlink(src_path, dst_path)
+                print(f"Created symlink from {src_path} to {dst_path}")
+    @classmethod
+    def copy_remote_py_files(cls, output_dir, copy=True):
+        # copy .py and README for next loading
+        current_file_path = os.path.abspath(__file__)
+        current_folder = os.path.dirname(current_file_path)
+        for file_name in os.listdir(current_folder):
+            if file_name == "INSTRUCTIONS.md":
+                src_fname = os.path.join(current_folder, file_name)
+                dst_fname = os.path.join(output_dir, "README.md")
+                if os.path.exists(dst_fname):
+                    old_readme = open(dst_fname).read()
+                else:
+                    old_readme = ""
+                with open(src_fname) as src, open(dst_fname, "w") as dst:
+                    dst.write(src.read())
+                    dst.write(old_readme)
+                print("[HF] README", src_fname, "to", dst_fname)
+            if file_name.endswith(".py") or file_name.endswith(".jinja"):
+                full_file_name = os.path.join(current_folder, file_name)
+                if os.path.isfile(full_file_name):
+                    if copy:
+                        shutil.copy(full_file_name, output_dir)
+                        print("[HF] copying", full_file_name, "to", output_dir)
+                    else:
+                        # symlink to ease development
+                        if os.path.exists(os.path.join(output_dir, file_name)):
+                            os.remove(os.path.join(output_dir, file_name))
+                        os.symlink(full_file_name, os.path.join(output_dir, file_name))
+                        print("[HF] linking", full_file_name, "to", output_dir)
+    def save_pretrained(self, output_dir, state_dict=None, **kwargs):
+        if state_dict is None:
+            # other wise fetch from deepspeed
+            # state_dict = accelerator.get_state_dict(is_deepspeed_enabled)
+            state_dict = self.state_dict()
+        if getattr(self, "tokenizer", None):
+            self.tokenizer.save_pretrained(osp.join(output_dir, "llm"))
+        if self.get_llm():
+            print(f"saving llm to {osp.join(output_dir, 'llm')}")
+            self.llm.config._name_or_path = osp.join(output_dir, "llm")
+            llm_state_dict = OrderedDict({k.split("llm.")[-1]: v for k, v in state_dict.items() if "llm" in k})
+            self.llm.save_pretrained(os.path.join(output_dir, "llm"), state_dict=llm_state_dict)
+            self.config.llm_cfg = self.llm.config
+        if self.get_vision_tower():
+            print(f"saving vision_tower to {osp.join(output_dir, 'vision_tower')}")
+            self.vision_tower.config._name_or_path = osp.join(output_dir, "vision_tower")
+            vision_tower_state_dict = OrderedDict(
+                {k.split("vision_tower.vision_tower.")[-1]: v for k, v in state_dict.items() if "vision_tower" in k}
+            )
+            self.vision_tower.vision_tower.save_pretrained(
+                os.path.join(output_dir, "vision_tower"),
+                state_dict=vision_tower_state_dict,
+            )
+            self.vision_tower.image_processor.save_pretrained(os.path.join(output_dir, "vision_tower"))
+            self.config.vision_tower_cfg = self.vision_tower.config
+            if hasattr(self.config.vision_tower_cfg, "auto_map"):
+                if "radio" not in self.get_vision_tower().__class__.__name__.lower():
+                    delattr(self.config.vision_tower_cfg, "auto_map")
+        if self.get_speech_tower():
+            print(f"saving speech_tower to {osp.join(output_dir, 'speech_tower')}")
+            self.speech_tower.config._name_or_path = osp.join(output_dir, "speech_tower").replace(
+                "tmp-checkpoint", "checkpoint"
+            )
+            speech_tower_state_dict = OrderedDict(
+                {k.split("speech_tower.audio_tower.")[-1]: v for k, v in state_dict.items() if "speech_tower" in k}
+            )
+            self.speech_tower.audio_tower.save_pretrained(
+                os.path.join(output_dir, "speech_tower"),
+                state_dict=speech_tower_state_dict,
+            )
+            self.config.speech_tower_cfg = self.speech_tower.config
+        if self.get_sound_tower():
+            print(f"saving sound_tower to {osp.join(output_dir, 'sound_tower')}")
+            self.sound_tower.config._name_or_path = osp.join(output_dir, "sound_tower").replace(
+                "tmp-checkpoint", "checkpoint"
+            )
+            sound_tower_state_dict = OrderedDict(
+                {k.split("sound_tower.audio_tower.")[-1]: v for k, v in state_dict.items() if "sound_tower" in k}
+            )
+            self.sound_tower.audio_tower.save_pretrained(
+                os.path.join(output_dir, "sound_tower"),
+                state_dict=sound_tower_state_dict,
+            )
+            self.config.sound_tower_cfg = self.sound_tower.config
+        if self.get_mm_projector():
+            print(f"saving mm_projector to {osp.join(output_dir, 'mm_projector')}")
+            self.mm_projector.config._name_or_path = osp.join(output_dir, "mm_projector")
+            mm_projector_state_dict = OrderedDict(
+                {k.split("mm_projector.")[-1]: v for k, v in state_dict.items() if "mm_projector" in k}
+            )
+            self.mm_projector.save_pretrained(
+                os.path.join(output_dir, "mm_projector"),
+                state_dict=mm_projector_state_dict,
+            )
+            self.config.mm_projector_cfg = self.mm_projector.config
+        if self.get_speech_mm_projector():
+            print(f"saving speech_mm_projector to {osp.join(output_dir, 'speech_mm_projector')}")
+            self.speech_mm_projector.config._name_or_path = osp.join(output_dir, "speech_mm_projector").replace(
+                "tmp-checkpoint", "checkpoint"
+            )
+            speech_mm_projector_state_dict = OrderedDict(
+                {k.split("speech_mm_projector.")[-1]: v for k, v in state_dict.items() if "speech_mm_projector" in k}
+            )
+            self.speech_mm_projector.save_pretrained(
+                os.path.join(output_dir, "speech_mm_projector"),
+                state_dict=speech_mm_projector_state_dict,
+            )
+            self.config.speech_mm_projector_cfg = self.speech_mm_projector.config
+        if self.get_sound_mm_projector():
+            print(f"saving sound_mm_projector to {osp.join(output_dir, 'sound_mm_projector')}")
+            self.sound_mm_projector.config._name_or_path = osp.join(output_dir, "sound_mm_projector").replace(
+                "tmp-checkpoint", "checkpoint"
+            )
+            sound_mm_projector_state_dict = OrderedDict(
+                {k.split("sound_mm_projector.")[-1]: v for k, v in state_dict.items() if "sound_mm_projector" in k}
+            )
+            self.sound_mm_projector.save_pretrained(
+                os.path.join(output_dir, "sound_mm_projector"),
+                state_dict=sound_mm_projector_state_dict,
+            )
+            self.config.sound_mm_projector_cfg = self.sound_mm_projector.config
+        # update and save top-level config
+        self.config._name_or_path = output_dir
+        self.config.architectures = [self.__class__.__name__]
+        self.config.save_pretrained(output_dir)
+        # copy .py and README for next loading
+        self.copy_remote_py_files(output_dir)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[str] = None,
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: Optional[bool] = None,
+        weights_only: bool = True,
+        **kwargs,
+    ):
+        # print("DEBUG2", kwargs); input()
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+        if kwargs.get("torch_dtype", None) is not None:
+            config.torch_dtype = kwargs.get("torch_dtype", None)
+            config.model_dtype = kwargs.get("torch_dtype", None)
+            if type(kwargs.get("torch_dtype", None)) == str:
+                kwargs["torch_dtype"] = eval(kwargs.get("torch_dtype", None))
+            else:
+                kwargs["torch_dtype"] = kwargs.get("torch_dtype", None)
+        return cls._from_config(config, **kwargs)
+    def init_llm(self, llm_config, config, *args, **kwargs):
+        """Initialize language model and tokenizer."""
+        self.llm, self.tokenizer = build_llm_and_tokenizer(llm_config, config, *args, **kwargs)
+        self.pad_token_list = (
+            self.tokenizer.pad_token_id,
+            self.tokenizer.eos_token_id,
+            self.tokenizer.tokenize("<|endoftext|>")[0],  # for Qwen
+        )
+        self.vocab_size = len(self.tokenizer)
+        self.update_vocab_size = lambda: setattr(self, "vocab_size", len(self.tokenizer))
+        # XGrammar tokenizer and grammar compiler
+        # lazy init only when specified json output during inference
+        self.grammar_compiler = None
+        # self.llm.resize_token_embeddings(len(self.tokenizer))
+        return self.llm, self.tokenizer
+    def post_config(self):
+        self.training = self.llm.training
+        if self.training:
+            self.train()
+        else:
+            self.eval()
+        # configuration
+        if getattr(self.config, "llm_cfg", None) is None:
+            self.config.llm_cfg = self.llm.config
+        if getattr(self.config, "vision_tower_cfg", None) is None:
+            self.config.vision_tower_cfg = self.vision_tower.config
+        if getattr(self.config, "mm_projector_cfg", None) is None:
+            self.config.mm_projector_cfg = self.mm_projector.config
+        if getattr(self.config, "speech_tower_cfg", None) is None and hasattr(self, "speech_tower"):
+            self.config.speech_tower_cfg = self.speech_tower.config
+        if getattr(self.config, "sound_tower_cfg", None) is None and hasattr(self, "sound_tower"):
+            self.config.sound_tower_cfg = self.sound_tower.config
+        if getattr(self.config, "speech_mm_projector_cfg", None) is None and hasattr(self, "speech_mm_projector"):
+            self.config.speech_mm_projector_cfg = self.speech_mm_projector.config
+        if getattr(self.config, "sound_mm_projector_cfg", None) is None and hasattr(self, "sound_mm_projector"):
+            self.config.sound_mm_projector_cfg = self.sound_mm_projector.config
+    def get_llm(self):
+        llm = getattr(self, "llm", None)
+        if type(llm) is list:
+            llm = llm[0]
+        return llm
+    def get_lm_head(self):
+        lm_head = getattr(self.get_llm(), "lm_head", None)
+        return lm_head
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def get_speech_tower(self):
+        speech_tower = getattr(self, "speech_tower", None)
+        if type(speech_tower) is list:
+            speech_tower = speech_tower[0]
+        return speech_tower
+    def get_sound_tower(self):
+        sound_tower = getattr(self, "sound_tower", None)
+        if type(sound_tower) is list:
+            sound_tower = sound_tower[0]
+        return sound_tower
+    def get_mm_projector(self):
+        mm_projector = getattr(self, "mm_projector", None)
+        if type(mm_projector) is list:
+            mm_projector = mm_projector[0]
+        return mm_projector
+    def get_sound_mm_projector(self):
+        sound_mm_projector = getattr(self, "sound_mm_projector", None)
+        if type(sound_mm_projector) is list:
+            sound_mm_projector = sound_mm_projector[0]
+        return sound_mm_projector
+    def get_speech_tower(self):
+        speech_tower = getattr(self, "speech_tower", None)
+        if type(speech_tower) is list:
+            speech_tower = speech_tower[0]
+        return speech_tower
+    def get_speech_mm_projector(self):
+        speech_mm_projector = getattr(self, "speech_mm_projector", None)
+        if type(speech_mm_projector) is list:
+            speech_mm_projector = speech_mm_projector[0]
+        return speech_mm_projector
+    def freezed_module_patch(self):
+        """
+        Huggingface will call model.train() at each training_step. To ensure the expected behaviors for modules like dropout, batchnorm, etc., we need to call model.eval() for the freezed modules.
+        """
+        if self.training:
+            if self.get_llm() and not getattr(self.config, "tune_language_model", False):
+                pass
+            if self.get_vision_tower() and not getattr(self.config, "tune_vision_tower", False):
+                self.get_vision_tower().eval()
+            if self.get_speech_tower() and not getattr(self.config, "tune_speech_tower", False):
+                self.get_speech_tower().eval()
+            if self.get_sound_tower() and not getattr(self.config, "tune_sound_tower", False):
+                self.get_sound_tower().eval()
+            if self.get_mm_projector() and not getattr(self.config, "tune_mm_projector", False):
+                self.get_mm_projector().eval()
+            if self.get_speech_mm_projector() and not getattr(self.config, "tune_speech_mm_projector", False):
+                self.get_speech_mm_projector().eval()
+            if self.get_sound_mm_projector() and not getattr(self.config, "tune_sound_mm_projector", False):
+                self.get_sound_mm_projector().eval()
+class VILAForCausalLM(VILAPretrainedModel):
+    def __init__(self, config: VILAConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+    def merge_features_for_dynamic_s2(self, image_features, block_sizes):
+        scales = self.get_vision_tower().scales
+        resize_output_to_scale_idx = self.get_vision_tower().resize_output_to_scale_idx
+        image_features_each_image = []
+        new_block_sizes = []
+        block_cnt = 0
+        for block_size_each_image in block_sizes:
+            if block_size_each_image is None:
+                cur_features = image_features[block_cnt : block_cnt + 1]
+                cur_features = rearrange(cur_features, "1 (h w) c -> 1 c h w", h=int(cur_features.shape[1] ** 0.5))
+                cur_features = cur_features.repeat(1, len(scales), 1, 1)
+                image_features_each_image.append(cur_features)
+                new_block_sizes.append((1, 1))
+                block_cnt += 1
+            else:
+                cur_features_each_scale = []
+                for scale in scales[:-1]:
+                    num_blocks_this_scale = (scale // scales[0]) ** 2
+                    cur_features_each_scale.append(
+                        self.merge_chessboard(
+                            image_features[block_cnt : block_cnt + num_blocks_this_scale],
+                            num_split_h=scale // scales[0],
+                            num_split_w=scale // scales[0],
+                        )
+                    )  # 1 * C * H * W
+                    block_cnt += num_blocks_this_scale
+                num_blocks_last_scale = block_size_each_image[0] * block_size_each_image[1]
+                cur_features_each_scale.append(
+                    self.merge_chessboard(
+                        image_features[block_cnt : block_cnt + num_blocks_last_scale],
+                        num_split_h=block_size_each_image[0],
+                        num_split_w=block_size_each_image[1],
+                    )
+                )  # 1 * C * H * W
+                block_cnt += num_blocks_last_scale
+                # resize and concat features from different scales
+                output_size = cur_features_each_scale[resize_output_to_scale_idx].shape[-2:]
+                cur_features = torch.cat(
+                    [
+                        F.interpolate(cur_features_each_scale[i].to(torch.float32), size=output_size, mode="area").to(
+                            cur_features_each_scale[i].dtype
+                        )
+                        for i in range(len(cur_features_each_scale))
+                    ],
+                    dim=1,
+                )
+                image_features_each_image.append(cur_features)
+                if resize_output_to_scale_idx == len(scales) - 1 or resize_output_to_scale_idx == -1:
+                    new_block_sizes.append(block_size_each_image)
+                else:
+                    new_block_sizes.append(
+                        (
+                            scales[resize_output_to_scale_idx] // scales[0],
+                            scales[resize_output_to_scale_idx] // scales[0],
+                        )
+                    )
+        assert block_cnt == len(image_features)
+        return image_features_each_image, new_block_sizes
+    @staticmethod
+    def split_chessboard(x, num_split_h, num_split_w):
+        """
+        x: b * c * h * w
+        out: b * c * h * w
+        Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension
+        """
+        B, C, H, W = x.shape
+        assert H % num_split_h == 0 and W % num_split_w == 0
+        h, w = H // num_split_h, W // num_split_w
+        x_split = torch.cat(
+            [x[:, :, i * h : (i + 1) * h, j * w : (j + 1) * w] for i in range(num_split_h) for j in range(num_split_w)],
+            dim=0,
+        )
+        return x_split
+    @staticmethod
+    def merge_chessboard(x, num_split_h, num_split_w):
+        """
+        x: b * n * c or b * h * w * c
+        out: b * c * h * w
+        Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square.
+        """
+        B = x.shape[0]
+        if x.dim() == 3:
+            N = x.shape[1]
+            x = rearrange(x, "b (h w) c -> b c h w", h=int(N**0.5), w=int(N**0.5))
+        assert B % (num_split_h * num_split_w) == 0
+        b = B // (num_split_h * num_split_w)
+        x_merge = torch.cat(
+            [
+                torch.cat(
+                    [x[(i * num_split_w + j) * b : (i * num_split_w + j + 1) * b] for j in range(num_split_w)], dim=-1
+                )
+                for i in range(num_split_h)
+            ],
+            dim=-2,
+        )
+        return x_merge
+    def encode_video(self, inp, block_sizes: Optional[Optional[Tuple[int, ...]]] = None, mm_info: Optional[dict] = None, num_frames: Optional[List[int]] = None):
+        bs = len(inp)
+        cache_feas = []
+        cache_feas_index = []
+        inp_block_sizes = block_sizes
+        # handle cache features
+        for _idx in range(len(inp)):
+            if type(inp[_idx]) == CacheFeatures:
+                cache_feas.append(inp[_idx])
+                cache_feas_index.append(_idx)
+        raw_images = [_ for _ in inp if type(_) != CacheFeatures]
+        raw_videos_num_frames = [_.shape[0] for _ in raw_images]
+        if len(raw_images) > 0:
+            images = torch.cat(raw_images, dim=0)
+        else:
+            images = []
+        if block_sizes is None:
+            block_sizes = [None] * len(images)
+        def _load_video_features(image_features, cache_feas, cache_feas_index, raw_videos_num_frames):
+            # load cache features
+            if len(cache_feas) > 0:
+                if len(image_features) > 0:
+                    image_features = torch.split(image_features, raw_videos_num_frames)
+                new_image_features = []
+                cache_feas_idx = 0
+                raw_fea_idx = 0
+                for _idx in range(bs):
+                    if _idx in cache_feas_index:
+                        new_image_features.append(cache_feas[cache_feas_idx].value['features'].to(self.device, self.dtype))
+                        cache_feas_idx += 1
+                    else:
+                        new_image_features.append(image_features[raw_fea_idx])
+                        raw_fea_idx += 1
+                assert len(new_image_features) == bs
+                image_features = new_image_features
+                image_features = torch.cat(image_features, dim=0)
+            return image_features
+        if getattr(self.config, "dynamic_s2", False):
+            if len(images) > 0:
+                image_features = self.get_vision_tower()(images)
+                image_features, new_block_sizes = self.merge_features_for_dynamic_s2(image_features, block_sizes)
+                image_features = [
+                    self.split_chessboard(x, block_size[0], block_size[1])
+                    for x, block_size in zip(image_features, new_block_sizes)
+                ]  # list of B * C * H * W tensors
+                image_features = torch.cat(
+                    [rearrange(x, "b c h w -> b (h w) c") for x in image_features], dim=0
+                )  # B * N * C
+            else:
+                image_features = []
+            # load cache features
+            image_features = _load_video_features(image_features, cache_feas, cache_feas_index, raw_videos_num_frames)
+            # if hasattr(self.config, "save_data") and self.config.save_data and num_frames is not None: # video
+            #     _save_video_features(image_features, mm_info, inp)
+            if inp_block_sizes is None:
+                new_block_sizes = [(1, 1)] * len(image_features)
+            else:
+                raise ValueError(f"inp_block_sizes is not None: {inp_block_sizes}")
+            image_features = image_features.to(self.device, self.dtype)
+            image_features = self.get_mm_projector()(image_features)
+            image_features = list(
+                image_features.split([block_size[0] * block_size[1] for block_size in new_block_sizes], dim=0)
+            )
+            image_features = [
+                self.merge_chessboard(x, block_size[0], block_size[1])
+                for x, block_size in zip(image_features, new_block_sizes)
+            ]  # list of 1 * C * H * W tensors
+            image_features = [rearrange(x, "1 c h w -> (h w) c") for x in image_features]  # list of N * C tensors
+            if all([feature.shape[0] == image_features[0].shape[0] for feature in image_features]):
+                image_features = torch.stack(image_features, dim=0)
+        else:
+            if len(images) > 0:
+                image_features = self.get_vision_tower()(images)
+            else:
+                image_features = []
+            # load cache features
+            image_features = _load_video_features(image_features, cache_feas, cache_feas_index, raw_videos_num_frames)
+            image_features = self.get_mm_projector()(image_features)
+        return image_features
+    def encode_images(self, images, block_sizes: Optional[Optional[Tuple[int, ...]]] = None, mm_info: Optional[dict] = None, num_frames: Optional[List[int]] = None):
+        if block_sizes is None:
+            block_sizes = [None] * len(images)
+        if getattr(self.config, "dynamic_s2", False):
+            image_features = self.get_vision_tower()(images)
+            image_features, new_block_sizes = self.merge_features_for_dynamic_s2(image_features, block_sizes)
+            image_features = [
+                self.split_chessboard(x, block_size[0], block_size[1])
+                for x, block_size in zip(image_features, new_block_sizes)
+            ]  # list of B * C * H * W tensors
+            image_features = torch.cat(
+                [rearrange(x, "b c h w -> b (h w) c") for x in image_features], dim=0
+            )  # B * N * C
+            image_features = self.get_mm_projector()(image_features)
+            image_features = list(
+                image_features.split([block_size[0] * block_size[1] for block_size in new_block_sizes], dim=0)
+            )
+            image_features = [
+                self.merge_chessboard(x, block_size[0], block_size[1])
+                for x, block_size in zip(image_features, new_block_sizes)
+            ]  # list of 1 * C * H * W tensors
+            image_features = [rearrange(x, "1 c h w -> (h w) c") for x in image_features]  # list of N * C tensors
+            if all([feature.shape[0] == image_features[0].shape[0] for feature in image_features]):
+                image_features = torch.stack(image_features, dim=0)
+        else:
+            image_features = self.get_vision_tower()(images)
+            image_features = self.get_mm_projector()(image_features)
+        return image_features
+    def encode_sound(self, sounds, mm_info: Optional[dict] = None):
+        audio_features, audio_output_lengths = self.get_sound_tower()(sounds)
+        use_fea_downsample = False
+        if getattr(self.config, "sound_mm_projector", "") != "":
+            if "mlp_downsample" in getattr(self.config, "sound_mm_projector", ""):
+                use_fea_downsample = True
+        else:
+            sound_mm_projector_cfg = getattr(self.config, "sound_mm_projector_cfg", None)
+            if sound_mm_projector_cfg is not None:
+                if type(sound_mm_projector_cfg) == dict:
+                    if "mlp_downsample" in sound_mm_projector_cfg["sound_mm_projector_type"]:
+                        use_fea_downsample = True
+                elif type(sound_mm_projector_cfg) == SoundMultimodalProjectorConfig:
+                    if "mlp_downsample" in sound_mm_projector_cfg.sound_mm_projector_type:
+                        use_fea_downsample = True
+        if not use_fea_downsample:
+            audio_features = self.get_sound_mm_projector()(audio_features)
+        if audio_output_lengths is not None:
+            # split the batch
+            new_audio_features = []
+            start = 0
+            for length in audio_output_lengths:
+                new_audio_features.append(audio_features[start : start + length])
+                start += length
+            audio_features = new_audio_features
+        if use_fea_downsample:
+            audio_features = torch.stack(audio_features, dim=0)
+            audio_features = self.get_sound_mm_projector()(audio_features)
+        return audio_features
+    def train(self, mode: bool = True):
+        super().train(mode)
+        return self
+    def _embed(
+        self,
+        input_ids: torch.Tensor,
+        media: Dict[str, List[torch.Tensor]],
+        media_config: Dict[str, Dict[str, Any]],
+        labels: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        media = copy.deepcopy(media)
+        media_config = copy.deepcopy(media_config)
+        labels = labels if labels is not None else torch.full_like(input_ids, IGNORE_INDEX)
+        attention_mask = attention_mask if attention_mask is not None else torch.ones_like(input_ids, dtype=torch.bool)
+        PROCESS_GROUP_MANAGER = None
+        if PROCESS_GROUP_MANAGER is not None:
+            for name in media:
+                self.encoders[name].end_tokens = None
+        # Extract text and media embeddings
+        text_embeds = self.llm_model_embed_tokens(input_ids)
+        mm_info = {}
+        if "video_info" in media:
+            video_info = media["video_info"]
+            del media["video_info"]
+            mm_info['video_info'] = video_info
+        else:
+            video_info = None
+        if "audio_info" in media:
+            audio_info = media["audio_info"]
+            del media["audio_info"]
+            mm_info['audio_info'] = audio_info
+        else:
+            audio_info = None
+        if media is not None:
+            media_embeds = self.__embed_media_tokens(media, media_config, mm_info)
+        else:
+            # no media was provided, so we just return an empty dict
+            media_embeds = {}
+        if PROCESS_GROUP_MANAGER is not None:
+            media_embeds_video = []
+            for i, images in enumerate(media_embeds["video"]):
+                num_video_frame = media["video"][i].shape[0]
+                media_embeds_video += torch.unbind(images.reshape(num_video_frame, -1, images.shape[-1]))
+            media_embeds["video"] = deque(media_embeds_video)
+        # This is a workaround to make sure the dummy embeddings are consumed
+        while media_embeds.get("dummy"):
+            dummy_embed = media_embeds["dummy"].popleft()
+            text_embeds += torch.sum(dummy_embed) * 0
+        # Based on segment_aud_indices_list and segment_vis_indices_list, get interleaved vis-aud embeddings for video
+        video_sound_embeds_idx = 0
+        sep_embed = self.encoders["video"].embed_tokens("\n")
+        text_embeds = text_embeds.to(self.dtype)
+        sep_embed = sep_embed.to(text_embeds.dtype)
+        if video_info is not None and self.config.load_audio_in_video and self.config.interleaved_vis_aud_in_video:
+            assert self.encoders["video"].end_tokens is None, "end_tokens must be None for interleaved vis-aud in video"
+            new_video_embeds = deque()
+            video_embeds_idx = 0
+            for k in range(len(video_info)):
+                if video_info[k] is None:
+                    continue
+                for i in range(len(video_info[k])):
+                    has_audio = video_info[k][i]["has_audio"]
+                    if not has_audio:
+                        new_video_embeds.append(media_embeds["video"][video_embeds_idx])
+                        video_embeds_idx += 1
+                        continue
+                    # Check bounds for sound embeddings
+                    if video_sound_embeds_idx >= len(media_embeds["sound"]):
+                        raise ValueError(f"Sound embeddings index {video_sound_embeds_idx} out of bounds for video_info[{k}][{i}]")
+                    segment_aud_indices_list = video_info[k][i]["segment_aud_indices_list"]
+                    segment_vis_indices_list = video_info[k][i]["segment_vis_indices_list"]
+                    vis_fea_len_per_frame =  media_embeds["video"][video_embeds_idx].shape[0] / video_info[k][i]["expected_frame_count"]
+                    aud_fea_len_per_stft_frame =  media_embeds["sound"][video_sound_embeds_idx].shape[0] / audio_info[k][i]["new_audio_n_stft_frames"]
+                    vis_end = 0
+                    aud_end = 0
+                    _new_video_embed = []
+                    for j in range(len(segment_vis_indices_list)):
+                        _vis_aud_fea = []
+                        if len(segment_vis_indices_list[j]) > 0:
+                            _new_frames = [int(np.ceil((_frame+1) * vis_fea_len_per_frame)) for _frame in segment_vis_indices_list[j]]
+                            _vis_fea_end = _new_frames[-1]
+                            # Ensure we don't exceed the available features
+                            _vis_fea_end = min(_vis_fea_end, media_embeds["video"][video_embeds_idx].shape[0])
+                            if j == len(segment_vis_indices_list) - 1 and i == len(video_info) - 1 and k == len(video_info[i]) - 1 and not _vis_fea_end == media_embeds["video"][video_embeds_idx].shape[0]:
+                                print(f"Warning: The number of last interleaved video features does not match the video feature length. Expected: {media_embeds['video'][video_embeds_idx].shape[0]}, Got: {_vis_fea_end}")
+                                _vis_fea_end = media_embeds["video"][video_embeds_idx].shape[0]
+                            _vis_fea = media_embeds["video"][video_embeds_idx][vis_end:_vis_fea_end]
+                            vis_end = _vis_fea_end
+                            _vis_aud_fea.append(_vis_fea)
+                        _vis_aud_fea.append(sep_embed)
+                        if len(segment_aud_indices_list[j]) > 0:
+                            _new_audio_indices = [int(np.ceil(_fea * aud_fea_len_per_stft_frame)) for _fea in segment_aud_indices_list[j]]
+                            _aud_fea_end = _new_audio_indices[-1]
+                            # Ensure we don't exceed the available features
+                            _aud_fea_end = min(_aud_fea_end, media_embeds["sound"][video_sound_embeds_idx].shape[0])
+                            _aud_fea = media_embeds["sound"][video_sound_embeds_idx][aud_end:_aud_fea_end]
+                            _vis_aud_fea.append(_aud_fea)
+                            aud_end = _aud_fea_end
+                        _vis_aud_fea.append(sep_embed)
+                        _new_video_embed.append(torch.cat(_vis_aud_fea, dim=0))
+                    video_sound_embeds_idx += 1
+                    new_video_embeds.append(torch.cat(_new_video_embed, dim=0))
+                    video_embeds_idx += 1
+            assert len(new_video_embeds) == len(media_embeds["video"]), "The number of new video embeddings does not match the number of original video embeddings."
+            media_embeds["video"] = new_video_embeds
+        # Remove padding
+        batch_size = labels.shape[0]
+        text_embeds = [text_embeds[k][attention_mask[k]] for k in range(batch_size)]
+        labels = [labels[k][attention_mask[k]] for k in range(batch_size)]
+        # Build inverse mapping from token ID to media name
+        media_tokens = {}
+        for name, token_id in self.tokenizer.media_token_ids.items():
+            media_tokens[token_id] = name
+        # Fuse text and media embeddings
+        inputs_m, labels_m = [], []
+        sound_embeds_idx = 0
+        for k in range(batch_size):
+            inputs_mk, labels_mk = [], []
+            pos = 0
+            while pos < len(labels[k]):
+                if input_ids[k][pos].item() in media_tokens:
+                    name = media_tokens[input_ids[k][pos].item()] if PROCESS_GROUP_MANAGER is None else "video"
+                    if input_ids[k][pos].item() == self.tokenizer.media_token_ids["sound"]:
+                        if self.config.interleaved_vis_aud_in_video:
+                            if sound_embeds_idx < video_sound_embeds_idx:
+                                media_embeds[name].popleft()
+                                sound_embeds_idx += 1
+                                pos += 1
+                                continue
+                        sound_embeds_idx += 1
+                    end = pos + 1
+                    input = media_embeds[name].popleft()
+                    label = torch.full([input.shape[0]], IGNORE_INDEX, device=labels[k].device, dtype=labels[k].dtype)
+                else:
+                    end = pos
+                    while end < len(labels[k]) and input_ids[k][end].item() not in media_tokens:
+                        end += 1
+                    input = text_embeds[k][pos:end]
+                    label = labels[k][pos:end]
+                inputs_mk.append(input)
+                labels_mk.append(label)
+                pos = end
+            inputs_m.append(torch.cat(inputs_mk, dim=0))
+            labels_m.append(torch.cat(labels_mk, dim=0))
+        inputs, labels = inputs_m, labels_m
+        inputs[0] += sep_embed.mean() * 0 # dummy embedding
+        # Check if all media embeddings are consumed
+        for name in media_embeds:
+            if media_embeds[name]:
+                raise ValueError(f"Not all {name} embeddings are consumed! Still {len(media_embeds[name])} left.")
+        # Truncate sequences to `model_max_length` as media embeddings are inserted
+        inputs, labels = self.__truncate_sequence(inputs, labels)
+        # Pad sequences to the longest one in the batch
+        return self.__batchify_sequence(inputs, labels)
+    def __embed_media_tokens(
+        self,
+        media: Dict[str, List[torch.Tensor]],
+        media_config: Dict[str, Dict[str, Any]],
+        mm_info,
+    ) -> Dict[str, List[torch.Tensor]]:
+        embeds = defaultdict(deque)
+        if self.config.unified_audio_encoder:
+            assert len(media["speech"]) == 0
+        for name in media:
+            _encoder = self.encoders[name]
+            if name in ["speech", "sound"] and self.config.unified_audio_encoder:
+                _encoder = self.encoders["sound"]
+            if self.training:
+                 # Gather metainfo of media objects from all ranks
+                if name in ["speech", "sound"]:
+                    info = []
+                    if type(media.get(name, {})) is dict:
+                        for _dict in media.get(name, {}):
+                            info.append({k: {"shape": v.shape, "dtype": v.dtype} for k, v in _dict.items()})
+                    elif type(media.get(name, {})) is list:
+                        info = [{"shape": tensor.shape, "dtype": tensor.dtype} for tensor in media.get(name, [])]
+                    else:
+                        raise ValueError(f"Unsupported media type: {type(media.get(name, {}))}")
+                    infos_list = vila_all_gather(info)
+                    infos = list(chain(*infos_list))
+                    # The entire batch does not contain any media objects of this type.
+                    if not infos:
+                        continue
+                    # for audio encoding, we have to ensure the batch size is the same for all ranks. If not, we need to pad the batch with dummy tensors to the max batch size
+                    max_batch_size = max(len(_info) for _info in infos_list)
+                    missing_batch_size = max_batch_size - len(info)
+                    _media = media.get(name, [])
+                    _medias = list(chain(vila_all_gather(_media)))
+                    if missing_batch_size > 0:
+                        for i in range(missing_batch_size):
+                            # use one of the media tensors to create a dummy tensor
+                            if type(media.get(name, {})) is dict:
+                                _dummy = {k: v.clone().to(device=self.device) for k, v in _medias[0].items()}
+                            elif type(media.get(name, {})) is list:
+                                if type(_medias[0]) is torch.Tensor:
+                                    _dummy = _medias[0].clone().to(device=self.device)
+                                elif type(_medias[0]) is np.ndarray:
+                                    _dummy = _medias[0].copy()
+                                else:
+                                    raise ValueError(f"Unsupported media type: {type(_medias[0])}")
+                            else:
+                                raise ValueError(f"Unsupported media type: {type(media.get(name, {}))}")
+                            _media.append(_dummy)
+                            mm_info["audio_info"].append(["dummy"])
+                    # we need to also align the length of all audio samples in the batch size
+                    cur_batch_max_audio_samples = max(len(_audio) for _audio in _medias)
+                    cur_batch_max_audio_samples = int(np.ceil(cur_batch_max_audio_samples  / (self.config.audio_sampling_rate * 30)) * (self.config.audio_sampling_rate * 30)) # should be multiple of 30 seconds
+                    cur_batch_max_audio_samples = min(cur_batch_max_audio_samples, self.config.audio_chunk_length * self.config.audio_sampling_rate)
+                    cur_batch_max_audio_duration = cur_batch_max_audio_samples // self.config.audio_sampling_rate
+                    whisper_feature_extractor = WhisperFeatureExtractor.from_pretrained(
+                        self.config._name_or_path, chunk_length=cur_batch_max_audio_duration, sampling_rate=self.config.audio_sampling_rate, hop_length=self.config.audio_hop_length
+                    )
+                    # use WhisperFeatureExtractor in transformers to load
+                    new_media = []
+                    aud_idx = 0
+                    for _batch_idx in range(len(mm_info["audio_info"])):
+                        _audio_info = mm_info["audio_info"][_batch_idx]
+                        if _audio_info is not None:
+                            for _mm_idx in range(len(_audio_info)):
+                                _audio = _media[aud_idx]
+                                if type(_audio) is torch.Tensor:
+                                    device = _audio.device
+                                    dtype = _audio.dtype
+                                    _audio = _audio.cpu().float()
+                                else:
+                                    # logger.warning(f"The audio type is not a tensor, which is unexpected. Using the device and dtype of the model. media: {media}, mm_info: {mm_info}")
+                                    device = self.device
+                                    dtype = self.dtype
+                                _audio = whisper.pad_or_trim(_audio, length=cur_batch_max_audio_samples)
+                                aud_idx += 1
+                                stft_features = whisper_feature_extractor(
+                                    _audio,
+                                    sampling_rate=self.config.audio_sampling_rate,
+                                    return_attention_mask=True,
+                                    padding="max_length",
+                                    return_tensors="pt",
+                                ).to(device, dtype)
+                                new_media.append(stft_features)
+                                if _audio_info[_mm_idx] != "dummy":
+                                    _audio_info[_mm_idx]["new_audio_chunk_length"] = cur_batch_max_audio_duration
+                                    _audio_info[_mm_idx]["new_audio_n_samples"] = cur_batch_max_audio_samples
+                                    _audio_info[_mm_idx]["audio_end_sample_sec"] = _audio_info[_mm_idx]["audio_start_sec"] + cur_batch_max_audio_duration
+                                    _audio_info[_mm_idx]["new_audio_n_stft_frames"] = stft_features["input_features"].shape[-1]
+                    assert aud_idx == len(_media), "The number of audio info does not match the number of audio samples."
+                    _media = new_media
+                    _fea = _encoder(_media, media_config[name], mm_info)
+                    # [751, 1536]
+                    # consume dummy features later
+                    _dummy_fea = _fea[len(info) :]
+                    embeds["dummy"].extend(_dummy_fea)
+                    # remove the dummy features
+                    _real_fea = _fea[: len(info)]
+                    if len(info) > 0:
+                        embeds[name] = deque(_real_fea)
+                else:
+                    # Gather metainfo of media objects from all ranks
+                    info = [{"shape": tensor.shape, "dtype": tensor.dtype} for tensor in media.get(name, [])]
+                    infos = list(chain(vila_all_gather(info)))
+                    # The entire batch does not contain any media objects of this type.
+                    if not infos:
+                        continue
+                    # Create a dummy tensor to ensure the encoder is called, otherwise the training will hang.
+                    if media.get(name) is None or len(media[name]) == 0:
+                        dummy = torch.zeros(infos[0]["shape"], dtype=infos[0]["dtype"], device=self.device)
+                        embeds["dummy"].extend(self.encoders[name]([dummy], media_config[name]))
+                        continue
+                    embeds[name] = deque(self.encoders[name](media[name], media_config[name]))
+            else:
+                if name == "sound":
+                    all_audio_chunk_lengths = []
+                    for _sample_idx in range(len(media[name])):
+                        for _mm_idx in range(len(mm_info["audio_info"][_sample_idx])):
+                            _new_audio_chunk_length = mm_info["audio_info"][_sample_idx][_mm_idx]["new_audio_chunk_length"]
+                            all_audio_chunk_lengths.append(_new_audio_chunk_length)
+                    cur_batch_max_audio_duration = max(all_audio_chunk_lengths)
+                    cur_batch_max_audio_samples = cur_batch_max_audio_duration * self.config.audio_sampling_rate
+                    # for qwen omni audio
+                    # cur_batch_max_audio_samples = 960000
+                    whisper_feature_extractor = WhisperFeatureExtractor.from_pretrained(
+                            self.config._name_or_path, chunk_length=cur_batch_max_audio_duration, sampling_rate=self.config.audio_sampling_rate, hop_length=self.config.audio_hop_length
+                    )
+                    new_media = []
+                    _idx = 0
+                    assert len(all_audio_chunk_lengths) == len(media[name]), "The number of audio chunk lengths does not match the number of audio samples."
+                    _media = media.get(name, [])
+                    aud_idx = 0
+                    for _batch_idx in range(len(mm_info["audio_info"])):
+                        _audio_info = mm_info["audio_info"][_batch_idx]
+                        if _audio_info is not None:
+                            for _mm_idx in range(len(_audio_info)):
+                                _audio = _media[aud_idx]
+                                if type(_audio) is torch.Tensor:
+                                    device = _audio.device
+                                    dtype = _audio.dtype
+                                    _audio = _audio.cpu().float()
+                                else:
+                                    device = self.device
+                                    dtype = self.dtype
+                                _audio = whisper.pad_or_trim(_audio, length=cur_batch_max_audio_samples)
+                                aud_idx += 1
+                                stft_features = whisper_feature_extractor(
+                                    _audio,
+                                    sampling_rate=self.config.audio_sampling_rate,
+                                    return_attention_mask=True,
+                                    padding="max_length",
+                                    return_tensors="pt",
+                                ).to(device, dtype)
+                                new_media.append(stft_features)
+                                if _audio_info[_mm_idx] != "dummy":
+                                    _audio_info[_mm_idx]["new_audio_chunk_length"] = cur_batch_max_audio_duration
+                                    _audio_info[_mm_idx]["new_audio_n_samples"] = cur_batch_max_audio_samples
+                                    _audio_info[_mm_idx]["audio_end_sample_sec"] = _audio_info[_mm_idx]["audio_start_sec"] + cur_batch_max_audio_duration
+                                    _audio_info[_mm_idx]["new_audio_n_stft_frames"] = stft_features["input_features"].shape[-1]
+                    media[name] = new_media
+                if len(media[name]) > 0:
+                    embeds[name] = deque(_encoder(media[name], media_config[name], mm_info))
+        return embeds
+    def __truncate_sequence(
+        self, inputs: List[torch.Tensor], labels: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.training and any(len(input) > self.tokenizer.model_max_length for input in inputs):
+            warnings.warn(f"Truncating sequences to `model_max_length` ({self.tokenizer.model_max_length}).")
+            inputs = [input[: self.tokenizer.model_max_length] for input in inputs]
+            labels = [label[: self.tokenizer.model_max_length] for label in labels]
+        return inputs, labels
+    def __batchify_sequence(
+        self, inputs: List[torch.Tensor], labels: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        batch_size = len(inputs)
+        device = inputs[0].device
+        hidden_size = inputs[0].shape[1]
+        max_length = max(inputs[k].shape[0] for k in range(batch_size))
+        attention_mask = torch.ones((batch_size, max_length), dtype=torch.bool, device=device)
+        inputs_p, labels_p = [], []
+        for k in range(batch_size):
+            size_pk = max_length - inputs[k].shape[0]
+            inputs_pk = torch.zeros((size_pk, hidden_size), dtype=inputs[k].dtype, device=device)
+            labels_pk = torch.full((size_pk,), IGNORE_INDEX, dtype=labels[k].dtype, device=device)
+            if self.tokenizer.padding_side == "right":
+                attention_mask[k, inputs[k].shape[0] :] = False
+                inputs_pk = torch.cat([inputs[k], inputs_pk], dim=0)
+                labels_pk = torch.cat([labels[k], labels_pk], dim=0)
+            else:
+                labels[k] = labels[k].to(device)
+                attention_mask[k, : -inputs[k].shape[0]] = False
+                inputs_pk = torch.cat([inputs_pk, inputs[k]], dim=0)
+                labels_pk = torch.cat([labels_pk, labels[k]], dim=0)
+            inputs_p.append(inputs_pk)
+            labels_p.append(labels_pk)
+        inputs = torch.stack(inputs_p, dim=0)
+        labels = torch.stack(labels_p, dim=0)
+        return inputs, labels, attention_mask
+    def repack_multimodal_data(self, inputs_embeds, attention_mask, position_ids, labels):
+        # Handle sequence parallelism
+        PROCESS_GROUP_MANAGER = None
+        # We do re-sharding instead of packing here to ensure the sequence length is the same across all ranks.
+        if PROCESS_GROUP_MANAGER is not None:
+            sp_degree = PROCESS_GROUP_MANAGER.sp_degree
+            sp_rank = PROCESS_GROUP_MANAGER.sp_rank
+            sp_group = PROCESS_GROUP_MANAGER.sp_pg
+            ring_degree = PROCESS_GROUP_MANAGER.ring_degree
+            ring_rank = PROCESS_GROUP_MANAGER.ring_rank
+            ring_type = PROCESS_GROUP_MANAGER.ring_type
+            ulysses_degree = PROCESS_GROUP_MANAGER.ulysses_degree
+            ulysses_rank = PROCESS_GROUP_MANAGER.ulysses_rank
+            bs, shard_seqlen = position_ids.shape
+            sp_seq_len = [torch.zeros(1, dtype=torch.int64, device=position_ids.device) for _ in range(sp_degree)]
+            dist.all_gather(sp_seq_len, torch.tensor(shard_seqlen, device=position_ids.device), group=sp_group)
+            sp_seq_len_cat = torch.cat(sp_seq_len, dim=0)
+            if sp_rank == 0:
+                original_start_id = 0
+            else:
+                original_start_id = torch.sum(sp_seq_len_cat[:sp_rank]).item()
+            original_end_id = torch.sum(sp_seq_len_cat[: sp_rank + 1]).item()
+            # Gather attention_mask, position_ids, labels and input_embeds
+            all_inputs_embeds = torch.zeros(
+                bs,
+                torch.sum(sp_seq_len_cat),
+                inputs_embeds.shape[-1],
+                dtype=inputs_embeds.dtype,
+                device=inputs_embeds.device,
+            ).contiguous()
+            all_inputs_embeds[:, original_start_id:original_end_id, :] += inputs_embeds
+            dist.barrier(group=sp_group)
+            dist.all_reduce(all_inputs_embeds, group=sp_group)
+            dist.barrier(group=sp_group)
+            attention_mask_list = [
+                torch.zeros((bs, sp_seq_len[i]), dtype=attention_mask.dtype, device=attention_mask.device)
+                for i in range(sp_degree)
+            ]
+            position_ids_list = [
+                torch.zeros((bs, sp_seq_len[i]), dtype=position_ids.dtype, device=position_ids.device)
+                for i in range(sp_degree)
+            ]
+            labels_list = [
+                torch.zeros((bs, sp_seq_len[i]), dtype=labels.dtype, device=labels.device) for i in range(sp_degree)
+            ]
+            dist.all_gather(attention_mask_list, attention_mask, group=sp_group)
+            dist.all_gather(position_ids_list, position_ids, group=sp_group)
+            dist.all_gather(labels_list, labels, group=sp_group)
+            effective_seqlen_list = [attention_mask_list[i].sum(dim=-1) for i in range(sp_degree)]
+            effective_seqlen = torch.stack(effective_seqlen_list, dim=-1)
+            effective_seqlen_batch_list = torch.unbind(effective_seqlen, dim=0)
+            global_attention_mask_list = []
+            global_position_ids_list = []
+            global_labels_list = []
+            global_inputs_embeds_list = []
+            for i in range(bs):
+                global_attention_mask_batch_list = []
+                global_position_ids_batch_list = []
+                global_labels_batch_list = []
+                global_inputs_embeds_batch_list = []
+                for j in range(sp_degree):
+                    eff_len = effective_seqlen_batch_list[i][j]
+                    prev_len = torch.sum(sp_seq_len_cat[:j]).item() if j > 0 else 0
+                    global_attention_mask_batch_list.append(attention_mask_list[j][i, :eff_len])
+                    global_position_ids_batch_list.append(position_ids_list[j][i, :eff_len])
+                    global_labels_batch_list.append(labels_list[j][i, :eff_len])
+                    global_inputs_embeds_batch_list.append(all_inputs_embeds[i, prev_len : prev_len + eff_len, :])
+                global_attention_mask_list.append(torch.cat(global_attention_mask_batch_list, dim=0))
+                global_position_ids_list.append(torch.cat(global_position_ids_batch_list, dim=0))
+                global_labels_list.append(torch.cat(global_labels_batch_list, dim=0))
+                global_inputs_embeds_list.append(torch.cat(global_inputs_embeds_batch_list, dim=0))
+                global_attention_mask = torch.nn.utils.rnn.pad_sequence(
+                    global_attention_mask_list, batch_first=True, padding_value=False
+                )
+                global_position_ids = torch.nn.utils.rnn.pad_sequence(
+                    global_position_ids_list, batch_first=True, padding_value=-1
+                )
+                global_labels = torch.nn.utils.rnn.pad_sequence(
+                    global_labels_list, batch_first=True, padding_value=IGNORE_INDEX
+                )
+                global_inputs_embeds = torch.nn.utils.rnn.pad_sequence(
+                    global_inputs_embeds_list, batch_first=True, padding_value=0
+                )
+            # Re-shard the inputs
+            if ring_degree > 1:
+                total_effective_seqlen = torch.sum(effective_seqlen, dim=1)
+                new_seqlen_per_rank = total_effective_seqlen // sp_degree
+                assert torch.all(
+                    total_effective_seqlen % sp_degree == 0
+                ), "total_effective_seqlen must be divisible by sp_degree"
+                max_new_seqlen = torch.max(new_seqlen_per_rank).item()
+                new_attention_mask = torch.zeros(
+                    (bs, max_new_seqlen), dtype=global_attention_mask.dtype, device=global_attention_mask.device
+                )
+                new_position_ids = torch.zeros(
+                    (bs, max_new_seqlen), dtype=global_position_ids.dtype, device=global_position_ids.device
+                )
+                new_labels = torch.full(
+                    (bs, max_new_seqlen), IGNORE_INDEX, dtype=global_labels.dtype, device=global_labels.device
+                )
+                new_inputs_embeds = torch.zeros(
+                    (bs, max_new_seqlen, global_inputs_embeds.shape[-1]),
+                    dtype=global_inputs_embeds.dtype,
+                    device=global_inputs_embeds.device,
+                )
+                if ring_type == "ring_varlen":
+                    for i in range(bs):
+                        start_idx = new_seqlen_per_rank[i] * sp_rank
+                        end_idx = start_idx + new_seqlen_per_rank[i]
+                        new_attention_mask[i, : new_seqlen_per_rank[i]] = global_attention_mask[i, start_idx:end_idx]
+                        new_position_ids[i, : new_seqlen_per_rank[i]] = global_position_ids[i, start_idx:end_idx]
+                        new_labels[i, : new_seqlen_per_rank[i]] = global_labels[i, start_idx:end_idx]
+                        new_inputs_embeds[i, : new_seqlen_per_rank[i], :] = global_inputs_embeds[
+                            i, start_idx:end_idx, :
+                        ]
+                elif ring_type == "zigzag_ring_varlen":
+                    chunk_size = total_effective_seqlen // (2 * sp_degree)
+                    for i in range(bs):
+                        # Zigzag pattern indices
+                        if sp_degree == ring_degree:
+                            forward_rank_idx = sp_rank
+                            backward_rank_idx = 2 * sp_degree - sp_rank - 1
+                        else:
+                            ulysses_offset = ulysses_rank * ring_degree * 2
+                            forward_rank_idx = ring_rank + ulysses_offset
+                            backward_rank_idx = sp_degree - ring_rank - 1 + ulysses_offset
+                        # Calculate start and end indices for the forward and backward zigzag
+                        start_idx_fwd = forward_rank_idx * chunk_size[i]
+                        end_idx_fwd = start_idx_fwd + chunk_size[i]
+                        start_idx_bwd = backward_rank_idx * chunk_size[i]
+                        end_idx_bwd = start_idx_bwd + chunk_size[i]
+                        # Fill new tensors with zigzag data
+                        new_attention_mask[i, : chunk_size[i]] = global_attention_mask[i, start_idx_fwd:end_idx_fwd]
+                        new_attention_mask[i, chunk_size[i] : 2 * chunk_size[i]] = global_attention_mask[
+                            i, start_idx_bwd:end_idx_bwd
+                        ]
+                        new_position_ids[i, : chunk_size[i]] = global_position_ids[i, start_idx_fwd:end_idx_fwd]
+                        new_position_ids[i, chunk_size[i] : 2 * chunk_size[i]] = global_position_ids[
+                            i, start_idx_bwd:end_idx_bwd
+                        ]
+                        new_labels[i, : chunk_size[i]] = global_labels[i, start_idx_fwd:end_idx_fwd]
+                        new_labels[i, chunk_size[i] : 2 * chunk_size[i]] = global_labels[i, start_idx_bwd:end_idx_bwd]
+                        new_inputs_embeds[i, : chunk_size[i], :] = global_inputs_embeds[i, start_idx_fwd:end_idx_fwd, :]
+                        new_inputs_embeds[i, chunk_size[i] : 2 * chunk_size[i], :] = global_inputs_embeds[
+                            i, start_idx_bwd:end_idx_bwd, :
+                        ]
+                else:
+                    raise ValueError(f"Invalid ring_type: {ring_type}")
+            else:
+                global_seq_len = global_attention_mask.shape[-1]
+                seq_len_sharded = global_seq_len // sp_degree
+                start_idx_reshard = seq_len_sharded * sp_rank
+                end_idx_reshard = start_idx_reshard + seq_len_sharded if sp_rank < sp_degree - 1 else global_seq_len
+                new_attention_mask = torch.narrow(
+                    global_attention_mask, 1, start_idx_reshard, end_idx_reshard - start_idx_reshard
+                )
+                new_position_ids = torch.narrow(
+                    global_position_ids, 1, start_idx_reshard, end_idx_reshard - start_idx_reshard
+                )
+                new_labels = torch.narrow(global_labels, 1, start_idx_reshard, end_idx_reshard - start_idx_reshard)
+                new_inputs_embeds = torch.narrow(
+                    global_inputs_embeds, 1, start_idx_reshard, end_idx_reshard - start_idx_reshard
+                )
+            return new_inputs_embeds, new_attention_mask, new_position_ids, new_labels
+        device = inputs_embeds.device
+        batch_size = inputs_embeds.shape[0]
+        seqlens = [attention_mask[k].sum().item() for k in range(batch_size)]
+        # Pack all sequences together
+        inputs_embeds_p = [inputs_embeds[k][attention_mask[k]] for k in range(batch_size)]
+        attention_mask_p = [torch.ones(seqlens[k], dtype=torch.int, device=device) for k in range(batch_size)]
+        position_ids_p = [torch.arange(seqlens[k], dtype=torch.int, device=device) for k in range(batch_size)]
+        labels_p = [labels[k][attention_mask[k]] for k in range(batch_size)]
+        # Add one dummy token at the end of the packed sequence to ensure that `_get_unpacked_data` will be called
+        inputs_embeds_p.append(torch.zeros(1, inputs_embeds.shape[-1], dtype=inputs_embeds.dtype, device=device))
+        attention_mask_p.append(torch.tensor([0], dtype=torch.int, device=device))
+        position_ids_p.append(torch.tensor([0], dtype=torch.int, device=device))
+        labels_p.append(torch.tensor([IGNORE_INDEX], dtype=torch.int, device=device))
+        # Mask the first token of each sequence to avoid contamination
+        for label in labels_p:
+            label[0] = IGNORE_INDEX
+        # Batch the data
+        inputs_embeds_p = torch.cat(inputs_embeds_p, dim=0).unsqueeze(0)
+        attention_mask_p = torch.cat(attention_mask_p, dim=0).unsqueeze(0)
+        position_ids_p = torch.cat(position_ids_p, dim=0).unsqueeze(0)
+        labels_p = torch.cat(labels_p, dim=0).unsqueeze(0)
+        if hasattr(
+            self, "pad_to_multiple_of"
+        ):  # related to quantization, please refer to ModelArguments for more information.
+            assert len(labels_p.shape) == 2
+            batch_size, max_length, cur_length = labels_p.shape[0], labels_p.shape[1], labels_p.shape[1]
+            hidden_size = inputs_embeds_p.shape[-1]
+            if max_length % self.pad_to_multiple_of != 0:
+                max_length = ((max_length // self.pad_to_multiple_of) + 1) * self.pad_to_multiple_of
+                difference = max_length - cur_length
+                inputs_embeds_p = torch.cat(
+                    (
+                        inputs_embeds_p,
+                        torch.full((batch_size, difference, hidden_size), self.llm.pad_token_id).to(inputs_embeds_p),
+                    ),
+                    dim=1,
+                )
+                labels_p = torch.cat((labels_p, torch.full((batch_size, difference), IGNORE_INDEX).to(labels_p)), dim=1)
+                attention_mask_p = torch.cat(
+                    (
+                        attention_mask_p,
+                        torch.zeros((batch_size, difference), dtype=torch.bool).to(attention_mask_p),
+                    ),
+                    dim=1,
+                )
+                position_ids_p = torch.cat(
+                    (position_ids_p, torch.full((batch_size, difference), -1).to(position_ids_p)), dim=1
+                )
+        return inputs_embeds_p, attention_mask_p, position_ids_p, labels_p
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        media: Optional[Dict[str, List[torch.Tensor]]] = None,
+        images: Optional[torch.FloatTensor] = None,
+        media_config: Optional[List] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        packing: bool = True,
+        force_packing: bool = False,
+        seqlens_in_batch: Optional[torch.LongTensor] = None,
+        dpo_forward: bool = False,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        self.freezed_module_patch()
+        if images is not None:
+            if media is not None:
+                raise ValueError("Both 'media' and 'images' are provided. Please provide only one.")
+            print("The 'images' argument is deprecated. Please use 'media' instead.")
+            media = {"image": images}
+        if media_config is None:
+            media_config = defaultdict(dict)
+        if inputs_embeds is None:
+            inputs_embeds, labels, attention_mask = self._embed(input_ids, media, media_config, labels, attention_mask)
+        if force_packing or (packing and self.training and not dpo_forward):
+            if seqlens_in_batch is None:
+                seqlens_in_batch = torch.sum(attention_mask, dim=1)
+            set_seqlens_in_batch(seqlens_in_batch)
+            (inputs_embeds, attention_mask, position_ids, labels) = self.repack_multimodal_data(
+                inputs_embeds, attention_mask, position_ids, labels
+            )
+        outputs = self.llm(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            labels=labels,
+            **kwargs,
+        )
+        if self.training and getattr(self.config, "time_token_ids", []):
+            outputs.loss = soft_cross_entropy(
+                outputs.logits,
+                labels,
+                soft_tokens=self.config.time_token_ids,
+                std=self.config.soft_ce_std,
+            )
+        if dpo_forward:
+            return outputs.logits, labels
+        return outputs
+    @torch.inference_mode()
+    def generate(
+        self,
+        input_ids: Optional[torch.FloatTensor] = None,
+        media: Optional[Dict[str, List[torch.Tensor]]] = None,
+        media_config: Dict[str, Dict[str, Any]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        return_output_ids_only: bool = True,
+        **generation_kwargs,
+    ) -> torch.LongTensor:
+        """
+        input_tokens: <image> describe the image
+        media:        [Tensor(1, 3, 384, 384), ]
+        ----------->
+        input_tokens:      36000      001 002 003 004
+        input_emds:     <media emd>   001 002 003 004
+        """
+        inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
+        output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
+        if return_output_ids_only:
+            return_value = output_ids
+        else:
+            # by default, return the input_ids and output_ids concatenated to keep consistency with the community VLMs like qwen
+            generation_config = generation_kwargs.get("generation_config", None)
+            if generation_config is not None:
+                num_generations = generation_config.num_return_sequences
+                repeat_input_ids = input_ids.repeat_interleave(num_generations, dim=0)
+                return_value = torch.cat([repeat_input_ids, output_ids], dim=-1)
+            else:
+                return_value = torch.cat([input_ids, output_ids], dim=-1)
+        return return_value
+    @torch.inference_mode()
+    def generate_content(
+        self,
+        prompt: Union[str, List],
+        generation_config: Optional[GenerationConfig] = None,
+        response_format=None,
+    ) -> str:
+        conversation = [{"from": "human", "value": prompt}]
+        # Convert response format to logits processor
+        xgr_logits_processor = None
+        # Extract media from the conversation
+        media = extract_media(conversation, self.config)
+        # Process media
+        media_config = defaultdict(dict)
+        for name in media:
+            if name == "image":
+                if len(media["image"]) == 1 and self.config.image_aspect_ratio in ["dynamic", "dynamic_s2"]:
+                    self.config.image_processor = self.vision_tower.image_processor
+                    if self.config.image_aspect_ratio == "dynamic":
+                        images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
+                        conversation[0]["value"] = conversation[0]["value"].replace(
+                            DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
+                        )
+                    else:
+                        if type(self.config.s2_scales) is str:
+                            self.config.s2_scales = list(map(int, self.config.s2_scales.split(",")))
+                        images, block_sizes = process_image(
+                            media["image"][0], self.config, None, enable_dynamic_s2=True
+                        )
+                        images = images.half()
+                        media_config[name]["block_sizes"] = [block_sizes]
+                else:
+                    images = process_images(media["image"], self.vision_tower.image_processor, self.config).half()
+                media[name] = [image for image in images]
+            elif name == "video":
+                if self.config.image_aspect_ratio == "dynamic" and self.config.video_max_tiles > 1:
+                    media[name] = [
+                        process_images(
+                            images,
+                            self.vision_tower.image_processor,
+                            self.config,
+                            enable_dynamic_res=True,
+                            max_tiles=self.config.video_max_tiles,
+                        ).half()
+                        for images in media[name]
+                    ]
+                elif self.config.image_aspect_ratio == "dynamic_s2" and self.config.video_max_tiles > 1:
+                    self.config.image_processor = self.vision_tower.image_processor
+                    if type(self.config.s2_scales) is str:
+                        self.config.s2_scales = list(map(int, self.config.s2_scales.split(",")))
+                    media[name] = [
+                        torch.cat(
+                            [
+                                process_image(
+                                    image,
+                                    self.config,
+                                    None,
+                                    enable_dynamic_s2=True,
+                                    max_tiles=self.config.video_max_tiles,
+                                )[0].half()
+                                for image in images
+                            ]
+                        )
+                        for images in media[name]
+                    ]
+                else:
+                    media[name] = [
+                        process_images(images, self.vision_tower.image_processor, self.config)
+                        for images in media[name]
+                    ]
+            elif name == "speech":
+                speeches = media["speech"]
+                media[name] = [speech for speech in speeches]
+            elif name == "sound":
+                # sounds = process_sounds(media["sound"]).half()
+                sounds = media["sound"]
+                # media[name] = [{k: v.half() for sound in sounds for k, v in sound.items()]
+                for sound in sounds:
+                    if type(sound) is dict:
+                        for k, v in sound.items():
+                            sound[k] = v.half()
+                media[name] = [sound for sound in sounds]
+            elif name == "video_info":
+                media[name] = [media["video_info"]]
+            elif name == "audio_info":
+                media[name] = [media["audio_info"]]
+            else:
+                raise ValueError(f"Unsupported media type: {name}")
+        # Tokenize the conversation
+        input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()
+        # Set up the generation config
+        generation_config = generation_config or self.default_generation_config
+        # Generate the response
+        try:
+            output_ids = self.generate(
+                input_ids=input_ids,
+                media=media,
+                media_config=media_config,
+                generation_config=generation_config,
+                logits_processor=xgr_logits_processor,  # structured generation
+            )
+        except ValueError:
+            if not generation_config.do_sample:
+                raise
+            logging.warning("Generation failed with sampling, retrying with greedy decoding.")
+            generation_config.do_sample = False
+            output_ids = self.generate(
+                input_ids=input_ids,
+                media=media,
+                media_config=media_config,
+                generation_config=generation_config,
+                logits_processor=xgr_logits_processor,
+            )
+        # Decode the response
+        response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+        return response
+    @property
+    def default_generation_config(self) -> GenerationConfig:
+        generation_config = copy.deepcopy(self.generation_config or GenerationConfig())
+        if self.tokenizer.eos_token_id is None:
+            raise ValueError("Tokenizer must have an EOS token")
+        if generation_config.max_length == GenerationConfig().max_length:
+            generation_config.max_length = self.tokenizer.model_max_length
+        if generation_config.pad_token_id is None:
+            generation_config.pad_token_id = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id
+        if generation_config.bos_token_id is None:
+            generation_config.bos_token_id = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        if generation_config.eos_token_id is None:
+            generation_config.eos_token_id = self.tokenizer.eos_token_id
+        return generation_config

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "chunk_length": 30,
+    "feature_extractor_type": "WhisperFeatureExtractor",
+    "feature_size": 128,
+    "hop_length": 160,
+    "n_fft": 400,
+    "n_samples": 480000,
+    "nb_max_frames": 3000,
+    "padding_side": "right",
+    "padding_value": 0.0,
+    "return_attention_mask": true,
+    "sampling_rate": 16000
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,59 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "omnivinci"
+version = "1.0.0"
+description = "omnivinci"
+readme = "README.md"
+requires-python = ">=3.10"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+]
+dependencies = [
+    "torch==2.3.0", "torchvision==0.18.0",
+    "transformers==4.46.0", "tokenizers>=0.15.2", "sentencepiece==0.1.99", "shortuuid",
+    "accelerate==0.34.2", "peft>=0.9.0", "bitsandbytes==0.43.2",
+    "pydantic<2,>=1", "markdown2[all]", "numpy==1.26.4", "scikit-learn==1.2.2",
+    "gradio==3.35.2", "gradio_client==0.2.9",
+    "requests", "httpx", "uvicorn", "fastapi", "fire", "seaborn", "ring_flash_attn==0.1.1",
+    "einops==0.6.1", "einops-exts==0.0.4", "timm==0.9.12",
+    "openpyxl==3.1.2", "pytorchvideo==0.1.5", "decord==0.6.0",
+    "datasets==2.16.1", "openai==1.8.0", "webdataset==0.2.86",
+    "nltk==3.3", "pywsd==1.2.4", "opencv-python-headless==4.8.0.76",
+    "s2wrapper@git+https://github.com/bfshi/scaling_on_scales",
+    "tyro", "pytest", "pre-commit", "loguru", "hydra-core", "xgrammar"
+]
+[project.scripts]
+[project.optional-dependencies]
+train = ["deepspeed==0.9.5", "ninja", "wandb"]
+eval = ["word2number", "Levenshtein", "nltk", "pywsd"]
+[project.urls]
+"Homepage" = "https://github.com/NVlabs/OmniVinci"
+"Bug Tracker" = "https://github.com/NVlabs/OmniVinci"
+[tool.triton]
+triton = {version = "3.0.0.post20240610003544", file = "https://aiinfra.pkgs.visualstudio.com/2692857e-05ef-43b4-ba9c-ccf1c22c437c/_packaging/07c94329-d4c3-4ad4-9e6b-f904a60032ec/pypi/download/triton-nightly/3.post20240610003544/triton_nightly-3.0.0.post20240610003544-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", sha256 = "ac2c36a49bf9c2bb780909b38096fb718f17efd78b88a1ca1d649f6d063cdc2c"}
+[tool.black]
+line-length = 120
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+line_length = 120
+[tool.setuptools.packages.find]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
+[tool.wheel]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]

qwen2.jinja ADDED Viewed

	@@ -0,0 +1,11 @@

+{% if messages[0]['role'] != 'system' %}
+    {{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}
+{% endif %}
+{% for message in messages if message['content'] is not none %}
+    {{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{% endfor %}
+{% if add_generation_prompt %}
+    {{ '<|im_start|>assistant\n' }}
+{% endif %}

qwen_audio_encoder.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from transformers import PretrainedConfig, Qwen2AudioEncoder, Qwen2AudioForConditionalGeneration
+from .audio_encoder import AudioTower
+class Qwen2AudioTower(AudioTower):
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig):
+        super().__init__(model_name_or_path, config)
+        self.audio_tower = Qwen2AudioEncoder.from_pretrained(model_name_or_path, attn_implementation="flash_attention_2")
+        self.is_loaded = True
+        self.audio_chunk_unit_duration = 30
+        self.audio_chunk_unit_length = 3000
+    def forward(self, sounds):
+        if type(sounds) is list:
+            sound_features = []
+            audio_output_lengths = []
+            for sound in sounds:
+                if hasattr(sound, "input_features") or (type(sound) is dict and "input_features" in sound):
+                    sound = sound["input_features"]
+                sound_feature = self.forward_audio_tower_batch(sound)
+                sound_feature = sound_feature.to(sound.dtype)
+                sound_features.append(sound_feature)
+                audio_output_lengths.append(sound_feature.shape[1])
+            if len(sound_features) > 0:
+                sound_features = torch.cat(sound_features, dim=1).squeeze(0)
+        else:
+            raise NotImplementedError("Not implemented for this encoder")
+        return sound_features, audio_output_lengths
+    def forward_audio_tower_batch(self, inp):
+        """
+        Process long audio input by splitting into fixed-size chunks (30 seconds),
+        padding if needed, batching them together, and processing through the audio tower.
+        Args:
+            inp: Tensor of shape (batch_size, n_mels, seq_len)
+        Returns:
+            Tensor of shape (batch_size, num_chunks * chunk_seq_len, hidden_size)
+        """
+        batch_size, n_mels, seq_len = inp.shape
+        chunk_length = self.audio_chunk_unit_length
+        num_chunks = (seq_len + chunk_length - 1) // chunk_length  # Ceiling division
+        padded_chunks = []
+        for i in range(num_chunks):
+            start_idx = i * chunk_length
+            end_idx = min(start_idx + chunk_length, seq_len)
+            # Extract and pad chunk if necessary
+            chunk = inp[:, :, start_idx:end_idx]
+            if chunk.shape[2] < chunk_length:
+                pad_len = chunk_length - chunk.shape[2]
+                chunk = torch.nn.functional.pad(chunk, (0, pad_len), mode='constant', value=0)
+            padded_chunks.append(chunk)
+        # Stack chunks along batch dimension
+        all_chunks = torch.cat(padded_chunks, dim=0).reshape(batch_size * num_chunks, n_mels, chunk_length)
+        # Forward pass through the audio tower
+        chunk_outputs = self.audio_tower(all_chunks)
+        hidden_states = chunk_outputs.last_hidden_state
+        # Reshape back to (batch_size, num_chunks * seq_len', hidden_size)
+        _, chunk_seq_len, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.reshape(batch_size, num_chunks * chunk_seq_len, hidden_size)
+        return hidden_states

siglip_encoder.py ADDED Viewed

	@@ -0,0 +1,293 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from accelerate.hooks import add_hook_to_module
+from einops import rearrange
+from s2wrapper import forward as multiscale_forward
+from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, SiglipImageProcessor
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.models.siglip import SiglipVisionModel
+class VisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = getattr(args, "mm_vision_select_layer", -2)
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        self.cfg_only = None
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == "patch":
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+    def _maybe_resize_pos_embeds(
+        self,
+        model: PreTrainedModel,
+        image_processor: BaseImageProcessor,
+        resolution: int = -1,
+        interpolate_mode: str = "linear",
+    ):
+        if resolution in [model.config.image_size, -1]:
+            return
+        print(
+            f"Resizing vision model's position embeddings to support higher vision resolution: from {model.config.image_size} to {resolution} ..."
+        )
+        embeddings = model.vision_model.embeddings
+        patch_size = embeddings.patch_size
+        num_new_tokens = int((resolution // patch_size) ** 2)
+        old_embeddings = embeddings.position_embedding
+        match interpolate_mode:
+            case "linear":
+                # Step 1: Calculate the corresponding patch ID (pid) in the current resolution (M patches) based on the target resolution (N patches). Formula: pid = pid / N * M
+                # Step 2: Obtain new embeddings by interpolating between the embeddings of the two nearest calculated patch IDs. Formula: new_embeds = (pid - floor(pid)) * embeds[ceil(pid)] + (ceil(pid) - pid) * embeds[floor(pid)]
+                import torch
+                import torch.nn as nn
+                if is_deepspeed_zero3_enabled():
+                    try:
+                        import deepspeed
+                    except ImportError:
+                        raise ImportError("DeepSpeed is not installed. Please install it with `pip install deepspeed`.")
+                    with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
+                        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+                else:
+                    old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+                new_embeddings = nn.Embedding(
+                    num_new_tokens,
+                    old_embedding_dim,
+                    dtype=old_embeddings.weight.dtype,
+                    device=old_embeddings.weight.device,
+                )
+                mapped_indices = (
+                    torch.arange(num_new_tokens).to(old_embeddings.weight.device)
+                    / (num_new_tokens - 1)
+                    * (old_num_tokens - 1)
+                )
+                floor_indices = torch.clamp(mapped_indices.floor().long(), min=0, max=old_num_tokens - 1)
+                ceil_indices = torch.clamp(mapped_indices.ceil().long(), min=0, max=old_num_tokens - 1)
+                if is_deepspeed_zero3_enabled():
+                    params = [old_embeddings.weight, new_embeddings.weight]
+                    with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                        interpolated_embeds = (mapped_indices - floor_indices)[:, None] * old_embeddings.weight.data[
+                            ceil_indices, :
+                        ] + (ceil_indices - mapped_indices)[:, None] * old_embeddings.weight.data[floor_indices, :]
+                else:
+                    interpolated_embeds = (mapped_indices - floor_indices)[:, None] * old_embeddings.weight.data[
+                        ceil_indices, :
+                    ] + (ceil_indices - mapped_indices)[:, None] * old_embeddings.weight.data[floor_indices, :]
+                new_embeddings.weight.data = interpolated_embeds
+            case _:
+                raise NotImplementedError
+        if hasattr(old_embeddings, "_hf_hook"):
+            hook = old_embeddings._hf_hook
+            add_hook_to_module(new_embeddings, hook)
+        new_embeddings.requires_grad_(old_embeddings.weight.requires_grad)
+        # Update vision encoder's configurations
+        model.config.image_size = resolution
+        if hasattr(image_processor, "crop_size"):
+            # CLIP vision tower
+            image_processor.crop_size = resolution
+        else:
+            # SIGLIP vision tower
+            assert hasattr(image_processor, "size")
+            image_processor.size = {"height": resolution, "width": resolution}
+        embeddings.position_embedding = new_embeddings
+        embeddings.image_size = resolution
+        embeddings.num_patches = embeddings.num_positions = num_new_tokens
+        embeddings.position_ids = (
+            torch.arange(embeddings.num_positions).expand((1, -1)).to(old_embeddings.weight.device)
+        )
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            )
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+class VisionTowerS2(VisionTower):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__(vision_tower, args, delay_load)
+        self.scales = list(map(int, args.s2_scales.split(",")))
+        self.scales.sort()
+        self.max_split_size = args.s2_max_split_size
+        self.resize_output_to_scale_idx = getattr(args, "s2_resize_output_to_scale_idx", 0)
+    def forward_feature(self, images):
+        image_forward_outs = self.vision_tower(
+            images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
+        )
+        image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    def forward(self, images):
+        if type(images) is list:
+            image_feature = []
+            for image in images:
+                image_feature = multiscale_forward(
+                    self.forward_feature,
+                    image.unsqueeze(0),
+                    img_sizes=self.scales,
+                    max_split_size=self.max_split_size,
+                    resize_output_to_idx=self.resize_output_to_scale_idx,
+                )
+                image_features.append(image_feature)
+        else:
+            image_features = multiscale_forward(
+                self.forward_feature,
+                images,
+                img_sizes=self.scales,
+                max_split_size=self.max_split_size,
+                resize_output_to_idx=self.resize_output_to_scale_idx,
+            )
+        return image_features
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size * len(self.scales)
+class VisionTowerDynamicS2(VisionTower):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__(vision_tower, args, delay_load)
+        self.scales = list(map(int, args.s2_scales.split(",")))
+        self.scales.sort()
+        self.max_split_size = args.s2_max_split_size
+        self.resize_output_to_scale_idx = getattr(args, "s2_resize_output_to_scale_idx", 0)
+    def forward_feature(self, images):
+        image_forward_outs = self.vision_tower(
+            images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
+        )
+        image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    def forward(self, images):
+        assert type(images) is not list
+        image_features = self.forward_feature(images)
+        return image_features
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size * len(self.scales)
+class SiglipVisionTower(VisionTower):
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig) -> None:
+        super().__init__(model_name_or_path, config)
+        self.vision_tower = SiglipVisionModel.from_pretrained(
+            model_name_or_path,
+            attn_implementation=config._attn_implementation,
+            torch_dtype=eval(config.model_dtype),
+        )
+        self.image_processor = SiglipImageProcessor.from_pretrained(model_name_or_path)
+        self.is_loaded = True
+class SiglipVisionTowerS2(VisionTowerS2):
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig) -> None:
+        super().__init__(model_name_or_path, config)
+        self.vision_tower = SiglipVisionModel.from_pretrained(
+            model_name_or_path,
+            attn_implementation=config._attn_implementation,
+            torch_dtype=eval(config.model_dtype),
+        )
+        self.image_processor = SiglipImageProcessor.from_pretrained(model_name_or_path)
+        # Make sure it crops/resizes the image to the largest scale in self.scales to maintain high-res information
+        self.image_processor.size["height"] = self.image_processor.size["width"] = self.scales[-1]
+        self.is_loaded = True
+class SiglipVisionTowerDynamicS2(VisionTowerDynamicS2):
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig) -> None:
+        super().__init__(model_name_or_path, config)
+        if type(config.model_dtype) == str:
+            model_dtype = eval(config.model_dtype)
+        else:
+            model_dtype = config.model_dtype
+        self.vision_tower = SiglipVisionModel.from_pretrained(
+            model_name_or_path,
+            attn_implementation="flash_attention_2",
+            torch_dtype=model_dtype,
+        )
+        self.image_processor = SiglipImageProcessor.from_pretrained(model_name_or_path)
+        # Make sure it crops/resizes the image to the largest scale in self.scales to maintain high-res information
+        self.image_processor.size["height"] = self.image_processor.size["width"] = self.scales[0]
+        self.is_loaded = True

sound_base_projector.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel
+class SoundMultimodalProjectorConfig(PretrainedConfig):
+    """Configuration for sound multimodal projector."""
+    model_type = "sound_mm_projector"
+    def __init__(self, sound_mm_projector_type: str = None, **kwargs):
+        super().__init__()
+        self.sound_mm_projector_type = sound_mm_projector_type
+class AudioDownSampleBlock(nn.Module):
+    """Downsample audio features using 1D convolution."""
+    def __init__(self, embed_dim):
+        super().__init__()
+        self.conv1 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+    def forward(self, x):
+        x = rearrange(x, "b t c -> b c t")
+        x = self.conv1(x)
+        x = rearrange(x, "b c t -> b t c")
+        return x
+class AudioDownSamplePoolBlock(nn.Module):
+    """Downsample audio features using average pooling."""
+    def __init__(self, embed_dim):
+        super().__init__()
+        self.pool = nn.AvgPool1d(kernel_size=2)
+    def forward(self, x):
+        x = rearrange(x, "b t c -> b c t")
+        x = self.pool(x)
+        x = rearrange(x, "b c t -> b t c")
+        return x
+class AudioDownSampleMaxPoolBlock(nn.Module):
+    """Downsample audio features using max pooling."""
+    def __init__(self, embed_dim):
+        super().__init__()
+        self.pool = nn.MaxPool1d(kernel_size=2)
+    def forward(self, x):
+        x = rearrange(x, "b t c -> b c t")
+        x = self.pool(x)
+        x = rearrange(x, "b c t -> b t c")
+        return x
+class SoundMultimodalProjector(PreTrainedModel):
+    """Sound multimodal projector for mapping audio features to LLM space."""
+    config_class = SoundMultimodalProjectorConfig
+    def __init__(self, sound_mm_projector_cfg: SoundMultimodalProjectorConfig, config: PretrainedConfig):
+        super().__init__(sound_mm_projector_cfg)
+        if hasattr(config, "sound_mm_projector"):
+            sound_mm_projector_type = config.sound_mm_projector
+        else:
+            sound_mm_projector_type = sound_mm_projector_cfg.sound_mm_projector_type
+        self.sound_mm_projector_type = sound_mm_projector_type
+        self.config.sound_mm_projector_type = sound_mm_projector_type
+        if hasattr(config, "sound_mm_projector_cfg") and type(config.sound_mm_projector_cfg) == dict:
+            config.sound_mm_projector_cfg["sound_mm_projector_type"] = sound_mm_projector_type
+        if sound_mm_projector_type == "mlp":
+            self.layers = nn.Sequential(
+                nn.Linear(config.sound_hidden_size, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        elif sound_mm_projector_type == "mlp_downsample":
+            self.downsample_block = AudioDownSampleBlock(config.sound_hidden_size)
+            self.layers = nn.Sequential(
+                nn.Linear(config.sound_hidden_size, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        elif sound_mm_projector_type == "mlp_downsample_pool":
+            self.downsample_block = AudioDownSamplePoolBlock(config.sound_hidden_size)
+            self.layers = nn.Sequential(
+                nn.Linear(config.sound_hidden_size, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        elif sound_mm_projector_type == "mlp_downsample_pool_max":
+            self.downsample_block = AudioDownSampleMaxPoolBlock(config.sound_hidden_size)
+            self.layers = nn.Sequential(
+                nn.Linear(config.sound_hidden_size, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        else:
+            raise ValueError(f"Unknown projector type: {sound_mm_projector_type}")
+    def forward(self, x, *args, **kwargs):
+        if self.sound_mm_projector_type in ["mlp_downsample", "mlp_downsample_pool", "mlp_downsample_pool_max"]:
+            x = self.downsample_block(x)
+        return self.layers(x)
+AutoConfig.register("sound_mm_projector", SoundMultimodalProjectorConfig)
+AutoModel.register(SoundMultimodalProjectorConfig, SoundMultimodalProjector)

sound_mm_projector/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_name_or_path": "/lustre/fs12/portfolios/llmservice/projects/llmservice_fm_vision/users/hanrongy/project/vila/VILA-Internal/../exp_log/nvomni-8b-video-0d1-trope128_omniT_ras_n16_bs2048_ga8_mstep-1_j20250718/outputs/model/sound_mm_projector",
+  "architectures": [
+    "SoundMultimodalProjector"
+  ],
+  "model_type": "sound_mm_projector",
+  "sound_mm_projector_type": "mlp",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.0"
+}

sound_mm_projector/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb57ebfdeb51af4a1c0de931fd43e6a4b93277552ad02ad01b1d9ba720bcb9a4
+size 34879856

sound_tower/config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "_name_or_path": "outputs/model/sound_tower",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "architectures": [
+    "Qwen2AudioEncoder"
+  ],
+  "attention_dropout": 0.0,
+  "audio_config": {
+    "activation_function": "gelu",
+    "d_model": 1280,
+    "encoder_attention_heads": 20,
+    "encoder_ffn_dim": 5120,
+    "encoder_layers": 32,
+    "max_source_positions": 1500,
+    "model_type": "qwen2_audio_encoder",
+    "num_mel_bins": 128,
+    "scale_embedding": false
+  },
+  "audio_token_index": 151646,
+  "d_model": 1280,
+  "dropout": 0.0,
+  "encoder_attention_heads": 20,
+  "encoder_ffn_dim": 5120,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 32,
+  "ignore_index": -100,
+  "init_std": 0.02,
+  "max_source_positions": 1500,
+  "model_type": "qwen2_audio_encoder",
+  "num_hidden_layers": 32,
+  "num_mel_bins": 128,
+  "scale_embedding": false,
+  "text_config": {
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "intermediate_size": 11008,
+    "max_position_embeddings": 8192,
+    "model_type": "qwen2",
+    "rms_norm_eps": 1e-05,
+    "rope_theta": 10000,
+    "sliding_window": 32768,
+    "torch_dtype": "bfloat16",
+    "use_mrope": false,
+    "vocab_size": 156032
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.0",
+  "vocab_size": 156032
+}

sound_tower/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01fcbfa8ac3d3bc4c5ab97c439dfecfea2a9c2e061031280efed292fc37b4a44
+size 1273988176