tongww commited on 12 days ago

Commit

4cffcdc

verified ·

1 Parent(s): 3512073

upload initial model

Browse files

Files changed (31) hide show

.gitattributes +1 -0
README.md +889 -3
added_tokens.json +327 -0
campplus.onnx +3 -0
config.json +628 -0
configuration_flow.py +102 -0
configuration_hifigan.py +87 -0
configuration_interactiveomni.py +125 -0
configuration_intern_vit.py +119 -0
configuration_voicelm.py +63 -0
configuration_whisper.py +340 -0
conversation.py +340 -0
generation_config.json +4 -0
merges.txt +0 -0
model-00001-of-00005.safetensors +3 -0
model-00002-of-00005.safetensors +3 -0
model-00003-of-00005.safetensors +3 -0
model-00004-of-00005.safetensors +3 -0
model-00005-of-00005.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_flow.py +2318 -0
modeling_hifigan.py +479 -0
modeling_interactiveomni.py +773 -0
modeling_intern_vit.py +427 -0
modeling_voicelm.py +192 -0
modeling_whisper.py +0 -0
special_tokens_map.json +330 -0
taozi.wav +3 -0
tokenizer.json +0 -0
tokenizer_config.json +2931 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+taozi.wav filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,889 @@
----
-license: mit
----

+---
+license: mit
+pipeline_tag: any-to-any
+language:
+- zh
+- en
+---
+# InteractiveOmni
+<p align="center">
+InteractiveOmni-4B <a href="https://huggingface.co/sensenova/InteractiveOmni-4B">🤗</a>&nbsp; | InteractiveOmni-8B <a href="https://huggingface.co/sensenova/InteractiveOmni-8B">🤗</a>&nbsp; | 📑 <a href="https://arxiv.org/abs/2510.13747">Paper</a> &nbsp;&nbsp;
+</p>
+## Introduction
+InteractiveOmni is a unified omni-modal model that can simultaneously receive inputs such as images, audio, text, and
+video and directly generate coherent text and speech streams, achieving truly integrated interaction.
+This is the schematic diagram for multi-turn audio-visual interaction.
+<p align="center">
+    <img src="https://raw.github.com/SenseTime-FVG/InteractiveOmni/main/assets/demo_interaction.png" width="99%"/>
+<p>
+### Key Features
+* **Strong Performance Across Modalities:**  Exhibiting omni-modal understanding and speech generation capabilities. InteractiveOmni outperforms the similarly sized vision-language models, audio-language models and omni-modal models.
+* **State-of-the-Art Performance:** Achieve SOTA results on various open-source benchmarks for image, audio, and video understanding, as well as speech conversation.
+* **Excellent Interactive Performance:** Achieve more intelligent audio-visual experience with multi-turn and long-term memory capabilities.
+* **Multi-turn Interactive Benchmarks:** Propose multi-modal multi-turn benchmark to evaluate multi-turn memory and speech interaction of leading MLLMs.
+* **On-device Model:**  the 4B model achieves 97% of the performance with just 50% of the model size compared with 8B model.
+### Model Architecture
+<p align="center">
+    <img src="https://raw.github.com/SenseTime-FVG/InteractiveOmni/main/assets/model_architecture.png" width="80%"/>
+<p>
+## Quickstart
+### Get the Code
+```bash
+git clone https://github.com/SenseTime-FVG/InteractiveOmni.git
+cd InteractiveOmni
+pip install -r requirements.txt
+```
+We provide an example code to run `InteractiveOmni` using 🤗 `Transformers`.
+> Please use transformers>=4.51.0 and FlashAttention2 to ensure the model works normally.
+### Model Loading
+```python
+import torch
+from transformers import AutoTokenizer, AutoModel
+path = "sensefvg/InteractiveOmni-8B"
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True).eval().cuda()
+```
+### Inference with Transformers
+```python
+import torch
+from transformers import AutoModel, AutoTokenizer
+import torchaudio
+path = "sensefvg/InteractiveOmni-8B"
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=True)
+# set the max number of tiles in `max_num`
+max_num = 12
+frame = 8
+generation_config = dict(max_new_tokens=1024, do_sample=True)
+# pure-text conversation (纯文本对话)
+messages = [
+    {
+        'role': "user",
+        'content': 'Hello, who are you?',
+    }
+]
+response = model.chat(tokenizer, generation_config, messages)
+# audio conversation (音频对话)
+messages = [
+    {
+        'role': "user",
+        'content': [
+            {
+                "type": "audio",
+                "audio": "assets/hello_en.wav"
+            }
+        ]
+    }
+]
+response = model.chat(tokenizer, generation_config, messages)
+## Generate both audio and text output
+messages = [
+    {
+        'role': "user",
+        'content': [
+            {
+                "type": "audio",
+                "audio": "assets/hello_zh.wav"
+            }
+        ]
+    }
+]
+response, wav_response = model.chat(tokenizer, generation_config, messages, generate_audio=True)
+torchaudio.save("result.wav", wav_response.cpu(), 24000, format="wav")
+# image-text conversation (图文对话)
+messages = [
+    {
+        'role': "user",
+        'content': [
+            {
+                "type": "image",
+                "image": 'assets/cat_cup.jpeg'
+            },
+            {
+                "type": "text",
+                "text": "Please describe the image shortly."
+            }
+        ]
+    }
+]
+response = model.chat(tokenizer, generation_config, messages, max_num)
+# image-audio conversation (图音对话)
+messages = [
+    {
+        'role': "user",
+        'content': [
+            {
+                "type": "image",
+                "image": 'assets/cat_cup.jpeg'
+            },
+            {
+                "type": "audio",
+                "audio": "assets/describe_img_en.wav"
+            }
+        ]
+    }
+]
+response = model.chat(tokenizer, generation_config, messages, max_num)
+## image-audio conversation, generate both audio and text output
+messages = [
+    {
+        'role': "user",
+        'content': [
+            {
+                "type": "image",
+                "image": 'assets/cat_cup.jpeg'
+            },
+            {
+                "type": "audio",
+                "audio": "assets/describe_img_en.wav"
+            }
+        ]
+    }
+]
+response, wav_response = model.chat(tokenizer, generation_config, messages, generate_audio=True)
+torchaudio.save("result.wav", wav_response.cpu(), 24000, format="wav")
+# video conversation (视频对话)
+messages = [
+    {
+        'role': "user",
+        'content': [
+            {
+                "type": "video",
+                "video": 'video_path'
+            },
+            {
+                "type": "text",
+                "text": "Describe this video in detail."
+            }
+        ]
+    }
+]
+response = model.chat(tokenizer, generation_config, messages, max_num, frame)
+```
+### Use audio output
+* If users need audio output, the system prompt must be set as follows, otherwise the audio output may not work as expected.
+```
+You are a highly advanced multimodal conversational AI designed for human-like interaction. You can perceive auditory, visual, speech, and textual inputs, and generate text and speech.
+```
+```python
+messages = [
+    {
+        "role": "system",
+        "content": "You are a highly advanced multimodal conversational AI designed for human-like interaction. You can perceive auditory, visual, speech, and textual inputs, and generate text and speech."
+    },
+    {
+        'role': "user",
+        'content': [
+            {
+                "type": "audio",
+                "audio": "assets/hello_zh.wav",
+            }
+        ]
+    }
+]
+response, wav_response = model.chat(tokenizer, generation_config, messages, generate_audio=True)
+torchaudio.save("result_none_speaker.wav", wav_response.cpu(), 24000, format="wav")
+```
+* Use default speaker to generate output audio.
+```python
+messages = [
+    {
+        "role": "system",
+        "content": "You are a highly advanced multimodal conversational AI designed for human-like interaction. You can perceive auditory, visual, speech, and textual inputs, and generate text and speech."
+    },
+    {
+        'role': "user",
+        'content': [
+            {
+                "type": "audio",
+                "audio": "assets/hello_zh.wav",
+            }
+        ]
+    }
+]
+response, wav_response = model.chat(tokenizer, generation_config, messages, generate_audio=True, speaker_embedding=model.default_speaker_embedding)
+torchaudio.save("result_default_speaker.wav", wav_response.cpu(), 24000, format="wav")
+```
+* Use custom speaker to generate output audio, similar to sound cloning.
+```python
+messages = [
+    {
+        "role": "system",
+        "content": "You are a highly advanced multimodal conversational AI designed for human-like interaction. You can perceive auditory, visual, speech, and textual inputs, and generate text and speech."
+    },
+    {
+        'role': "user",
+        'content': [
+            {
+                "type": "audio",
+                "audio": "assets/hello_zh.wav",
+            }
+        ]
+    }
+]
+speaker_embedding = model.extract_speaker_embedding("assets/hello_zh.wav")
+response, wav_response = model.chat(tokenizer, generation_config, messages, generate_audio=True, speaker_embedding=speaker_embedding)
+torchaudio.save("result_custom_speaker.wav", wav_response.cpu(), 24000, format="wav")
+```
+## Evaluation
+InteractiveOmni achieves state-of-the-art performance across a wide range of multi-modal understanding and speech generation benchmarks.
+<p align="center">
+    <img src="https://raw.github.com/SenseTime-FVG/InteractiveOmni/main/assets/radar_chart.png" width="70%"/>
+<p>
+<details>
+<summary>Image Understanding</summary>
+<table style="width:100%; border-collapse: collapse;">
+  <thead>
+    <tr style="border-bottom: 1px solid black;">
+      <th align="left" style="padding: 8px;">Model</th>
+      <th align="center" style="padding: 8px;">MMBench</th>
+      <th align="center" style="padding: 8px;">MMStar</th>
+      <th align="center" style="padding: 8px;">MMMU</th>
+      <th align="center" style="padding: 8px;">MathVista</th>
+      <th align="center" style="padding: 8px;">HallusionBench</th>
+      <th align="center" style="padding: 8px;">AI2D</th>
+      <th align="center" style="padding: 8px;">OCRBench</th>
+      <th align="center" style="padding: 8px;">Avg</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td colspan="9" align="center" style="font-weight: bold; border-top: 1px solid #ddd; border-bottom: 1px solid black;">Vision-Language Model</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InternVL3-8B</td>
+      <td align="center" style="padding: 8px;">82.1</td>
+      <td align="center" style="padding: 8px;">68.7</td>
+      <td align="center" style="padding: 8px;">62.2</td>
+      <td align="center" style="padding: 8px;">70.5</td>
+      <td align="center" style="padding: 8px;">49.0</td>
+      <td align="center" style="padding: 8px;">85.1</td>
+      <td align="center" style="padding: 8px;">88.4</td>
+      <td align="center" style="padding: 8px;">72.3</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InternVL3.5-8B</td>
+      <td align="center" style="padding: 8px;">79.5</td>
+      <td align="center" style="padding: 8px;">69.3</td>
+      <td align="center" style="padding: 8px;">73.4</td>
+      <td align="center" style="padding: 8px;">78.4</td>
+      <td align="center" style="padding: 8px;">54.5</td>
+      <td align="center" style="padding: 8px;">84.0</td>
+      <td align="center" style="padding: 8px;">84.0</td>
+      <td align="center" style="padding: 8px;">74.7</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">Qwen2.5-VL-7B</td>
+      <td align="center" style="padding: 8px;">82.2</td>
+      <td align="center" style="padding: 8px;">64.1</td>
+      <td align="center" style="padding: 8px;">58.0</td>
+      <td align="center" style="padding: 8px;">68.1</td>
+      <td align="center" style="padding: 8px;">51.9</td>
+      <td align="center" style="padding: 8px;">84.3</td>
+      <td align="center" style="padding: 8px;">88.8</td>
+      <td align="center" style="padding: 8px;">71.1</td>
+    </tr>
+    <tr>
+      <td colspan="9" align="center" style="font-weight: bold; border-top: 1px solid black; border-bottom: 1px solid black;">Omni Model</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">GPT-4o-mini</td>
+      <td align="center" style="padding: 8px;">76.0</td>
+      <td align="center" style="padding: 8px;">54.8</td>
+      <td align="center" style="padding: 8px;">60.0</td>
+      <td align="center" style="padding: 8px;">52.5</td>
+      <td align="center" style="padding: 8px;">46.1</td>
+      <td align="center" style="padding: 8px;">77.8</td>
+      <td align="center" style="padding: 8px;">78.5</td>
+      <td align="center" style="padding: 8px;">63.7</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">VITA-1.5</td>
+      <td align="center" style="padding: 8px;">76.8</td>
+      <td align="center" style="padding: 8px;">60.2</td>
+      <td align="center" style="padding: 8px;">52.6</td>
+      <td align="center" style="padding: 8px;">66.2</td>
+      <td align="center" style="padding: 8px;">44.6</td>
+      <td align="center" style="padding: 8px;">79.2</td>
+      <td align="center" style="padding: 8px;">74.1</td>
+      <td align="center" style="padding: 8px;">64.8</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">Ming-Lite-Omni</td>
+      <td align="center" style="padding: 8px;">80.8</td>
+      <td align="center" style="padding: 8px;">64.7</td>
+      <td align="center" style="padding: 8px;">56.3</td>
+      <td align="center" style="padding: 8px;">71.6</td>
+      <td align="center" style="padding: 8px;">55.0</td>
+      <td align="center" style="padding: 8px;">83.1</td>
+      <td align="center" style="padding: 8px;">88.4</td>
+      <td align="center" style="padding: 8px;">71.4</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">Qwen2.5-Omni-7B</td>
+      <td align="center" style="padding: 8px;">81.3</td>
+      <td align="center" style="padding: 8px;">64.0</td>
+      <td align="center" style="padding: 8px;">59.2</td>
+      <td align="center" style="padding: 8px;">67.9</td>
+      <td align="center" style="padding: 8px;">47.4</td>
+      <td align="center" style="padding: 8px;">83.2</td>
+      <td align="center" style="padding: 8px;">83.4</td>
+      <td align="center" style="padding: 8px;">69.5</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InteractiveOmni-4B</td>
+      <td align="center" style="padding: 8px;">78.9</td>
+      <td align="center" style="padding: 8px;">62.6</td>
+      <td align="center" style="padding: 8px;">61.1</td>
+      <td align="center" style="padding: 8px;">61.7</td>
+      <td align="center" style="padding: 8px;">52.2</td>
+      <td align="center" style="padding: 8px;">83.8</td>
+      <td align="center" style="padding: 8px;">80.0</td>
+      <td align="center" style="padding: 8px;">68.6</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InteractiveOmni-8B</td>
+      <td align="center" style="padding: 8px;"><strong>81.4</strong></td>
+      <td align="center" style="padding: 8px;"><strong>66.8</strong></td>
+      <td align="center" style="padding: 8px;"><strong>66.9</strong></td>
+      <td align="center" style="padding: 8px;">68.0</td>
+      <td align="center" style="padding: 8px;"><strong>61.3</strong></td>
+      <td align="center" style="padding: 8px;"><strong>84.3</strong></td>
+      <td align="center" style="padding: 8px;">83.7</td>
+      <td align="center" style="padding: 8px;"><strong>73.2</strong></td>
+    </tr>
+  </tbody>
+</table>
+</details>
+<details>
+<summary>Video Understanding</summary>
+<table style="width:100%; border-collapse: collapse;">
+  <thead>
+    <tr style="border-bottom: 1px solid black;">
+      <th align="left" style="padding: 8px;">Model</th>
+      <th align="center" style="padding: 8px;">Video-MME<br>(wo sub)</th>
+      <th align="center" style="padding: 8px;">Video-MME<br>(w sub)</th>
+      <th align="center" style="padding: 8px;">MLVU<br>(M-Avg)</th>
+      <th align="center" style="padding: 8px;">LongVideoBench<br>(val total)</th>
+      <th align="center" style="padding: 8px;">Avg</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td colspan="6" align="center" style="font-weight: bold; border-top: 1px solid #ddd; border-bottom: 1px solid black;">Vision-Language Model</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InternVL3-8B</td>
+      <td align="center" style="padding: 8px;"><strong>66.3</strong></td>
+      <td align="center" style="padding: 8px;">68.9</td>
+      <td align="center" style="padding: 8px;">71.4</td>
+      <td align="center" style="padding: 8px;">58.8</td>
+      <td align="center" style="padding: 8px;">66.4</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InternVL3.5-8B</td>
+      <td align="center" style="padding: 8px;">66.0</td>
+      <td align="center" style="padding: 8px;">68.6</td>
+      <td align="center" style="padding: 8px;">70.2</td>
+      <td align="center" style="padding: 8px;">62.1</td>
+      <td align="center" style="padding: 8px;">66.7</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">Qwen2.5-VL-7B</td>
+      <td align="center" style="padding: 8px;">65.1</td>
+      <td align="center" style="padding: 8px;">71.6</td>
+      <td align="center" style="padding: 8px;">70.2</td>
+      <td align="center" style="padding: 8px;">56.0</td>
+      <td align="center" style="padding: 8px;">64.5</td>
+    </tr>
+    <tr>
+      <td colspan="6" align="center" style="font-weight: bold; border-top: 1px solid black; border-bottom: 1px solid black;">Omni Model</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">GPT-4o-mini</td>
+      <td align="center" style="padding: 8px;">64.8</td>
+      <td align="center" style="padding: 8px;">-</td>
+      <td align="center" style="padding: 8px;">-</td>
+      <td align="center" style="padding: 8px;">-</td>
+      <td align="center" style="padding: 8px;">-</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">Qwen2.5-Omni-7B</td>
+      <td align="center" style="padding: 8px;">64.3</td>
+      <td align="center" style="padding: 8px;"><strong>72.4</strong></td>
+      <td align="center" style="padding: 8px;">-</td>
+      <td align="center" style="padding: 8px;">-</td>
+      <td align="center" style="padding: 8px;">-</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InteractiveOmni-4B</td>
+      <td align="center" style="padding: 8px;">63.3</td>
+      <td align="center" style="padding: 8px;">69.3</td>
+      <td align="center" style="padding: 8px;">68.0</td>
+      <td align="center" style="padding: 8px;">57.0</td>
+      <td align="center" style="padding: 8px;">64.4</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InteractiveOmni-8B</td>
+      <td align="center" style="padding: 8px;">66.0</td>
+      <td align="center" style="padding: 8px;">71.8</td>
+      <td align="center" style="padding: 8px;"><strong>71.6</strong></td>
+      <td align="center" style="padding: 8px;">59.1</td>
+      <td align="center" style="padding: 8px;"><strong>67.1</strong></td>
+    </tr>
+  </tbody>
+</table>
+</details>
+<details>
+<summary>Audio Understanding</summary>
+<table style="width:100%; border-collapse: collapse;">
+<thead>
+  <tr>
+    <th align="left" style="padding: 8px;">Model</th>
+    <th align="center" style="padding: 8px;">Qwen2-Audio</th>
+    <th align="center" style="padding: 8px;">Step-Audio-Chat</th>
+    <th align="center" style="padding: 8px;">Kimi-Audio</th>
+    <th align="center" style="padding: 8px;">Qwen2.5-Omni-7B</th>
+    <th align="center" style="padding: 8px;">InteractiveOmni-4B</th>
+    <th align="center" style="padding: 8px;">InteractiveOmni-8B</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td colspan="9" align="center" style="font-weight: bold; border-top: 1px solid #ddd; border-bottom: 1px solid black;">ASR (wer)</td>
+  </tr>
+  <tr>
+    <td align="left" style="padding: 8px;">Wenetspeech<br><em>test-net</em></td>
+    <td align="center" style="padding: 8px;">10.60</td>
+    <td align="center" style="padding: 8px;">8.75</td>
+    <td align="center" style="padding: 8px;">5.37</td>
+    <td align="center" style="padding: 8px;">5.90</td>
+    <td align="center" style="padding: 8px;">5.40</td>
+    <td align="center" style="padding: 8px;"><strong>5.04</strong></td>
+  </tr>
+  <tr>
+    <td align="left" style="padding: 8px;">Wenetspeech<br><em>test-meeting</em></td>
+    <td align="center" style="padding: 8px;">10.68</td>
+    <td align="center" style="padding: 8px;">9.52</td>
+    <td align="center" style="padding: 8px;">6.28</td>
+    <td align="center" style="padding: 8px;">7.70</td>
+    <td align="center" style="padding: 8px;">6.95</td>
+    <td align="center" style="padding: 8px;"><strong>5.55</strong></td>
+  </tr>
+  <tr>
+    <td align="left" style="padding: 8px;">LibriSpeech<br><em>test-clean</em></td>
+    <td align="center" style="padding: 8px;">1.60</td>
+    <td align="center" style="padding: 8px;">3.19</td>
+    <td align="center" style="padding: 8px;"><strong>1.28</strong></td>
+    <td align="center" style="padding: 8px;">1.80</td>
+    <td align="center" style="padding: 8px;">1.73</td>
+    <td align="center" style="padding: 8px;">1.64</td>
+  </tr>
+  <tr>
+    <td align="left" style="padding: 8px;">LibriSpeech<br><em>test-other</em></td>
+    <td align="center" style="padding: 8px;">3.60</td>
+    <td align="center" style="padding: 8px;">10.67</td>
+    <td align="center" style="padding: 8px;"><strong>2.42</strong></td>
+    <td align="center" style="padding: 8px;">3.40</td>
+    <td align="center" style="padding: 8px;">3.69</td>
+    <td align="center" style="padding: 8px;">3.41</td>
+  </tr>
+  <tr>
+    <td align="left" style="padding: 8px;">Aishell-2 IOS</td>
+    <td align="center" style="padding: 8px;">4.48</td>
+    <td align="center" style="padding: 8px;">3.57</td>
+    <td align="center" style="padding: 8px;">2.56</td>
+    <td align="center" style="padding: 8px;">2.56</td>
+    <td align="center" style="padding: 8px;">2.85</td>
+    <td align="center" style="padding: 8px;"><strong>2.18</strong></td>
+  </tr>
+  <tr>
+    <td align="left" style="padding: 8px;">ChildMandarin</td>
+    <td align="center" style="padding: 8px;">14.62</td>
+    <td align="center" style="padding: 8px;">-</td>
+    <td align="center" style="padding: 8px;">-</td>
+    <td align="center" style="padding: 8px;">19.34</td>
+    <td align="center" style="padding: 8px;">17.21</td>
+    <td align="center" style="padding: 8px;"><strong>14.03</strong></td>
+  </tr>
+  <tr>
+    <td colspan="9" align="center" style="font-weight: bold; border-top: 1px solid #ddd; border-bottom: 1px solid black;">Audio Understanding</td>
+  </tr>
+  <tr>
+    <td align="left" style="padding: 8px;">MMAU</td>
+    <td align="center" style="padding: 8px;">56.60</td>
+    <td align="center" style="padding: 8px;">-</td>
+    <td align="center" style="padding: 8px;">65.20</td>
+    <td align="center" style="padding: 8px;">65.60</td>
+    <td align="center" style="padding: 8px;"><strong>72.00</strong></td>
+    <td align="center" style="padding: 8px;">67.39</td>
+  </tr>
+  <tr>
+    <td align="left" style="padding: 8px;">MELD</td>
+    <td align="center" style="padding: 8px;">55.30</td>
+    <td align="center" style="padding: 8px;">33.54</td>
+    <td align="center" style="padding: 8px;"><strong>59.13</strong></td>
+    <td align="center" style="padding: 8px;">57.00</td>
+    <td align="center" style="padding: 8px;">57.16</td>
+    <td align="center" style="padding: 8px;">57.55</td>
+  </tr>
+  <tr>
+    <td align="left" style="padding: 8px;">ClothoAQA<br><em>dev</em></td>
+    <td align="center" style="padding: 8px;">72.63</td>
+    <td align="center" style="padding: 8px;">44.98</td>
+    <td align="center" style="padding: 8px;"><strong>73.18</strong></td>
+    <td align="center" style="padding: 8px;">73.12</td>
+    <td align="center" style="padding: 8px;">71.91</td>
+    <td align="center" style="padding: 8px;">72.98</td>
+  </tr>
+  <tr>
+    <td align="left" style="padding: 8px;">ClothoAQA<br><em>test</em></td>
+    <td align="center" style="padding: 8px;">71.73</td>
+    <td align="center" style="padding: 8px;">45.84</td>
+    <td align="center" style="padding: 8px;">71.24</td>
+    <td align="center" style="padding: 8px;">72.86</td>
+    <td align="center" style="padding: 8px;">71.28</td>
+    <td align="center" style="padding: 8px;"><strong>74.49</strong></td>
+  </tr>
+</tbody>
+</table>
+</details>
+<details>
+<summary>Omni-modal Understanding</summary>
+<table>
+  <thead>
+    <tr>
+      <th>Model</th>
+      <th>Speech</th>
+      <th>Sound Event</th>
+      <th>Music</th>
+      <th>Avg</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td colspan="9" align="center" style="font-weight: bold; border-top: 1px solid #ddd; border-bottom: 1px solid black;">OmniBench</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">MiniCPM-o-2.6</td>
+      <td align="center" style="padding: 8px;">-</td>
+      <td align="center" style="padding: 8px;">-</td>
+      <td align="center" style="padding: 8px;">-</td>
+      <td align="center" style="padding: 8px;">40.50</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">Baichuan-Omni-1.5</td>
+      <td align="center" style="padding: 8px;">-</td>
+      <td align="center" style="padding: 8px;">-</td>
+      <td align="center" style="padding: 8px;">-</td>
+      <td align="center" style="padding: 8px;">42.90</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">Qwen2.5-Omni-7B</td>
+      <td align="center" style="padding: 8px;">55.25</td>
+      <td align="center" style="padding: 8px;">60.00</td>
+      <td align="center" style="padding: 8px;">52.83</td>
+      <td align="center" style="padding: 8px;">56.13</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InteractiveOmni-4B</td>
+      <td align="center" style="padding: 8px;"><strong>60.70</strong></td>
+      <td align="center" style="padding: 8px;">61.51</td>
+      <td align="center" style="padding: 8px;">42.45</td>
+      <td align="center" style="padding: 8px;">59.19</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InteractiveOmni-8B</td>
+      <td align="center" style="padding: 8px;">60.18</td>
+      <td align="center" style="padding: 8px;"><strong>62.64</strong></td>
+      <td align="center" style="padding: 8px;"><strong>55.66</strong></td>
+      <td align="center" style="padding: 8px;"><strong>60.33</strong></td>
+    </tr>
+  </tbody>
+</table>
+</details>
+<details>
+<summary>Speech-to-text</summary>
+<table>
+  <thead>
+    <tr>
+      <th align="left">Datasets</th>
+      <th align="left">Model</th>
+      <th align="left">Performance</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td rowspan="11" align="center" valign="middle"><strong>OpenAudioBench</strong><br><em>Reasoning QA</em> | <em>Llama Questions</em> <br>| <em>Web Questions</em> | <em>TriviaQA</em><br> | <em>AlpacaEval</em> | <em>Avg</em></td>
+      <td align="left">Qwen2-Audio</td>
+      <td align="left">42.77 | 69.67 | 45.20 | 40.30 | 57.19 | 51.03</td>
+    </tr>
+    <tr>
+      <td align="left">GLM-4-Voice</td>
+      <td align="left">47.43 | 76.00 | 55.40 | 51.80 | 57.89 | 57.70</td>
+    </tr>
+    <tr>
+      <td align="left">VITA-1.5</td>
+      <td align="left">41.00 | 74.20 | 57.30 | 46.80 | 68.20 | 57.50</td>
+    </tr>
+    <tr>
+      <td align="left">Step-Audio-chat</td>
+      <td align="left">60.00 | 72.33 | <strong>73.00</strong> | 56.80 | 56.53 | 63.73</td>
+    </tr>
+    <tr>
+      <td align="left">Baichuan-Audio</td>
+      <td align="left">41.90 | 78.40 | 64.50 | 61.70 | 77.40 | 64.78</td>
+    </tr>
+    <tr>
+      <td align="left">Kimi-Audio</td>
+      <td align="left">58.02 | 79.33 | 70.20 | 62.10 | 75.73 | 69.08</td>
+    </tr>
+    <tr>
+      <td align="left">MiniCPM-o-2.6</td>
+      <td align="left">38.60 | 77.80 | 68.60 | 61.90 | 51.80 | 59.74</td>
+    </tr>
+    <tr>
+      <td align="left">Baichuan-Omni-1.5</td>
+      <td align="left">50.00 | 78.50 | 59.10 | 57.20 | <strong>77.90</strong> | 64.54</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen2.5-Omni-7B</td>
+      <td align="left">63.76 | 75.33 | 62.80 | 57.06 | 72.76 | 66.34</td>
+    </tr>
+    <tr>
+      <td align="left">InteractiveOmni-4B</td>
+      <td align="left">69.11 | 79.33 | 65.80 | 56.40 | 74.87 | 69.10</td>
+    </tr>
+    <tr>
+      <td align="left">InteractiveOmni-8B</td>
+      <td align="left"><strong>71.68</strong>  |  <strong>80.67</strong>  |  70.30  |  <strong>66.50</strong>  |  74.57  |  <strong>72.74</strong></td>
+    </tr>
+    <tr style="border-top: 1px solid #333;">
+    </tr>
+    <tr>
+      <td rowspan="11" align="center" valign="middle"><strong>VoiceBench</strong><br><em>AlpacaEval</em> | <em>CommonEval</em> <br>| <em>WildVoice</em> | <em>SD-QA</em> | <em>MMSU</em></td>
+      <td align="left">Qwen2-Audio</td>
+      <td align="left">3.69  |  3.40  |  3.01  |  35.35  |  35.43</td>
+    </tr>
+    <tr>
+      <td align="left">GLM-4-Voice</td>
+      <td align="left">4.06  |  3.48  |  3.18  |  43.31  |  40.11</td>
+    </tr>
+    <tr>
+      <td align="left">VITA-1.5</td>
+      <td align="left">4.21  |  3.66  |  3.48  |  38.88  |  52.15</td>
+    </tr>
+    <tr>
+      <td align="left">Step-Audio-chat</td>
+      <td align="left">3.99  |  2.99  |  2.93  |  46.84  |  28.72</td>
+    </tr>
+    <tr>
+      <td align="left">Baichuan-Audio</td>
+      <td align="left">4.41  |  4.08  |  3.92  |  45.84  |  53.19</td>
+    </tr>
+    <tr>
+      <td align="left">Kimi-Audio</td>
+      <td align="left">4.46  |  3.97  |  4.20  |  <strong>63.12</strong>  |  62.17</td>
+    </tr>
+    <tr>
+      <td align="left">MiniCPM-o-2.6</td>
+      <td align="left">4.42  |  4.15  |  3.94  |  50.72  |  54.78</td>
+    </tr>
+    <tr>
+      <td align="left">Baichuan-Omni-1.5</td>
+      <td align="left">4.50  |  4.05  |  4.06  |  43.40  |  57.25</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen2.5-Omni-7B</td>
+      <td align="left">4.50  |  3.84  |  3.89  |  56.40  |  61.32</td>
+    </tr>
+    <tr>
+      <td align="left">InteractiveOmni-4B</td>
+      <td align="left">4.27  |  4.20  |  3.94  |  41.41  |  63.24</td>
+    </tr>
+    <tr>
+      <td align="left">InteractiveOmni-8B</td>
+      <td align="left"><strong>4.61</strong>  |  <strong>4.34</strong> |  <strong>4.21</strong>  |  44.67  |  <strong>65.26</strong></td>
+    </tr>
+    <tr style="border-top: 1px solid #333;">
+    </tr>
+    <tr>
+      <td rowspan="11" align="center" valign="middle"><strong>VoiceBench</strong><br><em>OpenBookQA</em> | <em>IFEval</em> <br>| <em>BBH</em> | <em>AdvBench</em> | <em>Avg</em></td>
+      <td align="left">Qwen2-Audio</td>
+      <td align="left">49.01  |  54.70  |  22.57  |  98.85	 |  55.32</td>
+    </tr>
+    <tr>
+      <td align="left">GLM-4-Voice</td>
+      <td align="left">52.97  |  52.80  |  24.91  |  88.08	 |  57.40</td>
+    </tr>
+    <tr>
+      <td align="left">VITA-1.5</td>
+      <td align="left">71.65  |  55.30  |  38.14  |  97.69	 |  64.53</td>
+    </tr>
+    <tr>
+      <td align="left">Step-Audio-chat</td>
+      <td align="left">31.87  |  50.60  |  29.19  |  65.77	 |  50.13</td>
+    </tr>
+    <tr>
+      <td align="left">Baichuan-Audio</td>
+      <td align="left">71.65  |  54.80  |  50.31  |  99.42	 |  69.27</td>
+    </tr>
+    <tr>
+      <td align="left">Kimi-Audio</td>
+      <td align="left">83.52  |  69.70  |  <strong>61.10</strong>  |  <strong>100.0</strong>	 |  <strong>76.91</strong></td>
+    </tr>
+    <tr>
+      <td align="left">MiniCPM-o-2.6</td>
+      <td align="left">78.02  |  60.40  |  49.25  |  97.69	 |  71.23</td>
+    </tr>
+    <tr>
+      <td align="left">Baichuan-Omni-1.5</td>
+      <td align="left">74.51  |  62.70  |  54.54  |  97.31	 |  71.32</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen2.5-Omni-7B</td>
+      <td align="left">80.90  |  66.70  |  53.50  |  99.20	 |  73.60</td>
+    </tr>
+    <tr>
+      <td align="left">InteractiveOmni-4B</td>
+      <td align="left">82.64  |  55.90  |  60.90  |  99.62  |  73.10</td>
+    </tr>
+    <tr>
+      <td align="left">InteractiveOmni-8B</td>
+      <td align="left"><strong>86.37</strong>  |  <strong>73.30</strong>  |  57.99  |  99.42  |  76.69</td>
+    </tr>
+  </tbody>
+</table>
+</details>
+<details>
+<summary>Speech Generation</summary>
+<table>
+  <thead>
+    <tr>
+      <th>Model</th>
+      <th>test-zh</th>
+      <th>test-en</th>
+      <th>test-zh-hard</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td colspan="9" align="center" style="font-weight: bold; border-top: 1px solid #ddd; border-bottom: 1px solid black;">TTS Model</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">MaskGCT</td>
+      <td align="center" style="padding: 8px;">2.27</td>
+      <td align="center" style="padding: 8px;">2.62</td>
+      <td align="center" style="padding: 8px;">10.27</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">SeedTTS</td>
+      <td align="center" style="padding: 8px;">1.12</td>
+      <td align="center" style="padding: 8px;">2.25</td>
+      <td align="center" style="padding: 8px;">7.59</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">CosyVoice 2</td>
+      <td align="center" style="padding: 8px;">1.45</td>
+      <td align="center" style="padding: 8px;">2.57</td>
+      <td align="center" style="padding: 8px;">6.83</td>
+    </tr>
+    <tr>
+      <td colspan="9" align="center" style="font-weight: bold; border-top: 1px solid #ddd; border-bottom: 1px solid black;">MLLM</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">MinMo</td>
+      <td align="center" style="padding: 8px;">2.48</td>
+      <td align="center" style="padding: 8px;">2.90</td>
+      <td align="center" style="padding: 8px;">-</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">Ming-Lite-Omni</td>
+      <td align="center" style="padding: 8px;">1.69</td>
+      <td align="center" style="padding: 8px;">4.31</td>
+      <td align="center" style="padding: 8px;">-</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">Qwen2.5-Omni-7B</td>
+      <td align="center" style="padding: 8px;">1.70</td>
+      <td align="center" style="padding: 8px;">2.72</td>
+      <td align="center" style="padding: 8px;">7.97</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InteractiveOmni-4B</td>
+      <td align="center" style="padding: 8px;"></strong>1.37<strong></td>
+      <td align="center" style="padding: 8px;">3.73</td>
+      <td align="center" style="padding: 8px;">8.02</td>
+    </tr>
+    <tr>
+      <td align="left" style="padding: 8px;">InteractiveOmni-8B</td>
+      <td align="center" style="padding: 8px;">1.56</td>
+      <td align="center" style="padding: 8px;"><strong>2.33</strong></td>
+      <td align="center" style="padding: 8px;"><strong>7.92</strong></td>
+    </tr>
+  </tbody>
+</table>
+</details>
+## Citation
+If you find our paper and code useful in your research, please cite our technical report.
+```bibtex
+@misc{tong2025interactiveomniunifiedomnimodalmodel,
+      title={InteractiveOmni: A Unified Omni-modal Model for Audio-Visual Multi-turn Dialogue},
+      author={Wenwen Tong and Hewei Guo and Dongchuan Ran and Jiangnan Chen and Jiefan Lu and Kaibin Wang and Keqiang Li and Xiaoxu Zhu and Jiakui Li and Kehan Li and Xueheng Li and Lumin Li and Chenxu Guo and Jiasheng Zhou and Jiandong Chen and Xianye Wu and Jiahao Wang and Silei Wu and Lei Chen and Hanming Deng and Yuxuan Song and Dinghao Zhou and Guiping Zhong and Ken Zheng and Shiyin Kang and Lewei Lu},
+      year={2025},
+      eprint={2510.13747},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2510.13747},
+}

added_tokens.json ADDED Viewed

	@@ -0,0 +1,327 @@

+{
+  "</audio>": 151937,
+  "</box>": 151677,
+  "</img>": 151671,
+  "</quad>": 151673,
+  "</ref>": 151675,
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<AUDIO_CONTEXT>": 151938,
+  "<FAKE_PAD_0>": 151682,
+  "<FAKE_PAD_100>": 151782,
+  "<FAKE_PAD_101>": 151783,
+  "<FAKE_PAD_102>": 151784,
+  "<FAKE_PAD_103>": 151785,
+  "<FAKE_PAD_104>": 151786,
+  "<FAKE_PAD_105>": 151787,
+  "<FAKE_PAD_106>": 151788,
+  "<FAKE_PAD_107>": 151789,
+  "<FAKE_PAD_108>": 151790,
+  "<FAKE_PAD_109>": 151791,
+  "<FAKE_PAD_10>": 151692,
+  "<FAKE_PAD_110>": 151792,
+  "<FAKE_PAD_111>": 151793,
+  "<FAKE_PAD_112>": 151794,
+  "<FAKE_PAD_113>": 151795,
+  "<FAKE_PAD_114>": 151796,
+  "<FAKE_PAD_115>": 151797,
+  "<FAKE_PAD_116>": 151798,
+  "<FAKE_PAD_117>": 151799,
+  "<FAKE_PAD_118>": 151800,
+  "<FAKE_PAD_119>": 151801,
+  "<FAKE_PAD_11>": 151693,
+  "<FAKE_PAD_120>": 151802,
+  "<FAKE_PAD_121>": 151803,
+  "<FAKE_PAD_122>": 151804,
+  "<FAKE_PAD_123>": 151805,
+  "<FAKE_PAD_124>": 151806,
+  "<FAKE_PAD_125>": 151807,
+  "<FAKE_PAD_126>": 151808,
+  "<FAKE_PAD_127>": 151809,
+  "<FAKE_PAD_128>": 151810,
+  "<FAKE_PAD_129>": 151811,
+  "<FAKE_PAD_12>": 151694,
+  "<FAKE_PAD_130>": 151812,
+  "<FAKE_PAD_131>": 151813,
+  "<FAKE_PAD_132>": 151814,
+  "<FAKE_PAD_133>": 151815,
+  "<FAKE_PAD_134>": 151816,
+  "<FAKE_PAD_135>": 151817,
+  "<FAKE_PAD_136>": 151818,
+  "<FAKE_PAD_137>": 151819,
+  "<FAKE_PAD_138>": 151820,
+  "<FAKE_PAD_139>": 151821,
+  "<FAKE_PAD_13>": 151695,
+  "<FAKE_PAD_140>": 151822,
+  "<FAKE_PAD_141>": 151823,
+  "<FAKE_PAD_142>": 151824,
+  "<FAKE_PAD_143>": 151825,
+  "<FAKE_PAD_144>": 151826,
+  "<FAKE_PAD_145>": 151827,
+  "<FAKE_PAD_146>": 151828,
+  "<FAKE_PAD_147>": 151829,
+  "<FAKE_PAD_148>": 151830,
+  "<FAKE_PAD_149>": 151831,
+  "<FAKE_PAD_14>": 151696,
+  "<FAKE_PAD_150>": 151832,
+  "<FAKE_PAD_151>": 151833,
+  "<FAKE_PAD_152>": 151834,
+  "<FAKE_PAD_153>": 151835,
+  "<FAKE_PAD_154>": 151836,
+  "<FAKE_PAD_155>": 151837,
+  "<FAKE_PAD_156>": 151838,
+  "<FAKE_PAD_157>": 151839,
+  "<FAKE_PAD_158>": 151840,
+  "<FAKE_PAD_159>": 151841,
+  "<FAKE_PAD_15>": 151697,
+  "<FAKE_PAD_160>": 151842,
+  "<FAKE_PAD_161>": 151843,
+  "<FAKE_PAD_162>": 151844,
+  "<FAKE_PAD_163>": 151845,
+  "<FAKE_PAD_164>": 151846,
+  "<FAKE_PAD_165>": 151847,
+  "<FAKE_PAD_166>": 151848,
+  "<FAKE_PAD_167>": 151849,
+  "<FAKE_PAD_168>": 151850,
+  "<FAKE_PAD_169>": 151851,
+  "<FAKE_PAD_16>": 151698,
+  "<FAKE_PAD_170>": 151852,
+  "<FAKE_PAD_171>": 151853,
+  "<FAKE_PAD_172>": 151854,
+  "<FAKE_PAD_173>": 151855,
+  "<FAKE_PAD_174>": 151856,
+  "<FAKE_PAD_175>": 151857,
+  "<FAKE_PAD_176>": 151858,
+  "<FAKE_PAD_177>": 151859,
+  "<FAKE_PAD_178>": 151860,
+  "<FAKE_PAD_179>": 151861,
+  "<FAKE_PAD_17>": 151699,
+  "<FAKE_PAD_180>": 151862,
+  "<FAKE_PAD_181>": 151863,
+  "<FAKE_PAD_182>": 151864,
+  "<FAKE_PAD_183>": 151865,
+  "<FAKE_PAD_184>": 151866,
+  "<FAKE_PAD_185>": 151867,
+  "<FAKE_PAD_186>": 151868,
+  "<FAKE_PAD_187>": 151869,
+  "<FAKE_PAD_188>": 151870,
+  "<FAKE_PAD_189>": 151871,
+  "<FAKE_PAD_18>": 151700,
+  "<FAKE_PAD_190>": 151872,
+  "<FAKE_PAD_191>": 151873,
+  "<FAKE_PAD_192>": 151874,
+  "<FAKE_PAD_193>": 151875,
+  "<FAKE_PAD_194>": 151876,
+  "<FAKE_PAD_195>": 151877,
+  "<FAKE_PAD_196>": 151878,
+  "<FAKE_PAD_197>": 151879,
+  "<FAKE_PAD_198>": 151880,
+  "<FAKE_PAD_199>": 151881,
+  "<FAKE_PAD_19>": 151701,
+  "<FAKE_PAD_1>": 151683,
+  "<FAKE_PAD_200>": 151882,
+  "<FAKE_PAD_201>": 151883,
+  "<FAKE_PAD_202>": 151884,
+  "<FAKE_PAD_203>": 151885,
+  "<FAKE_PAD_204>": 151886,
+  "<FAKE_PAD_205>": 151887,
+  "<FAKE_PAD_206>": 151888,
+  "<FAKE_PAD_207>": 151889,
+  "<FAKE_PAD_208>": 151890,
+  "<FAKE_PAD_209>": 151891,
+  "<FAKE_PAD_20>": 151702,
+  "<FAKE_PAD_210>": 151892,
+  "<FAKE_PAD_211>": 151893,
+  "<FAKE_PAD_212>": 151894,
+  "<FAKE_PAD_213>": 151895,
+  "<FAKE_PAD_214>": 151896,
+  "<FAKE_PAD_215>": 151897,
+  "<FAKE_PAD_216>": 151898,
+  "<FAKE_PAD_217>": 151899,
+  "<FAKE_PAD_218>": 151900,
+  "<FAKE_PAD_219>": 151901,
+  "<FAKE_PAD_21>": 151703,
+  "<FAKE_PAD_220>": 151902,
+  "<FAKE_PAD_221>": 151903,
+  "<FAKE_PAD_222>": 151904,
+  "<FAKE_PAD_223>": 151905,
+  "<FAKE_PAD_224>": 151906,
+  "<FAKE_PAD_225>": 151907,
+  "<FAKE_PAD_226>": 151908,
+  "<FAKE_PAD_227>": 151909,
+  "<FAKE_PAD_228>": 151910,
+  "<FAKE_PAD_229>": 151911,
+  "<FAKE_PAD_22>": 151704,
+  "<FAKE_PAD_230>": 151912,
+  "<FAKE_PAD_231>": 151913,
+  "<FAKE_PAD_232>": 151914,
+  "<FAKE_PAD_233>": 151915,
+  "<FAKE_PAD_234>": 151916,
+  "<FAKE_PAD_235>": 151917,
+  "<FAKE_PAD_236>": 151918,
+  "<FAKE_PAD_237>": 151919,
+  "<FAKE_PAD_238>": 151920,
+  "<FAKE_PAD_239>": 151921,
+  "<FAKE_PAD_23>": 151705,
+  "<FAKE_PAD_240>": 151922,
+  "<FAKE_PAD_241>": 151923,
+  "<FAKE_PAD_242>": 151924,
+  "<FAKE_PAD_243>": 151925,
+  "<FAKE_PAD_244>": 151926,
+  "<FAKE_PAD_245>": 151927,
+  "<FAKE_PAD_246>": 151928,
+  "<FAKE_PAD_247>": 151929,
+  "<FAKE_PAD_248>": 151930,
+  "<FAKE_PAD_249>": 151931,
+  "<FAKE_PAD_24>": 151706,
+  "<FAKE_PAD_250>": 151932,
+  "<FAKE_PAD_251>": 151933,
+  "<FAKE_PAD_252>": 151934,
+  "<FAKE_PAD_253>": 151935,
+  "<FAKE_PAD_25>": 151707,
+  "<FAKE_PAD_26>": 151708,
+  "<FAKE_PAD_27>": 151709,
+  "<FAKE_PAD_28>": 151710,
+  "<FAKE_PAD_29>": 151711,
+  "<FAKE_PAD_2>": 151684,
+  "<FAKE_PAD_30>": 151712,
+  "<FAKE_PAD_31>": 151713,
+  "<FAKE_PAD_32>": 151714,
+  "<FAKE_PAD_33>": 151715,
+  "<FAKE_PAD_34>": 151716,
+  "<FAKE_PAD_35>": 151717,
+  "<FAKE_PAD_36>": 151718,
+  "<FAKE_PAD_37>": 151719,
+  "<FAKE_PAD_38>": 151720,
+  "<FAKE_PAD_39>": 151721,
+  "<FAKE_PAD_3>": 151685,
+  "<FAKE_PAD_40>": 151722,
+  "<FAKE_PAD_41>": 151723,
+  "<FAKE_PAD_42>": 151724,
+  "<FAKE_PAD_43>": 151725,
+  "<FAKE_PAD_44>": 151726,
+  "<FAKE_PAD_45>": 151727,
+  "<FAKE_PAD_46>": 151728,
+  "<FAKE_PAD_47>": 151729,
+  "<FAKE_PAD_48>": 151730,
+  "<FAKE_PAD_49>": 151731,
+  "<FAKE_PAD_4>": 151686,
+  "<FAKE_PAD_50>": 151732,
+  "<FAKE_PAD_51>": 151733,
+  "<FAKE_PAD_52>": 151734,
+  "<FAKE_PAD_53>": 151735,
+  "<FAKE_PAD_54>": 151736,
+  "<FAKE_PAD_55>": 151737,
+  "<FAKE_PAD_56>": 151738,
+  "<FAKE_PAD_57>": 151739,
+  "<FAKE_PAD_58>": 151740,
+  "<FAKE_PAD_59>": 151741,
+  "<FAKE_PAD_5>": 151687,
+  "<FAKE_PAD_60>": 151742,
+  "<FAKE_PAD_61>": 151743,
+  "<FAKE_PAD_62>": 151744,
+  "<FAKE_PAD_63>": 151745,
+  "<FAKE_PAD_64>": 151746,
+  "<FAKE_PAD_65>": 151747,
+  "<FAKE_PAD_66>": 151748,
+  "<FAKE_PAD_67>": 151749,
+  "<FAKE_PAD_68>": 151750,
+  "<FAKE_PAD_69>": 151751,
+  "<FAKE_PAD_6>": 151688,
+  "<FAKE_PAD_70>": 151752,
+  "<FAKE_PAD_71>": 151753,
+  "<FAKE_PAD_72>": 151754,
+  "<FAKE_PAD_73>": 151755,
+  "<FAKE_PAD_74>": 151756,
+  "<FAKE_PAD_75>": 151757,
+  "<FAKE_PAD_76>": 151758,
+  "<FAKE_PAD_77>": 151759,
+  "<FAKE_PAD_78>": 151760,
+  "<FAKE_PAD_79>": 151761,
+  "<FAKE_PAD_7>": 151689,
+  "<FAKE_PAD_80>": 151762,
+  "<FAKE_PAD_81>": 151763,
+  "<FAKE_PAD_82>": 151764,
+  "<FAKE_PAD_83>": 151765,
+  "<FAKE_PAD_84>": 151766,
+  "<FAKE_PAD_85>": 151767,
+  "<FAKE_PAD_86>": 151768,
+  "<FAKE_PAD_87>": 151769,
+  "<FAKE_PAD_88>": 151770,
+  "<FAKE_PAD_89>": 151771,
+  "<FAKE_PAD_8>": 151690,
+  "<FAKE_PAD_90>": 151772,
+  "<FAKE_PAD_91>": 151773,
+  "<FAKE_PAD_92>": 151774,
+  "<FAKE_PAD_93>": 151775,
+  "<FAKE_PAD_94>": 151776,
+  "<FAKE_PAD_95>": 151777,
+  "<FAKE_PAD_96>": 151778,
+  "<FAKE_PAD_97>": 151779,
+  "<FAKE_PAD_98>": 151780,
+  "<FAKE_PAD_99>": 151781,
+  "<FAKE_PAD_9>": 151691,
+  "<FAKE_PAD_PAD_0>": 151940,
+  "<FAKE_PAD_PAD_10>": 151950,
+  "<FAKE_PAD_PAD_11>": 151951,
+  "<FAKE_PAD_PAD_12>": 151952,
+  "<FAKE_PAD_PAD_13>": 151953,
+  "<FAKE_PAD_PAD_14>": 151954,
+  "<FAKE_PAD_PAD_15>": 151955,
+  "<FAKE_PAD_PAD_16>": 151956,
+  "<FAKE_PAD_PAD_17>": 151957,
+  "<FAKE_PAD_PAD_18>": 151958,
+  "<FAKE_PAD_PAD_19>": 151959,
+  "<FAKE_PAD_PAD_1>": 151941,
+  "<FAKE_PAD_PAD_20>": 151960,
+  "<FAKE_PAD_PAD_21>": 151961,
+  "<FAKE_PAD_PAD_22>": 151962,
+  "<FAKE_PAD_PAD_23>": 151963,
+  "<FAKE_PAD_PAD_24>": 151964,
+  "<FAKE_PAD_PAD_25>": 151965,
+  "<FAKE_PAD_PAD_26>": 151966,
+  "<FAKE_PAD_PAD_27>": 151967,
+  "<FAKE_PAD_PAD_2>": 151942,
+  "<FAKE_PAD_PAD_3>": 151943,
+  "<FAKE_PAD_PAD_4>": 151944,
+  "<FAKE_PAD_PAD_5>": 151945,
+  "<FAKE_PAD_PAD_6>": 151946,
+  "<FAKE_PAD_PAD_7>": 151947,
+  "<FAKE_PAD_PAD_8>": 151948,
+  "<FAKE_PAD_PAD_9>": 151949,
+  "<IMG_CONTEXT>": 151669,
+  "<audio>": 151936,
+  "<box>": 151676,
+  "<img>": 151670,
+  "<interrupt>": 151939,
+  "<quad>": 151672,
+  "<ref>": 151674,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|action_end|>": 151679,
+  "<|action_start|>": 151678,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|interpreter|>": 151681,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|plugin|>": 151680,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

campplus.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
+size 28303423

config.json ADDED Viewed

	@@ -0,0 +1,628 @@

+{
+  "_commit_hash": null,
+  "_name_or_path": "InteractiveOmni",
+  "architectures": [
+    "InteractiveOmniModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_interactiveomni.InteractiveOmniConfig",
+    "AutoModel": "modeling_interactiveomni.InteractiveOmniModel",
+    "AutoModelForCausalLM": "modeling_interactiveomni.InteractiveOmniModel"
+  },
+  "audio_config": {
+    "_name_or_path": "openai/whisper-large-v3",
+    "activation_dropout": 0.0,
+    "activation_function": "gelu",
+    "apply_spec_augment": false,
+    "architectures": [
+      "WhisperForConditionalGeneration"
+    ],
+    "attention_dropout": 0.0,
+    "begin_suppress_tokens": [
+      220,
+      50257
+    ],
+    "bos_token_id": 50257,
+    "classifier_proj_size": 256,
+    "d_model": 1280,
+    "decoder_attention_heads": 20,
+    "decoder_ffn_dim": 5120,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 32,
+    "decoder_start_token_id": 50258,
+    "dropout": 0.0,
+    "encoder_attention_heads": 20,
+    "encoder_ffn_dim": 5120,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 32,
+    "eos_token_id": 50257,
+    "init_std": 0.02,
+    "is_encoder_decoder": true,
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_prob": 0.05,
+    "max_length": 448,
+    "max_source_positions": 1500,
+    "max_target_positions": 448,
+    "median_filter_width": 7,
+    "model_type": "whisper",
+    "num_hidden_layers": 32,
+    "num_mel_bins": 128,
+    "pad_token_id": 50256,
+    "scale_embedding": false,
+    "torch_dtype": "float16",
+    "transformers_version": "4.36.0.dev0",
+    "use_cache": true,
+    "use_weighted_layer_sum": false,
+    "vocab_size": 51866
+  },
+  "audio_preprocessor_config": {
+    "chunk_length": 30,
+    "feature_extractor_type": "WhisperFeatureExtractor",
+    "feature_size": 128,
+    "hop_length": 160,
+    "n_fft": 400,
+    "n_samples": 480000,
+    "nb_max_frames": 3000,
+    "padding_side": "right",
+    "padding_value": 0.0,
+    "processor_class": "WhisperProcessor",
+    "return_attention_mask": false,
+    "sampling_rate": 16000
+  },
+  "downsample_ratio": 0.25,
+  "dynamic_image_size": true,
+  "force_image_size": 448,
+  "llm_config": {
+    "_name_or_path": "Qwen/Qwen3-8B",
+    "add_cross_attention": false,
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_implementation": "flash_attention_2",
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bias": false,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 151645,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 40960,
+    "max_window_layers": 40,
+    "min_length": 0,
+    "model_type": "qwen3",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 32,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "factor": 2.0,
+      "type": "dynamic"
+    },
+    "rope_theta": 1000000.0,
+    "sep_token_id": null,
+    "sliding_window": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "transformers_version": "4.51.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151968
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "interactiveomni",
+  "pad2square": false,
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "interactiveomni_template",
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "_name_or_path": "OpenGVLab/InternViT-300M-448px",
+    "add_cross_attention": false,
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "auto_map": {
+      "AutoConfig": "configuration_intern_vit.InternVisionConfig",
+      "AutoModel": "modeling_intern_vit.InternVisionModel"
+    },
+    "attention_dropout": 0.0,
+    "drop_path_rate": 0.1,
+    "dropout": 0.0,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-06,
+    "model_type": "intern_vit_6b",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.37.2",
+    "use_flash_attn": true
+  },
+  "flow_config": {
+    "_attn_implementation_internal": null,
+    "_commit_hash": null,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": [
+      "CausalMaskedDiffWithXvec"
+    ],
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_config": {
+      "cfm_params": {
+        "inference_cfg_rate": 0.7,
+        "reg_loss_type": "l1",
+        "sigma_min": 1e-06,
+        "solver": "euler",
+        "t_scheduler": "cosine",
+        "training_cfg_rate": 0.2
+      },
+      "estimator_config": {
+        "act_fn": "gelu",
+        "attention_head_dim": 64,
+        "causal": true,
+        "channels": [
+          256
+        ],
+        "dropout": 0.0,
+        "in_channels": 320,
+        "n_blocks": 4,
+        "num_heads": 8,
+        "num_mid_blocks": 12,
+        "out_channels": 80
+      },
+      "in_channels": 240,
+      "n_spks": 1,
+      "spk_emb_dim": 80
+    },
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_config": {
+      "attention_dropout_rate": 0.1,
+      "attention_heads": 8,
+      "dropout_rate": 0.1,
+      "input_layer": "linear",
+      "input_size": 512,
+      "linear_units": 2048,
+      "macaron_style": false,
+      "normalize_before": true,
+      "num_blocks": 6,
+      "output_size": 512,
+      "pos_enc_layer_type": "rel_pos_espnet",
+      "positional_dropout_rate": 0.1,
+      "selfattention_layer_type": "rel_selfattn",
+      "use_cnn_module": false
+    },
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "input_frame_rate": 25,
+    "input_size": 512,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "only_mask_loss": true,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "output_size": 80,
+    "output_type": "mel",
+    "pad_token_id": null,
+    "pre_lookahead_len": 3,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "spk_embed_dim": 192,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "token_mel_ratio": 2,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "transformers_version": null,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vocab_size": 6561
+  },
+  "hifigan_config": {
+    "_attn_implementation_internal": null,
+    "_commit_hash": null,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": [
+      "HiFTGenerator"
+    ],
+    "audio_limit": 0.99,
+    "bad_words_ids": null,
+    "base_channels": 512,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "f0_predictor_config": {
+      "cond_channels": 512,
+      "in_channels": 80,
+      "num_class": 1
+    },
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "in_channels": 80,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "istft_params": {
+      "hop_len": 4,
+      "n_fft": 16
+    },
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "lrelu_slope": 0.1,
+    "max_length": 20,
+    "min_length": 0,
+    "nb_harmonics": 8,
+    "no_repeat_ngram_size": 0,
+    "nsf_alpha": 0.1,
+    "nsf_sigma": 0.003,
+    "nsf_voiced_threshold": 10,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sampling_rate": 24000,
+    "sep_token_id": null,
+    "source_resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "source_resblock_kernel_sizes": [
+      7,
+      7,
+      11
+    ],
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "transformers_version": null,
+    "typical_p": 1.0,
+    "upsample_kernel_sizes": [
+      16,
+      11,
+      7
+    ],
+    "upsample_rates": [
+      8,
+      5,
+      3
+    ],
+    "use_bfloat16": false
+  },
+  "voicelm_config": {
+    "_attn_implementation_internal": null,
+    "_commit_hash": null,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_normalized_loss": true,
+    "length_penalty": 1.0,
+    "llm_config": {
+      "add_cross_attention": false,
+      "architectures": [
+        "Qwen2ForCausalLM"
+      ],
+      "attention_dropout": 0.0,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": 151643,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": 151643,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "hidden_act": "silu",
+      "hidden_size": 896,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "initializer_range": 0.02,
+      "intermediate_size": 4864,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "max_position_embeddings": 32768,
+      "max_window_layers": 24,
+      "min_length": 0,
+      "model_type": "qwen2",
+      "no_repeat_ngram_size": 0,
+      "num_attention_heads": 14,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_hidden_layers": 24,
+      "num_key_value_heads": 2,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "rms_norm_eps": 1e-06,
+      "rope_theta": 1000000.0,
+      "sep_token_id": null,
+      "sliding_window": 32768,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": "bfloat16",
+      "torchscript": false,
+      "transformers_version": "4.37.2",
+      "typical_p": 1.0,
+      "use_bfloat16": false,
+      "use_cache": false,
+      "use_mrope": false,
+      "use_sliding_window": false,
+      "vocab_size": 151936
+    },
+    "llm_input_size": 896,
+    "llm_output_size": 896,
+    "lsm_weight": 0,
+    "max_length": 20,
+    "min_length": 0,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sampling_config": {
+      "tau_r": 0.1,
+      "top_k": 15,
+      "top_p": 0.7,
+      "win_size": 10
+    },
+    "sep_token_id": null,
+    "speech_token_size": 6561,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": null,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  }
+}

configuration_flow.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class FlowConfig(PretrainedConfig):
+    def __init__(
+            self,
+            input_size = 512,
+            output_size= 80,
+            spk_embed_dim = 192,
+            output_type = 'mel',
+            vocab_size = 6561,
+            input_frame_rate = 25,
+            only_mask_loss = True,
+            token_mel_ratio=2,
+            pre_lookahead_len=3,
+            encoder_config={'output_size': 512,
+                            'attention_heads': 8,
+                            'linear_units': 2048,
+                            'num_blocks': 6,
+                            'dropout_rate': 0.1,
+                            'positional_dropout_rate': 0.1,
+                            'attention_dropout_rate': 0.1,
+                            'normalize_before': True,
+                            'input_layer': 'linear',
+                            'pos_enc_layer_type': 'rel_pos_espnet',
+                            'selfattention_layer_type': 'rel_selfattn',
+                            'input_size': 512,
+                            'use_cnn_module': False,
+                            'macaron_style': False,
+                            },
+            decoder_config={'in_channels': 240,
+                            'n_spks': 1,
+                            'spk_emb_dim': 80,
+                            'cfm_params': {
+                                'sigma_min': 1e-06,
+                                'solver': 'euler',
+                                't_scheduler': 'cosine',
+                                'training_cfg_rate': 0.2,
+                                'inference_cfg_rate': 0.7,
+                               'reg_loss_type': 'l1',
+                               },
+                            'estimator_config':{
+                                'in_channels': 320,
+                                'out_channels': 80,
+                                'causal': True,
+                                'channels': [256],
+                                'dropout': 0.0,
+                                'attention_head_dim': 64,
+                                'n_blocks': 4,
+                                'num_mid_blocks': 12,
+                                'num_heads': 8,
+                                'act_fn': 'gelu'
+                                }
+                            },
+            **kwargs):
+        super().__init__(**kwargs)
+        self.encoder_config = encoder_config
+        self.decoder_config = decoder_config
+        self.input_size = input_size
+        self.output_size = output_size
+        self.spk_embed_dim = spk_embed_dim
+        self.output_type = output_type
+        self.vocab_size = vocab_size
+        self.input_frame_rate = input_frame_rate
+        self.only_mask_loss = only_mask_loss
+        self.token_mel_ratio = token_mel_ratio
+        self.pre_lookahead_len = pre_lookahead_len
+        pass
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['encoder_config'] = self.encoder_config
+        output['decoder_config'] = self.decoder_config
+        output['input_size'] = self.input_size
+        output['output_size'] = self.output_size
+        output['spk_embed_dim'] = self.spk_embed_dim
+        output['output_type'] = self.output_type
+        output['vocab_size'] = self.vocab_size
+        output['input_frame_rate'] = self.input_frame_rate
+        output['only_mask_loss'] = self.only_mask_loss
+        output['token_mel_ratio'] = self.token_mel_ratio
+        output['pre_lookahead_len'] = self.pre_lookahead_len
+        return output

configuration_hifigan.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class HiFiGanConfig(PretrainedConfig):
+    def __init__(
+            self,
+            in_channels = 80,
+            base_channels = 512,
+            nb_harmonics = 8,
+            sampling_rate =24000,
+            nsf_alpha= 0.1,
+            nsf_sigma= 0.003,
+            nsf_voiced_threshold = 10,
+            upsample_rates = [8, 5, 3],
+            upsample_kernel_sizes = [16, 11, 7],
+            istft_params ={'n_fft': 16,
+                           'hop_len': 4,
+                           },
+            resblock_kernel_sizes = [3, 7, 11],
+            resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes = [7, 7, 11],
+            source_resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            lrelu_slope = 0.1,
+            audio_limit =0.99,
+            f0_predictor_config={
+                'num_class': 1,
+                'in_channels': 80,
+                'cond_channels': 512
+                },
+            **kwargs):
+        super().__init__(**kwargs)
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.nsf_alpha = nsf_alpha
+        self.nsf_sigma = nsf_sigma
+        self.nsf_voiced_threshold = nsf_voiced_threshold
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.istft_params = istft_params
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes= resblock_dilation_sizes
+        self.source_resblock_kernel_sizes = source_resblock_kernel_sizes
+        self.source_resblock_dilation_sizes = source_resblock_dilation_sizes
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+        self.f0_predictor_config = f0_predictor_config
+        pass
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['in_channels'] = self.in_channels
+        output['base_channels'] = self.base_channels
+        output['nb_harmonics'] = self.nb_harmonics
+        output['sampling_rate'] = self.sampling_rate
+        output['nsf_alpha'] = self.nsf_alpha
+        output['nsf_sigma'] = self.nsf_sigma
+        output['nsf_voiced_threshold'] = self.nsf_voiced_threshold
+        output['upsample_rates'] = self.upsample_rates
+        output['upsample_kernel_sizes'] = self.upsample_kernel_sizes
+        output['istft_params'] = self.istft_params
+        output['resblock_kernel_sizes'] = self.resblock_kernel_sizes
+        output['resblock_dilation_sizes'] = self.resblock_dilation_sizes
+        output['source_resblock_dilation_sizes'] = self.source_resblock_dilation_sizes
+        output['lrelu_slope'] = self.lrelu_slope
+        output['audio_limit'] = self.audio_limit
+        output['f0_predictor_config'] = self.f0_predictor_config
+        return output

configuration_interactiveomni.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers import LlamaConfig, Qwen2Config, Qwen3Config
+from .configuration_intern_vit import InternVisionConfig
+from .configuration_whisper import WhisperConfig
+from .configuration_voicelm import VoiceLMConfig
+from .configuration_flow import FlowConfig
+from .configuration_hifigan import HiFiGanConfig
+logger = logging.get_logger(__name__)
+class InteractiveOmniConfig(PretrainedConfig):
+    model_type = 'interactiveomni'
+    is_composition = True
+    def __init__(
+            self,
+            vision_config=None,
+            llm_config=None,
+            audio_config=None,
+            voicelm_config=None,
+            flow_config=None,
+            hifigan_config=None,
+            use_backbone_lora=0,
+            use_llm_lora=0,
+            pad2square=False,
+            select_layer=-4,
+            force_image_size=None,
+            downsample_ratio=0.5,
+            template=None,
+            dynamic_image_size=False,
+            use_thumbnail=False,
+            ps_version='v1',
+            min_dynamic_patch=1,
+            max_dynamic_patch=6,
+            **kwargs):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = {}
+            logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
+        if llm_config is None:
+            llm_config = {}
+            logger.info('llm_config is None. Initializing the Qwen3Config as default values.')
+        if audio_config is None:
+            audio_config = {}
+            logger.info('audio_config is None. Initializing the WhisperConfig as default values.')
+        if voicelm_config is None:
+            voicelm_config = {}
+            logger.info('voicelm_config is None. Initializing the VoiceLMConfig as default values')
+        if flow_config is None:
+            flow_config = {}
+            logger.info('flow_config is None. Initializing the FlowConfig as default values')
+        if hifigan_config is None:
+            hifigan_config = {}
+            logger.info('hifigan_config is None. Initializing the HiFiGanConfig as default values')
+        self.vision_config = InternVisionConfig(**vision_config)
+        self.audio_config = WhisperConfig(**audio_config)
+        self.llm_config = Qwen3Config(**llm_config)
+        self.voicelm_config = VoiceLMConfig(**voicelm_config)
+        self.flow_config = FlowConfig(**flow_config)
+        self.hifigan_config = HiFiGanConfig(**hifigan_config)
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.pad2square = pad2square
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        logger.info(f'vision_select_layer: {self.select_layer}')
+        logger.info(f'ps_version: {self.ps_version}')
+        logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
+        logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
+        pass
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['vision_config'] = self.vision_config.to_dict()
+        output['audio_config'] = self.audio_config.to_dict()
+        output['llm_config'] = self.llm_config.to_dict()
+        output['voicelm_config'] = self.voicelm_config.to_dict()
+        output['flow_config'] = self.flow_config.to_dict()
+        output['hifigan_config'] = self.hifigan_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        output['use_backbone_lora'] = self.use_backbone_lora
+        output['use_llm_lora'] = self.use_llm_lora
+        output['pad2square'] = self.pad2square
+        output['select_layer'] = self.select_layer
+        output['force_image_size'] = self.force_image_size
+        output['downsample_ratio'] = self.downsample_ratio
+        output['template'] = self.template
+        output['dynamic_image_size'] = self.dynamic_image_size
+        output['use_thumbnail'] = self.use_thumbnail
+        output['ps_version'] = self.ps_version
+        output['min_dynamic_patch'] = self.min_dynamic_patch
+        output['max_dynamic_patch'] = self.max_dynamic_patch
+        return output

configuration_intern_vit.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+    model_type = 'intern_vit_6b'
+    def __init__(
+            self,
+            num_channels=3,
+            patch_size=14,
+            image_size=224,
+            qkv_bias=False,
+            hidden_size=3200,
+            num_attention_heads=25,
+            intermediate_size=12800,
+            qk_normalization=True,
+            num_hidden_layers=48,
+            use_flash_attn=True,
+            hidden_act='gelu',
+            norm_type='rms_norm',
+            layer_norm_eps=1e-6,
+            dropout=0.0,
+            drop_path_rate=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=0.1,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.norm_type = norm_type
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if 'vision_config' in config_dict:
+            config_dict = config_dict['vision_config']
+        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+        return cls.from_dict(config_dict, **kwargs)

configuration_voicelm.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers import LlamaConfig, Qwen2Config
+logger = logging.get_logger(__name__)
+class VoiceLMConfig(PretrainedConfig):
+    def __init__(
+            self,
+            llm_input_size = 896,
+            llm_output_size = 896,
+            speech_token_size = 6561,
+            length_normalized_loss = True,
+            lsm_weight = 0,
+            llm_config=None,
+            sampling_config={
+                'top_p': 0.8,
+                'top_k': 25,
+                'win_size': 10,
+                'tau_r': 0.1,
+                },
+            **kwargs):
+        super().__init__(**kwargs)
+        self.llm_input_size = llm_input_size
+        self.llm_output_size = llm_output_size
+        self.speech_token_size = speech_token_size
+        self.length_normalized_loss = length_normalized_loss
+        self.lsm_weight = lsm_weight
+        self.sampling_config = sampling_config
+        if llm_config is None:
+            llm_config = {}
+            logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
+        self.llm_config = Qwen2Config(**llm_config)
+        pass
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['llm_input_size'] = self.llm_input_size
+        output['llm_output_size'] = self.llm_output_size
+        output['speech_token_size'] = self.speech_token_size
+        output['length_normalized_loss'] = self.length_normalized_loss
+        output['lsm_weight'] = self.lsm_weight
+        output['sampling_config'] = self.sampling_config
+        output['llm_config'] = self.llm_config.to_dict()
+        return output

configuration_whisper.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Whisper model configuration"""
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
+from transformers.utils import logging
+if TYPE_CHECKING:
+    from transformers.feature_extraction_utils import FeatureExtractionMixin
+    from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+    from transformers.utils import TensorType
+logger = logging.get_logger(__name__)
+# fmt: off
+NON_SPEECH_TOKENS = [
+    1, 2, 7, 8, 9, 10, 14, 25,
+    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
+    63, 90, 91, 92, 93, 357, 366, 438, 532, 685,
+    705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377,
+    1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211,
+    4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786,
+    11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791,
+    17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
+    34949, 40283, 40493, 40549, 47282, 49146, 50257, 50359, 50360, 50361
+]
+NON_SPEECH_TOKENS_MULTI = [
+    1, 2, 7, 8, 9, 10, 14, 25,
+    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
+    63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
+    893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627,
+    3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647,
+    7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793,
+    14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675,
+    22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865,
+    42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362
+]
+# fmt: on
+class WhisperConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate a
+    Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Whisper
+    [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 51865):
+            Vocabulary size of the Whisper model. Defines the number of different tokens that can be represented by the
+            `decoder_input_ids` passed when calling [`WhisperModel`]
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            Number of mel features used per input features. Should correspond to the value used in the
+            `WhisperProcessor` class.
+        encoder_layers (`int`, *optional*, defaults to 4):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 4):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_start_token_id (`int`, *optional*, defaults to 50257):
+            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
+            are provided to the `generate` function. It is used to guide the model`s generation process depending on
+            the task.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        d_model (`int`, *optional*, defaults to 384):
+            Dimensionality of the layers.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_embedding (`bool`, *optional*, defaults to False):
+            Scale embeddings by diving by sqrt(d_model).
+        max_source_positions (`int`, *optional*, defaults to 1500):
+            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
+        max_target_positions (`int`, *optional*, defaults to 448):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        pad_token_id (`int`, *optional*, defaults to 50256):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            Begin of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            End of stream token id.
+        suppress_tokens (`List[int]`, *optional*):
+            A list containing the non-speech tokens that will be used by the logit processor in the `generate`
+            function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI each correspond to the `english-only` and the
+            `multilingual` model.
+        begin_suppress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
+            A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
+            the token for `" "` (`blank_token_id`) and the `eos_token_id`
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`WhisperForAudioClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification. Only relevant when using an
+            instance of [`WhisperForAudioClassification`].
+        apply_spec_augment (`bool`, *optional*, defaults to `False`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates `mask_time_prob*len(time_axis)/mask_time_length` independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment == True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`.
+        median_filter_width (`int`, *optional*, defaults to 7):
+            Width of the median filter used to smoothen to cross-attention outputs when computing token timestamps.
+            Should be an odd number.
+    Example:
+    ```python
+    >>> from transformers import WhisperConfig, WhisperModel
+    >>> # Initializing a Whisper tiny style configuration
+    >>> configuration = WhisperConfig()
+    >>> # Initializing a model (with random weights) from the tiny style configuration
+    >>> model = WhisperModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "whisper"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    def __init__(
+        self,
+        vocab_size=51865,
+        num_mel_bins=80,
+        encoder_layers=4,
+        encoder_attention_heads=6,
+        decoder_layers=4,
+        decoder_attention_heads=6,
+        decoder_ffn_dim=1536,
+        encoder_ffn_dim=1536,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        decoder_start_token_id=50257,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=384,
+        dropout=0.0,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        scale_embedding=False,
+        max_source_positions=1500,
+        max_target_positions=448,
+        pad_token_id=50256,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        suppress_tokens=None,
+        begin_suppress_tokens=[220, 50256],
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        apply_spec_augment=False,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        median_filter_width=7,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.num_mel_bins = num_mel_bins
+        self.d_model = d_model
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.max_source_positions = max_source_positions
+        self.max_target_positions = max_target_positions
+        # Audio Classification-specific parameters. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+        self.median_filter_width = median_filter_width
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            suppress_tokens=suppress_tokens,
+            begin_suppress_tokens=begin_suppress_tokens,
+            **kwargs,
+        )
+class WhisperOnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict(
+            [
+                ("input_features", {0: "batch", 1: "feature_size", 2: "encoder_sequence"}),
+            ]
+        )
+        if self.use_past:
+            common_inputs["decoder_input_ids"] = {0: "batch"}
+        else:
+            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+        return common_inputs
+    def generate_dummy_inputs(
+        self,
+        preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional["TensorType"] = None,
+        sampling_rate: int = 22050,
+        time_duration: float = 5.0,
+        frequency: int = 220,
+    ) -> Mapping[str, Any]:
+        dummy_inputs = OrderedDict()
+        encoder_inputs = OnnxConfig.generate_dummy_inputs(
+            self,
+            preprocessor=preprocessor.feature_extractor,
+            batch_size=batch_size,
+            framework=framework,
+            sampling_rate=sampling_rate,
+            time_duration=time_duration,
+            frequency=frequency,
+        )
+        encoder_sequence_length = encoder_inputs["input_features"].shape[2]
+        seq_length = encoder_sequence_length // 2 if self.use_past else seq_length
+        decoder_inputs = super().generate_dummy_inputs(
+            preprocessor.tokenizer, batch_size, seq_length, is_pair, framework
+        )
+        dummy_inputs["input_features"] = encoder_inputs.pop("input_features")
+        dummy_inputs["decoder_input_ids"] = decoder_inputs.pop("decoder_input_ids")
+        if "past_key_values" in decoder_inputs:
+            dummy_inputs["past_key_values"] = decoder_inputs.pop("past_key_values")
+        return dummy_inputs
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-3

conversation.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+Conversation prompt templates.
+We kindly request that you import fastchat instead of copying this file if you wish to use it.
+If you have any changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
+"""
+import dataclasses
+from enum import IntEnum, auto
+from typing import Any, Dict, List, Tuple, Union
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    LLAMA2 = auto()
+    CHATGLM = auto()
+    CHATML = auto()
+    CHATINTERN = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ROBIN = auto()
+    FALCON_CHAT = auto()
+    CHATGLM3 = auto()
+    MPT = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = '{system_message}'
+    # The system message
+    system_message: str = ''
+    # The names of two roles
+    roles: Tuple[str] = ('USER', 'ASSISTANT')
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = '\n'
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ': ' + message + seps[i % 2]
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ': '  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = '' if system_prompt == '' else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + message + self.sep
+                else:
+                    ret += role + '\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ': '
+                        + message.replace('\r\n', '\n').replace('\n\n', '\n')
+                    )
+                    ret += '\n\n'
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA2:
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = '[INST] '
+            for i, (role, message) in enumerate(self.messages):
+                tag = self.roles[i % 2]
+                if message:
+                    if i == 0:
+                        ret += message + ' '
+                    else:
+                        ret += tag + ' ' + message + seps[i % 2]
+                else:
+                    ret += tag
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM:
+            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+            round_add_n = 1 if self.name == 'chatglm2' else 0
+            if system_prompt:
+                ret = system_prompt + self.sep
+            else:
+                ret = ''
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += f'[Round {i//2 + round_add_n}]{self.sep}'
+                if message:
+                    ret += f'{role}：{message}{self.sep}'
+                else:
+                    ret += f'{role}：'
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + message + self.sep + '\n'
+                else:
+                    ret += role + '\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM3:
+            ret = ''
+            if self.system_message:
+                ret += system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + ' ' + message
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATINTERN:
+            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                # if i % 2 == 0:
+                #     ret += "<s>"
+                if message:
+                    ret += role + ':' + message + seps[i % 2] + '\n'
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ':\n' + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += '\n\n'
+                else:
+                    ret += role + ':\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + '<s>' + message + '</s>'
+                else:
+                    ret += role + ': ' + '<s>'
+            return ret
+        elif self.sep_style == SeparatorStyle.ROBIN:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ':\n' + message + self.sep
+                else:
+                    ret += role + ':\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ''
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.MPT:
+            if self.system_message == '':
+                ret = ''
+            else:
+                ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        else:
+            raise ValueError(f'Invalid style: {self.sep_style}')
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+    def update_last_message(self, message: str):
+        """Update the last output.
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        ret = [{'role': 'system', 'content': self.system_message}]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({'role': 'user', 'content': msg})
+            else:
+                if msg is not None:
+                    ret.append({'role': 'assistant', 'content': msg})
+        return ret
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+    def dict(self):
+        return {
+            'template_name': self.name,
+            'system_message': self.system_message,
+            'roles': self.roles,
+            'messages': self.messages,
+            'offset': self.offset,
+        }
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert (
+            template.name not in conv_templates
+        ), f'{template.name} has been registered.'
+    conv_templates[template.name] = template
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+register_conv_template(
+    Conversation(
+        name='interactiveomni_template',
+        system_template='<|im_start|>system\n{system_message}',
+        system_message='You are a highly advanced multimodal conversational AI designed for human-like interaction. You can perceive auditory, visual, speech, and textual inputs, and generate text and speech.',
+        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|im_end|>\n',
+        stop_token_ids=[
+            2,
+            92543,
+            92542
+        ]
+    )
+)

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.51.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5b2da752eea0e481167b8203c4b792c8cd7b5f4dfe44490a577b8ed5db6ee15
+size 4990472920

model-00002-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fb6caa54bb12b742ba39f1d44963057aa2cdc177206f39ccabb4a61a5922d27
+size 4999848424

model-00003-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:849eeeb4f6b5233a4d4749eabacd79375f3ac4340c0057fdc85d93af65e4c45d
+size 4983071360

model-00004-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10149f10dbd934bc38e316409cd12432aeb21061e35bbc754c8d70c387c2d6ee
+size 4999999724

model-00005-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c57621a543541dc6e0fd8aa9f7bfcae153ddfd549a570435f106467d37654b0
+size 129569282

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_flow.py ADDED Viewed

	@@ -0,0 +1,2318 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.modeling_utils import PreTrainedModel
+from typing import Dict, Tuple, Optional, Union, Any
+from torch import nn
+from torch.nn import functional as F
+import torch
+import copy
+from omegaconf import DictConfig
+import threading
+import math
+from abc import ABC
+from diffusers.models.activations import get_activation
+from einops import pack, rearrange, repeat
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import (
+    GEGLU,
+    GELU,
+    AdaLayerNorm,
+    AdaLayerNormZero,
+    ApproximateGELU,
+)
+from diffusers.models.attention_processor import Attention
+from diffusers.models.lora import LoRACompatibleLinear
+from .configuration_flow import FlowConfig
+def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size) with chunk size,
+       this is for streaming encoder
+    Args:
+        size (int): size of mask
+        chunk_size (int): size of chunk
+        num_left_chunks (int): number of left chunks
+            <0: use full chunk
+            >=0: use num_left_chunks
+        device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+    Returns:
+        torch.Tensor: mask
+    Examples:
+        >>> subsequent_chunk_mask(4, 2)
+        [[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 1],
+         [1, 1, 1, 1]]
+    """
+    # NOTE this modified implementation meets onnx export requirements, but it doesn't support num_left_chunks
+    # actually this is not needed after we have inference cache implemented, will remove it later
+    pos_idx = torch.arange(size, device=device)
+    block_value = (torch.div(pos_idx, chunk_size, rounding_mode='trunc') + 1) * chunk_size
+    ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
+    return ret
+def add_optional_chunk_mask(xs: torch.Tensor,
+                            masks: torch.Tensor,
+                            use_dynamic_chunk: bool,
+                            use_dynamic_left_chunk: bool,
+                            decoding_chunk_size: int,
+                            static_chunk_size: int,
+                            num_decoding_left_chunks: int,
+                            enable_full_context: bool = True):
+    """ Apply optional mask for encoder.
+    Args:
+        xs (torch.Tensor): padded input, (B, L, D), L for max length
+        mask (torch.Tensor): mask for xs, (B, 1, L)
+        use_dynamic_chunk (bool): whether to use dynamic chunk or not
+        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
+            training.
+        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
+            0: default for training, use random dynamic chunk.
+            <0: for decoding, use full chunk.
+            >0: for decoding, use fixed chunk size as set.
+        static_chunk_size (int): chunk size for static chunk training/decoding
+            if it's greater than 0, if use_dynamic_chunk is true,
+            this parameter will be ignored
+        num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+            >=0: use num_decoding_left_chunks
+            <0: use all left chunks
+        enable_full_context (bool):
+            True: chunk size is either [1, 25] or full context(max_len)
+            False: chunk size ~ U[1, 25]
+    Returns:
+        torch.Tensor: chunk mask of the input xs.
+    """
+    # Whether to use chunk mask or not
+    if use_dynamic_chunk:
+        max_len = xs.size(1)
+        if decoding_chunk_size < 0:
+            chunk_size = max_len
+            num_left_chunks = -1
+        elif decoding_chunk_size > 0:
+            chunk_size = decoding_chunk_size
+            num_left_chunks = num_decoding_left_chunks
+        else:
+            # chunk size is either [1, 25] or full context(max_len).
+            # Since we use 4 times subsampling and allow up to 1s(100 frames)
+            # delay, the maximum frame is 100 / 4 = 25.
+            chunk_size = torch.randint(1, max_len, (1, )).item()
+            num_left_chunks = -1
+            if chunk_size > max_len // 2 and enable_full_context:
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % 25 + 1
+                if use_dynamic_left_chunk:
+                    max_left_chunks = (max_len - 1) // chunk_size
+                    num_left_chunks = torch.randint(0, max_left_chunks,
+                                                    (1, )).item()
+        chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    elif static_chunk_size > 0:
+        num_left_chunks = num_decoding_left_chunks
+        chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    else:
+        chunk_masks = masks
+    return chunk_masks
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    assert mask.dtype == torch.bool
+    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+    mask = mask.to(dtype)
+    # attention mask bias
+    # NOTE(Mddct): torch.finfo jit issues
+    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * torch.finfo(dtype).min
+    return mask
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """Make mask tensor containing indices of padded part.
+    See description of make_non_pad_mask.
+    Args:
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+        torch.Tensor: Mask tensor containing indices of padded part.
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+    """
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0,
+                             max_len,
+                             dtype=torch.int64,
+                             device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return mask
+class Swish(torch.nn.Module):
+    """Construct an Swish object."""
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Return Swish activation function."""
+        return x * torch.sigmoid(x)
+class BASECFM(torch.nn.Module, ABC):
+    def __init__(
+        self,
+        n_feats,
+        cfm_params,
+        n_spks=1,
+        spk_emb_dim=128,
+    ):
+        super().__init__()
+        self.n_feats = n_feats
+        self.n_spks = n_spks
+        self.spk_emb_dim = spk_emb_dim
+        self.solver = cfm_params.solver
+        if hasattr(cfm_params, "sigma_min"):
+            self.sigma_min = cfm_params.sigma_min
+        else:
+            self.sigma_min = 1e-4
+        self.estimator = None
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in range(1, len(t_span)):
+            dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        # random timestep
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / (
+            torch.sum(mask) * u.shape[1]
+        )
+        return loss, y
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x: torch.Tensor):
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+class Block1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, groups=8):
+        super().__init__()
+        self.block = torch.nn.Sequential(
+            torch.nn.Conv1d(dim, dim_out, 3, padding=1),
+            torch.nn.GroupNorm(groups, dim_out),
+            nn.Mish(),
+        )
+    def forward(self, x, mask):
+        output = self.block(x * mask)
+        return output * mask
+class CausalBlock1D(Block1D):
+    def __init__(self, dim: int, dim_out: int):
+        super(CausalBlock1D, self).__init__(dim, dim_out)
+        self.block = torch.nn.Sequential(
+            CausalConv1d(dim, dim_out, 3),
+            Transpose(1, 2),
+            nn.LayerNorm(dim_out),
+            Transpose(1, 2),
+            nn.Mish(),
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor):
+        output = self.block(x * mask)
+        return output * mask
+class ResnetBlock1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super().__init__()
+        self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))
+        self.block1 = Block1D(dim, dim_out, groups=groups)
+        self.block2 = Block1D(dim_out, dim_out, groups=groups)
+        self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)
+    def forward(self, x, mask, time_emb):
+        h = self.block1(x, mask)
+        h += self.mlp(time_emb).unsqueeze(-1)
+        h = self.block2(h, mask)
+        output = h + self.res_conv(x * mask)
+        return output
+class CausalResnetBlock1D(ResnetBlock1D):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
+        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+        self.block1 = CausalBlock1D(dim, dim_out)
+        self.block2 = CausalBlock1D(dim_out, dim_out)
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1
+        self.causal_padding = (kernel_size - 1, 0)
+    def forward(self, x: torch.Tensor):
+        x = F.pad(x, self.causal_padding)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+class ResnetBlock1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super().__init__()
+        self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))
+        self.block1 = Block1D(dim, dim_out, groups=groups)
+        self.block2 = Block1D(dim_out, dim_out, groups=groups)
+        self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)
+    def forward(self, x, mask, time_emb):
+        h = self.block1(x, mask)
+        h += self.mlp(time_emb).unsqueeze(-1)
+        h = self.block2(h, mask)
+        output = h + self.res_conv(x * mask)
+        return output
+class SinusoidalPosEmb(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
+    def forward(self, x, scale=1000):
+        if x.ndim < 1:
+            x = x.unsqueeze(0)
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class SnakeBeta(nn.Module):
+    """
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+    def __init__(self, in_features, out_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True):
+        """
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        """
+        super().__init__()
+        self.in_features = out_features if isinstance(out_features, list) else [out_features]
+        self.proj = LoRACompatibleLinear(in_features, out_features)
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(self.in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(self.in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(self.in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(self.in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        x = self.proj(x)
+        if self.alpha_logscale:
+            alpha = torch.exp(self.alpha)
+            beta = torch.exp(self.beta)
+        else:
+            alpha = self.alpha
+            beta = self.beta
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)
+        return x
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh")
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        elif activation_fn == "snakebeta":
+            act_fn = SnakeBeta(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(LoRACompatibleLinear(inner_dim, dim_out))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                # scale_qk=False, # uncomment this to not to use flash attention
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+    ):
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 1. Self-Attention
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=encoder_attention_mask if self.only_cross_attention else attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = attn_output + hidden_states
+        # 2. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
+                dim=self._chunk_dim,
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class Downsample1D(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1)
+    def forward(self, x):
+        return self.conv(x)
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = get_activation(act_fn)
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+class ConditionalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        causal=False,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.causal = causal
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else
+                CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = CausalResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            ) if self.causal else ResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = CausalBlock1D(channels[-1], channels[-1]) if self.causal else Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x, mask, mu, t, spks=None, cond=None):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t).to(t.dtype)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+            attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+            attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+            attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask
+class ConditionalCFM(BASECFM):
+    def __init__(self, in_channels=240, cfm_params=None, n_spks=1, spk_emb_dim=64, estimator_config= None):
+        super().__init__(
+            n_feats=in_channels,
+            cfm_params=cfm_params,
+            n_spks=n_spks,
+            spk_emb_dim=spk_emb_dim,
+        )
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+        in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
+        # Just change the architecture of the estimator here
+        self.estimator = ConditionalDecoder(**estimator_config)
+        self.lock = threading.Lock()
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature
+        cache_size = flow_cache.shape[2]
+        # fix prompt and overlap part mu and z
+        if cache_size != 0:
+            z[:, :, :cache_size] = flow_cache[:, :, :, 0]
+            mu[:, :, :cache_size] = flow_cache[:, :, :, 1]
+        z_cache = torch.concat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2)
+        mu_cache = torch.concat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2)
+        flow_cache = torch.stack([z_cache, mu_cache], dim=-1)
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), flow_cache
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        # Do not use concat, it may cause memory format changed and trt infer with wrong results!
+        x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
+        mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
+        spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
+        cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        for step in range(1, len(t_span)):
+            # Classifier-Free Guidance inference introduced in VoiceBox
+            x_in[:] = x
+            mask_in[:] = mask
+            mu_in[0] = mu
+            t_in[:] = t.unsqueeze(0)
+            spks_in[0] = spks
+            cond_in[0] = cond
+            dphi_dt = self.forward_estimator(
+                x_in, mask_in,
+                mu_in, t_in,
+                spks_in,
+                cond_in
+            )
+            dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1].float()
+    def forward_estimator(self, x, mask, mu, t, spks, cond):
+        if isinstance(self.estimator, torch.nn.Module):
+            return self.estimator.forward(x, mask, mu, t, spks, cond)
+        else:
+            with self.lock:
+                self.estimator.set_input_shape('x', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
+                self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('t', (2,))
+                self.estimator.set_input_shape('spks', (2, 80))
+                self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
+                # run trt engine
+                self.estimator.execute_v2([x.contiguous().data_ptr(),
+                                           mask.contiguous().data_ptr(),
+                                           mu.contiguous().data_ptr(),
+                                           t.contiguous().data_ptr(),
+                                           spks.contiguous().data_ptr(),
+                                           cond.contiguous().data_ptr(),
+                                           x.data_ptr()])
+            return x
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        # random timestep
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t = 1 - torch.cos(t * 0.5 * torch.pi)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
+        if self.training_cfg_rate > 0:
+            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
+            mu = mu * cfg_mask.view(-1, 1, 1)
+            spks = spks * cfg_mask.view(-1, 1)
+            cond = cond * cfg_mask.view(-1, 1, 1)
+        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
+        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
+        return loss, y
+class CausalConditionalCFM(ConditionalCFM):
+    def __init__(self, in_channels=240, cfm_params=None, n_spks=1, spk_emb_dim=64,  estimator_config = None):
+        super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator_config)
+        self.rand_noise = torch.randn([1, 80, 50 * 300])
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = self.rand_noise[:, :, :mu.size(2)].to(mu.device).to(mu.dtype) * temperature
+        # fix prompt and overlap part mu and z
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    FeedForward are appied on each position of the sequence.
+    The output dim is same with the input dim.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+    def __init__(
+            self,
+            idim: int,
+            hidden_units: int,
+            dropout_rate: float,
+            activation: torch.nn.Module = torch.nn.ReLU(),
+    ):
+        """Construct a PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.activation = activation
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+        """
+        return self.w_2(self.dropout(self.activation(self.w_1(xs))))
+class ConformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module
+             instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: use layer_norm after each sub-block.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: torch.nn.Module,
+        feed_forward: Optional[nn.Module] = None,
+        feed_forward_macaron: Optional[nn.Module] = None,
+        conv_module: Optional[nn.Module] = None,
+        dropout_rate: float = 0.1,
+        normalize_before: bool = True,
+    ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = nn.LayerNorm(size, eps=1e-12)  # for the FNN module
+        self.norm_mha = nn.LayerNorm(size, eps=1e-12)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-12)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = nn.LayerNorm(size, eps=1e-12)  # for the CNN module
+            self.norm_final = nn.LayerNorm(
+                size, eps=1e-12)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (torch.Tensor): (#batch, time, size)
+            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
+                (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): positional encoding, must not be None
+                for ConformerEncoderLayer.
+            mask_pad (torch.Tensor): batch padding mask used for conv module.
+                (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, cache_t2)
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time).
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb,
+                                              att_cache)
+        x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+        # convolution module
+        # Fake new cnn cache here, and then change it in conv_module
+        new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+            x = residual + self.dropout(x)
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+        return x, mask, new_att_cache, new_cnn_cache
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Conformer model."""
+    def __init__(self,
+                 channels: int,
+                 kernel_size: int = 15,
+                 activation: nn.Module = nn.ReLU(),
+                 norm: str = "batch_norm",
+                 causal: bool = False,
+                 bias: bool = True):
+        """Construct an ConvolutionModule object.
+        Args:
+            channels (int): The number of channels of conv layers.
+            kernel_size (int): Kernel size of conv layers.
+            causal (int): Whether use causal convolution or not
+        """
+        super().__init__()
+        self.pointwise_conv1 = nn.Conv1d(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0: it's a causal convolution, the input will be
+        #    padded with self.lorder frames on the left in forward.
+        # else: it's a symmetrical convolution
+        if causal:
+            padding = 0
+            self.lorder = kernel_size - 1
+        else:
+            # kernel_size should be an odd number for none causal convolution
+            assert (kernel_size - 1) % 2 == 0
+            padding = (kernel_size - 1) // 2
+            self.lorder = 0
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=padding,
+            groups=channels,
+            bias=bias,
+        )
+        assert norm in ['batch_norm', 'layer_norm']
+        if norm == "batch_norm":
+            self.use_layer_norm = False
+            self.norm = nn.BatchNorm1d(channels)
+        else:
+            self.use_layer_norm = True
+            self.norm = nn.LayerNorm(channels)
+        self.pointwise_conv2 = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.activation = activation
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        cache: torch.Tensor = torch.zeros((0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute convolution module.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, channels).
+            mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
+                (0, 0, 0) means fake mask.
+            cache (torch.Tensor): left context cache, it is only
+                used in causal convolution (#batch, channels, cache_t),
+                (0, 0, 0) meas fake cache.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)  # (#batch, channels, time)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+        if self.lorder > 0:
+            if cache.size(2) == 0:  # cache_t == 0
+                x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
+            else:
+                assert cache.size(0) == x.size(0)  # equal batch
+                assert cache.size(1) == x.size(1)  # equal channel
+                x = torch.cat((cache, x), dim=2)
+            assert (x.size(2) > self.lorder)
+            new_cache = x[:, :, -self.lorder:]
+        else:
+            # It's better we just return None if no cache is required,
+            # However, for JIT export, here we just fake one tensor instead of
+            # None.
+            new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.activation(self.norm(x))
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.pointwise_conv2(x)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+        return x.transpose(1, 2), new_cache
+class Upsample1D(nn.Module):
+    """A 1D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+    """
+    def __init__(self, channels: int, out_channels: int, stride: int = 2):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels
+        self.stride = stride
+        # In this mode, first repeat interpolate, than conv with stride=1
+        self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
+        outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
+        outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
+        outputs = self.conv(outputs)
+        return outputs, input_lengths * self.stride
+class PreLookaheadLayer(nn.Module):
+    def __init__(self, channels: int, pre_lookahead_len: int = 1):
+        super().__init__()
+        self.channels = channels
+        self.pre_lookahead_len = pre_lookahead_len
+        self.conv1 = nn.Conv1d(
+            channels, channels,
+            kernel_size=pre_lookahead_len + 1,
+            stride=1, padding=0,
+        )
+        self.conv2 = nn.Conv1d(
+            channels, channels,
+            kernel_size=3, stride=1, padding=0,
+        )
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        """
+        inputs: (batch_size, seq_len, channels)
+        """
+        outputs = inputs.transpose(1, 2).contiguous()
+        # look ahead
+        outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
+        outputs = F.leaky_relu(self.conv1(outputs))
+        # outputs
+        outputs = F.pad(outputs, (2, 0), mode='constant', value=0.0)
+        outputs = self.conv2(outputs)
+        outputs = outputs.transpose(1, 2).contiguous()
+        # residual connection
+        outputs = outputs + inputs
+        return outputs
+class BaseSubsampling(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return self.pos_enc.position_encoding(offset, size)
+class LinearNoSubsampling(BaseSubsampling):
+    """Linear transform the input without subsampling
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an linear object."""
+        super().__init__()
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            torch.nn.LayerNorm(odim, eps=1e-5),
+            torch.nn.Dropout(dropout_rate),
+        )
+        self.pos_enc = pos_enc_class
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.out(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+class EspnetRelPositionalEncoding(torch.nn.Module):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        """Construct an PositionalEncoding object."""
+        super(EspnetRelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+    def extend_pe(self, x: torch.Tensor):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.position_encoding(size=x.size(1), offset=offset)
+        return self.dropout(x), self.dropout(pos_emb)
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        pos_emb = self.pe[
+            :,
+            self.pe.size(1) // 2 - size + 1: self.pe.size(1) // 2 + size,
+        ]
+        return pos_emb
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+    def forward_qkv(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            torch.Tensor: Transformed query tensor, size
+                (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor, size
+                (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor, size
+                (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        return q, k, v
+    def forward_attention(
+        self,
+        value: torch.Tensor,
+        scores: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
+    ) -> torch.Tensor:
+        """Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
+        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
+        #           1st chunk to ease the onnx export.]
+        #   2. pytorch training
+        if mask.size(2) > 0:  # time2 > 0
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            # For last chunk, time2 might be larger than scores.size(-1)
+            mask = mask[:, :, :, :scores.size(-1)]  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0)  # (batch, head, time1, time2)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
+        #   1. onnx(16/-1, -1/-1, 16/0)
+        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
+                                                 self.h * self.d_k)
+             )  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+                1.When applying cross attention between decoder and encoder,
+                the batch padding mask for input is in (#batch, 1, T) shape.
+                2.When applying self attention of encoder,
+                the mask is in (#batch, T, T)  shape.
+                3.When applying self attention of decoder,
+                the mask is in (#batch, L, L)  shape.
+                4.If the different position in decoder see different block
+                of the encoder, such as Mocha, the passed in mask could be
+                in (#batch, L, T) shape. But there is no such case in current
+                CosyVoice.
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask), new_cache
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate, key_bias)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[
+            :, :, :, : x.size(-1) // 2 + 1
+        ]  # only keep the positions from 0 to time2
+        return x
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
+        if matrix_ac.shape != matrix_bd.shape:
+            matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k)  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask), new_cache
+class UpsampleConformerEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "rel_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        positionwise_conv_kernel_size: int = 1,
+        macaron_style: bool = True,
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        cnn_module_kernel: int = 15,
+        causal: bool = False,
+        cnn_module_norm: str = "batch_norm",
+        key_bias: bool = True,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        Args:
+            input_size (int): input dim
+            output_size (int): dimension of attention
+            attention_heads (int): the number of heads of multi head attention
+            linear_units (int): the hidden units number of position-wise feed
+                forward
+            num_blocks (int): the number of decoder blocks
+            dropout_rate (float): dropout rate
+            attention_dropout_rate (float): dropout rate in attention
+            positional_dropout_rate (float): dropout rate after adding
+                positional encoding
+            input_layer (str): input layer type.
+                optional [linear, conv2d, conv2d6, conv2d8]
+            pos_enc_layer_type (str): Encoder positional encoding layer type.
+                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
+            normalize_before (bool):
+                True: use layer_norm before each sub-block of a layer.
+                False: use layer_norm after each sub-block of a layer.
+            static_chunk_size (int): chunk size for static chunk training and
+                decoding
+            use_dynamic_chunk (bool): whether use dynamic chunk size for
+                training or not, You can only use fixed chunk(chunk_size > 0)
+                or dyanmic chunk size(use_dynamic_chunk = True)
+            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
+            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
+                dynamic chunk training
+            key_bias: whether use bias in attention.linear_k, False for whisper models.
+            gradient_checkpointing: rerunning a forward-pass segment for each
+                checkpointed segment during backward.
+        """
+        super().__init__()
+        self._output_size = output_size
+        self.global_cmvn = global_cmvn
+        # self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+        self.embed = LinearNoSubsampling(
+            input_size,
+            output_size,
+            dropout_rate,
+            # COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
+            EspnetRelPositionalEncoding(
+                output_size,
+                positional_dropout_rate,
+            ),
+        )
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        self.gradient_checkpointing = gradient_checkpointing
+        # COSYVOICE_ACTIVATION_CLASSES[activation_type]()
+        activation = getattr(torch.nn, "SiLU", Swish)()
+        # self-attention module definition
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+            key_bias,
+        )
+        # feed-forward module definition
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+        )
+        # convolution module definition
+        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+                                  cnn_module_norm, causal)
+        self.pre_lookahead_layer = PreLookaheadLayer(channels=512, pre_lookahead_len=3)
+        self.encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                # COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+                RelPositionMultiHeadedAttention(
+                    *encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                PositionwiseFeedForward(
+                    *positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(num_blocks)
+        ])
+        self.up_layer = Upsample1D(channels=512, out_channels=512, stride=2)
+        # self.up_embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+        self.up_embed = LinearNoSubsampling(
+            input_size,
+            output_size,
+            dropout_rate,
+            # COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
+            EspnetRelPositionalEncoding(
+                output_size,
+                positional_dropout_rate,
+            ),
+        )
+        self.up_encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                # COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+                RelPositionMultiHeadedAttention(
+                    *encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                PositionwiseFeedForward(
+                    *positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(4)
+        ])
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embed positions in tensor.
+        Args:
+            xs: padded input tensor (B, T, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor xs, and subsampled masks
+            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+            masks: torch.Tensor batch padding mask after subsample
+                (B, 1, T' ~= T/subsample_rate)
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks,
+                                              self.use_dynamic_chunk,
+                                              self.use_dynamic_left_chunk,
+                                              decoding_chunk_size,
+                                              self.static_chunk_size,
+                                              num_decoding_left_chunks)
+        # lookahead + conformer encoder
+        xs = self.pre_lookahead_layer(xs)
+        xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
+        # upsample + conformer encoder
+        xs = xs.transpose(1, 2).contiguous()
+        xs, xs_lens = self.up_layer(xs, xs_lens)
+        xs = xs.transpose(1, 2).contiguous()
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        xs, pos_emb, masks = self.up_embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks,
+                                              self.use_dynamic_chunk,
+                                              self.use_dynamic_left_chunk,
+                                              decoding_chunk_size,
+                                              self.static_chunk_size * self.up_layer.stride,
+                                              num_decoding_left_chunks)
+        xs = self.forward_up_layers(xs, chunk_masks, pos_emb, mask_pad)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks
+    def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                       pos_emb: torch.Tensor,
+                       mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs
+    def forward_up_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                          pos_emb: torch.Tensor,
+                          mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.up_encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs
+class CausalMaskedDiffWithXvec(PreTrainedModel):
+    """
+    cosyvoice2.0 flow模块
+    """
+    def __init__(
+        self,
+        config: FlowConfig,
+        mel_feat_conf: Dict = {
+            'n_fft': 1024,
+            'num_mels': 80,
+            'sampling_rate': 22050,
+            'hop_size': 256,
+            'win_size': 1024,
+            'fmin': 0,
+            'fmax': 8000,
+        },
+    ):
+        super().__init__(config)
+        self.input_size = config.input_size
+        self.output_size = config.output_size
+        self.decoder_conf = config.decoder_config
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = config.vocab_size  # 与speech tokenizer保持一致 6561
+        self.output_type = config.output_type
+        self.input_frame_rate = config.input_frame_rate
+        self.input_embedding = nn.Embedding(config.vocab_size, config.input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(config.spk_embed_dim, config.output_size)
+        self.encoder = UpsampleConformerEncoder(**config.encoder_config)
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), config.output_size)
+        decoder_config = copy.deepcopy(config.decoder_config)
+        decoder_config['cfm_params'] = DictConfig(decoder_config['cfm_params'])
+        self.decoder = CausalConditionalCFM(**decoder_config)
+        self.only_mask_loss = config.only_mask_loss
+        self.token_mel_ratio = config.token_mel_ratio
+        self.pre_lookahead_len = config.pre_lookahead_len
+    @torch.inference_mode()
+    def inference(
+        self,
+        token,
+        token_len,
+        prompt_token,
+        prompt_token_len,
+        prompt_feat,
+        prompt_feat_len,
+        embedding,
+        finalize,
+    ):
+        # if self.fp16 is True:
+        #     prompt_feat = prompt_feat.half()
+        #     embedding = embedding.half()
+        # process
+        embedding = embedding.to(self.spk_embed_affine_layer.weight.data.dtype)  # noqa, TODO
+        prompt_feat = prompt_feat.to(self.spk_embed_affine_layer.weight.data.dtype) # noqa, TODO
+        assert token.shape[0] == 1
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len   # 拼接prompt token+ 需要生成的token
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len)
+        if finalize is False:
+            h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
+        mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
+        h = self.encoder_proj(h)
+        # get conditions
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
+        conds[:, :mel_len1] = prompt_feat   # prompt音频的mel 特征作为condition
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, _ = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat.float(), None

modeling_hifigan.py ADDED Viewed

	@@ -0,0 +1,479 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Dict, Optional, List
+import numpy as np
+from scipy.signal import get_window
+import torch
+import torch.nn.functional as F
+from torch.nn import ConvTranspose1d, Conv1d, Parameter
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+from torch.distributions.uniform import Uniform
+from torch import nn, sin, pow
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_hifigan import HiFiGanConfig
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+    return
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class ConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512
+                 ):
+        super().__init__()
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(
+                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.condnet(x)
+        x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: List[int] = [1, 3, 5],
+    ):
+        super(ResBlock, self).__init__()
+        self.convs1 = nn.ModuleList()
+        self.convs2 = nn.ModuleList()
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        padding=get_padding(kernel_size, dilation)
+                    )
+                )
+            )
+            self.convs2.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1)
+                    )
+                )
+            )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+        self.activations1 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs1))
+        ])
+        self.activations2 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs2))
+        ])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt = self.convs1[idx](xt)
+            xt = self.activations2[idx](xt)
+            xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            remove_weight_norm(self.convs1[idx])
+            remove_weight_norm(self.convs2[idx])
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    @torch.no_grad()
+    def forward(self, f0):
+        """
+        :param f0: [B, 1, sample_len], Hz
+        :return: [B, 1, sample_len]
+        """
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
+        for i in range(self.harmonic_num + 1):
+            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
+        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+        u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
+        phase_vec[:, 0, :] = 0
+        # generate sine waveforms
+        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
+            sine_wavs = sine_wavs.transpose(1, 2)
+            uv = uv.transpose(1, 2)
+        sine_wavs = sine_wavs.to(self.l_linear.weight.data.dtype)  # noqa, TODO
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class HiFTGenerator(PreTrainedModel):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            config: HiFiGanConfig
+    ):
+        super(HiFTGenerator, self).__init__(config)
+        self.out_channels = 1
+        self.nb_harmonics = config.nb_harmonics
+        self.sampling_rate = config.sampling_rate
+        self.istft_params = config.istft_params
+        self.lrelu_slope = config.lrelu_slope
+        self.audio_limit = config.audio_limit
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=config.sampling_rate,
+            upsample_scale=np.prod(config.upsample_rates) * config.istft_params["hop_len"],
+            harmonic_num=config.nb_harmonics,
+            sine_amp=config.nsf_alpha,
+            add_noise_std=config.nsf_sigma,
+            voiced_threshod=config.nsf_voiced_threshold)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(config.upsample_rates) * config.istft_params["hop_len"])
+        self.conv_pre = weight_norm(
+            Conv1d(config.in_channels, config.base_channels, 7, 1, padding=3)
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        config.base_channels // (2**i),
+                        config.base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + config.upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], config.source_resblock_kernel_sizes, config.source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    Conv1d(config.istft_params["n_fft"] + 2, config.base_channels // (2 ** (i + 1)), 1, 1)
+                )
+            else:
+                self.source_downs.append(
+                    Conv1d(config.istft_params["n_fft"] + 2, config.base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
+                )
+            self.source_resblocks.append(
+                ResBlock(config.base_channels // (2 ** (i + 1)), k, d)
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = config.base_channels // (2**(i + 1))
+            for _, (k, d) in enumerate(zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, config.istft_params["n_fft"] + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", config.istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.f0_predictor = ConvRNNF0Predictor(**config.f0_predictor_config)
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        self.m_source.remove_weight_norm()
+        for l in self.source_downs:
+            remove_weight_norm(l)
+        for l in self.source_resblocks:
+            l.remove_weight_norm()
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
+            return_complex=True)
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        return spec[..., 0], spec[..., 1]
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
+                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
+        return inverse_transform
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        s_stft = s_stft.to(x)  # noqa TODO
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        magnitude = magnitude.to(torch.float)  # noqa TODO
+        phase = phase.to(torch.float)  # noqa TODO
+        x = self._istft(magnitude, phase)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        speech_feat = batch['speech_feat'].transpose(1, 2).to(device)
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # mel+source->speech
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, f0
+    @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        # process data
+        speech_feat = speech_feat.to(self.f0_predictor.classifier.weight.data.dtype)  # noqa, TODO
+        cache_source = cache_source.to(self.f0_predictor.classifier.weight.data.dtype)  # noqa, TODO
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # use cache_source to avoid glitch
+        if cache_source.shape[2] != 0:
+            s[:, :, :cache_source.shape[2]] = cache_source
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, s

modeling_interactiveomni.py ADDED Viewed

	@@ -0,0 +1,773 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import warnings
+from typing import Any, List, Optional, Tuple, Union
+import re
+import json
+import math
+import librosa
+import numpy as np
+from PIL import Image
+from decord import VideoReader, cpu
+from torch import nn
+import torch
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from transformers import (GenerationConfig, Qwen3ForCausalLM, WhisperFeatureExtractor)
+from transformers.modeling_utils import PreTrainedModel
+import onnxruntime
+import torchaudio.compliance.kaldi as kaldi
+import torchaudio
+from transformers.utils.hub import cached_file
+from .configuration_interactiveomni import InteractiveOmniConfig
+from .modeling_intern_vit import InternVisionModel
+from .modeling_whisper import AudioWhisperModel
+from .modeling_voicelm import VoiceLM
+from .conversation import get_conv_template
+from .modeling_flow import CausalMaskedDiffWithXvec
+from .modeling_hifigan import HiFTGenerator
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+IMG_START_TOKEN = '<img>'
+IMG_END_TOKEN = '</img>'
+IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+AUDIO_START_TOKEN = '<audio>'
+AUDIO_END_TOKEN = '</audio>'
+AUDIO_CONTEXT_TOKEN = '<AUDIO_CONTEXT>'
+class InteractiveOmniModel(PreTrainedModel):
+    config_class = InteractiveOmniConfig
+    main_input_name = 'pixel_values'
+    base_model_prefix = 'language_model'
+    _no_split_modules = ['InternVisionModel', 'AudioWhisperModel', 'Qwen3DecoderLayer', 'Qwen2DecoderLayer']
+    def __init__(self, config: InteractiveOmniConfig, vision_model=None, language_model=None, audio_model=None):
+        super().__init__(config)
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.select_layer = config.select_layer
+        self.template = config.template
+        self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+        self.audio_feature_extractor = WhisperFeatureExtractor(**config.audio_preprocessor_config)
+        self.transform = self.build_transform(input_size=image_size)
+        self.campplus_session = None
+        self.default_speaker_embedding = None
+        self.default_wav_path = None
+        logger.info(f'num_image_token: {self.num_image_token}')
+        logger.info(f'ps_version: {self.ps_version}')
+        if vision_model is not None:
+            self.vision_model = vision_model
+        else:
+            self.vision_model = InternVisionModel(config.vision_config)
+        if audio_model is not None:
+            self.audio_model = audio_model
+        else:
+            self.audio_model = AudioWhisperModel(config.audio_config)
+        if language_model is not None:
+            self.language_model = language_model
+        else:
+            self.language_model = Qwen3ForCausalLM(config.llm_config)
+        self.voicelm_model = VoiceLM(config.voicelm_config)
+        self.flow_model = CausalMaskedDiffWithXvec(config.flow_config).float()
+        self.hifigan_model = HiFTGenerator(config.hifigan_config).float()
+        vit_hidden_size = config.vision_config.hidden_size
+        audio_hidden_size = config.audio_config.d_model
+        llm_hidden_size = config.llm_config.hidden_size
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size)
+        )
+        self.mlp2 = nn.Sequential(
+            nn.LayerNorm(audio_hidden_size),
+            nn.Linear(audio_hidden_size, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size)
+        )
+        self.mlp_llm2voicelm = nn.Sequential(
+            nn.LayerNorm(llm_hidden_size),
+            nn.Linear(llm_hidden_size, config.voicelm_config.llm_input_size),
+            nn.GELU(),
+            nn.Linear(config.voicelm_config.llm_input_size, config.voicelm_config.llm_input_size)
+        )
+        self.gate = nn.Sequential(
+            nn.Linear(2 * llm_hidden_size, llm_hidden_size),
+            nn.Sigmoid()
+        )
+        self.img_context_token_id = None
+        self.audio_context_token_id = None
+        self.neftune_alpha = None
+        self.post_init()
+        pass
+    def fusion(self, rep, emb):
+        gate = self.gate(torch.cat([rep, emb], dim=-1))
+        return rep * gate + emb * (1 - gate)
+    def __load_campplus_session(self, campplus_path:str):
+        ''''''
+        logger.info(f"load campplus session: {campplus_path}")
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        campplus_session = onnxruntime.InferenceSession(
+            campplus_path,
+            sess_options=option,
+            providers=["CPUExecutionProvider"],
+        )
+        self.campplus_session = campplus_session
+        return campplus_session
+    def extract_speaker_embedding(self, prompt_wav:str):
+        '''extract speaker embedding tensor'''
+        logger.info(f"extract speaker embedding: {prompt_wav}")
+        target_sr = 16000
+        prompt_speech_16k, sample_rate = torchaudio.load(prompt_wav)
+        prompt_speech_16k = prompt_speech_16k.mean(dim=0, keepdim=True)
+        if sample_rate != target_sr:
+            assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
+            prompt_speech_16k = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(prompt_speech_16k)
+        feat = kaldi.fbank(
+            prompt_speech_16k,
+            num_mel_bins=80,
+            dither=0,
+            sample_frequency=target_sr,
+        )
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        speaker_embedding = self.campplus_session.run(
+            None,
+            {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()},
+        )[0].flatten().tolist()
+        speaker_embedding = torch.tensor([speaker_embedding])
+        return speaker_embedding
+    def build_transform(self, input_size):
+        MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD)
+        ])
+        return transform
+    def find_closest_aspect_ratio(self, image, min_num=1, max_num=6, image_size=448):
+        assert min_num == 1
+        original_width, original_height = image.size
+        log_ratio = math.log(original_width / original_height)
+        ratio = original_width * original_height / (image_size * image_size)
+        multiple = min(math.ceil(ratio), max_num)
+        if multiple <= 1:
+            return [1, 1]
+        candidate_split_grids_nums = []
+        for i in [multiple - 1, multiple, multiple + 1]:
+            if i > max_num:
+                continue
+            candidate_split_grids_nums.append(i)
+        candidate_grids = []
+        for split_grids_nums in candidate_split_grids_nums:
+            m = 1
+            while m <= split_grids_nums:
+                if split_grids_nums % m == 0:
+                    candidate_grids.append([m, split_grids_nums // m])
+                m += 1
+        best_grid = [1, 1]
+        min_error = float("inf")
+        for grid in candidate_grids:
+            error = abs(log_ratio - math.log(grid[0] / grid[1]))
+            if error < min_error:
+                best_grid = grid
+                min_error = error
+        return best_grid
+    def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+        target_aspect_ratio = self.find_closest_aspect_ratio(image, min_num, max_num, image_size)
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+        # resize the image
+        resized_img = image.resize((target_width, target_height))
+        processed_images = []
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size
+            )
+            # split the image
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+        assert len(processed_images) == blocks
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = image.resize((image_size, image_size))
+            processed_images.append(thumbnail_img)
+        return processed_images
+    def load_image(self, image, input_size=448, max_num=12):
+        if not isinstance(image, Image.Image):
+            image = Image.open(image).convert('RGB')
+        images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        return images
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
+                          'which results in a transposed image.')
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+    def extract_feature(self, pixel_values):
+        if self.select_layer == -1:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=False,
+                return_dict=True).last_hidden_state
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=True,
+                return_dict=True).hidden_states[self.select_layer]
+        vit_embeds = vit_embeds[:, 1:, :]
+        if self.training and self.neftune_alpha is not None:
+            vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)#.to(pixel_values.device)
+        return vit_embeds
+    def get_T_after_cnn(self, L_in, dilation=1):
+        for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
+            L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
+            L_out = 1 + L_out // stride
+            L_in = L_out
+        return L_out
+    def process_audio(self, audio, return_tensors, sampling_rate=16000):
+        L = (audio.shape[0] if audio.shape[0] <= 480000 else 480000)  # max_length < 30s
+        mel_len = L // 160
+        audio_len_after_cnn = self.get_T_after_cnn(mel_len)
+        audio_token_num = (audio_len_after_cnn - 2) // 2 + 1
+        inputs = self.audio_feature_extractor(audio, return_tensors=return_tensors, sampling_rate=sampling_rate)
+        inputs['audio_len_after_cnn'] = torch.tensor(audio_len_after_cnn, dtype=torch.long)
+        inputs['audio_token_num'] = torch.tensor(audio_token_num, dtype=torch.long)
+        return inputs
+    def load_audio(self, audio_file, sampling_rate=16000):
+        audio_values, _ = librosa.load(audio_file, sr=sampling_rate) # sample rate should be 16000
+        audio_process_values = self.process_audio(audio_values, sampling_rate=sampling_rate, return_tensors="pt")
+        input_features = audio_process_values['input_features']
+        audio_len_after_cnn = audio_process_values['audio_len_after_cnn']
+        audio_token_num = audio_process_values['audio_token_num']
+        audio_input_dict = {'audio_values': input_features,
+                        'audio_len_after_cnn': audio_len_after_cnn,
+                        'audio_token_num': audio_token_num,
+                        }
+        return audio_input_dict
+    def extract_audio_feature(self, audio_values, audio_len_after_cnn):
+        audio_values = audio_values.squeeze(1)
+        max_len_in_batch = int(torch.max(audio_len_after_cnn).item())
+        padding_mask = torch.ones([audio_values.size(0), max_len_in_batch]).to(dtype=audio_values.dtype, device=audio_values.device)
+        for index in range(len(audio_values)):
+            padding_mask[index, :int(audio_len_after_cnn[index].item())] = 0
+        last_hidden_state = self.audio_model(audio_values, padding_mask, audio_len_after_cnn)  # (bs, max_token_num, 1280)
+        audio_embeds = self.mlp2(last_hidden_state)
+        return audio_embeds
+    def get_index(self, bound, fps, max_frame, first_idx=0, num_segments=32):
+        if bound:
+            start, end = bound[0], bound[1]
+        else:
+            start, end = -100000, 100000
+        start_idx = max(first_idx, round(start * fps))
+        end_idx = min(round(end * fps), max_frame)
+        seg_size = float(end_idx - start_idx) / num_segments
+        frame_indices = np.array([
+            int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+            for idx in range(num_segments)
+        ])
+        return frame_indices
+    def load_video(self, video_path, bound=None, num_segments=32):
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        max_frame = len(vr) - 1
+        fps = float(vr.get_avg_fps())
+        frame_indices = self.get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+        frames = list()
+        for frame_index in frame_indices:
+            img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+            frames.append(img)
+        return frames
+    def find_second_last_occurrence(self, input_ids_list, target_id):
+        '''find taget_id index'''
+        reversed_list = list(reversed(input_ids_list))
+        first_occurrence = -1
+        second_occurrence = -1
+        for idx, val in enumerate(reversed_list):
+            if val == target_id:
+                if first_occurrence == -1:
+                    first_occurrence = idx  # first index
+                elif second_occurrence == -1:
+                    second_occurrence = idx  # second index
+                    break
+        if second_occurrence == -1:
+            return -1
+        return len(input_ids_list) - second_occurrence - 1
+    def decode_speech_tokens(
+        self,
+        speech_tokens,
+        speaker_embedding=None,
+        flow_prompt_speech_token=None,
+        prompt_speech_feat=None,
+        finalize=True,
+        token_offset=0,
+    ):
+        if speaker_embedding is None:
+            speaker_embedding = torch.zeros(1, 192)
+            pass
+        if flow_prompt_speech_token is None:
+            flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32)
+            pass
+        if prompt_speech_feat is None:
+            prompt_speech_feat = torch.zeros(1, 0, 80)
+            pass
+        self.flow_model.encoder.static_chunk_size = 2 * self.flow_model.input_frame_rate # 50
+        self.flow_model.decoder.estimator.static_chunk_size = 2 * self.flow_model.input_frame_rate * self.flow_model.token_mel_ratio # 100
+        device = speech_tokens.device
+        tts_mel, _ = self.flow_model.inference(
+            token=speech_tokens.to(device),
+            token_len=torch.tensor([speech_tokens.shape[1]], dtype=torch.int32).to(device),
+            prompt_token=flow_prompt_speech_token.to(device),
+            prompt_token_len=torch.tensor([flow_prompt_speech_token.shape[1]], dtype=torch.int32).to(device),
+            prompt_feat=prompt_speech_feat.to(device),
+            prompt_feat_len=torch.tensor([prompt_speech_feat.shape[1]], dtype=torch.int32).to(device),
+            embedding=speaker_embedding.to(device),
+            finalize=finalize,
+        )
+        tts_mel = tts_mel[:, :, token_offset * self.config.flow_config.token_mel_ratio:]
+        hift_cache_source = torch.zeros(1, 1, 0)
+        tts_speech, tts_source = self.hifigan_model.inference(speech_feat=tts_mel, cache_source=hift_cache_source)  # [1, sampling point num]
+        return tts_speech
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.FloatTensor,
+        attention_mask: torch.LongTensor,
+        visual_features: Optional[torch.FloatTensor] = None,
+        audio_values: Optional[torch.FloatTensor] = None,
+        audio_len_after_cnn: Optional[bool] = None,
+        audio_token_num: Optional[bool] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        output_hidden_states: Optional[bool] = None,
+        start_token_id:int = 151644,
+        generate_audio:bool = False,
+        speaker_embedding:torch.Tensor = torch.zeros(1, 192),
+        mix_ratio:list=[5,25],
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        assert self.img_context_token_id is not None
+        assert self.audio_context_token_id is not None
+        vit_embeds = None
+        if visual_features is not None:
+            vit_embeds = visual_features
+        elif pixel_values is not None:
+            vit_embeds = self.extract_feature(pixel_values)
+        cur_conv_start_id = self.find_second_last_occurrence(input_ids.tolist()[0], start_token_id)
+        input_embeds = self.language_model.get_input_embeddings()(input_ids)
+        B, N, C = input_embeds.shape
+        input_embeds = input_embeds.reshape(B * N, C)
+        input_ids = input_ids.reshape(B * N)
+        if vit_embeds is not None:
+            selected = (input_ids == self.img_context_token_id)
+            input_embeds[selected] = vit_embeds.reshape(-1, C)
+        if audio_values is not None and audio_len_after_cnn is not None and audio_token_num is not None:
+            audio_embeds = self.extract_audio_feature(audio_values, audio_len_after_cnn)
+            output_audios = []
+            for i in range(len(audio_token_num)):
+                token_num = int(audio_token_num[i].item())
+                audio = audio_embeds[i][:token_num]
+                output_audios.append(audio)
+            output_audios = torch.cat(output_audios, dim=0)
+            selected = (input_ids == self.audio_context_token_id)
+            input_embeds[selected] = output_audios.reshape(-1, C)
+        input_embeds = input_embeds.reshape(B, N, C)
+        outputs = self.language_model.generate(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            output_hidden_states=output_hidden_states or generate_audio,
+            return_dict_in_generate=generate_audio,
+            use_cache=True,
+            **generate_kwargs,
+        )
+        if not generate_audio:
+            return outputs, None, None
+        hidden_states = torch.cat(
+            [outputs.hidden_states[0][-1][:, -1:, :]] + [outputs.hidden_states[i][-1] for i in range(1, len(outputs.hidden_states))],
+            dim=1,
+        )
+        sampled_token = outputs.sequences
+        if sampled_token.shape[1] == hidden_states.shape[1] + 1:
+            sampled_token = sampled_token[:, 1:]
+        sampled_token_embeddings = self.language_model.get_input_embeddings()(sampled_token)
+        target_text_token_hidden_states = self.fusion(hidden_states, sampled_token_embeddings)
+        input_token_hidden_states = outputs.hidden_states[0][-1][:, cur_conv_start_id:-1, :]
+        question_input_embeddings = input_embeds[:, cur_conv_start_id+1:, :]
+        input_token_hidden_states = self.fusion(input_token_hidden_states, question_input_embeddings)
+        input_feature = self.mlp_llm2voicelm(input_token_hidden_states)
+        target_text_feature = self.mlp_llm2voicelm(target_text_token_hidden_states)  #
+        try:
+            speech_tokens = self.voicelm_model.inference_bistream(input_feature, target_text_feature, mix_ratio=mix_ratio)
+            speech_tokens = torch.LongTensor([speech_tokens]).to(input_feature.device)
+            tts_speech = self.decode_speech_tokens(
+                speech_tokens,
+                speaker_embedding=speaker_embedding,
+            )
+        except Exception as e:
+              logger.warning(f"=========voice lm except:{e}")
+              return outputs.sequences,None, None
+        return outputs.sequences, speech_tokens, tts_speech
+    def chat(
+        self,
+        tokenizer,
+        generation_config,
+        messages,
+        max_patch_num=12,
+        frame=8,
+        generate_audio=False,
+        speaker_embedding=torch.zeros(1, 192),
+        print_flag=True,
+    ):
+        if self.flow_model.dtype != torch.float32 or self.hifigan_model.dtype != torch.float32:
+            logger.info(f"reset flow model and higigan model dtype to float32")
+            self.reset_vocoder()
+            pass
+        if messages is None or len(messages) == 0:
+            raise RuntimeError('no messages')
+        role_transfer_dict = {
+            'system': ['user'],
+            'user': ['assistant'],
+            'assistant': ['user'],
+        }
+        first_role = ['system', 'user']
+        last_role = ['user']
+        if messages[-1]['role'] not in last_role:
+            raise RuntimeError(f"last role error, expect {last_role}, but got {messages[-1]}")
+        current_role = None
+        dynamic_images = list()
+        dynamic_nums = list()
+        audio_values = list()
+        audio_len_after_cnn = list()
+        audio_token_num = list()
+        template = get_conv_template(self.template)
+        for index in range(len(messages)):
+            text = ''
+            audios = list()
+            images = list()
+            message = messages[index]
+            if index == 0:
+                if message['role'] not in first_role:
+                    raise RuntimeError(f'first role error expect {first_role}, but got {message}')
+            else:
+                if message['role'] not in current_role:
+                    raise RuntimeError(f'role error expect {current_role}, but got {message}')
+            current_role = message['role']
+            if isinstance(message["content"], list):
+                for item in message["content"]:
+                    if item['type'] == 'text':
+                        if item.get('text', None) is None:
+                            continue
+                        text += item['text']
+                    elif item['type'] == 'audio':
+                        if item.get('audio', None) is None:
+                            continue
+                        if type(item['audio']) is list:
+                            assert len(item['audio']) == 1, f'only support 1 audio file in round, but got {item["audio"]}'
+                            audio = item['audio'][0]
+                        else:
+                            audio = item['audio']
+                        audios.append(audio)
+                    elif item['type'] == 'image':
+                        if item.get('image', None) is None:
+                            continue
+                        if type(item['image']) is not list:
+                            images.append(item['image'])
+                        else:
+                            images.extend(item['image'])
+                    elif item['type'] == 'video':
+                        if item.get('video', None) is None:
+                            continue
+                        if type(item['video']) is list:
+                            assert len(item['video']) == 1, f'only support 1 video file in round, but got {item["video"]}'
+                            video = item['video'][0]
+                        else:
+                            video = item['video']
+                        frames = self.load_video(video, num_segments=frame)
+                        images.extend(frames)
+            else:
+                assert isinstance(message["content"], str), message["content"]
+                text = message["content"]
+            if len(audios) != 0:
+                assert len(audios) == 1, f'only support 1 audio file in round, but got {audios}'
+                if '<audio>' in text:
+                    matches = re.findall(r"<audio>", text)
+                    assert len(matches) == len(audios), f'<audio> error {text} {len(audios)}' + text
+                    text = re.sub(r'(<audio>)(?!\n)', r'\1\n', text)
+                else:
+                    text = '<audio>\n'*len(audios) + text
+                audio_path = audios[0]
+                audio_input_dict = self.load_audio(audio_path)
+                assert audio_input_dict['audio_token_num'].item() != 0, f'audio_token_num of {audio_path} is 0.'
+                audio_values.append(audio_input_dict['audio_values'])
+                audio_len_after_cnn.append(audio_input_dict['audio_len_after_cnn'])
+                audio_token_num.append(audio_input_dict['audio_token_num'])
+            if images is not None:
+                if '<image>' in text:
+                    matches = re.findall(r"<image>", text)
+                    assert len(matches) == len(images), f'<image> error {text} {len(images)}' + text
+                    text = re.sub(r'(<image>)(?!\n)', r'\1\n', text)
+                else:
+                    text = '<image>\n'*len(images) + text
+                for image in images:
+                    dynamic_image = self.load_image(image, max_num=max_patch_num)
+                    dynamic_images += dynamic_image
+                    dynamic_nums.append(len(dynamic_image))
+            if message['role'] == 'system':
+                template.set_system_message(text)
+            elif message['role'] == 'user':
+                template.append_message(template.roles[0], text)
+            elif message['role'] == 'assistant':
+                template.append_message(template.roles[1], text)
+            else:
+                raise ValueError('unexpected role')
+            current_role = role_transfer_dict[current_role]
+        template.append_message(template.roles[1], None)
+        if len(audio_values) != 0:
+            audio_values = torch.cat(audio_values, dim=0).to(dtype=self.dtype).cuda()  # [num_audio, 128, 3000]
+            audio_len_after_cnn = torch.stack(audio_len_after_cnn, dim=0)  # [num_audio]
+            audio_token_num = torch.stack(audio_token_num, dim=0)  # [num_audio]
+        else:
+            audio_values = None
+            audio_len_after_cnn = None
+            audio_token_num = None
+        if len(dynamic_images) != 0:
+            pixel_values = [self.transform(image) for image in dynamic_images]
+            pixel_values = torch.stack(pixel_values)
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+        else:
+            pixel_values = None
+            dynamic_nums = None
+        img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
+        self.img_context_token_id = img_context_token_id
+        audio_context_token_id = tokenizer.convert_tokens_to_ids(AUDIO_CONTEXT_TOKEN)
+        self.audio_context_token_id = audio_context_token_id
+        # also add end-of-assistant token in eos token id to avoid unnecessary generation
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
+        start_token_id = tokenizer.convert_tokens_to_ids(["<|im_start|>"])[0]
+        query = template.get_prompt()
+        if audio_values is not None:
+            if print_flag:
+                logger.info(f'audio num: {len(audio_token_num)}')
+            audio_tokens_list = list()
+            for index in range(len(audio_token_num)):
+                audio_token_num_i = audio_token_num[index]
+                if print_flag:
+                    logger.info(f'audio_token_num: {audio_token_num_i}')
+                audio_tokens = AUDIO_START_TOKEN + AUDIO_CONTEXT_TOKEN * audio_token_num_i + AUDIO_END_TOKEN
+                audio_tokens_list.append(audio_tokens)
+            audio_tokens_iter = iter(audio_tokens_list)
+            query = re.sub(r"<audio>", lambda match:next(audio_tokens_iter), query)
+        if pixel_values is not None:
+            if print_flag:
+                logger.info(f'image num: {len(dynamic_nums)}')
+            image_tokens_list = list()
+            total_dynamic_num = 0
+            for index in range(len(dynamic_nums)):
+                dynamic_num = dynamic_nums[index]
+                total_dynamic_num += dynamic_num
+                if print_flag:
+                    logger.info(f'dynamic ViT batch size: {dynamic_num}')
+                image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * dynamic_num + IMG_END_TOKEN
+                image_tokens_list.append(image_tokens)
+            assert total_dynamic_num == pixel_values.shape[0], f'dynamic num not equal, {total_dynamic_num}, {pixel_values.shape[0]}'
+            image_tokens_iter = iter(image_tokens_list)
+            query = re.sub(r"<image>", lambda match:next(image_tokens_iter), query)
+        model_inputs = tokenizer(query, return_tensors='pt', add_special_tokens=False)
+        input_ids = model_inputs['input_ids'].cuda()
+        attention_mask = model_inputs['attention_mask'].cuda()
+        generation_config['eos_token_id'] = eos_token_id
+        generation_output, speech_token, audio_bytes = self.generate(
+            pixel_values=pixel_values,
+            audio_values=audio_values,
+            audio_len_after_cnn=audio_len_after_cnn,
+            audio_token_num=audio_token_num,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            generate_audio=generate_audio,
+            start_token_id=start_token_id,
+            speaker_embedding=speaker_embedding,
+            **generation_config
+        )
+        response = tokenizer.batch_decode(generation_output, skip_special_tokens=False)[0]
+        response = response.split("<|im_end|>")[0].replace('<|endoftext|>', '').strip()
+        query_to_print = query
+        if pixel_values is not None:
+            query_to_print = query_to_print.replace(IMG_CONTEXT_TOKEN, '')
+            query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
+        if audio_values is not None:
+            query_to_print = query_to_print.replace(AUDIO_CONTEXT_TOKEN, '')
+            query_to_print = query_to_print.replace(f'{AUDIO_START_TOKEN}{AUDIO_END_TOKEN}', '<audio>')
+        if print_flag:
+            logger.info('query: ' + json.dumps(query_to_print, ensure_ascii=False))
+            logger.info('response: ' + response)
+        if generate_audio:
+            return response, audio_bytes
+        return response
+    def __cache_file(self, pretrained_model_name_or_path:str, filename:str, **kw):
+        '''cache some file'''
+        full_path = cached_file(
+            pretrained_model_name_or_path,
+            filename,
+            subfolder=kw.pop("subfolder", None),
+            cache_dir=kw.pop("cache_dir", None),
+            force_download=kw.pop("force_download", False),
+            proxies=kw.pop("proxies", None),
+            resume_download=kw.pop("resume_download", None),
+            local_files_only=kw.pop("local_files_only", False),
+            token=kw.pop("use_auth_token", None),
+            revision=kw.pop("revision", None),
+        )
+        if full_path is None:
+            raise ValueError(f"""{pretrained_model_name_or_path}/{filename} not exists""")
+        return full_path
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        *model_args,
+        config=None,
+        cache_dir=None,
+        ignore_mismatched_sizes=False,
+        force_download=False,
+        local_files_only=False,
+        token=None,
+        revision="main",
+        use_safetensors=None,
+        weights_only=True,
+        **kwargs,
+    ):
+        model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+        campplus_path = model.__cache_file(pretrained_model_name_or_path, "campplus.onnx", **kwargs)
+        model.__load_campplus_session(campplus_path)
+        default_wav_path = model.__cache_file(pretrained_model_name_or_path, "taozi.wav", **kwargs)
+        model.default_wav_path = default_wav_path
+        model.default_speaker_embedding = model.extract_speaker_embedding(default_wav_path)
+        return model

modeling_intern_vit.py ADDED Viewed

	@@ -0,0 +1,427 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from timm.models.layers import DropPath
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_intern_vit import InternVisionConfig
+try:
+    from flash_attn.bert_padding import pad_input, unpad_input
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+    has_flash_attn = True
+except:
+    print('FlashAttention2 is not installed.')
+    has_flash_attn = False
+logger = logging.get_logger(__name__)
+class FlashAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
+                max_s=None, need_weights=False):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+                if unpadded: (nnz, 3, h, d)
+            key_padding_mask: a bool tensor of shape (B, S)
+        """
+        assert not need_weights
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+                max_s = seqlen
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                          device=qkv.device)
+                output = flash_attn_varlen_qkvpacked_func(
+                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+            else:
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+                x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
+                output_unpad = flash_attn_varlen_qkvpacked_func(
+                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                             indices, batch_size, seqlen),
+                                   'b s (h d) -> b s h d', h=nheads)
+        else:
+            assert max_s is not None
+            output = flash_attn_varlen_qkvpacked_func(
+                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                softmax_scale=self.softmax_scale, causal=causal
+            )
+        return output, None
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+try:
+    from apex.normalization import FusedRMSNorm
+    InternRMSNorm = FusedRMSNorm  # noqa
+    logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
+except ImportError:
+    # using the normal InternRMSNorm
+    pass
+except Exception:
+    logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
+    pass
+NORM2FN = {
+    'rms_norm': InternRMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False).\
+            reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
+        return pos_embed
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
+        ], dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).'
+            )
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj_drop = nn.Dropout(config.dropout)
+        self.qk_normalization = config.qk_normalization
+        if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
+        return x
+class InternMLP(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class InternVisionEncoderLayer(nn.Module):
+    def __init__(self, config: InternVisionConfig, drop_path_rate: float):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
+        hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
+        return hidden_states
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+    def forward(
+            self,
+            inputs_embeds,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer,
+                    hidden_states)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                )
+            hidden_states = layer_outputs
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+class InternVisionModel(PreTrainedModel):
+    main_input_name = 'pixel_values'
+    config_class = InternVisionConfig
+    _no_split_modules = ['InternVisionEncoderLayer']
+    def __init__(self, config: InternVisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(config)
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+        logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
+    def get_input_embeddings(self):
+        return self.embeddings
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError('You have to specify pixel_values or pixel_embeds')
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

modeling_voicelm.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# --------------------------------------------------------
+# SenseTime
+# Copyright (c) 2025 SenseTime
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import List
+import math
+import torch
+from torch import nn
+from transformers import Qwen2ForCausalLM
+from transformers import PreTrainedModel
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+from .configuration_voicelm import VoiceLMConfig
+class Qwen2Encoder(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.model = Qwen2ForCausalLM(config)
+        pass
+    def forward_one_step(self, xs, masks, cache=None):
+        input_masks = masks[:, -1, :]
+        outs = self.model(
+            inputs_embeds=xs,
+            attention_mask=input_masks,
+            output_hidden_states=True,
+            return_dict=True,
+            use_cache=True,
+            past_key_values=cache,
+        )
+        xs = outs.hidden_states[-1]
+        new_cache = outs.past_key_values
+        return xs, new_cache
+class VoiceLM(PreTrainedModel):
+    """
+    voicelm model
+    """
+    def __init__(self, config: VoiceLMConfig):
+        super().__init__(config)
+        self.llm_input_size = config.llm_input_size
+        self.llm_output_size = config.llm_output_size
+        self.speech_token_size = config.speech_token_size  # 6561
+        self.sampling_config = config.sampling_config
+        self.sos_eos = 0
+        self.task_id = 1
+        self.fill_token = 2
+        self.llm_embedding = torch.nn.Embedding(2, config.llm_input_size)
+        self.llm = Qwen2Encoder(config.llm_config)
+        self.llm_decoder = nn.Linear(config.llm_output_size, config.speech_token_size + 3)
+        # speech token embedding (6564, 896)
+        self.speech_embedding = torch.nn.Embedding(
+            config.speech_token_size + 3,
+            config.llm_input_size,
+        )
+        pass
+    # Repetition Aware Sampling in VALL-E 2
+    def ras_sampling(self, weighted_scores:torch.Tensor, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1):
+        top_ids = self.nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k)
+        rep_num = (torch.tensor(decoded_tokens[-win_size:]).to(weighted_scores.device) == top_ids).sum().item()
+        if rep_num >= win_size * tau_r:
+            top_ids = self.random_sampling(weighted_scores, decoded_tokens, sampling)
+        return top_ids
+    def nucleus_sampling(self, weighted_scores:torch.Tensor, top_p=0.8, top_k=25):
+        prob, indices = [], []
+        cum_prob = 0.0
+        sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(descending=True, stable=True)
+        for i in range(len(sorted_idx)):
+            # sampling both top-p and numbers.
+            if cum_prob < top_p and len(prob) < top_k:
+                cum_prob += sorted_value[i]
+                prob.append(sorted_value[i])
+                indices.append(sorted_idx[i])
+            else:
+                break
+        prob = torch.tensor(prob).to(weighted_scores)
+        indices = torch.tensor(indices, dtype=torch.long).to(weighted_scores.device)
+        top_ids = indices[prob.multinomial(1, replacement=True)]
+        return top_ids
+    def random_sampling(self, weighted_scores:torch.Tensor, decoded_tokens, sampling):
+        top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True)
+        return top_ids
+    def sampling_ids(
+            self,
+            weighted_scores: torch.Tensor,
+            decoded_tokens: List,
+            sampling: int,
+            ignore_eos: bool = True,
+    ):
+        num_trials, max_trials = 0, 100
+        while True:
+            top_ids = self.ras_sampling(weighted_scores, decoded_tokens, sampling, **self.sampling_config)
+            if (not ignore_eos) or (self.speech_token_size not in top_ids):
+                break
+            num_trials += 1
+            if num_trials > max_trials:
+                raise RuntimeError('sampling reaches max_trials {} and still get eos when ignore_eos is True, check your input!'.format(max_trials))
+        return top_ids
+    @torch.inference_mode()
+    def inference_bistream(
+        self,
+        input_feature: torch.Tensor,
+        target_text_feature: torch.Tensor,
+        sampling: int = 25,
+        mix_ratio: List[int] = [5, 25],
+    ):
+        text_token_len = target_text_feature.size(1)
+        # 1. prepare input
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        lm_input = torch.concat([sos_eos_emb, input_feature], dim=1)
+        # 2. iterate text
+        out_tokens = []
+        return_out_tokens = []
+        cache = None
+        text_cache = target_text_feature
+        next_fill_index = -1
+        for j in range(int(math.floor((text_token_len) / mix_ratio[0] ))):
+            if (len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2) or (len(out_tokens) == 0 and lm_input.size(1) == (1 + input_feature.size(1))):
+                logger.info('get fill token, need to append more text token')
+                if text_cache.size(1) >= mix_ratio[0]:
+                    lm_input_text = text_cache[:, :mix_ratio[0]]
+                    logger.info('append {} text token'.format(lm_input_text.size(1)))
+                    if len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2:
+                        lm_input = lm_input_text
+                    else:
+                        lm_input = torch.concat([lm_input, lm_input_text], dim=1)
+                    text_cache = text_cache[:, mix_ratio[0]:]
+                else:
+                    logger.info('not enough text token to decode, wait for more')
+                    continue
+            while True:
+                seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
+                y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                            masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
+                                                            cache=cache)
+                logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+                if next_fill_index != -1 and len(out_tokens) == next_fill_index:
+                    top_ids = self.speech_token_size + 2
+                    next_fill_index += (mix_ratio[1] + 1)
+                else:
+                    top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True).item()
+                if top_ids == self.speech_token_size + 2:
+                    next_fill_index = len(out_tokens) + mix_ratio[1] + 1
+                    logger.info('fill_token index {} next fill_token index {}'.format(len(out_tokens), next_fill_index))
+                out_tokens.append(top_ids)
+                if top_ids >= self.speech_token_size:
+                    if top_ids == self.speech_token_size + 2:
+                        break
+                    else:
+                        raise ValueError('should not get token {}'.format(top_ids))
+                # yield top_ids
+                return_out_tokens.append(top_ids)
+                lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+        # 3. final decode
+        lm_input = torch.concat([lm_input, text_cache, task_id_emb], dim=1)
+        logger.info('no more text token, decode until met eos')
+        while True:
+            seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
+            y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                      masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
+                                                      cache=cache)
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=False).item()
+            out_tokens.append(top_ids)
+            if top_ids >= self.speech_token_size:
+                if top_ids == self.speech_token_size:
+                    break
+                else:
+                    raise ValueError('should not get token {}'.format(top_ids))
+            # in stream mode, yield token one by one
+            # yield top_ids
+            return_out_tokens.append(top_ids)
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+        return return_out_tokens

modeling_whisper.py ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,330 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<IMG_CONTEXT>",
+    "<img>",
+    "</img>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>",
+    "<|action_start|>",
+    "<|action_end|>",
+    "<|plugin|>",
+    "<|interpreter|>",
+    "<FAKE_PAD_0>",
+    "<FAKE_PAD_1>",
+    "<FAKE_PAD_2>",
+    "<FAKE_PAD_3>",
+    "<FAKE_PAD_4>",
+    "<FAKE_PAD_5>",
+    "<FAKE_PAD_6>",
+    "<FAKE_PAD_7>",
+    "<FAKE_PAD_8>",
+    "<FAKE_PAD_9>",
+    "<FAKE_PAD_10>",
+    "<FAKE_PAD_11>",
+    "<FAKE_PAD_12>",
+    "<FAKE_PAD_13>",
+    "<FAKE_PAD_14>",
+    "<FAKE_PAD_15>",
+    "<FAKE_PAD_16>",
+    "<FAKE_PAD_17>",
+    "<FAKE_PAD_18>",
+    "<FAKE_PAD_19>",
+    "<FAKE_PAD_20>",
+    "<FAKE_PAD_21>",
+    "<FAKE_PAD_22>",
+    "<FAKE_PAD_23>",
+    "<FAKE_PAD_24>",
+    "<FAKE_PAD_25>",
+    "<FAKE_PAD_26>",
+    "<FAKE_PAD_27>",
+    "<FAKE_PAD_28>",
+    "<FAKE_PAD_29>",
+    "<FAKE_PAD_30>",
+    "<FAKE_PAD_31>",
+    "<FAKE_PAD_32>",
+    "<FAKE_PAD_33>",
+    "<FAKE_PAD_34>",
+    "<FAKE_PAD_35>",
+    "<FAKE_PAD_36>",
+    "<FAKE_PAD_37>",
+    "<FAKE_PAD_38>",
+    "<FAKE_PAD_39>",
+    "<FAKE_PAD_40>",
+    "<FAKE_PAD_41>",
+    "<FAKE_PAD_42>",
+    "<FAKE_PAD_43>",
+    "<FAKE_PAD_44>",
+    "<FAKE_PAD_45>",
+    "<FAKE_PAD_46>",
+    "<FAKE_PAD_47>",
+    "<FAKE_PAD_48>",
+    "<FAKE_PAD_49>",
+    "<FAKE_PAD_50>",
+    "<FAKE_PAD_51>",
+    "<FAKE_PAD_52>",
+    "<FAKE_PAD_53>",
+    "<FAKE_PAD_54>",
+    "<FAKE_PAD_55>",
+    "<FAKE_PAD_56>",
+    "<FAKE_PAD_57>",
+    "<FAKE_PAD_58>",
+    "<FAKE_PAD_59>",
+    "<FAKE_PAD_60>",
+    "<FAKE_PAD_61>",
+    "<FAKE_PAD_62>",
+    "<FAKE_PAD_63>",
+    "<FAKE_PAD_64>",
+    "<FAKE_PAD_65>",
+    "<FAKE_PAD_66>",
+    "<FAKE_PAD_67>",
+    "<FAKE_PAD_68>",
+    "<FAKE_PAD_69>",
+    "<FAKE_PAD_70>",
+    "<FAKE_PAD_71>",
+    "<FAKE_PAD_72>",
+    "<FAKE_PAD_73>",
+    "<FAKE_PAD_74>",
+    "<FAKE_PAD_75>",
+    "<FAKE_PAD_76>",
+    "<FAKE_PAD_77>",
+    "<FAKE_PAD_78>",
+    "<FAKE_PAD_79>",
+    "<FAKE_PAD_80>",
+    "<FAKE_PAD_81>",
+    "<FAKE_PAD_82>",
+    "<FAKE_PAD_83>",
+    "<FAKE_PAD_84>",
+    "<FAKE_PAD_85>",
+    "<FAKE_PAD_86>",
+    "<FAKE_PAD_87>",
+    "<FAKE_PAD_88>",
+    "<FAKE_PAD_89>",
+    "<FAKE_PAD_90>",
+    "<FAKE_PAD_91>",
+    "<FAKE_PAD_92>",
+    "<FAKE_PAD_93>",
+    "<FAKE_PAD_94>",
+    "<FAKE_PAD_95>",
+    "<FAKE_PAD_96>",
+    "<FAKE_PAD_97>",
+    "<FAKE_PAD_98>",
+    "<FAKE_PAD_99>",
+    "<FAKE_PAD_100>",
+    "<FAKE_PAD_101>",
+    "<FAKE_PAD_102>",
+    "<FAKE_PAD_103>",
+    "<FAKE_PAD_104>",
+    "<FAKE_PAD_105>",
+    "<FAKE_PAD_106>",
+    "<FAKE_PAD_107>",
+    "<FAKE_PAD_108>",
+    "<FAKE_PAD_109>",
+    "<FAKE_PAD_110>",
+    "<FAKE_PAD_111>",
+    "<FAKE_PAD_112>",
+    "<FAKE_PAD_113>",
+    "<FAKE_PAD_114>",
+    "<FAKE_PAD_115>",
+    "<FAKE_PAD_116>",
+    "<FAKE_PAD_117>",
+    "<FAKE_PAD_118>",
+    "<FAKE_PAD_119>",
+    "<FAKE_PAD_120>",
+    "<FAKE_PAD_121>",
+    "<FAKE_PAD_122>",
+    "<FAKE_PAD_123>",
+    "<FAKE_PAD_124>",
+    "<FAKE_PAD_125>",
+    "<FAKE_PAD_126>",
+    "<FAKE_PAD_127>",
+    "<FAKE_PAD_128>",
+    "<FAKE_PAD_129>",
+    "<FAKE_PAD_130>",
+    "<FAKE_PAD_131>",
+    "<FAKE_PAD_132>",
+    "<FAKE_PAD_133>",
+    "<FAKE_PAD_134>",
+    "<FAKE_PAD_135>",
+    "<FAKE_PAD_136>",
+    "<FAKE_PAD_137>",
+    "<FAKE_PAD_138>",
+    "<FAKE_PAD_139>",
+    "<FAKE_PAD_140>",
+    "<FAKE_PAD_141>",
+    "<FAKE_PAD_142>",
+    "<FAKE_PAD_143>",
+    "<FAKE_PAD_144>",
+    "<FAKE_PAD_145>",
+    "<FAKE_PAD_146>",
+    "<FAKE_PAD_147>",
+    "<FAKE_PAD_148>",
+    "<FAKE_PAD_149>",
+    "<FAKE_PAD_150>",
+    "<FAKE_PAD_151>",
+    "<FAKE_PAD_152>",
+    "<FAKE_PAD_153>",
+    "<FAKE_PAD_154>",
+    "<FAKE_PAD_155>",
+    "<FAKE_PAD_156>",
+    "<FAKE_PAD_157>",
+    "<FAKE_PAD_158>",
+    "<FAKE_PAD_159>",
+    "<FAKE_PAD_160>",
+    "<FAKE_PAD_161>",
+    "<FAKE_PAD_162>",
+    "<FAKE_PAD_163>",
+    "<FAKE_PAD_164>",
+    "<FAKE_PAD_165>",
+    "<FAKE_PAD_166>",
+    "<FAKE_PAD_167>",
+    "<FAKE_PAD_168>",
+    "<FAKE_PAD_169>",
+    "<FAKE_PAD_170>",
+    "<FAKE_PAD_171>",
+    "<FAKE_PAD_172>",
+    "<FAKE_PAD_173>",
+    "<FAKE_PAD_174>",
+    "<FAKE_PAD_175>",
+    "<FAKE_PAD_176>",
+    "<FAKE_PAD_177>",
+    "<FAKE_PAD_178>",
+    "<FAKE_PAD_179>",
+    "<FAKE_PAD_180>",
+    "<FAKE_PAD_181>",
+    "<FAKE_PAD_182>",
+    "<FAKE_PAD_183>",
+    "<FAKE_PAD_184>",
+    "<FAKE_PAD_185>",
+    "<FAKE_PAD_186>",
+    "<FAKE_PAD_187>",
+    "<FAKE_PAD_188>",
+    "<FAKE_PAD_189>",
+    "<FAKE_PAD_190>",
+    "<FAKE_PAD_191>",
+    "<FAKE_PAD_192>",
+    "<FAKE_PAD_193>",
+    "<FAKE_PAD_194>",
+    "<FAKE_PAD_195>",
+    "<FAKE_PAD_196>",
+    "<FAKE_PAD_197>",
+    "<FAKE_PAD_198>",
+    "<FAKE_PAD_199>",
+    "<FAKE_PAD_200>",
+    "<FAKE_PAD_201>",
+    "<FAKE_PAD_202>",
+    "<FAKE_PAD_203>",
+    "<FAKE_PAD_204>",
+    "<FAKE_PAD_205>",
+    "<FAKE_PAD_206>",
+    "<FAKE_PAD_207>",
+    "<FAKE_PAD_208>",
+    "<FAKE_PAD_209>",
+    "<FAKE_PAD_210>",
+    "<FAKE_PAD_211>",
+    "<FAKE_PAD_212>",
+    "<FAKE_PAD_213>",
+    "<FAKE_PAD_214>",
+    "<FAKE_PAD_215>",
+    "<FAKE_PAD_216>",
+    "<FAKE_PAD_217>",
+    "<FAKE_PAD_218>",
+    "<FAKE_PAD_219>",
+    "<FAKE_PAD_220>",
+    "<FAKE_PAD_221>",
+    "<FAKE_PAD_222>",
+    "<FAKE_PAD_223>",
+    "<FAKE_PAD_224>",
+    "<FAKE_PAD_225>",
+    "<FAKE_PAD_226>",
+    "<FAKE_PAD_227>",
+    "<FAKE_PAD_228>",
+    "<FAKE_PAD_229>",
+    "<FAKE_PAD_230>",
+    "<FAKE_PAD_231>",
+    "<FAKE_PAD_232>",
+    "<FAKE_PAD_233>",
+    "<FAKE_PAD_234>",
+    "<FAKE_PAD_235>",
+    "<FAKE_PAD_236>",
+    "<FAKE_PAD_237>",
+    "<FAKE_PAD_238>",
+    "<FAKE_PAD_239>",
+    "<FAKE_PAD_240>",
+    "<FAKE_PAD_241>",
+    "<FAKE_PAD_242>",
+    "<FAKE_PAD_243>",
+    "<FAKE_PAD_244>",
+    "<FAKE_PAD_245>",
+    "<FAKE_PAD_246>",
+    "<FAKE_PAD_247>",
+    "<FAKE_PAD_248>",
+    "<FAKE_PAD_249>",
+    "<FAKE_PAD_250>",
+    "<FAKE_PAD_251>",
+    "<FAKE_PAD_252>",
+    "<FAKE_PAD_253>",
+    "<audio>",
+    "</audio>",
+    "<AUDIO_CONTEXT>",
+    "<interrupt>",
+    "<FAKE_PAD_PAD_0>",
+    "<FAKE_PAD_PAD_1>",
+    "<FAKE_PAD_PAD_2>",
+    "<FAKE_PAD_PAD_3>",
+    "<FAKE_PAD_PAD_4>",
+    "<FAKE_PAD_PAD_5>",
+    "<FAKE_PAD_PAD_6>",
+    "<FAKE_PAD_PAD_7>",
+    "<FAKE_PAD_PAD_8>",
+    "<FAKE_PAD_PAD_9>",
+    "<FAKE_PAD_PAD_10>",
+    "<FAKE_PAD_PAD_11>",
+    "<FAKE_PAD_PAD_12>",
+    "<FAKE_PAD_PAD_13>",
+    "<FAKE_PAD_PAD_14>",
+    "<FAKE_PAD_PAD_15>",
+    "<FAKE_PAD_PAD_16>",
+    "<FAKE_PAD_PAD_17>",
+    "<FAKE_PAD_PAD_18>",
+    "<FAKE_PAD_PAD_19>",
+    "<FAKE_PAD_PAD_20>",
+    "<FAKE_PAD_PAD_21>",
+    "<FAKE_PAD_PAD_22>",
+    "<FAKE_PAD_PAD_23>",
+    "<FAKE_PAD_PAD_24>",
+    "<FAKE_PAD_PAD_25>",
+    "<FAKE_PAD_PAD_26>",
+    "<FAKE_PAD_PAD_27>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

taozi.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3d286d93323ff1ed598503c40cf028dc3faa946c662fa8d509b201165d56356
+size 807404

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,2931 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "<|action_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "<|action_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151680": {
+      "content": "<|plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151681": {
+      "content": "<|interpreter|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151682": {
+      "content": "<FAKE_PAD_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151683": {
+      "content": "<FAKE_PAD_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151684": {
+      "content": "<FAKE_PAD_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151685": {
+      "content": "<FAKE_PAD_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151686": {
+      "content": "<FAKE_PAD_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151687": {
+      "content": "<FAKE_PAD_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151688": {
+      "content": "<FAKE_PAD_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151689": {
+      "content": "<FAKE_PAD_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151690": {
+      "content": "<FAKE_PAD_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151691": {
+      "content": "<FAKE_PAD_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151692": {
+      "content": "<FAKE_PAD_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151693": {
+      "content": "<FAKE_PAD_11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151694": {
+      "content": "<FAKE_PAD_12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151695": {
+      "content": "<FAKE_PAD_13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151696": {
+      "content": "<FAKE_PAD_14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151697": {
+      "content": "<FAKE_PAD_15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151698": {
+      "content": "<FAKE_PAD_16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151699": {
+      "content": "<FAKE_PAD_17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151700": {
+      "content": "<FAKE_PAD_18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151701": {
+      "content": "<FAKE_PAD_19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151702": {
+      "content": "<FAKE_PAD_20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151703": {
+      "content": "<FAKE_PAD_21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151704": {
+      "content": "<FAKE_PAD_22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151705": {
+      "content": "<FAKE_PAD_23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151706": {
+      "content": "<FAKE_PAD_24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151707": {
+      "content": "<FAKE_PAD_25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151708": {
+      "content": "<FAKE_PAD_26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151709": {
+      "content": "<FAKE_PAD_27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151710": {
+      "content": "<FAKE_PAD_28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151711": {
+      "content": "<FAKE_PAD_29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151712": {
+      "content": "<FAKE_PAD_30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151713": {
+      "content": "<FAKE_PAD_31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151714": {
+      "content": "<FAKE_PAD_32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151715": {
+      "content": "<FAKE_PAD_33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151716": {
+      "content": "<FAKE_PAD_34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151717": {
+      "content": "<FAKE_PAD_35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151718": {
+      "content": "<FAKE_PAD_36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151719": {
+      "content": "<FAKE_PAD_37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151720": {
+      "content": "<FAKE_PAD_38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151721": {
+      "content": "<FAKE_PAD_39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151722": {
+      "content": "<FAKE_PAD_40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151723": {
+      "content": "<FAKE_PAD_41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151724": {
+      "content": "<FAKE_PAD_42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151725": {
+      "content": "<FAKE_PAD_43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151726": {
+      "content": "<FAKE_PAD_44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151727": {
+      "content": "<FAKE_PAD_45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151728": {
+      "content": "<FAKE_PAD_46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151729": {
+      "content": "<FAKE_PAD_47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151730": {
+      "content": "<FAKE_PAD_48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151731": {
+      "content": "<FAKE_PAD_49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151732": {
+      "content": "<FAKE_PAD_50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151733": {
+      "content": "<FAKE_PAD_51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151734": {
+      "content": "<FAKE_PAD_52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151735": {
+      "content": "<FAKE_PAD_53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151736": {
+      "content": "<FAKE_PAD_54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151737": {
+      "content": "<FAKE_PAD_55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151738": {
+      "content": "<FAKE_PAD_56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151739": {
+      "content": "<FAKE_PAD_57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151740": {
+      "content": "<FAKE_PAD_58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151741": {
+      "content": "<FAKE_PAD_59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151742": {
+      "content": "<FAKE_PAD_60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151743": {
+      "content": "<FAKE_PAD_61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151744": {
+      "content": "<FAKE_PAD_62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151745": {
+      "content": "<FAKE_PAD_63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151746": {
+      "content": "<FAKE_PAD_64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151747": {
+      "content": "<FAKE_PAD_65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151748": {
+      "content": "<FAKE_PAD_66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151749": {
+      "content": "<FAKE_PAD_67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151750": {
+      "content": "<FAKE_PAD_68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151751": {
+      "content": "<FAKE_PAD_69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151752": {
+      "content": "<FAKE_PAD_70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151753": {
+      "content": "<FAKE_PAD_71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151754": {
+      "content": "<FAKE_PAD_72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151755": {
+      "content": "<FAKE_PAD_73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151756": {
+      "content": "<FAKE_PAD_74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151757": {
+      "content": "<FAKE_PAD_75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151758": {
+      "content": "<FAKE_PAD_76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151759": {
+      "content": "<FAKE_PAD_77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151760": {
+      "content": "<FAKE_PAD_78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151761": {
+      "content": "<FAKE_PAD_79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151762": {
+      "content": "<FAKE_PAD_80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151763": {
+      "content": "<FAKE_PAD_81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151764": {
+      "content": "<FAKE_PAD_82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151765": {
+      "content": "<FAKE_PAD_83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151766": {
+      "content": "<FAKE_PAD_84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151767": {
+      "content": "<FAKE_PAD_85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151768": {
+      "content": "<FAKE_PAD_86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151769": {
+      "content": "<FAKE_PAD_87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151770": {
+      "content": "<FAKE_PAD_88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151771": {
+      "content": "<FAKE_PAD_89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151772": {
+      "content": "<FAKE_PAD_90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151773": {
+      "content": "<FAKE_PAD_91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151774": {
+      "content": "<FAKE_PAD_92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151775": {
+      "content": "<FAKE_PAD_93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151776": {
+      "content": "<FAKE_PAD_94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151777": {
+      "content": "<FAKE_PAD_95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151778": {
+      "content": "<FAKE_PAD_96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151779": {
+      "content": "<FAKE_PAD_97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151780": {
+      "content": "<FAKE_PAD_98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151781": {
+      "content": "<FAKE_PAD_99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151782": {
+      "content": "<FAKE_PAD_100>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151783": {
+      "content": "<FAKE_PAD_101>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151784": {
+      "content": "<FAKE_PAD_102>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151785": {
+      "content": "<FAKE_PAD_103>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151786": {
+      "content": "<FAKE_PAD_104>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151787": {
+      "content": "<FAKE_PAD_105>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151788": {
+      "content": "<FAKE_PAD_106>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151789": {
+      "content": "<FAKE_PAD_107>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151790": {
+      "content": "<FAKE_PAD_108>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151791": {
+      "content": "<FAKE_PAD_109>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151792": {
+      "content": "<FAKE_PAD_110>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151793": {
+      "content": "<FAKE_PAD_111>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151794": {
+      "content": "<FAKE_PAD_112>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151795": {
+      "content": "<FAKE_PAD_113>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151796": {
+      "content": "<FAKE_PAD_114>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151797": {
+      "content": "<FAKE_PAD_115>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151798": {
+      "content": "<FAKE_PAD_116>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151799": {
+      "content": "<FAKE_PAD_117>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151800": {
+      "content": "<FAKE_PAD_118>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151801": {
+      "content": "<FAKE_PAD_119>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151802": {
+      "content": "<FAKE_PAD_120>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151803": {
+      "content": "<FAKE_PAD_121>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151804": {
+      "content": "<FAKE_PAD_122>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151805": {
+      "content": "<FAKE_PAD_123>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151806": {
+      "content": "<FAKE_PAD_124>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151807": {
+      "content": "<FAKE_PAD_125>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151808": {
+      "content": "<FAKE_PAD_126>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151809": {
+      "content": "<FAKE_PAD_127>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151810": {
+      "content": "<FAKE_PAD_128>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151811": {
+      "content": "<FAKE_PAD_129>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151812": {
+      "content": "<FAKE_PAD_130>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151813": {
+      "content": "<FAKE_PAD_131>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151814": {
+      "content": "<FAKE_PAD_132>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151815": {
+      "content": "<FAKE_PAD_133>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151816": {
+      "content": "<FAKE_PAD_134>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151817": {
+      "content": "<FAKE_PAD_135>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151818": {
+      "content": "<FAKE_PAD_136>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151819": {
+      "content": "<FAKE_PAD_137>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151820": {
+      "content": "<FAKE_PAD_138>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151821": {
+      "content": "<FAKE_PAD_139>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151822": {
+      "content": "<FAKE_PAD_140>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151823": {
+      "content": "<FAKE_PAD_141>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151824": {
+      "content": "<FAKE_PAD_142>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151825": {
+      "content": "<FAKE_PAD_143>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151826": {
+      "content": "<FAKE_PAD_144>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151827": {
+      "content": "<FAKE_PAD_145>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151828": {
+      "content": "<FAKE_PAD_146>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151829": {
+      "content": "<FAKE_PAD_147>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151830": {
+      "content": "<FAKE_PAD_148>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151831": {
+      "content": "<FAKE_PAD_149>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151832": {
+      "content": "<FAKE_PAD_150>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151833": {
+      "content": "<FAKE_PAD_151>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151834": {
+      "content": "<FAKE_PAD_152>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151835": {
+      "content": "<FAKE_PAD_153>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151836": {
+      "content": "<FAKE_PAD_154>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151837": {
+      "content": "<FAKE_PAD_155>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151838": {
+      "content": "<FAKE_PAD_156>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151839": {
+      "content": "<FAKE_PAD_157>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151840": {
+      "content": "<FAKE_PAD_158>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151841": {
+      "content": "<FAKE_PAD_159>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151842": {
+      "content": "<FAKE_PAD_160>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151843": {
+      "content": "<FAKE_PAD_161>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151844": {
+      "content": "<FAKE_PAD_162>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151845": {
+      "content": "<FAKE_PAD_163>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151846": {
+      "content": "<FAKE_PAD_164>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151847": {
+      "content": "<FAKE_PAD_165>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151848": {
+      "content": "<FAKE_PAD_166>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151849": {
+      "content": "<FAKE_PAD_167>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151850": {
+      "content": "<FAKE_PAD_168>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151851": {
+      "content": "<FAKE_PAD_169>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151852": {
+      "content": "<FAKE_PAD_170>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151853": {
+      "content": "<FAKE_PAD_171>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151854": {
+      "content": "<FAKE_PAD_172>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151855": {
+      "content": "<FAKE_PAD_173>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151856": {
+      "content": "<FAKE_PAD_174>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151857": {
+      "content": "<FAKE_PAD_175>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151858": {
+      "content": "<FAKE_PAD_176>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151859": {
+      "content": "<FAKE_PAD_177>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151860": {
+      "content": "<FAKE_PAD_178>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151861": {
+      "content": "<FAKE_PAD_179>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151862": {
+      "content": "<FAKE_PAD_180>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151863": {
+      "content": "<FAKE_PAD_181>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151864": {
+      "content": "<FAKE_PAD_182>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151865": {
+      "content": "<FAKE_PAD_183>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151866": {
+      "content": "<FAKE_PAD_184>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151867": {
+      "content": "<FAKE_PAD_185>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151868": {
+      "content": "<FAKE_PAD_186>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151869": {
+      "content": "<FAKE_PAD_187>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151870": {
+      "content": "<FAKE_PAD_188>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151871": {
+      "content": "<FAKE_PAD_189>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151872": {
+      "content": "<FAKE_PAD_190>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151873": {
+      "content": "<FAKE_PAD_191>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151874": {
+      "content": "<FAKE_PAD_192>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151875": {
+      "content": "<FAKE_PAD_193>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151876": {
+      "content": "<FAKE_PAD_194>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151877": {
+      "content": "<FAKE_PAD_195>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151878": {
+      "content": "<FAKE_PAD_196>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151879": {
+      "content": "<FAKE_PAD_197>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151880": {
+      "content": "<FAKE_PAD_198>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151881": {
+      "content": "<FAKE_PAD_199>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151882": {
+      "content": "<FAKE_PAD_200>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151883": {
+      "content": "<FAKE_PAD_201>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151884": {
+      "content": "<FAKE_PAD_202>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151885": {
+      "content": "<FAKE_PAD_203>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151886": {
+      "content": "<FAKE_PAD_204>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151887": {
+      "content": "<FAKE_PAD_205>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151888": {
+      "content": "<FAKE_PAD_206>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151889": {
+      "content": "<FAKE_PAD_207>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151890": {
+      "content": "<FAKE_PAD_208>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151891": {
+      "content": "<FAKE_PAD_209>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151892": {
+      "content": "<FAKE_PAD_210>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151893": {
+      "content": "<FAKE_PAD_211>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151894": {
+      "content": "<FAKE_PAD_212>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151895": {
+      "content": "<FAKE_PAD_213>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151896": {
+      "content": "<FAKE_PAD_214>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151897": {
+      "content": "<FAKE_PAD_215>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151898": {
+      "content": "<FAKE_PAD_216>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151899": {
+      "content": "<FAKE_PAD_217>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151900": {
+      "content": "<FAKE_PAD_218>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151901": {
+      "content": "<FAKE_PAD_219>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151902": {
+      "content": "<FAKE_PAD_220>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151903": {
+      "content": "<FAKE_PAD_221>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151904": {
+      "content": "<FAKE_PAD_222>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151905": {
+      "content": "<FAKE_PAD_223>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151906": {
+      "content": "<FAKE_PAD_224>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151907": {
+      "content": "<FAKE_PAD_225>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151908": {
+      "content": "<FAKE_PAD_226>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151909": {
+      "content": "<FAKE_PAD_227>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151910": {
+      "content": "<FAKE_PAD_228>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151911": {
+      "content": "<FAKE_PAD_229>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151912": {
+      "content": "<FAKE_PAD_230>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151913": {
+      "content": "<FAKE_PAD_231>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151914": {
+      "content": "<FAKE_PAD_232>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151915": {
+      "content": "<FAKE_PAD_233>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151916": {
+      "content": "<FAKE_PAD_234>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151917": {
+      "content": "<FAKE_PAD_235>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151918": {
+      "content": "<FAKE_PAD_236>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151919": {
+      "content": "<FAKE_PAD_237>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151920": {
+      "content": "<FAKE_PAD_238>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151921": {
+      "content": "<FAKE_PAD_239>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151922": {
+      "content": "<FAKE_PAD_240>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151923": {
+      "content": "<FAKE_PAD_241>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151924": {
+      "content": "<FAKE_PAD_242>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151925": {
+      "content": "<FAKE_PAD_243>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151926": {
+      "content": "<FAKE_PAD_244>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151927": {
+      "content": "<FAKE_PAD_245>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151928": {
+      "content": "<FAKE_PAD_246>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151929": {
+      "content": "<FAKE_PAD_247>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151930": {
+      "content": "<FAKE_PAD_248>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151931": {
+      "content": "<FAKE_PAD_249>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151932": {
+      "content": "<FAKE_PAD_250>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151933": {
+      "content": "<FAKE_PAD_251>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151934": {
+      "content": "<FAKE_PAD_252>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151935": {
+      "content": "<FAKE_PAD_253>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151936": {
+      "content": "<audio>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151937": {
+      "content": "</audio>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151938": {
+      "content": "<AUDIO_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151939": {
+      "content": "<interrupt>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151940": {
+      "content": "<FAKE_PAD_PAD_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151941": {
+      "content": "<FAKE_PAD_PAD_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151942": {
+      "content": "<FAKE_PAD_PAD_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151943": {
+      "content": "<FAKE_PAD_PAD_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151944": {
+      "content": "<FAKE_PAD_PAD_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151945": {
+      "content": "<FAKE_PAD_PAD_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151946": {
+      "content": "<FAKE_PAD_PAD_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151947": {
+      "content": "<FAKE_PAD_PAD_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151948": {
+      "content": "<FAKE_PAD_PAD_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151949": {
+      "content": "<FAKE_PAD_PAD_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151950": {
+      "content": "<FAKE_PAD_PAD_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151951": {
+      "content": "<FAKE_PAD_PAD_11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151952": {
+      "content": "<FAKE_PAD_PAD_12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151953": {
+      "content": "<FAKE_PAD_PAD_13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151954": {
+      "content": "<FAKE_PAD_PAD_14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151955": {
+      "content": "<FAKE_PAD_PAD_15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151956": {
+      "content": "<FAKE_PAD_PAD_16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151957": {
+      "content": "<FAKE_PAD_PAD_17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151958": {
+      "content": "<FAKE_PAD_PAD_18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151959": {
+      "content": "<FAKE_PAD_PAD_19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151960": {
+      "content": "<FAKE_PAD_PAD_20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151961": {
+      "content": "<FAKE_PAD_PAD_21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151962": {
+      "content": "<FAKE_PAD_PAD_22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151963": {
+      "content": "<FAKE_PAD_PAD_23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151964": {
+      "content": "<FAKE_PAD_PAD_24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151965": {
+      "content": "<FAKE_PAD_PAD_25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151966": {
+      "content": "<FAKE_PAD_PAD_26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151967": {
+      "content": "<FAKE_PAD_PAD_27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<IMG_CONTEXT>",
+    "<img>",
+    "</img>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>",
+    "<|action_start|>",
+    "<|action_end|>",
+    "<|plugin|>",
+    "<|interpreter|>",
+    "<FAKE_PAD_0>",
+    "<FAKE_PAD_1>",
+    "<FAKE_PAD_2>",
+    "<FAKE_PAD_3>",
+    "<FAKE_PAD_4>",
+    "<FAKE_PAD_5>",
+    "<FAKE_PAD_6>",
+    "<FAKE_PAD_7>",
+    "<FAKE_PAD_8>",
+    "<FAKE_PAD_9>",
+    "<FAKE_PAD_10>",
+    "<FAKE_PAD_11>",
+    "<FAKE_PAD_12>",
+    "<FAKE_PAD_13>",
+    "<FAKE_PAD_14>",
+    "<FAKE_PAD_15>",
+    "<FAKE_PAD_16>",
+    "<FAKE_PAD_17>",
+    "<FAKE_PAD_18>",
+    "<FAKE_PAD_19>",
+    "<FAKE_PAD_20>",
+    "<FAKE_PAD_21>",
+    "<FAKE_PAD_22>",
+    "<FAKE_PAD_23>",
+    "<FAKE_PAD_24>",
+    "<FAKE_PAD_25>",
+    "<FAKE_PAD_26>",
+    "<FAKE_PAD_27>",
+    "<FAKE_PAD_28>",
+    "<FAKE_PAD_29>",
+    "<FAKE_PAD_30>",
+    "<FAKE_PAD_31>",
+    "<FAKE_PAD_32>",
+    "<FAKE_PAD_33>",
+    "<FAKE_PAD_34>",
+    "<FAKE_PAD_35>",
+    "<FAKE_PAD_36>",
+    "<FAKE_PAD_37>",
+    "<FAKE_PAD_38>",
+    "<FAKE_PAD_39>",
+    "<FAKE_PAD_40>",
+    "<FAKE_PAD_41>",
+    "<FAKE_PAD_42>",
+    "<FAKE_PAD_43>",
+    "<FAKE_PAD_44>",
+    "<FAKE_PAD_45>",
+    "<FAKE_PAD_46>",
+    "<FAKE_PAD_47>",
+    "<FAKE_PAD_48>",
+    "<FAKE_PAD_49>",
+    "<FAKE_PAD_50>",
+    "<FAKE_PAD_51>",
+    "<FAKE_PAD_52>",
+    "<FAKE_PAD_53>",
+    "<FAKE_PAD_54>",
+    "<FAKE_PAD_55>",
+    "<FAKE_PAD_56>",
+    "<FAKE_PAD_57>",
+    "<FAKE_PAD_58>",
+    "<FAKE_PAD_59>",
+    "<FAKE_PAD_60>",
+    "<FAKE_PAD_61>",
+    "<FAKE_PAD_62>",
+    "<FAKE_PAD_63>",
+    "<FAKE_PAD_64>",
+    "<FAKE_PAD_65>",
+    "<FAKE_PAD_66>",
+    "<FAKE_PAD_67>",
+    "<FAKE_PAD_68>",
+    "<FAKE_PAD_69>",
+    "<FAKE_PAD_70>",
+    "<FAKE_PAD_71>",
+    "<FAKE_PAD_72>",
+    "<FAKE_PAD_73>",
+    "<FAKE_PAD_74>",
+    "<FAKE_PAD_75>",
+    "<FAKE_PAD_76>",
+    "<FAKE_PAD_77>",
+    "<FAKE_PAD_78>",
+    "<FAKE_PAD_79>",
+    "<FAKE_PAD_80>",
+    "<FAKE_PAD_81>",
+    "<FAKE_PAD_82>",
+    "<FAKE_PAD_83>",
+    "<FAKE_PAD_84>",
+    "<FAKE_PAD_85>",
+    "<FAKE_PAD_86>",
+    "<FAKE_PAD_87>",
+    "<FAKE_PAD_88>",
+    "<FAKE_PAD_89>",
+    "<FAKE_PAD_90>",
+    "<FAKE_PAD_91>",
+    "<FAKE_PAD_92>",
+    "<FAKE_PAD_93>",
+    "<FAKE_PAD_94>",
+    "<FAKE_PAD_95>",
+    "<FAKE_PAD_96>",
+    "<FAKE_PAD_97>",
+    "<FAKE_PAD_98>",
+    "<FAKE_PAD_99>",
+    "<FAKE_PAD_100>",
+    "<FAKE_PAD_101>",
+    "<FAKE_PAD_102>",
+    "<FAKE_PAD_103>",
+    "<FAKE_PAD_104>",
+    "<FAKE_PAD_105>",
+    "<FAKE_PAD_106>",
+    "<FAKE_PAD_107>",
+    "<FAKE_PAD_108>",
+    "<FAKE_PAD_109>",
+    "<FAKE_PAD_110>",
+    "<FAKE_PAD_111>",
+    "<FAKE_PAD_112>",
+    "<FAKE_PAD_113>",
+    "<FAKE_PAD_114>",
+    "<FAKE_PAD_115>",
+    "<FAKE_PAD_116>",
+    "<FAKE_PAD_117>",
+    "<FAKE_PAD_118>",
+    "<FAKE_PAD_119>",
+    "<FAKE_PAD_120>",
+    "<FAKE_PAD_121>",
+    "<FAKE_PAD_122>",
+    "<FAKE_PAD_123>",
+    "<FAKE_PAD_124>",
+    "<FAKE_PAD_125>",
+    "<FAKE_PAD_126>",
+    "<FAKE_PAD_127>",
+    "<FAKE_PAD_128>",
+    "<FAKE_PAD_129>",
+    "<FAKE_PAD_130>",
+    "<FAKE_PAD_131>",
+    "<FAKE_PAD_132>",
+    "<FAKE_PAD_133>",
+    "<FAKE_PAD_134>",
+    "<FAKE_PAD_135>",
+    "<FAKE_PAD_136>",
+    "<FAKE_PAD_137>",
+    "<FAKE_PAD_138>",
+    "<FAKE_PAD_139>",
+    "<FAKE_PAD_140>",
+    "<FAKE_PAD_141>",
+    "<FAKE_PAD_142>",
+    "<FAKE_PAD_143>",
+    "<FAKE_PAD_144>",
+    "<FAKE_PAD_145>",
+    "<FAKE_PAD_146>",
+    "<FAKE_PAD_147>",
+    "<FAKE_PAD_148>",
+    "<FAKE_PAD_149>",
+    "<FAKE_PAD_150>",
+    "<FAKE_PAD_151>",
+    "<FAKE_PAD_152>",
+    "<FAKE_PAD_153>",
+    "<FAKE_PAD_154>",
+    "<FAKE_PAD_155>",
+    "<FAKE_PAD_156>",
+    "<FAKE_PAD_157>",
+    "<FAKE_PAD_158>",
+    "<FAKE_PAD_159>",
+    "<FAKE_PAD_160>",
+    "<FAKE_PAD_161>",
+    "<FAKE_PAD_162>",
+    "<FAKE_PAD_163>",
+    "<FAKE_PAD_164>",
+    "<FAKE_PAD_165>",
+    "<FAKE_PAD_166>",
+    "<FAKE_PAD_167>",
+    "<FAKE_PAD_168>",
+    "<FAKE_PAD_169>",
+    "<FAKE_PAD_170>",
+    "<FAKE_PAD_171>",
+    "<FAKE_PAD_172>",
+    "<FAKE_PAD_173>",
+    "<FAKE_PAD_174>",
+    "<FAKE_PAD_175>",
+    "<FAKE_PAD_176>",
+    "<FAKE_PAD_177>",
+    "<FAKE_PAD_178>",
+    "<FAKE_PAD_179>",
+    "<FAKE_PAD_180>",
+    "<FAKE_PAD_181>",
+    "<FAKE_PAD_182>",
+    "<FAKE_PAD_183>",
+    "<FAKE_PAD_184>",
+    "<FAKE_PAD_185>",
+    "<FAKE_PAD_186>",
+    "<FAKE_PAD_187>",
+    "<FAKE_PAD_188>",
+    "<FAKE_PAD_189>",
+    "<FAKE_PAD_190>",
+    "<FAKE_PAD_191>",
+    "<FAKE_PAD_192>",
+    "<FAKE_PAD_193>",
+    "<FAKE_PAD_194>",
+    "<FAKE_PAD_195>",
+    "<FAKE_PAD_196>",
+    "<FAKE_PAD_197>",
+    "<FAKE_PAD_198>",
+    "<FAKE_PAD_199>",
+    "<FAKE_PAD_200>",
+    "<FAKE_PAD_201>",
+    "<FAKE_PAD_202>",
+    "<FAKE_PAD_203>",
+    "<FAKE_PAD_204>",
+    "<FAKE_PAD_205>",
+    "<FAKE_PAD_206>",
+    "<FAKE_PAD_207>",
+    "<FAKE_PAD_208>",
+    "<FAKE_PAD_209>",
+    "<FAKE_PAD_210>",
+    "<FAKE_PAD_211>",
+    "<FAKE_PAD_212>",
+    "<FAKE_PAD_213>",
+    "<FAKE_PAD_214>",
+    "<FAKE_PAD_215>",
+    "<FAKE_PAD_216>",
+    "<FAKE_PAD_217>",
+    "<FAKE_PAD_218>",
+    "<FAKE_PAD_219>",
+    "<FAKE_PAD_220>",
+    "<FAKE_PAD_221>",
+    "<FAKE_PAD_222>",
+    "<FAKE_PAD_223>",
+    "<FAKE_PAD_224>",
+    "<FAKE_PAD_225>",
+    "<FAKE_PAD_226>",
+    "<FAKE_PAD_227>",
+    "<FAKE_PAD_228>",
+    "<FAKE_PAD_229>",
+    "<FAKE_PAD_230>",
+    "<FAKE_PAD_231>",
+    "<FAKE_PAD_232>",
+    "<FAKE_PAD_233>",
+    "<FAKE_PAD_234>",
+    "<FAKE_PAD_235>",
+    "<FAKE_PAD_236>",
+    "<FAKE_PAD_237>",
+    "<FAKE_PAD_238>",
+    "<FAKE_PAD_239>",
+    "<FAKE_PAD_240>",
+    "<FAKE_PAD_241>",
+    "<FAKE_PAD_242>",
+    "<FAKE_PAD_243>",
+    "<FAKE_PAD_244>",
+    "<FAKE_PAD_245>",
+    "<FAKE_PAD_246>",
+    "<FAKE_PAD_247>",
+    "<FAKE_PAD_248>",
+    "<FAKE_PAD_249>",
+    "<FAKE_PAD_250>",
+    "<FAKE_PAD_251>",
+    "<FAKE_PAD_252>",
+    "<FAKE_PAD_253>",
+    "<audio>",
+    "</audio>",
+    "<AUDIO_CONTEXT>",
+    "<interrupt>",
+    "<FAKE_PAD_PAD_0>",
+    "<FAKE_PAD_PAD_1>",
+    "<FAKE_PAD_PAD_2>",
+    "<FAKE_PAD_PAD_3>",
+    "<FAKE_PAD_PAD_4>",
+    "<FAKE_PAD_PAD_5>",
+    "<FAKE_PAD_PAD_6>",
+    "<FAKE_PAD_PAD_7>",
+    "<FAKE_PAD_PAD_8>",
+    "<FAKE_PAD_PAD_9>",
+    "<FAKE_PAD_PAD_10>",
+    "<FAKE_PAD_PAD_11>",
+    "<FAKE_PAD_PAD_12>",
+    "<FAKE_PAD_PAD_13>",
+    "<FAKE_PAD_PAD_14>",
+    "<FAKE_PAD_PAD_15>",
+    "<FAKE_PAD_PAD_16>",
+    "<FAKE_PAD_PAD_17>",
+    "<FAKE_PAD_PAD_18>",
+    "<FAKE_PAD_PAD_19>",
+    "<FAKE_PAD_PAD_20>",
+    "<FAKE_PAD_PAD_21>",
+    "<FAKE_PAD_PAD_22>",
+    "<FAKE_PAD_PAD_23>",
+    "<FAKE_PAD_PAD_24>",
+    "<FAKE_PAD_PAD_25>",
+    "<FAKE_PAD_PAD_26>",
+    "<FAKE_PAD_PAD_27>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff