pushing the projector weights only
Browse files- .gitattributes +1 -0
- config.json +22 -32
- generation_config.json +1 -1
- model.safetensors +3 -0
- special_tokens_map.json +7 -1
- tokenizer.json +0 -0
- ultravox_config.py +0 -2
- ultravox_model.py +49 -43
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
config.json
CHANGED
|
@@ -1,9 +1,19 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "
|
| 3 |
"architectures": [
|
| 4 |
"UltravoxModel"
|
| 5 |
],
|
| 6 |
"audio_model_id": "openai/whisper-medium",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"audio_token_index": 32000,
|
| 8 |
"auto_map": {
|
| 9 |
"AutoConfig": "ultravox_config.UltravoxConfig",
|
|
@@ -27,38 +37,18 @@
|
|
| 27 |
"pad_token_id": 128009,
|
| 28 |
"projector_act": "swiglu",
|
| 29 |
"stack_factor": 8,
|
| 30 |
-
"
|
| 31 |
-
|
| 32 |
-
"
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
],
|
| 41 |
-
"hidden_size": 8192,
|
| 42 |
-
"intermediate_size": 28672,
|
| 43 |
-
"max_position_embeddings": 131072,
|
| 44 |
-
"model_type": "llama",
|
| 45 |
-
"num_attention_heads": 64,
|
| 46 |
-
"num_hidden_layers": 80,
|
| 47 |
-
"num_key_value_heads": 8,
|
| 48 |
-
"rms_norm_eps": 1e-05,
|
| 49 |
-
"rope_scaling": {
|
| 50 |
-
"factor": 8.0,
|
| 51 |
-
"high_freq_factor": 4.0,
|
| 52 |
-
"low_freq_factor": 1.0,
|
| 53 |
-
"original_max_position_embeddings": 8192,
|
| 54 |
-
"rope_type": "llama3"
|
| 55 |
-
},
|
| 56 |
-
"rope_theta": 500000.0,
|
| 57 |
-
"torch_dtype": "bfloat16",
|
| 58 |
-
"vocab_size": 128256
|
| 59 |
},
|
| 60 |
-
"text_model_id": null,
|
| 61 |
"torch_dtype": "bfloat16",
|
| 62 |
-
"transformers_version": "4.
|
| 63 |
"vocab_size": 128256
|
| 64 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "fixie-ai/ultravox-v0_4-llama-3_1-70b",
|
| 3 |
"architectures": [
|
| 4 |
"UltravoxModel"
|
| 5 |
],
|
| 6 |
"audio_model_id": "openai/whisper-medium",
|
| 7 |
+
"audio_model_lora_config": {
|
| 8 |
+
"lora_alpha": 8,
|
| 9 |
+
"r": 0,
|
| 10 |
+
"target_modules": [
|
| 11 |
+
"k_proj",
|
| 12 |
+
"q_proj",
|
| 13 |
+
"linear_k",
|
| 14 |
+
"linear_q"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
"audio_token_index": 32000,
|
| 18 |
"auto_map": {
|
| 19 |
"AutoConfig": "ultravox_config.UltravoxConfig",
|
|
|
|
| 37 |
"pad_token_id": 128009,
|
| 38 |
"projector_act": "swiglu",
|
| 39 |
"stack_factor": 8,
|
| 40 |
+
"text_model_id": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
| 41 |
+
"text_model_lora_config": {
|
| 42 |
+
"lora_alpha": 8,
|
| 43 |
+
"r": 0,
|
| 44 |
+
"target_modules": [
|
| 45 |
+
"k_proj",
|
| 46 |
+
"q_proj",
|
| 47 |
+
"linear_k",
|
| 48 |
+
"linear_q"
|
| 49 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
},
|
|
|
|
| 51 |
"torch_dtype": "bfloat16",
|
| 52 |
+
"transformers_version": "4.45.0",
|
| 53 |
"vocab_size": 128256
|
| 54 |
}
|
generation_config.json
CHANGED
|
@@ -7,5 +7,5 @@
|
|
| 7 |
128009
|
| 8 |
],
|
| 9 |
"pad_token_id": 128009,
|
| 10 |
-
"transformers_version": "4.
|
| 11 |
}
|
|
|
|
| 7 |
128009
|
| 8 |
],
|
| 9 |
"pad_token_id": 128009,
|
| 10 |
+
"transformers_version": "4.45.0"
|
| 11 |
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a09b15fd86a2015f62df808820cea641aa974968732c290d7c725d41631ba67
|
| 3 |
+
size 100696544
|
special_tokens_map.json
CHANGED
|
@@ -13,5 +13,11 @@
|
|
| 13 |
"rstrip": false,
|
| 14 |
"single_word": false
|
| 15 |
},
|
| 16 |
-
"pad_token":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
}
|
|
|
|
| 13 |
"rstrip": false,
|
| 14 |
"single_word": false
|
| 15 |
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<|eot_id|>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
}
|
| 23 |
}
|
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ultravox_config.py
CHANGED
|
@@ -99,7 +99,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
|
| 99 |
audio_model_id: Optional[str] = None,
|
| 100 |
text_model_id: Optional[str] = None,
|
| 101 |
ignore_index: int = -100,
|
| 102 |
-
audio_token_index: int = 32000,
|
| 103 |
hidden_size: int = 4096,
|
| 104 |
stack_factor: int = 8,
|
| 105 |
norm_init: float = 0.4,
|
|
@@ -112,7 +111,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
|
| 112 |
|
| 113 |
self.audio_model_id = audio_model_id
|
| 114 |
self.text_model_id = text_model_id
|
| 115 |
-
self.audio_token_index = audio_token_index
|
| 116 |
|
| 117 |
self.hidden_size = hidden_size
|
| 118 |
self.stack_factor = stack_factor
|
|
|
|
| 99 |
audio_model_id: Optional[str] = None,
|
| 100 |
text_model_id: Optional[str] = None,
|
| 101 |
ignore_index: int = -100,
|
|
|
|
| 102 |
hidden_size: int = 4096,
|
| 103 |
stack_factor: int = 8,
|
| 104 |
norm_init: float = 0.4,
|
|
|
|
| 111 |
|
| 112 |
self.audio_model_id = audio_model_id
|
| 113 |
self.text_model_id = text_model_id
|
|
|
|
| 114 |
|
| 115 |
self.hidden_size = hidden_size
|
| 116 |
self.stack_factor = stack_factor
|
ultravox_model.py
CHANGED
|
@@ -51,36 +51,18 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
| 51 |
self.vocab_size = config.vocab_size
|
| 52 |
|
| 53 |
self.audio_tower = self._create_audio_tower(config)
|
| 54 |
-
self.multi_modal_projector =
|
| 55 |
self.language_model = self._create_language_model(config)
|
| 56 |
|
| 57 |
-
# Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
|
| 58 |
-
# some of the layer types are not found in the model.
|
| 59 |
# This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
|
| 60 |
-
self._no_split_modules = (
|
| 61 |
-
self.
|
| 62 |
)
|
| 63 |
|
| 64 |
self.loss_config = LossConfig()
|
| 65 |
self.post_init()
|
| 66 |
-
self.multi_modal_projector.to(dtype=config.torch_dtype)
|
| 67 |
-
|
| 68 |
-
def save_pretrained(
|
| 69 |
-
self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
|
| 70 |
-
):
|
| 71 |
-
if state_dict is None:
|
| 72 |
-
state_dict = super().state_dict()
|
| 73 |
-
|
| 74 |
-
named_params = dict(self.named_parameters())
|
| 75 |
-
|
| 76 |
-
state_dict = {
|
| 77 |
-
k: v
|
| 78 |
-
for k, v in state_dict.items()
|
| 79 |
-
if k in self.keep_params
|
| 80 |
-
or (k in named_params and named_params[k].requires_grad)
|
| 81 |
-
}
|
| 82 |
-
|
| 83 |
-
super().save_pretrained(*args, state_dict=state_dict, **kwargs)
|
| 84 |
|
| 85 |
def get_input_embeddings(self):
|
| 86 |
return self.language_model.get_input_embeddings()
|
|
@@ -290,6 +272,14 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
| 290 |
|
| 291 |
return model_input
|
| 292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
@classmethod
|
| 294 |
def _create_audio_tower(
|
| 295 |
cls, config: UltravoxConfig
|
|
@@ -311,7 +301,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
| 311 |
# we only ever use from_config if the weights are retrained, hence initializing is not
|
| 312 |
# required. This makes the model quite creation faster since init on CPU is quite slow.
|
| 313 |
audio_tower = transformers.AutoModel.from_config(
|
| 314 |
-
config.audio_config
|
| 315 |
)
|
| 316 |
|
| 317 |
if isinstance(
|
|
@@ -341,14 +331,18 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
| 341 |
# we only ever use from_config if the weights are retrained, hence initializing is not
|
| 342 |
# required. This makes the model quite creation faster since init on CPU is quite slow.
|
| 343 |
language_model = transformers.AutoModelForCausalLM.from_config(
|
| 344 |
-
config.text_config,
|
|
|
|
|
|
|
| 345 |
)
|
| 346 |
|
| 347 |
language_model = apply_lora(language_model, config.text_model_lora_config)
|
| 348 |
return language_model
|
| 349 |
|
| 350 |
-
def
|
| 351 |
-
if self.
|
|
|
|
|
|
|
| 352 |
self.config.text_model_id = None
|
| 353 |
self.keep_params.update(
|
| 354 |
set(
|
|
@@ -359,8 +353,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
| 359 |
)
|
| 360 |
)
|
| 361 |
|
| 362 |
-
|
| 363 |
-
|
|
|
|
| 364 |
self.config.audio_model_id = None
|
| 365 |
self.keep_params.update(
|
| 366 |
set(
|
|
@@ -371,17 +366,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
| 371 |
)
|
| 372 |
)
|
| 373 |
|
| 374 |
-
def merge_and_unload(self):
|
| 375 |
-
if isinstance(self.language_model, peft.PeftModel):
|
| 376 |
-
self.language_model = self.language_model.merge_and_unload()
|
| 377 |
-
# no need to download base language model weights anymore, so we can remove the id
|
| 378 |
-
self._add_language_model_weights_to_keep()
|
| 379 |
-
|
| 380 |
-
if isinstance(self.audio_tower, peft.PeftModel):
|
| 381 |
-
self.audio_tower = self.audio_tower.merge_and_unload()
|
| 382 |
-
# no need to download base audio model weights anymore, so we can remove the id
|
| 383 |
-
self._add_audio_tower_weights_to_keep()
|
| 384 |
-
|
| 385 |
for param in ["text_model_lora_config", "audio_model_lora_config"]:
|
| 386 |
if hasattr(self.config, param):
|
| 387 |
delattr(self.config, param)
|
|
@@ -391,6 +375,31 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
| 391 |
self.to(self.language_model.dtype)
|
| 392 |
return super().push_to_hub(*args, **kwargs)
|
| 393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
def print_trainable_parameters(self):
|
| 395 |
"""
|
| 396 |
Prints the number of trainable parameters in the model (reuses Peft model's method)
|
|
@@ -419,9 +428,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
| 419 |
f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%"
|
| 420 |
)
|
| 421 |
|
| 422 |
-
def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
|
| 423 |
-
self.keep_params.update(set(state_dict.keys()))
|
| 424 |
-
|
| 425 |
|
| 426 |
def is_cache_empty(
|
| 427 |
past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
|
|
|
|
| 51 |
self.vocab_size = config.vocab_size
|
| 52 |
|
| 53 |
self.audio_tower = self._create_audio_tower(config)
|
| 54 |
+
self.multi_modal_projector = self._create_multi_modal_projector(config)
|
| 55 |
self.language_model = self._create_language_model(config)
|
| 56 |
|
| 57 |
+
# Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
|
| 58 |
+
# FSDP throws an error if some of the layer types are not found in the model.
|
| 59 |
# This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
|
| 60 |
+
self._no_split_modules = (self.language_model._no_split_modules or []) + (
|
| 61 |
+
self.audio_tower._no_split_modules or []
|
| 62 |
)
|
| 63 |
|
| 64 |
self.loss_config = LossConfig()
|
| 65 |
self.post_init()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
def get_input_embeddings(self):
|
| 68 |
return self.language_model.get_input_embeddings()
|
|
|
|
| 272 |
|
| 273 |
return model_input
|
| 274 |
|
| 275 |
+
@classmethod
|
| 276 |
+
def _create_multi_modal_projector(
|
| 277 |
+
cls, config: UltravoxConfig
|
| 278 |
+
) -> "UltravoxProjector":
|
| 279 |
+
projector = UltravoxProjector(config)
|
| 280 |
+
projector.to(config.torch_dtype)
|
| 281 |
+
return projector
|
| 282 |
+
|
| 283 |
@classmethod
|
| 284 |
def _create_audio_tower(
|
| 285 |
cls, config: UltravoxConfig
|
|
|
|
| 301 |
# we only ever use from_config if the weights are retrained, hence initializing is not
|
| 302 |
# required. This makes the model quite creation faster since init on CPU is quite slow.
|
| 303 |
audio_tower = transformers.AutoModel.from_config(
|
| 304 |
+
config.audio_config
|
| 305 |
)
|
| 306 |
|
| 307 |
if isinstance(
|
|
|
|
| 331 |
# we only ever use from_config if the weights are retrained, hence initializing is not
|
| 332 |
# required. This makes the model quite creation faster since init on CPU is quite slow.
|
| 333 |
language_model = transformers.AutoModelForCausalLM.from_config(
|
| 334 |
+
config.text_config,
|
| 335 |
+
attn_implementation=config._attn_implementation,
|
| 336 |
+
torch_dtype=config.torch_dtype,
|
| 337 |
)
|
| 338 |
|
| 339 |
language_model = apply_lora(language_model, config.text_model_lora_config)
|
| 340 |
return language_model
|
| 341 |
|
| 342 |
+
def merge_and_unload(self):
|
| 343 |
+
if isinstance(self.language_model, peft.PeftModel):
|
| 344 |
+
self.language_model = self.language_model.merge_and_unload()
|
| 345 |
+
# no need to download base language model weights anymore, so we can remove the id
|
| 346 |
self.config.text_model_id = None
|
| 347 |
self.keep_params.update(
|
| 348 |
set(
|
|
|
|
| 353 |
)
|
| 354 |
)
|
| 355 |
|
| 356 |
+
if isinstance(self.audio_tower, peft.PeftModel):
|
| 357 |
+
self.audio_tower = self.audio_tower.merge_and_unload()
|
| 358 |
+
# no need to download base audio model weights anymore, so we can remove the id
|
| 359 |
self.config.audio_model_id = None
|
| 360 |
self.keep_params.update(
|
| 361 |
set(
|
|
|
|
| 366 |
)
|
| 367 |
)
|
| 368 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
for param in ["text_model_lora_config", "audio_model_lora_config"]:
|
| 370 |
if hasattr(self.config, param):
|
| 371 |
delattr(self.config, param)
|
|
|
|
| 375 |
self.to(self.language_model.dtype)
|
| 376 |
return super().push_to_hub(*args, **kwargs)
|
| 377 |
|
| 378 |
+
def save_pretrained(
|
| 379 |
+
self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
|
| 380 |
+
):
|
| 381 |
+
if state_dict is None:
|
| 382 |
+
state_dict = {}
|
| 383 |
+
for module, keep in [
|
| 384 |
+
("multi_modal_projector", True),
|
| 385 |
+
("audio_tower", self.config.audio_model_id is None),
|
| 386 |
+
("language_model", self.config.text_model_id is None),
|
| 387 |
+
]:
|
| 388 |
+
if keep:
|
| 389 |
+
state_dict.update(
|
| 390 |
+
{
|
| 391 |
+
f"{module}.{name}": param
|
| 392 |
+
for name, param in getattr(self, module)
|
| 393 |
+
.state_dict()
|
| 394 |
+
.items()
|
| 395 |
+
}
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
super().save_pretrained(*args, state_dict=state_dict, **kwargs)
|
| 399 |
+
|
| 400 |
+
def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
|
| 401 |
+
self.keep_params.update(set(state_dict.keys()))
|
| 402 |
+
|
| 403 |
def print_trainable_parameters(self):
|
| 404 |
"""
|
| 405 |
Prints the number of trainable parameters in the model (reuses Peft model's method)
|
|
|
|
| 428 |
f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%"
|
| 429 |
)
|
| 430 |
|
|
|
|
|
|
|
|
|
|
| 431 |
|
| 432 |
def is_cache_empty(
|
| 433 |
past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
|