pushing the projector weights only

Browse files

Files changed (8) hide show

.gitattributes +1 -0
config.json +22 -32
generation_config.json +1 -1
model.safetensors +3 -0
special_tokens_map.json +7 -1
tokenizer.json +0 -0
ultravox_config.py +0 -2
ultravox_model.py +49 -43

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

config.json CHANGED Viewed

@@ -1,9 +1,19 @@
 {
-  "_name_or_path": "artifacts/model-70B-bs6-ga1-cont:v5/",
   "architectures": [
     "UltravoxModel"
   ],
   "audio_model_id": "openai/whisper-medium",
   "audio_token_index": 32000,
   "auto_map": {
     "AutoConfig": "ultravox_config.UltravoxConfig",
@@ -27,38 +37,18 @@
   "pad_token_id": 128009,
   "projector_act": "swiglu",
   "stack_factor": 8,
-  "text_config": {
-    "_name_or_path": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-    "architectures": [
-      "LlamaForCausalLM"
-    ],
-    "bos_token_id": 128000,
-    "eos_token_id": [
-      128001,
-      128008,
-      128009
-    ],
-    "hidden_size": 8192,
-    "intermediate_size": 28672,
-    "max_position_embeddings": 131072,
-    "model_type": "llama",
-    "num_attention_heads": 64,
-    "num_hidden_layers": 80,
-    "num_key_value_heads": 8,
-    "rms_norm_eps": 1e-05,
-    "rope_scaling": {
-      "factor": 8.0,
-      "high_freq_factor": 4.0,
-      "low_freq_factor": 1.0,
-      "original_max_position_embeddings": 8192,
-      "rope_type": "llama3"
-    },
-    "rope_theta": 500000.0,
-    "torch_dtype": "bfloat16",
-    "vocab_size": 128256
   },
-  "text_model_id": null,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.44.0",
   "vocab_size": 128256
 }

 {
+  "_name_or_path": "fixie-ai/ultravox-v0_4-llama-3_1-70b",
   "architectures": [
     "UltravoxModel"
   ],
   "audio_model_id": "openai/whisper-medium",
+  "audio_model_lora_config": {
+    "lora_alpha": 8,
+    "r": 0,
+    "target_modules": [
+      "k_proj",
+      "q_proj",
+      "linear_k",
+      "linear_q"
+    ]
+  },
   "audio_token_index": 32000,
   "auto_map": {
     "AutoConfig": "ultravox_config.UltravoxConfig",
   "pad_token_id": 128009,
   "projector_act": "swiglu",
   "stack_factor": 8,
+  "text_model_id": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+  "text_model_lora_config": {
+    "lora_alpha": 8,
+    "r": 0,
+    "target_modules": [
+      "k_proj",
+      "q_proj",
+      "linear_k",
+      "linear_q"
+    ]
   },
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.0",
   "vocab_size": 128256
 }

generation_config.json CHANGED Viewed

@@ -7,5 +7,5 @@
     128009
   ],
   "pad_token_id": 128009,
-  "transformers_version": "4.44.0"
 }

     128009
   ],
   "pad_token_id": 128009,
+  "transformers_version": "4.45.0"
 }

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a09b15fd86a2015f62df808820cea641aa974968732c290d7c725d41631ba67
+size 100696544

special_tokens_map.json CHANGED Viewed

@@ -13,5 +13,11 @@
     "rstrip": false,
     "single_word": false
   },
-  "pad_token": "<|eot_id|>"
 }

     "rstrip": false,
     "single_word": false
   },
+  "pad_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
 }

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

ultravox_config.py CHANGED Viewed

@@ -99,7 +99,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
         audio_model_id: Optional[str] = None,
         text_model_id: Optional[str] = None,
         ignore_index: int = -100,
-        audio_token_index: int = 32000,
         hidden_size: int = 4096,
         stack_factor: int = 8,
         norm_init: float = 0.4,
@@ -112,7 +111,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
         self.audio_model_id = audio_model_id
         self.text_model_id = text_model_id
-        self.audio_token_index = audio_token_index
         self.hidden_size = hidden_size
         self.stack_factor = stack_factor

         audio_model_id: Optional[str] = None,
         text_model_id: Optional[str] = None,
         ignore_index: int = -100,
         hidden_size: int = 4096,
         stack_factor: int = 8,
         norm_init: float = 0.4,
         self.audio_model_id = audio_model_id
         self.text_model_id = text_model_id
         self.hidden_size = hidden_size
         self.stack_factor = stack_factor

ultravox_model.py CHANGED Viewed

@@ -51,36 +51,18 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         self.vocab_size = config.vocab_size
         self.audio_tower = self._create_audio_tower(config)
-        self.multi_modal_projector = UltravoxProjector(config)
         self.language_model = self._create_language_model(config)
-        # Determine no_split_modules dynamically to use with FSDP auto_wrap policy. FSDP throws an error if
-        # some of the layer types are not found in the model.
         # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
-        self._no_split_modules = (
-            self.language_model._no_split_modules + self.audio_tower._no_split_modules
         )
         self.loss_config = LossConfig()
         self.post_init()
-        self.multi_modal_projector.to(dtype=config.torch_dtype)
-    def save_pretrained(
-        self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
-    ):
-        if state_dict is None:
-            state_dict = super().state_dict()
-        named_params = dict(self.named_parameters())
-        state_dict = {
-            k: v
-            for k, v in state_dict.items()
-            if k in self.keep_params
-            or (k in named_params and named_params[k].requires_grad)
-        }
-        super().save_pretrained(*args, state_dict=state_dict, **kwargs)
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
@@ -290,6 +272,14 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         return model_input
     @classmethod
     def _create_audio_tower(
         cls, config: UltravoxConfig
@@ -311,7 +301,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                     # we only ever use from_config if the weights are retrained, hence initializing is not
                     # required. This makes the model quite creation faster since init on CPU is quite slow.
                     audio_tower = transformers.AutoModel.from_config(
-                        config.audio_config, torch_dtype=config.torch_dtype
                     )
         if isinstance(
@@ -341,14 +331,18 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                 # we only ever use from_config if the weights are retrained, hence initializing is not
                 # required. This makes the model quite creation faster since init on CPU is quite slow.
                 language_model = transformers.AutoModelForCausalLM.from_config(
-                    config.text_config, attn_implementation=config._attn_implementation
                 )
         language_model = apply_lora(language_model, config.text_model_lora_config)
         return language_model
-    def _add_language_model_weights_to_keep(self):
-        if self.config.text_model_id is not None:
             self.config.text_model_id = None
             self.keep_params.update(
                 set(
@@ -359,8 +353,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                 )
             )
-    def _add_audio_tower_weights_to_keep(self):
-        if self.config.audio_model_id is not None:
             self.config.audio_model_id = None
             self.keep_params.update(
                 set(
@@ -371,17 +366,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                 )
             )
-    def merge_and_unload(self):
-        if isinstance(self.language_model, peft.PeftModel):
-            self.language_model = self.language_model.merge_and_unload()
-            # no need to download base language model weights anymore, so we can remove the id
-            self._add_language_model_weights_to_keep()
-        if isinstance(self.audio_tower, peft.PeftModel):
-            self.audio_tower = self.audio_tower.merge_and_unload()
-            # no need to download base audio model weights anymore, so we can remove the id
-            self._add_audio_tower_weights_to_keep()
         for param in ["text_model_lora_config", "audio_model_lora_config"]:
             if hasattr(self.config, param):
                 delattr(self.config, param)
@@ -391,6 +375,31 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         self.to(self.language_model.dtype)
         return super().push_to_hub(*args, **kwargs)
     def print_trainable_parameters(self):
         """
         Prints the number of trainable parameters in the model (reuses Peft model's method)
@@ -419,9 +428,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
             f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%"
         )
-    def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
-        self.keep_params.update(set(state_dict.keys()))
 def is_cache_empty(
     past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]

         self.vocab_size = config.vocab_size
         self.audio_tower = self._create_audio_tower(config)
+        self.multi_modal_projector = self._create_multi_modal_projector(config)
         self.language_model = self._create_language_model(config)
+        # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
+        # FSDP throws an error if some of the layer types are not found in the model.
         # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
+        self._no_split_modules = (self.language_model._no_split_modules or []) + (
+            self.audio_tower._no_split_modules or []
         )
         self.loss_config = LossConfig()
         self.post_init()
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
         return model_input
+    @classmethod
+    def _create_multi_modal_projector(
+        cls, config: UltravoxConfig
+    ) -> "UltravoxProjector":
+        projector = UltravoxProjector(config)
+        projector.to(config.torch_dtype)
+        return projector
     @classmethod
     def _create_audio_tower(
         cls, config: UltravoxConfig
                     # we only ever use from_config if the weights are retrained, hence initializing is not
                     # required. This makes the model quite creation faster since init on CPU is quite slow.
                     audio_tower = transformers.AutoModel.from_config(
+                        config.audio_config
                     )
         if isinstance(
                 # we only ever use from_config if the weights are retrained, hence initializing is not
                 # required. This makes the model quite creation faster since init on CPU is quite slow.
                 language_model = transformers.AutoModelForCausalLM.from_config(
+                    config.text_config,
+                    attn_implementation=config._attn_implementation,
+                    torch_dtype=config.torch_dtype,
                 )
         language_model = apply_lora(language_model, config.text_model_lora_config)
         return language_model
+    def merge_and_unload(self):
+        if isinstance(self.language_model, peft.PeftModel):
+            self.language_model = self.language_model.merge_and_unload()
+            # no need to download base language model weights anymore, so we can remove the id
             self.config.text_model_id = None
             self.keep_params.update(
                 set(
                 )
             )
+        if isinstance(self.audio_tower, peft.PeftModel):
+            self.audio_tower = self.audio_tower.merge_and_unload()
+            # no need to download base audio model weights anymore, so we can remove the id
             self.config.audio_model_id = None
             self.keep_params.update(
                 set(
                 )
             )
         for param in ["text_model_lora_config", "audio_model_lora_config"]:
             if hasattr(self.config, param):
                 delattr(self.config, param)
         self.to(self.language_model.dtype)
         return super().push_to_hub(*args, **kwargs)
+    def save_pretrained(
+        self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
+    ):
+        if state_dict is None:
+            state_dict = {}
+            for module, keep in [
+                ("multi_modal_projector", True),
+                ("audio_tower", self.config.audio_model_id is None),
+                ("language_model", self.config.text_model_id is None),
+            ]:
+                if keep:
+                    state_dict.update(
+                        {
+                            f"{module}.{name}": param
+                            for name, param in getattr(self, module)
+                            .state_dict()
+                            .items()
+                        }
+                    )
+        super().save_pretrained(*args, state_dict=state_dict, **kwargs)
+    def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
+        self.keep_params.update(set(state_dict.keys()))
     def print_trainable_parameters(self):
         """
         Prints the number of trainable parameters in the model (reuses Peft model's method)
             f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%"
         )
 def is_cache_empty(
     past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]