mazesmazes
/

tiny-audio

@@ -1,4 +1,3 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 tokenizer_config.json -filter -diff -merge text
-tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 tokenizer_config.json -filter -diff -merge text

asr_modeling.py CHANGED Viewed

@@ -38,7 +38,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
     _is_loading_from_pretrained: bool = False
     _pretrained_model_path: Optional[str] = None
-    TRANSCRIBE_PROMPT = "Please transcribe this audio into text: "
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> "ASRModel":
@@ -543,7 +543,10 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             messages: list[dict[str, str]] = []
             if system_prompt:
                 messages.append({"role": "system", "content": system_prompt})
-            messages.append({"role": "user", "content": self.TRANSCRIBE_PROMPT + audio_placeholder})
             chat_result = self.tokenizer.apply_chat_template(
                 messages,
@@ -618,7 +621,10 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         messages: list[dict[str, str]] = []
         if system_prompt:
             messages.append({"role": "system", "content": system_prompt})
-        messages.append({"role": "user", "content": self.TRANSCRIBE_PROMPT + audio_placeholder})
         chat_result = self.tokenizer.apply_chat_template(
             messages,
@@ -778,6 +784,8 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             shutil.copy(asr_file, save_dir / asr_file.name)
         # Copy projectors module
         shutil.copy(src_dir / "projectors.py", save_dir / "projectors.py")
     def push_to_hub(self, repo_id: str, **kwargs) -> str:
         """Push model to HuggingFace Hub, ensuring adapter_config points to repo.

     _is_loading_from_pretrained: bool = False
     _pretrained_model_path: Optional[str] = None
+    TRANSCRIBE_PROMPT = "Transcribe speech to text"  # Audio tokens come BEFORE this
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> "ASRModel":
             messages: list[dict[str, str]] = []
             if system_prompt:
                 messages.append({"role": "system", "content": system_prompt})
+            # Audio BEFORE prompt for proper causal attention
+            messages.append(
+                {"role": "user", "content": audio_placeholder + " " + self.TRANSCRIBE_PROMPT}
+            )
             chat_result = self.tokenizer.apply_chat_template(
                 messages,
         messages: list[dict[str, str]] = []
         if system_prompt:
             messages.append({"role": "system", "content": system_prompt})
+        # Audio BEFORE prompt for proper causal attention
+        messages.append(
+            {"role": "user", "content": audio_placeholder + " " + self.TRANSCRIBE_PROMPT}
+        )
         chat_result = self.tokenizer.apply_chat_template(
             messages,
             shutil.copy(asr_file, save_dir / asr_file.name)
         # Copy projectors module
         shutil.copy(src_dir / "projectors.py", save_dir / "projectors.py")
+        # Copy diarization module
+        shutil.copy(src_dir / "diarization.py", save_dir / "diarization.py")
     def push_to_hub(self, repo_id: str, **kwargs) -> str:
         """Push model to HuggingFace Hub, ensuring adapter_config points to repo.

asr_processing.py CHANGED Viewed

@@ -17,7 +17,7 @@ class ASRProcessor(ProcessorMixin):
     feature_extractor_class = "AutoFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
     AUDIO_TOKEN = "<audio>"
-    TRANSCRIBE_PROMPT = "Transcribe: "
     # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
     DEFAULT_ENCODER_CONV_LAYERS = [(1, 3, 1), (1, 3, 2)]
@@ -89,10 +89,11 @@ class ASRProcessor(ProcessorMixin):
         else:
             num_audio_tokens = 0
-        # Build prompt with audio token placeholders
-        user_content = self.TRANSCRIBE_PROMPT
         if num_audio_tokens > 0:
-            user_content += self.AUDIO_TOKEN * num_audio_tokens
         messages = []
         if system_prompt:

     feature_extractor_class = "AutoFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
     AUDIO_TOKEN = "<audio>"
+    TRANSCRIBE_PROMPT = "Transcribe speech to text"
     # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
     DEFAULT_ENCODER_CONV_LAYERS = [(1, 3, 1), (1, 3, 2)]
         else:
             num_audio_tokens = 0
+        # Build prompt with audio token placeholders (audio BEFORE prompt)
         if num_audio_tokens > 0:
+            user_content = self.AUDIO_TOKEN * num_audio_tokens + " " + self.TRANSCRIBE_PROMPT
+        else:
+            user_content = self.TRANSCRIBE_PROMPT
         messages = []
         if system_prompt:

projectors.py CHANGED Viewed

@@ -33,11 +33,12 @@ class MLPAudioProjector(nn.Module):
         encoder_dim = getattr(config, "encoder_dim", 768)
         llm_dim = getattr(config, "llm_dim", 2048)
-        self.k = getattr(config, "projector_pool_stride", 2)
         # Frame stacking: concat k adjacent frames then project
         in_dim = encoder_dim * self.k
-        hidden_dim = llm_dim
         self.linear_1 = nn.Linear(in_dim, hidden_dim)
         self.act = nn.GELU()
         self.linear_2 = nn.Linear(hidden_dim, llm_dim)
@@ -85,6 +86,7 @@ class SimpleAdapter(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.fc2(self.act(self.fc1(x)))
 class MOSAProjector(nn.Module):
     """MOSA-Base projector: simple 2-layer ReLU router with 4 simple adapters.
@@ -126,7 +128,10 @@ class MOSAProjector(nn.Module):
         # --- 3. Experts (Simple 2-layer GELU adapters) ---
         # Each expert: llm_dim -> hidden -> llm_dim (much smaller than frame-stacking)
         self.experts = nn.ModuleList(
-            [SimpleAdapter(self.llm_dim, adapter_hidden, self.llm_dim) for _ in range(self.num_experts)]
         )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -149,18 +154,15 @@ class MOSAProjector(nn.Module):
         routing_weights = F.softmax(self.router(x), dim=-1)  # (B, out_len, num_experts)
         # --- 3. Expert Mixture (Dense Execution) ---
-        expert_outputs = torch.stack(
-            [expert(x) for expert in self.experts]
-        )  # (E, B, out_len, D)
         return torch.einsum("ebsd, bse -> bsd", expert_outputs, routing_weights)
     def get_output_length(self, input_length: int) -> int:
         """Calculate output sequence length after Conv1d downsampling (4x reduction)."""
         # Conv1d with stride 2, kernel 3, padding 1: out = (in + 2*1 - 3) // 2 + 1 = (in - 1) // 2 + 1
         # Applied twice for 4x total reduction
-        length = (input_length + 2 * 1 - 3) // 2 + 1  # First conv
-        length = (length + 2 * 1 - 3) // 2 + 1  # Second conv
-        return length
 # =============================================================================

         encoder_dim = getattr(config, "encoder_dim", 768)
         llm_dim = getattr(config, "llm_dim", 2048)
+        self.k = getattr(config, "projector_pool_stride", 4)
         # Frame stacking: concat k adjacent frames then project
+        # Hidden dim uses 2x expansion like GLM-ASR's GlmAsrMultiModalProjector
         in_dim = encoder_dim * self.k
+        hidden_dim = llm_dim * 2
         self.linear_1 = nn.Linear(in_dim, hidden_dim)
         self.act = nn.GELU()
         self.linear_2 = nn.Linear(hidden_dim, llm_dim)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.fc2(self.act(self.fc1(x)))
 class MOSAProjector(nn.Module):
     """MOSA-Base projector: simple 2-layer ReLU router with 4 simple adapters.
         # --- 3. Experts (Simple 2-layer GELU adapters) ---
         # Each expert: llm_dim -> hidden -> llm_dim (much smaller than frame-stacking)
         self.experts = nn.ModuleList(
+            [
+                SimpleAdapter(self.llm_dim, adapter_hidden, self.llm_dim)
+                for _ in range(self.num_experts)
+            ]
         )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         routing_weights = F.softmax(self.router(x), dim=-1)  # (B, out_len, num_experts)
         # --- 3. Expert Mixture (Dense Execution) ---
+        expert_outputs = torch.stack([expert(x) for expert in self.experts])  # (E, B, out_len, D)
         return torch.einsum("ebsd, bse -> bsd", expert_outputs, routing_weights)
     def get_output_length(self, input_length: int) -> int:
         """Calculate output sequence length after Conv1d downsampling (4x reduction)."""
         # Conv1d with stride 2, kernel 3, padding 1: out = (in + 2*1 - 3) // 2 + 1 = (in - 1) // 2 + 1
         # Applied twice for 4x total reduction
+        after_conv1 = (input_length + 2 * 1 - 3) // 2 + 1
+        return (after_conv1 + 2 * 1 - 3) // 2 + 1
 # =============================================================================