autoprogrammer
/

deepseekv2lite_densemixer

Safetensors

deepseek_v2

custom_code

Model card Files Files and versions

xet

Community

autoprogrammer commited on Aug 31, 2025

Commit

e290920

verified ·

1 Parent(s): aa36b25

Upload DeepSeekV2Lite model

Browse files

Files changed (1) hide show

modeling_deepseek.py +28 -28

modeling_deepseek.py CHANGED Viewed

@@ -566,7 +566,7 @@ class DeepseekV2MoE(nn.Module):
             )
     def forward_original(self, hidden_states):
-        """原始的forward方法，保留用于对比和回滚"""
         identity = hidden_states
         orig_shape = hidden_states.shape
         topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
@@ -590,20 +590,20 @@ class DeepseekV2MoE(nn.Module):
     def forward(self, hidden_states):
         """
-        实现dense backward功能的前向传播：
-        前向输出保持与官方相同（稀疏计算结果），但在反向传播时通过dense计算的梯度传递回来
-        Dense Backward机制说明：
-        1. 前向传播时，只有top-k专家参与计算（稀疏前向）
-        2. 反向传播时，所有专家都接收梯度（密集反向）
-        3. 使用直通梯度技术：sparse_output.detach() + (dense_output - dense_output.detach())
-        4. 通过register_hook确保只有被激活的专家才真正更新参数
         Args:
-            hidden_states: 输入张量，形状为 (batch_size, seq_length, hidden_dim)
         Returns:
-            输出张量，形状为 (batch_size, seq_length, hidden_dim)
         """
         batch_size, seq_length, hidden_dim = hidden_states.shape
         dtype = hidden_states.dtype
@@ -612,14 +612,14 @@ class DeepseekV2MoE(nn.Module):
         identity = hidden_states
         orig_shape = hidden_states.shape
-        # Step 1: 计算路由逻辑，选择top-k专家
         topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
         flat_hidden = hidden_states.view(-1, hidden_dim)  # (B*seq_len, hidden_dim)
         N_tokens = flat_hidden.size(0)
         flat_topk_idx = topk_idx.view(-1)
-        # Step 2: 计算完整的路由权重（用于dense backward）
-        # 注意V2版本强制使用float32计算
         router_logits = F.linear(
             flat_hidden.type(torch.float32),
             self.gate.weight.type(torch.float32),
@@ -632,49 +632,49 @@ class DeepseekV2MoE(nn.Module):
         routing_weights = routing_weights.to(dtype=dtype)
-        # Step 3: 准备输出累加器
         dense_outputs = torch.zeros((N_tokens, hidden_dim), dtype=dtype, device=device)
         sparse_outputs = torch.zeros((N_tokens, hidden_dim), dtype=dtype, device=device)
-        # Step 4: 对每个专家进行计算
         if self.training:
-            # 训练模式：实现dense backward
             for expert_idx in range(self.config.n_routed_experts):
-                # V2版本的专家可能为None（在EP模式下）
                 if self.experts[expert_idx] is None:
                     continue
                 expert_layer = self.experts[expert_idx]
-                # 为所有tokens计算当前专家的输出（dense计算）
                 expert_output = expert_layer(flat_hidden)  # (N_tokens, hidden_dim)
-                # 创建激活掩码：标记哪些token选择了该专家
                 activation_mask = (topk_idx == expert_idx).any(dim=1).float().unsqueeze(-1).to(dtype)
-                # 注册hook：只有被选中的token才能向该专家传递梯度
                 if expert_output.requires_grad:
                     expert_output.register_hook(lambda grad, mask=activation_mask: grad * mask)
                 expert_output = expert_output.to(dtype=dtype)
-                # Dense accumulation: 使用完整的路由权重
                 weight_full = routing_weights[:, expert_idx].unsqueeze(-1)  # (N_tokens, 1)
                 dense_outputs = dense_outputs + expert_output * weight_full
-                # Sparse accumulation: 只累加被选中的专家输出
                 matches = (topk_idx == expert_idx)
                 if matches.any():
                     token_indices, k_indices = torch.where(matches)
                     weights_topk = topk_weight[token_indices, k_indices].unsqueeze(-1).to(sparse_outputs.dtype)  # (num_matches, 1)
                     sparse_outputs[token_indices] = sparse_outputs[token_indices] + expert_output[token_indices] * weights_topk
         else:
-            # 推理模式：使用原始的稀疏计算逻辑
             sparse_outputs = self.moe_infer(flat_hidden, topk_idx, topk_weight)
-            # 推理时不需要dense_outputs
             dense_outputs = sparse_outputs
-        # Step 5: 添加共享专家（如果有）
         if self.config.n_shared_experts is not None:
             shared_expert_output = self.shared_experts(identity)
             sparse_outputs = sparse_outputs.view(*orig_shape) + shared_expert_output
@@ -683,11 +683,11 @@ class DeepseekV2MoE(nn.Module):
             sparse_outputs = sparse_outputs.view(*orig_shape)
             dense_outputs = dense_outputs.view(*orig_shape)
-        # Step 6: 使用直通梯度技术组合sparse前向和dense反向
         if self.training:
-            # 前向用sparse，反向用dense
             final_output = sparse_outputs.detach() + (dense_outputs - dense_outputs.detach())
-            # 添加辅助损失
             final_output = AddAuxiliaryLoss.apply(final_output, aux_loss)
         else:
             final_output = sparse_outputs

             )
     def forward_original(self, hidden_states):
+        """Original forward method, kept for comparison and rollback"""
         identity = hidden_states
         orig_shape = hidden_states.shape
         topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
     def forward(self, hidden_states):
         """
+        Forward pass implementing dense backward functionality:
+        Forward output remains the same as official (sparse computation result), but gradients flow back through dense computation during backward pass
+        Dense Backward mechanism explanation:
+        1. During forward pass, only top-k experts participate in computation (sparse forward)
+        2. During backward pass, all experts receive gradients (dense backward)
+        3. Uses straight-through gradient technique: sparse_output.detach() + (dense_output - dense_output.detach())
+        4. Uses register_hook to ensure only activated experts actually update parameters
         Args:
+            hidden_states: Input tensor, shape (batch_size, seq_length, hidden_dim)
         Returns:
+            Output tensor, shape (batch_size, seq_length, hidden_dim)
         """
         batch_size, seq_length, hidden_dim = hidden_states.shape
         dtype = hidden_states.dtype
         identity = hidden_states
         orig_shape = hidden_states.shape
+        # Step 1: Compute routing logic, select top-k experts
         topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
         flat_hidden = hidden_states.view(-1, hidden_dim)  # (B*seq_len, hidden_dim)
         N_tokens = flat_hidden.size(0)
         flat_topk_idx = topk_idx.view(-1)
+        # Step 2: Compute complete routing weights (for dense backward)
+        # Note: V2 version forces float32 computation
         router_logits = F.linear(
             flat_hidden.type(torch.float32),
             self.gate.weight.type(torch.float32),
         routing_weights = routing_weights.to(dtype=dtype)
+        # Step 3: Prepare output accumulators
         dense_outputs = torch.zeros((N_tokens, hidden_dim), dtype=dtype, device=device)
         sparse_outputs = torch.zeros((N_tokens, hidden_dim), dtype=dtype, device=device)
+        # Step 4: Compute for each expert
         if self.training:
+            # Training mode: implement dense backward
             for expert_idx in range(self.config.n_routed_experts):
+                # V2 version experts may be None (in EP mode)
                 if self.experts[expert_idx] is None:
                     continue
                 expert_layer = self.experts[expert_idx]
+                # Compute current expert output for all tokens (dense computation)
                 expert_output = expert_layer(flat_hidden)  # (N_tokens, hidden_dim)
+                # Create activation mask: mark which tokens selected this expert
                 activation_mask = (topk_idx == expert_idx).any(dim=1).float().unsqueeze(-1).to(dtype)
+                # Register hook: only selected tokens can pass gradients to this expert
                 if expert_output.requires_grad:
                     expert_output.register_hook(lambda grad, mask=activation_mask: grad * mask)
                 expert_output = expert_output.to(dtype=dtype)
+                # Dense accumulation: use complete routing weights
                 weight_full = routing_weights[:, expert_idx].unsqueeze(-1)  # (N_tokens, 1)
                 dense_outputs = dense_outputs + expert_output * weight_full
+                # Sparse accumulation: only accumulate selected expert outputs
                 matches = (topk_idx == expert_idx)
                 if matches.any():
                     token_indices, k_indices = torch.where(matches)
                     weights_topk = topk_weight[token_indices, k_indices].unsqueeze(-1).to(sparse_outputs.dtype)  # (num_matches, 1)
                     sparse_outputs[token_indices] = sparse_outputs[token_indices] + expert_output[token_indices] * weights_topk
         else:
+            # Inference mode: use original sparse computation logic
             sparse_outputs = self.moe_infer(flat_hidden, topk_idx, topk_weight)
+            # Dense outputs not needed during inference
             dense_outputs = sparse_outputs
+        # Step 5: Add shared experts (if any)
         if self.config.n_shared_experts is not None:
             shared_expert_output = self.shared_experts(identity)
             sparse_outputs = sparse_outputs.view(*orig_shape) + shared_expert_output
             sparse_outputs = sparse_outputs.view(*orig_shape)
             dense_outputs = dense_outputs.view(*orig_shape)
+        # Step 6: Use straight-through gradient technique to combine sparse forward and dense backward
         if self.training:
+            # Forward uses sparse, backward uses dense
             final_output = sparse_outputs.detach() + (dense_outputs - dense_outputs.detach())
+            # Add auxiliary loss
             final_output = AddAuxiliaryLoss.apply(final_output, aux_loss)
         else:
             final_output = sparse_outputs