cfli
/

test_minicpm

Text Generation

Transformers

custom_code

Model card Files Files and versions

xet

Community

cfli commited on Mar 4, 2024

Commit

cbcf91e

verified ·

1 Parent(s): 183b636

Update modeling_minicpm.py

Browse files

Files changed (1) hide show

modeling_minicpm.py +172 -111

modeling_minicpm.py CHANGED Viewed

@@ -36,7 +36,8 @@ from transformers.modeling_attn_mask_utils import (
     _prepare_4d_causal_attention_mask,
     _prepare_4d_causal_attention_mask_for_sdpa,
 )
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
 from transformers.utils import (
@@ -57,7 +58,6 @@ try:
 except:
     pass
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
 if is_torch_fx_available():
@@ -66,7 +66,6 @@ if is_torch_fx_available():
     _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "MiniCPMConfig"
@@ -92,7 +91,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 ):
     warnings.warn(
         "Calling `transformers.models.minicpm.modeling_minicpm._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.minicpm.modeling_minicpm.AttentionMaskConverter._make_causal_mask"
@@ -101,6 +100,7 @@ def _make_causal_mask(
         input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
     )
 # @torch.jit.script  # type: ignore
 def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
     old_dtype = hidden.dtype
@@ -193,7 +193,7 @@ class MiniCPMDynamicNTKScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
         if seq_len > self.max_position_embeddings:
             base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
             ) ** (self.dim / (self.dim - 2))
             inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
             self.register_buffer("inv_freq", inv_freq, persistent=False)
@@ -211,7 +211,7 @@ class MiniCPMDynamicNTKScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
@@ -249,6 +249,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
     return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
 class MiniCPMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -295,7 +296,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class MiniCPMAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -363,14 +363,14 @@ class MiniCPMAttention(nn.Module):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if "padding_mask" in kwargs:
             warnings.warn(
@@ -463,7 +463,7 @@ class MiniCPMAttention(nn.Module):
         if not output_attentions:
             attn_weights = None
         return attn_output, attn_weights, past_key_value
@@ -483,14 +483,14 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
     def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         # MiniCPMFlashAttention2 attention does not support output_attentions
         if "padding_mask" in kwargs:
@@ -571,7 +571,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
         return attn_output, attn_weights, past_key_value
     def _flash_attention_forward(
-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
     ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
@@ -675,13 +675,13 @@ class MiniCPMSdpaAttention(MiniCPMAttention):
     # Adapted from MiniCPMAttention.forward
     def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -774,14 +774,14 @@ class MiniCPMDecoderLayer(nn.Module):
         self.num_hidden_layers = config.num_hidden_layers
     def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -814,7 +814,7 @@ class MiniCPMDecoderLayer(nn.Module):
             use_cache=use_cache,
             **kwargs,
         )
         hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
         # Fully Connected
@@ -952,7 +952,7 @@ MINICPM_INPUTS_DOCSTRING = r"""
     "The bare MiniCPM Model outputting raw hidden-states without any specific head on top.",
     MINICPM_START_DOCSTRING,
 )
-class MiniCPMModel(MiniCPMPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
@@ -986,17 +986,17 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
     @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
     def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        layer_cutoff: Optional[int] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1066,11 +1066,21 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
         for idx, decoder_layer in enumerate(self.layers):
-            if layer_cutoff is not None and idx == layer_cutoff:
                 break
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
@@ -1103,7 +1113,7 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         hidden_states = self.norm(hidden_states)
         # add hidden states from the last decoder layer
-        if output_hidden_states:
             all_hidden_states += (hidden_states,)
         next_cache = None
@@ -1119,14 +1129,21 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         )
-class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
-        self.model = MiniCPMModel(config)
         self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1152,18 +1169,19 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
     @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        layer_cutoff: Optional[int] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -1196,6 +1214,19 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
             input_ids=input_ids,
@@ -1205,32 +1236,62 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
             output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
             return_dict=return_dict,
-            layer_cutoff=layer_cutoff
         )
         hidden_states = outputs[0]
-        if self.config.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
         else:
-            logits = self.lm_head(hidden_states / (self.config.hidden_size / self.config.dim_model_base))
-        logits = logits.float()
         loss = None
-        if labels is not None:
             # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -1245,7 +1306,7 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
         )
     def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
@@ -1261,7 +1322,7 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
             # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
@@ -1270,9 +1331,9 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
             # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
             if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
             ):
                 attention_mask = attention_mask[:, -max_cache_length:]
@@ -1282,7 +1343,7 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
@@ -1308,7 +1369,7 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
         return reordered_past
     @torch.inference_mode()
     def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
              max_length: int = 4096, num_beams=1, do_sample=True, top_p=0.8, temperature=0.3, logits_processor=None,
@@ -1317,11 +1378,11 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
             history = []
         if logits_processor:
             gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
-                        "temperature": temperature, "logits_processor": logits_processor, **kwargs}
         else:
             gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
-                        "temperature": temperature, "logits_processor": logits_processor, **kwargs}
         history.append({"role": role, "content": query})
         history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=False)
         inputs = tokenizer(history_str, return_tensors='pt').to(self.device)
@@ -1369,17 +1430,17 @@ class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
     @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
     def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):

     _prepare_4d_causal_attention_mask,
     _prepare_4d_causal_attention_mask_for_sdpa,
 )
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, \
+    SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
 from transformers.utils import (
 except:
     pass
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
 if is_torch_fx_available():
     _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "MiniCPMConfig"
 def _make_causal_mask(
+        input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 ):
     warnings.warn(
         "Calling `transformers.models.minicpm.modeling_minicpm._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.minicpm.modeling_minicpm.AttentionMaskConverter._make_causal_mask"
         input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
     )
 # @torch.jit.script  # type: ignore
 def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
     old_dtype = hidden.dtype
         if seq_len > self.max_position_embeddings:
             base = self.base * (
+                    (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
             ) ** (self.dim / (self.dim - 2))
             inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
             self.register_buffer("inv_freq", inv_freq, persistent=False)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
     return torch.cat((-x2, x1), dim=-1)
     k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
     return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
 class MiniCPMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class MiniCPMAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if "padding_mask" in kwargs:
             warnings.warn(
         if not output_attentions:
             attn_weights = None
         return attn_output, attn_weights, past_key_value
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
     def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         # MiniCPMFlashAttention2 attention does not support output_attentions
         if "padding_mask" in kwargs:
         return attn_output, attn_weights, past_key_value
     def _flash_attention_forward(
+            self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
     ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
     # Adapted from MiniCPMAttention.forward
     def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
         self.num_hidden_layers = config.num_hidden_layers
     def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
             use_cache=use_cache,
             **kwargs,
         )
         hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
         # Fully Connected
     "The bare MiniCPM Model outputting raw hidden-states without any specific head on top.",
     MINICPM_START_DOCSTRING,
 )
+class LayerWiseMiniCPMModel(MiniCPMPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
     @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
     def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            cutoff_layers: Optional[Union[int, List]] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
+        if cutoff_layers is None:
+            max_layer = self.config.num_hidden_layers
+            cutoff_layers = [max_layer]
+        if isinstance(cutoff_layers, int):
+            max_layer = cutoff_layers
+            cutoff_layers = [cutoff_layers]
+        else:
+            max_layer = max(cutoff_layers)
         for idx, decoder_layer in enumerate(self.layers):
+            if idx in cutoff_layers and output_hidden_states:
+                all_hidden_states += (self.norm(hidden_states),)
+            if idx == max_layer:
                 break
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
         hidden_states = self.norm(hidden_states)
         # add hidden states from the last decoder layer
+        if output_hidden_states and self.config.num_hidden_layers == max_layer:
             all_hidden_states += (hidden_states,)
         next_cache = None
         )
+class LayerWiseMiniCPMForCausalLM(MiniCPMPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
+        self.model = LayerWiseMiniCPMModel(config)
         self.vocab_size = config.vocab_size
+        if not self.config.classifier_multi:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        else:
+            self.lm_head = nn.ModuleList([nn.Linear(
+                config.hidden_size, config.vocab_size, bias=False) for _ in range(
+                self.config.start_layer,
+                self.model.config.num_hidden_layers)])
         # Initialize weights and apply final processing
         self.post_init()
     @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            cutoff_layers: Optional[Union[int, List]] = None,
+            only_for_one_logit: Optional[int] = None
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if cutoff_layers is None:
+            cutoff_layers = [self.config.num_hidden_layers]
+        elif isinstance(cutoff_layers, int):
+            cutoff_layers = [cutoff_layers]
+        remove_layers = [i for i in cutoff_layers if self.config.start_layer > i or i > self.config.num_hidden_layers]
+        if len(remove_layers) > 0:
+            logger.warning_once(
+                f"layers {remove_layers} is incompatible with the setting. They will be removed..."
+            )
+        cutoff_layers = [i for i in cutoff_layers if i not in remove_layers]
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
             use_cache=use_cache,
             output_attentions=output_attentions,
+            output_hidden_states=True,
             return_dict=return_dict,
+            cutoff_layers=cutoff_layers
         )
         hidden_states = outputs[0]
+        all_logits = ()
+        if only_for_one_logit is None:
+            for i in range(len(outputs.hidden_states)):
+                if self.config.classifier_multi == False:
+                    if self.config.pretraining_tp > 1:
+                        lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+                        logits = [F.linear(outputs.hidden_states[i], lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+                        logits = torch.cat(logits, dim=-1)
+                    else:
+                        logits = self.lm_head(outputs.hidden_states[i] / (self.config.hidden_size / self.config.dim_model_base))
+                else:
+                    if self.config.pretraining_tp > 1:
+                        lm_head_slices = self.lm_head[cutoff_layers[i] - self.config.start_layer].weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+                        logits = [F.linear(outputs.hidden_states[i], lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+                        logits = torch.cat(logits, dim=-1)
+                    else:
+                        logits = self.lm_head[cutoff_layers[i] - self.config.start_layer](outputs.hidden_states[i] / (self.config.hidden_size / self.config.dim_model_base))
+                logits = logits.float()
+                all_logits = all_logits + (logits, )
         else:
+            if self.config.classifier_multi == False:
+                lm_head_slices = self.lm_head.weight.split(1, dim=0)
+                for i in range(len(outputs.hidden_states)):
+                    logits = F.linear(outputs.hidden_states[i], lm_head_slices[only_for_one_logit])
+                    logits = logits.float()
+                    all_logits = all_logits + (logits,)
+            else:
+                for i in range(len(outputs.hidden_states)):
+                    lm_head_slices = self.lm_head[cutoff_layers[i] - self.config.start_layer].weight.split(1, dim=0)
+                    logits = F.linear(outputs.hidden_states[i], lm_head_slices[only_for_one_logit])
+                    logits = logits.float()
+                    all_logits = all_logits + (logits, )
         loss = None
+        if labels is not None and not only_for_one_logit:
             # Shift so that tokens < n predict n
+            loss = 0
+            for logits in all_logits:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss()
+                shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss += loss_fct(shift_logits, shift_labels)
+        outputs.hidden_states = None if not output_hidden_states else outputs.hidden_states
         if not return_dict:
             output = (logits,) + outputs[1:]
         )
     def prepare_inputs_for_generation(
+            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
             # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
             # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
             if (
+                    max_cache_length is not None
+                    and attention_mask is not None
+                    and cache_length + input_ids.shape[1] > max_cache_length
             ):
                 attention_mask = attention_mask[:, -max_cache_length:]
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
         return reordered_past
     @torch.inference_mode()
     def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
              max_length: int = 4096, num_beams=1, do_sample=True, top_p=0.8, temperature=0.3, logits_processor=None,
             history = []
         if logits_processor:
             gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                          "temperature": temperature, "logits_processor": logits_processor, **kwargs}
         else:
             gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                          "temperature": temperature, "logits_processor": logits_processor, **kwargs}
         history.append({"role": role, "content": query})
         history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=False)
         inputs = tokenizer(history_str, return_tensors='pt').to(self.device)
     @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING)
     def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):