openbmb
/

InfLLM-V2-Short-Dense-Base

@@ -70,13 +70,14 @@ from functools import lru_cache
 def compressed_attention(
     q: torch.Tensor,
     k: torch.Tensor,
-    v: torch.Tensor,
     kernel_size: int,
     kernel_stride: int,
     block_size: int,
     topk: int,
     cu_seqlens_q: torch.Tensor,
     cu_seqlens_k: torch.Tensor,
     max_seqlen_q: int,
     max_seqlen_k: int,
     sm_scale: float = None,
@@ -106,9 +107,10 @@ def compressed_attention(
         score = infllmv2_attn_stage1(
             q.contiguous(),
             k.contiguous(),
-            v.contiguous(),
             cu_seqlens_q=cu_seqlens_q,
             cu_seqlens_k=cu_seqlens_k,
             max_seqlen_q=max_seqlen_q,
             max_seqlen_k=max_seqlen_k,
             causal=is_prefilling
@@ -142,12 +144,10 @@ def compressed_attention(
 def calc_chunks_with_stride(cu_seqlen, chunk_size, kernel_stride):
     """
     Compute the chunks that require Sparse attention, with stride support.
     Args:
         cu_seqlen (torch.Tensor): Cumulative sequence lengths for each sample.
         chunk_size (int): Chunk size used for Sparse attention.
         kernel_stride (int): Stride size when sliding over the sequence.
     Returns:
         filtered_indices (torch.Tensor): Indices used to directly index into the key/value tensors.
         cu_seqlens_compressed (torch.Tensor): Cumulative sequence lengths after compression.
@@ -190,7 +190,6 @@ class CompressK(torch.nn.Module):
     def __init__(self, head_num_k, head_dim, kernel_size, kernel_stride=16):
         """
         Module for compressing key (K) representations.
         Args:
             head_num_k (int): Number of key attention heads.
             head_dim (int): Dimension of each attention head.
@@ -206,15 +205,12 @@ class CompressK(torch.nn.Module):
     def forward(self, k: torch.Tensor, cu_seqlens):
         """
         Forward pass for compressing the key (K) tensor.
         Args:
             k (torch.Tensor): Input key tensor of shape (total_seq_len, num_heads, head_dim).
             cu_seqlens (torch.Tensor): Cumulative sequence lengths for each sample in the batch, typically used for handling variable-length sequences.
         Returns:
             compress_k (torch.Tensor): Compressed key tensor.
             cu_seqlens_compressed (torch.Tensor): Updated cumulative sequence lengths after compression.
         """
         # Compute chunk-related metadata, with stride support
         filtered_k_indices, cu_seqlens_compressed = calc_chunks_with_stride(
@@ -241,6 +237,11 @@ class InfLLMv2CacheLayer(DynamicLayer):
         self.no_compress_k_cache = []
         self.cached_compressed_cu_seqlens = torch.tensor([], dtype=torch.int32)
         self.compress_k_cache_varlen = torch.tensor([], dtype=torch.float32)
     def update_no_rope_key(self, key_states):
         if self.no_rope_keys.numel() == 0:
@@ -282,6 +283,39 @@ class InfLLMv2CacheLayer(DynamicLayer):
                     k_chunk_list.append(None)
         return k_chunk_list
 class InfLLMv2Cache(DynamicCache):
     def __init__(self, config,num_hidden_layers: Optional[int] = None) -> None:
         super().__init__(config=config)
@@ -303,6 +337,12 @@ class InfLLMv2Cache(DynamicCache):
     def update_no_compress_k(self, key_states, layer_idx, kernel_size=32, kernel_stride=16, cache_kwargs=None):
         return self.layers[layer_idx].update_no_compress_k(key_states, kernel_size, kernel_stride)
     def crop(self, max_length):
         for layer in self.layers:
             layer.crop(max_length)
@@ -489,7 +529,6 @@ def rotate_half(x):
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         q (`torch.Tensor`): The query tensor.
         k (`torch.Tensor`): The key tensor.
@@ -860,7 +899,6 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
@@ -976,7 +1014,9 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
         self.local_blocks = self.window_size // self.block_size  # local_blocks
         self.topk = self.config.sparse_config.get('topk', 64) + (self.window_size//self.block_size)
         self.use_nope = self.config.sparse_config.get('use_nope', False)
         self.compress_k = CompressK(self.num_key_value_heads, self.head_dim, kernel_size=self.kernel_size, kernel_stride=self.kernel_stride)
     def forward(
         self,
@@ -1088,7 +1128,6 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
             """
             Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
             first unpad the input, then computes the attention scores and pad the final attention scores.
             Args:
                 query_states (`torch.Tensor`):
                     Input query states to be passed to Flash Attention API
@@ -1114,7 +1153,7 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
                 batch_size = query_states.shape[0]
                 # assert batch_size == 1, 'Only batch_size=1 is supported at the moment.'
                 if past_key_value!=None:
-                    compressed_k, compressed_cu_seqlens = self.get_compress_k(
                         key_states=key_states if self.use_nope ==False else no_rope_param['key_states_no_rope'],  # This can be optimized a bit;
                         attention_mask=attention_mask,
                         past_key_value=past_key_value,
@@ -1135,6 +1174,10 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
                 if past_key_value==None:
                     # compress_k use varlen form
                     compressed_k, compressed_cu_seqlens = self.compress_k(key_states,cu_seqlens_k)
                 attn_output_unpad = self.sparse_forward(
@@ -1146,7 +1189,8 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
                     max_seqlen_in_batch_q,
                     max_seqlen_in_batch_k,
                     no_rope_param=no_rope_param,
-                    compressed_k=compressed_k, compressed_cu_seqlens=compressed_cu_seqlens
                 )
                 attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
@@ -1166,7 +1210,7 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
             no_rope_param: Optional parameter containing key states without rope
         Returns:
-            Tuple of (compressed_k, compressed_cu_seqlens)
         """
         # Check if this is prefilling or initial compression condition
@@ -1182,9 +1226,12 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
             unpadded_key_states, indices, cu_seqlens, max_seqlen_in_batch = _unpad_one_tensor(key_states,attention_mask=attention_mask)
             # Compress the keys
             compressed_k, compressed_cu_seqlens = self.compress_k(unpadded_key_states, cu_seqlens)
             past_key_value.update_compress_k(
                 compressed_k, self.layer_idx, compressed_cu_seqlens)
             no_compress_k_list = []
             # Compute and update no_compress_k
@@ -1196,6 +1243,17 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
             past_key_value.update_no_compress_k(
                 no_compress_k_list, self.layer_idx,kernel_stride=self.kernel_stride,
                 kernel_size=self.kernel_size)
         else:
             # Decode case: incremental update
@@ -1220,8 +1278,23 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
                 else:
                     new_compressed_k_list.append(None)
             compressed_k, compressed_cu_seqlens = past_key_value.update_compress_k(new_compressed_k_list, self.layer_idx,)
-        return compressed_k, compressed_cu_seqlens
     def sparse_forward(self,
                        query_layer,
                        key_layer,
@@ -1231,7 +1304,8 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
                        max_seqlen_in_batch_q,
                        max_seqlen_in_batch_k,
                        no_rope_param=None,
-                       compressed_k=None, compressed_cu_seqlens=None):
         compressed_seqlens = compressed_cu_seqlens[1:] - compressed_cu_seqlens[:-1]
         cache_lens = None
         if max_seqlen_in_batch_q==1 and max_seqlen_in_batch_k>1: #decoding
@@ -1241,13 +1315,14 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
         topk_idx = compressed_attention(
             query_layer if no_rope_param is None else no_rope_param['query_states_no_rope'],
             compressed_k,
-            compressed_k.clone(),
             self.kernel_size,
             self.kernel_stride,
             self.block_size,
             self.topk,
             cu_seqlens_q,
             compressed_cu_seqlens,
             max_seqlen_in_batch_q,
             compressed_seqlens.max().item(),
             None,
@@ -1280,7 +1355,6 @@ class MiniCPMInfLLMv2Attention(MiniCPMAttention):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
@@ -1544,11 +1618,9 @@ MINICPM_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`MiniCPMConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -1588,50 +1660,38 @@ MINICPM_INPUTS_DOCSTRING = r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance;
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
@@ -1660,7 +1720,6 @@ MINICPM_INPUTS_DOCSTRING = r"""
 class MiniCPMModel(MiniCPMPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
     Args:
         config: MiniCPMConfig
     """
@@ -1887,20 +1946,14 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
         >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -2080,10 +2133,8 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
 @add_start_docstrings(
     """
     The MiniCPM Model transformer with a sequence classification head on top (linear layer).
     [`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
     no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
@@ -2196,4 +2247,4 @@ class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
-        )

 def compressed_attention(
     q: torch.Tensor,
     k: torch.Tensor,
+    k2: torch.Tensor,
     kernel_size: int,
     kernel_stride: int,
     block_size: int,
     topk: int,
     cu_seqlens_q: torch.Tensor,
     cu_seqlens_k: torch.Tensor,
+    cu_seqlens_k2: torch.Tensor,
     max_seqlen_q: int,
     max_seqlen_k: int,
     sm_scale: float = None,
         score = infllmv2_attn_stage1(
             q.contiguous(),
             k.contiguous(),
+            k2.contiguous(),
             cu_seqlens_q=cu_seqlens_q,
             cu_seqlens_k=cu_seqlens_k,
+            cu_seqlens_v=cu_seqlens_k2,
             max_seqlen_q=max_seqlen_q,
             max_seqlen_k=max_seqlen_k,
             causal=is_prefilling
 def calc_chunks_with_stride(cu_seqlen, chunk_size, kernel_stride):
     """
     Compute the chunks that require Sparse attention, with stride support.
     Args:
         cu_seqlen (torch.Tensor): Cumulative sequence lengths for each sample.
         chunk_size (int): Chunk size used for Sparse attention.
         kernel_stride (int): Stride size when sliding over the sequence.
     Returns:
         filtered_indices (torch.Tensor): Indices used to directly index into the key/value tensors.
         cu_seqlens_compressed (torch.Tensor): Cumulative sequence lengths after compression.
     def __init__(self, head_num_k, head_dim, kernel_size, kernel_stride=16):
         """
         Module for compressing key (K) representations.
         Args:
             head_num_k (int): Number of key attention heads.
             head_dim (int): Dimension of each attention head.
     def forward(self, k: torch.Tensor, cu_seqlens):
         """
         Forward pass for compressing the key (K) tensor.
         Args:
             k (torch.Tensor): Input key tensor of shape (total_seq_len, num_heads, head_dim).
             cu_seqlens (torch.Tensor): Cumulative sequence lengths for each sample in the batch, typically used for handling variable-length sequences.
         Returns:
             compress_k (torch.Tensor): Compressed key tensor.
             cu_seqlens_compressed (torch.Tensor): Updated cumulative sequence lengths after compression.
         """
         # Compute chunk-related metadata, with stride support
         filtered_k_indices, cu_seqlens_compressed = calc_chunks_with_stride(
         self.no_compress_k_cache = []
         self.cached_compressed_cu_seqlens = torch.tensor([], dtype=torch.int32)
         self.compress_k_cache_varlen = torch.tensor([], dtype=torch.float32)
+        # Add support for compress_k2
+        self.compress_k2_cache = []
+        self.cached_compressed_cu_seqlens2 = torch.tensor([], dtype=torch.int32)
+        self.compress_k2_cache_varlen = torch.tensor([], dtype=torch.float32)
+        self.no_compress_k2_cache = []
     def update_no_rope_key(self, key_states):
         if self.no_rope_keys.numel() == 0:
                     k_chunk_list.append(None)
         return k_chunk_list
+    def update_compress_k2(self, key_states, cu_seqlens=None):
+        if len(self.compress_k2_cache) == 0:
+            if cu_seqlens is not None:
+                self.cached_compressed_cu_seqlens2 = cu_seqlens.clone()
+            self.compress_k2_cache_varlen = key_states
+            split_sizes = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            self.compress_k2_cache = list(torch.split(key_states, split_sizes))
+        else:
+            for index, k in enumerate(key_states):
+                if k is not None:
+                    self.compress_k2_cache[index] = torch.cat([self.compress_k2_cache[index], k], dim=0)
+            new_seq_lens = torch.tensor([tensor.shape[0] for tensor in self.compress_k2_cache], dtype=torch.int32)
+            new_cumsum = torch.cumsum(new_seq_lens, dim=0, dtype=torch.int32)
+            self.compress_k2_cache_varlen = torch.cat(self.compress_k2_cache, dim=0)
+            self.cached_compressed_cu_seqlens2 = torch.cat([torch.tensor([0], dtype=torch.int32), new_cumsum]).to(self.compress_k2_cache_varlen.device)
+        return self.compress_k2_cache_varlen, self.cached_compressed_cu_seqlens2
+    def update_no_compress_k2(self, key_states, kernel_size=128, kernel_stride=64):
+        k_chunk_list = []
+        for index, k in enumerate(key_states):
+            if len(self.no_compress_k2_cache) <= index:
+                self.no_compress_k2_cache.append(k)
+            else:
+                self.no_compress_k2_cache[index] = torch.cat([self.no_compress_k2_cache[index], k], dim=0)
+                current_len = self.no_compress_k2_cache[index].shape[0]
+                if current_len >= kernel_size:
+                    k_chunk_list.append(self.no_compress_k2_cache[index][:kernel_size])
+                    self.no_compress_k2_cache[index] = self.no_compress_k2_cache[index][kernel_stride:]
+                else:
+                    k_chunk_list.append(None)
+        return k_chunk_list
 class InfLLMv2Cache(DynamicCache):
     def __init__(self, config,num_hidden_layers: Optional[int] = None) -> None:
         super().__init__(config=config)
     def update_no_compress_k(self, key_states, layer_idx, kernel_size=32, kernel_stride=16, cache_kwargs=None):
         return self.layers[layer_idx].update_no_compress_k(key_states, kernel_size, kernel_stride)
+    def update_compress_k2(self, key_states, layer_idx, cu_seqlens=None, cache_kwargs=None):
+        return self.layers[layer_idx].update_compress_k2(key_states, cu_seqlens)
+    def update_no_compress_k2(self, key_states, layer_idx, kernel_size=128, kernel_stride=64, cache_kwargs=None):
+        return self.layers[layer_idx].update_no_compress_k2(key_states, kernel_size, kernel_stride)
     def crop(self, max_length):
         for layer in self.layers:
             layer.crop(max_length)
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         q (`torch.Tensor`): The query tensor.
         k (`torch.Tensor`): The key tensor.
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
         self.local_blocks = self.window_size // self.block_size  # local_blocks
         self.topk = self.config.sparse_config.get('topk', 64) + (self.window_size//self.block_size)
         self.use_nope = self.config.sparse_config.get('use_nope', False)
         self.compress_k = CompressK(self.num_key_value_heads, self.head_dim, kernel_size=self.kernel_size, kernel_stride=self.kernel_stride)
+        self.compress_k2 = CompressK(self.num_key_value_heads, self.head_dim, kernel_size=self.kernel_size*4, kernel_stride=self.kernel_stride*4)
     def forward(
         self,
             """
             Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
             first unpad the input, then computes the attention scores and pad the final attention scores.
             Args:
                 query_states (`torch.Tensor`):
                     Input query states to be passed to Flash Attention API
                 batch_size = query_states.shape[0]
                 # assert batch_size == 1, 'Only batch_size=1 is supported at the moment.'
                 if past_key_value!=None:
+                    compressed_k, compressed_cu_seqlens, compressed_k2, compressed_cu_seqlens2 = self.get_compress_k(
                         key_states=key_states if self.use_nope ==False else no_rope_param['key_states_no_rope'],  # This can be optimized a bit;
                         attention_mask=attention_mask,
                         past_key_value=past_key_value,
                 if past_key_value==None:
                     # compress_k use varlen form
                     compressed_k, compressed_cu_seqlens = self.compress_k(key_states,cu_seqlens_k)
+                    compressed_k2, compressed_cu_seqlens2 = self.compress_k2(key_states,cu_seqlens_k)
+                else:
+                    # compressed_k and compressed_k2 already retrieved from get_compress_k above
+                    pass
                 attn_output_unpad = self.sparse_forward(
                     max_seqlen_in_batch_q,
                     max_seqlen_in_batch_k,
                     no_rope_param=no_rope_param,
+                    compressed_k=compressed_k, compressed_cu_seqlens=compressed_cu_seqlens,
+                    compressed_k2=compressed_k2, compressed_cu_seqlens2=compressed_cu_seqlens2
                 )
                 attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
             no_rope_param: Optional parameter containing key states without rope
         Returns:
+            Tuple of (compressed_k, compressed_cu_seqlens, compressed_k2, compressed_cu_seqlens2)
         """
         # Check if this is prefilling or initial compression condition
             unpadded_key_states, indices, cu_seqlens, max_seqlen_in_batch = _unpad_one_tensor(key_states,attention_mask=attention_mask)
             # Compress the keys
             compressed_k, compressed_cu_seqlens = self.compress_k(unpadded_key_states, cu_seqlens)
+            compressed_k2, compressed_cu_seqlens2 = self.compress_k2(unpadded_key_states, cu_seqlens)
             past_key_value.update_compress_k(
                 compressed_k, self.layer_idx, compressed_cu_seqlens)
+            past_key_value.update_compress_k2(
+                compressed_k2, self.layer_idx, compressed_cu_seqlens2)
             no_compress_k_list = []
             # Compute and update no_compress_k
             past_key_value.update_no_compress_k(
                 no_compress_k_list, self.layer_idx,kernel_stride=self.kernel_stride,
                 kernel_size=self.kernel_size)
+            # Also update no_compress_k2
+            no_compress_k2_list = []
+            for i in range(len(compressed_cu_seqlens2)-1):
+                no_compress_k2_start = (compressed_cu_seqlens2[i+1]- compressed_cu_seqlens2[i]) * self.kernel_stride * 4
+                no_compress_k2_list.append(unpadded_key_states[cu_seqlens[i]+no_compress_k2_start:cu_seqlens[i+1]].clone())
+            past_key_value.update_no_compress_k2(
+                no_compress_k2_list, self.layer_idx,kernel_stride=self.kernel_stride*4,
+                kernel_size=self.kernel_size*4)
         else:
             # Decode case: incremental update
                 else:
                     new_compressed_k_list.append(None)
             compressed_k, compressed_cu_seqlens = past_key_value.update_compress_k(new_compressed_k_list, self.layer_idx,)
+            # For compress_k2, update no_compress_k2 buffer and compress when ready
+            no_compress_k2_list = past_key_value.update_no_compress_k2(
+                key_states_split, self.layer_idx,
+                kernel_stride=self.kernel_stride*4,
+                kernel_size=self.kernel_size*4)
+            new_compressed_k2_list = []
+            for no_compress_k2 in no_compress_k2_list:
+                if no_compress_k2 is not None:
+                    # We have enough tokens to compress for k2
+                    new_compressed_k2 = no_compress_k2.mean(dim=0, keepdim=True)  # [1, n_heads_k, head_dim]
+                    new_compressed_k2_list.append(new_compressed_k2)
+                else:
+                    new_compressed_k2_list.append(None)
+            compressed_k2, compressed_cu_seqlens2 = past_key_value.update_compress_k2(new_compressed_k2_list, self.layer_idx,)
+        return compressed_k, compressed_cu_seqlens, compressed_k2, compressed_cu_seqlens2
     def sparse_forward(self,
                        query_layer,
                        key_layer,
                        max_seqlen_in_batch_q,
                        max_seqlen_in_batch_k,
                        no_rope_param=None,
+                       compressed_k=None, compressed_cu_seqlens=None,
+                       compressed_k2=None, compressed_cu_seqlens2=None):
         compressed_seqlens = compressed_cu_seqlens[1:] - compressed_cu_seqlens[:-1]
         cache_lens = None
         if max_seqlen_in_batch_q==1 and max_seqlen_in_batch_k>1: #decoding
         topk_idx = compressed_attention(
             query_layer if no_rope_param is None else no_rope_param['query_states_no_rope'],
             compressed_k,
+            compressed_k2,
             self.kernel_size,
             self.kernel_stride,
             self.block_size,
             self.topk,
             cu_seqlens_q,
             compressed_cu_seqlens,
+            compressed_cu_seqlens2,
             max_seqlen_in_batch_q,
             compressed_seqlens.max().item(),
             None,
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`MiniCPMConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance;
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
 class MiniCPMModel(MiniCPMPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`]
     Args:
         config: MiniCPMConfig
     """
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, MiniCPMForCausalLM
         >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 @add_start_docstrings(
     """
     The MiniCPM Model transformer with a sequence classification head on top (linear layer).
     [`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
     no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
+        )