autoprogrammer
/

dream_rcr

@@ -1,6 +1,17 @@
 # coding=utf-8
-# Copyright 2024 The Dream team, HKUNLP Group and the HuggingFace
-# Licensed under the Apache License, Version 2.0
 import warnings
 import copy
@@ -23,32 +34,30 @@ def top_p_logits(logits, top_p=None):
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
     cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
     sorted_indices_to_remove = cumulative_probs > top_p
-    # keep first token above threshold
     sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
     sorted_indices_to_remove[..., 0] = 0
     mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
     mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
-    return logits.masked_fill(mask, torch.finfo(logits.dtype).min)
 def top_k_logits(logits, top_k=None):
     if top_k is None:
         return logits
-    top_k = min(top_k, logits.size(-1))
-    thresh = torch.topk(logits, top_k)[0][..., -1, None]
-    indices_to_remove = logits < thresh
-    return logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
-def sample_tokens(
-    logits,
-    temperature: float = 0.0,
-    top_p: Optional[float] = None,
-    top_k: Optional[int] = None,
-    margin_confidence: bool = False,
-    neg_entropy: bool = False,
-):
-    # 保持 dtype 与 logits 一致（包含 bf16/fp16）
     if temperature and temperature > 0:
         logits = logits / temperature
     if top_p is not None and top_p < 1:
@@ -58,27 +67,27 @@ def sample_tokens(
     probs = torch.softmax(logits, dim=-1)
     if temperature and temperature > 0:
-        # 采样
         try:
             x0 = dists.Categorical(probs=probs).sample()
             confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
         except Exception:
             confidence, x0 = probs.max(dim=-1)
     else:
-        # 贪心
         confidence, x0 = probs.max(dim=-1)
     if margin_confidence:
         sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
-        top1_probs = sorted_probs[..., 0]
-        top2_probs = sorted_probs[..., 1]
         confidence = top1_probs - top2_probs
     if neg_entropy:
-        eps = probs.new_tensor(1e-10)
-        log_probs = torch.log(probs + eps)
-        # 负熵（和为负数），数值上越大（绝对值越小）表示不确定；此处直接用于排序
         confidence = torch.sum(probs * log_probs, dim=-1)
     return confidence, x0
@@ -92,23 +101,27 @@ class DreamModelOutput(ModelOutput):
 class DreamGenerationConfig(GenerationConfig):
     def __init__(self, **kwargs):
         self.temperature: float = kwargs.pop("temperature", 0.0)
         self.top_p: Optional[float] = kwargs.pop("top_p", None)
         self.top_k: Optional[int] = kwargs.pop("top_k", None)
         self.max_length = kwargs.pop("max_length", 20)
         self.max_new_tokens = kwargs.pop("max_new_tokens", None)
         # diffusion specific params
         self.eps: float = kwargs.pop("eps", 1e-3)
         self.steps: int = kwargs.pop("steps", 512)
-        self.alg: str = kwargs.pop("alg", "origin")
         self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
-        # RCR 参数（默认不生效）
         self.rcr: bool = kwargs.pop("rcr", False)
-        self.conf_alg: str = kwargs.pop("conf_alg", "maskgit_plus")
-        # generate 输出控制
         self.num_return_sequences: int = kwargs.pop("num_return_sequences", 1)
         self.return_dict_in_generate: bool = kwargs.pop("return_dict_in_generate", False)
         self.output_history: bool = kwargs.pop("output_history", False)
@@ -119,9 +132,10 @@ class DreamGenerationConfig(GenerationConfig):
         self.bos_token_id = kwargs.pop("bos_token_id", None)
         self.eos_token_id = kwargs.pop("eos_token_id", None)
         self.generation_kwargs = kwargs.pop("generation_kwargs", {})
-        # hub meta
         self._from_model_config = kwargs.pop("_from_model_config", False)
         self._commit_hash = kwargs.pop("_commit_hash", None)
         self.transformers_version = kwargs.pop("transformers_version", __version__)
@@ -137,7 +151,6 @@ class DreamGenerationConfig(GenerationConfig):
         self.validate(is_init=True)
     def validate(self, is_init=False):
-        # 保留空实现，兼容 upstream
         pass
@@ -146,7 +159,7 @@ class DreamGenerationMixin:
     def _expand_inputs_for_generation(
         expand_size: int = 1,
         input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
         if expand_size == 1:
             return input_ids, attention_mask
@@ -156,13 +169,16 @@ class DreamGenerationMixin:
             attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
         return input_ids, attention_mask
     def _apply_rcr_logic(
         self,
-        x: torch.LongTensor,
-        x0_sel: torch.LongTensor,
-        conf_sel: torch.Tensor,
         mask_index: torch.Tensor,
-        overtime_confidence: torch.Tensor,
         mask_token_id: int,
         step: int,
         total_steps: int,
@@ -170,56 +186,68 @@ class DreamGenerationMixin:
         t: torch.Tensor,
     ):
         """
-        Running Confidence Remasking (RCR)
-        - 按 Dream 原调度计算每步应转移的 token 数；
-        - 先把本步最高置信度的若干个位置从 [MASK] 转为预测；
-        - 再根据“截至本步的目标累计数量”，把最低置信度的多余部分回遮回 [MASK]。
-        仅在 rcr=True 时调用。
         """
         device = x.device
-        dtype = overtime_confidence.dtype  # == logits.dtype
-        B = x.shape[0]
-        # 当前 batch 平均剩余 mask 数
-        num_mask_token = mask_index.sum() / mask_index.shape[0]
-        # 本步的转移数量（与 Dream 调度一致）
-        number_transfer_tokens = int(num_mask_token * (1 - s / t)) if step < total_steps - 1 else int(num_mask_token)
-        # 构造“全长”置信度与候选 token（非 mask 位置分别设为 -inf / mask_token_id）
-        full_conf = torch.full(x.shape, float("-inf"), device=device, dtype=dtype)
-        x_temp = torch.full_like(x, fill_value=mask_token_id, dtype=torch.long, device=device)
-        full_conf[mask_index] = conf_sel
-        x_temp[mask_index] = x0_sel
         for j in range(B):
             masked_j = int(mask_index[j].sum().item())
-            if masked_j == 0:
-                continue
             k_j = min(number_transfer_tokens, masked_j)
             if k_j > 0:
-                # 选出本步 top-k_j 的位置
-                _, select_idx = torch.topk(full_conf[j], k=k_j, largest=True)
-                x[j, select_idx] = x_temp[j, select_idx]
-                # 记录这些位置的置信度，用于累计与回遮判断
-                overtime_confidence[j, select_idx] = full_conf[j, select_idx]
-            # 目标累计（与原 Dream 线性进度对齐）
             if step < total_steps - 1:
-                target_cum = int(num_mask_token * (1 - s / t))  # 累计目标到当前步
-                gen_mask = overtime_confidence[j] > overtime_confidence.new_tensor(0)
-                current_gen = int(gen_mask.sum().item())
-                overflow = max(0, current_gen - target_cum)
-                if overflow > 0:
-                    gen_indices = torch.where(gen_mask)[0]
-                    if gen_indices.numel() > 0:
-                        gen_conf = overtime_confidence[j, gen_indices]
-                        overflow = min(overflow, int(gen_indices.numel()))
-                        # 选“最低置信度”的 overflow 个位置回遮
-                        _, low_local = torch.topk(gen_conf, k=overflow, largest=False)
-                        low_global = gen_indices[low_local]
-                        x[j, low_global] = mask_token_id
-                        overtime_confidence[j, low_global] = overtime_confidence.new_zeros(low_global.shape)
     def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
         if is_torchdynamo_compiling():
@@ -232,9 +260,11 @@ class DreamGenerationMixin:
                 UserWarning,
             )
         if input_ids_length >= generation_config.max_length:
             raise ValueError(
-                f"Input length is {input_ids_length}, but `max_length` is {generation_config.max_length}. "
-                "Consider increasing `max_length` or setting `max_new_tokens`."
             )
     def _prepare_generated_length(self, generation_config, has_default_max_length, input_ids_length):
@@ -242,20 +272,20 @@ class DreamGenerationMixin:
             if not has_default_max_length and generation_config.max_length is not None:
                 logger.warning(
                     f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence."
                 )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_length
         elif has_default_max_length:
             if generation_config.max_length == DreamGenerationConfig().max_length:
                 generation_config.max_length = generation_config.max_length + input_ids_length
-                mpe = getattr(self.config, "max_position_embeddings", None)
-                if mpe is not None:
-                    generation_config.max_length = min(generation_config.max_length, mpe)
         return generation_config
-    def _prepare_generation_config(
-        self, generation_config: Optional[DreamGenerationConfig], **kwargs: Dict
-    ) -> DreamGenerationConfig:
         using_model_generation_config = False
         if generation_config is None:
             generation_config = DreamGenerationConfig.from_model_config(self.config)
@@ -273,11 +303,10 @@ class DreamGenerationMixin:
                     generation_config.pad_token_id = self.generation_config.pad_token_id
                 if generation_config.mask_token_id is None:
                     generation_config.mask_token_id = self.generation_config.mask_token_id
         return generation_config
-    def _prepare_special_tokens(
-        self, generation_config: DreamGenerationConfig, device: Optional[Union[torch.device, str]] = None
-    ):
         def _tensor_or_none(token, device=None):
             if token is None:
                 return token
@@ -332,18 +361,22 @@ class DreamGenerationMixin:
         if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
             warnings.warn(
-                "You are calling .generate() with `input_ids` on a device type different than your model's device. "
-                f"`input_ids` is on {input_ids.device.type}, model is on {self.device.type}.",
                 UserWarning,
             )
         if (
             hasattr(generation_config, "pad_token_id")
             and torch.any(input_ids == generation_config.pad_token_id)
             and attention_mask is None
         ):
             warnings.warn(
-                "Padding was detected but no attention mask is passed. For correct results, set `attention_mask` when batch-padding inputs.",
                 UserWarning,
             )
@@ -368,9 +401,9 @@ class DreamGenerationMixin:
         attention_mask: Optional[torch.LongTensor],
         generation_config: DreamGenerationConfig,
         generation_tokens_hook_func,
-        generation_logits_hook_func,
     ) -> Union[DreamModelOutput, torch.LongTensor]:
-        # 原变量
         output_history = generation_config.output_history
         return_dict_in_generate = generation_config.return_dict_in_generate
         max_length = generation_config.max_length
@@ -383,13 +416,13 @@ class DreamGenerationMixin:
         top_p = generation_config.top_p
         top_k = generation_config.top_k
-        # RCR 控制
         rcr = generation_config.rcr
         conf_alg = generation_config.conf_alg
         histories = [] if (return_dict_in_generate and output_history) else None
-        # pad 到 max_length
         x = F.pad(input_ids, (0, max_length - input_ids.shape[1]), value=mask_token_id)
         if attention_mask is not None and torch.any(attention_mask == 0.0):
@@ -406,104 +439,104 @@ class DreamGenerationMixin:
         timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
-        # 置信度累计缓冲，延迟到拿到 logits.dtype 后再初始化，避免 dtype 错误
-        overtime_confidence = None  # dtype = logits.dtype（初始化时设置）
-        # 允许用户控制中间 tokens
         x = generation_tokens_hook_func(None, x, None)
         for i in range(steps):
             mask_index = (x == mask_token_id)
             logits = self(x, attention_mask, tok_idx).logits
             logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
-            # 允许用户控制中间 logits
             logits = generation_logits_hook_func(i, x, logits)
             mask_logits = logits[mask_index]
             t = timesteps[i]
             s = timesteps[i + 1]
-            # 首次根据 logits.dtype 初始化 overtime_confidence（避免 Float/BFloat16 冲突）
-            if rcr and overtime_confidence is None:
-                overtime_confidence = torch.zeros_like(x, dtype=logits.dtype, device=x.device)
-            if alg == "origin":
-                # 原始 Dream 逻辑（不动）
                 p_transfer = 1 - s / t if i < steps - 1 else 1
-                x0 = torch.full_like(x[mask_index], fill_value=mask_token_id, dtype=torch.long, device=self.device)
                 transfer_index_t_s = torch.rand(*x0.shape, device=self.device) < p_transfer
                 _, x0[transfer_index_t_s] = sample_tokens(
                     mask_logits[transfer_index_t_s], temperature=temperature, top_p=top_p, top_k=top_k
                 )
                 x[mask_index] = x0.clone()
             else:
-                # 选择置信度算法
-                use_alg = alg
-                if rcr:
-                    # rcr=True 时，置信度算法由 conf_alg 决定（不影响 baseline）
-                    use_alg = conf_alg
-                if use_alg == "maskgit_plus":
                     confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k)
-                elif use_alg == "topk_margin":
                     confidence, x0 = sample_tokens(
                         mask_logits, temperature=temperature, top_p=top_p, top_k=top_k, margin_confidence=True
                     )
-                elif use_alg == "entropy":
                     confidence, x0 = sample_tokens(
-                        mask_logits, temperature, top_p=top_p, top_k=top_k, neg_entropy=True
                     )
                 else:
-                    raise RuntimeError(f"Unknown alg: {alg}")
-                # 统一 full_confidence 的 dtype = logits.dtype（避免 int/float 混合）
-                full_confidence = torch.full(
-                    x.shape, float("-inf"), device=self.device, dtype=logits.dtype
-                )
-                full_confidence[mask_index] = confidence
                 if rcr:
-                    # === RCR 分支：先转移 top-k，再根据累计目标回遮 ===
                     self._apply_rcr_logic(
                         x=x,
-                        x0_sel=x0,
-                        conf_sel=confidence,
                         mask_index=mask_index,
-                        overtime_confidence=overtime_confidence,
                         mask_token_id=mask_token_id,
                         step=i,
                         total_steps=steps,
-                        s=s,
-                        t=t,
                     )
                 else:
-                    # === baseline 分支：保持 Dream 逻辑不变 ===
-                    num_mask_token = mask_index.sum() / mask_index.shape[0]
-                    number_transfer_tokens = (
-                        int(num_mask_token * (1 - s / t)) if i < steps - 1 else int(num_mask_token)
-                    )
                     if number_transfer_tokens > 0:
                         if alg_temp is None or alg_temp == 0:
                             _, transfer_index = torch.topk(full_confidence, number_transfer_tokens)
                         else:
-                            fc = full_confidence / alg_temp
-                            fc = F.softmax(fc, dim=-1)
-                            transfer_index = torch.multinomial(fc, num_samples=number_transfer_tokens)
-                        x_ = torch.full_like(x, fill_value=mask_token_id, dtype=torch.long, device=self.device)
                         x_[mask_index] = x0.clone()
                         row_indices = torch.arange(x.size(0), device=self.device).unsqueeze(1).expand_as(transfer_index)
                         x[row_indices, transfer_index] = x_[row_indices, transfer_index]
-            # 允许用户控制中间 tokens
             x = generation_tokens_hook_func(i, x, logits)
             if histories is not None:
                 histories.append(x.clone())
         if return_dict_in_generate:
-            return DreamModelOutput(sequences=x, history=histories)
         else:
             return x

 # coding=utf-8
+# Copyright 2024 The Dream team, HKUNLP Group and the HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import warnings
 import copy
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
     cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
     sorted_indices_to_remove = cumulative_probs > top_p
+    # Shift the indices to the right to keep the first token above the threshold
     sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
     sorted_indices_to_remove[..., 0] = 0
     mask = torch.zeros_like(logits, dtype=torch.bool, device=logits.device)
     mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
+    logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
+    return logits
 def top_k_logits(logits, top_k=None):
     if top_k is None:
         return logits
+    top_k = int(min(top_k, logits.size(-1)))  # Safety check
+    if top_k <= 0:
+        return logits
+    # Remove all tokens with a probability less than the last token of the top-k
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
+    return logits
+def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
+    # logits: [N, V]
     if temperature and temperature > 0:
         logits = logits / temperature
     if top_p is not None and top_p < 1:
     probs = torch.softmax(logits, dim=-1)
+    # 采样/贪心
     if temperature and temperature > 0:
         try:
             x0 = dists.Categorical(probs=probs).sample()
             confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
         except Exception:
             confidence, x0 = probs.max(dim=-1)
     else:
         confidence, x0 = probs.max(dim=-1)
+    # 置信度定义切换
     if margin_confidence:
         sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+        top1_probs = sorted_probs[:, 0]
+        top2_probs = sorted_probs[:, 1]
         confidence = top1_probs - top2_probs
     if neg_entropy:
+        # 负熵（数值 ≤ 0；越接近 0 越大代表越确定）
+        epsilon = 1e-10
+        log_probs = torch.log(probs + epsilon)
         confidence = torch.sum(probs * log_probs, dim=-1)
     return confidence, x0
 class DreamGenerationConfig(GenerationConfig):
     def __init__(self, **kwargs):
+        # sampling
         self.temperature: float = kwargs.pop("temperature", 0.0)
         self.top_p: Optional[float] = kwargs.pop("top_p", None)
         self.top_k: Optional[int] = kwargs.pop("top_k", None)
+        # length
         self.max_length = kwargs.pop("max_length", 20)
         self.max_new_tokens = kwargs.pop("max_new_tokens", None)
         # diffusion specific params
         self.eps: float = kwargs.pop("eps", 1e-3)
         self.steps: int = kwargs.pop("steps", 512)
+        self.alg: str = kwargs.pop("alg", 'origin')  # 'origin' | 'maskgit_plus' | 'topk_margin' | 'entropy'
         self.alg_temp: Optional[float] = kwargs.pop("alg_temp", None)
+        # === RCR 参数（新增，默认关闭，不影响原逻辑） ===
         self.rcr: bool = kwargs.pop("rcr", False)
+        # 仅在 rcr=True 时用于选择置信度算法；rcr=False 不读取它
+        self.conf_alg: str = kwargs.pop("conf_alg", 'maskgit_plus')
+        # generate outputs
         self.num_return_sequences: int = kwargs.pop("num_return_sequences", 1)
         self.return_dict_in_generate: bool = kwargs.pop("return_dict_in_generate", False)
         self.output_history: bool = kwargs.pop("output_history", False)
         self.bos_token_id = kwargs.pop("bos_token_id", None)
         self.eos_token_id = kwargs.pop("eos_token_id", None)
+        # misc
         self.generation_kwargs = kwargs.pop("generation_kwargs", {})
+        # bookkeeping
         self._from_model_config = kwargs.pop("_from_model_config", False)
         self._commit_hash = kwargs.pop("_commit_hash", None)
         self.transformers_version = kwargs.pop("transformers_version", __version__)
         self.validate(is_init=True)
     def validate(self, is_init=False):
         pass
     def _expand_inputs_for_generation(
         expand_size: int = 1,
         input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None
     ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
         if expand_size == 1:
             return input_ids, attention_mask
             attention_mask = attention_mask.repeat_interleave(expand_size, dim=0)
         return input_ids, attention_mask
+    # === 新版：RCR 核心（历史置信度） ===
     def _apply_rcr_logic(
         self,
+        x: torch.Tensor,
+        x0: torch.Tensor,
+        conf_now: torch.Tensor,
         mask_index: torch.Tensor,
+        fixed_conf: torch.Tensor,
+        gen_mask: torch.Tensor,
+        init_mask_count: torch.Tensor,
         mask_token_id: int,
         step: int,
         total_steps: int,
         t: torch.Tensor,
     ):
         """
+        Running Confidence Remasking（历史置信度版）：
+          1) 在 mask 子集内以当步置信度 conf_now 选择 top-k_j 个位置“确认”（写 token）；
+          2) 更新历史置信度 fixed_conf = max(fixed_conf, conf_now)（仅对新选入位置）；
+          3) 按“累计允许确认配额” target_cum = init_mask_count * (1 - s/t) 若超额，
+             在已确认集合 gen_mask 内按 fixed_conf 最低回遮 over 个位置。
+        说明：
+          - conf_now 用 float32 维护，避免与 bfloat16 混写导致 dtype 报错；
+          - 对 entropy：conf_now = 负熵（≤0 且越接近 0 越大代表越确定），配合 topk(largest=True) 没问题。
         """
         device = x.device
+        B, L = x.shape
+        # 计算“当步”选入规模（与 vanilla 同口径：平均剩余 mask * (1 - s/t)）
+        avg_mask_now = (mask_index.sum().item() / max(1, mask_index.shape[0]))
+        ratio = (1.0 - (s.item() / t.item())) if step < total_steps - 1 else 1.0
+        number_transfer_tokens = int(avg_mask_now * ratio)
+        # 确保当步置信度是 float32
+        conf_now = conf_now.to(torch.float32)
+        # 仅在 mask 处有效的“全长”视图
+        full_conf_now = torch.full((B, L), float("-inf"), dtype=torch.float32, device=device)
+        full_x0 = torch.full((B, L), mask_token_id, dtype=torch.long, device=device)
+        full_conf_now[mask_index] = conf_now
+        full_x0[mask_index] = x0
+        # 逐样本处理
         for j in range(B):
             masked_j = int(mask_index[j].sum().item())
             k_j = min(number_transfer_tokens, masked_j)
             if k_j > 0:
+                conf_row = full_conf_now[j]  # float32
+                # 选当步 top-k_j
+                _, sel_idx = torch.topk(conf_row, k=k_j, largest=True)
+                # 写 token & 标记确认
+                x[j, sel_idx] = full_x0[j, sel_idx]
+                gen_mask[j, sel_idx] = True
+                # 历史置信度取 running max
+                fixed_conf[j, sel_idx] = torch.maximum(fixed_conf[j, sel_idx], conf_row[sel_idx])
+            # 累计允许确认配额（以初始 mask 为基数）
+            init_m = int(init_mask_count[j].item())
             if step < total_steps - 1:
+                target_cum = int(init_m * (1.0 - (s.item() / t.item())))
+            else:
+                target_cum = init_m  # 最后一步允许全确认
+            current_gen = int(gen_mask[j].sum().item())
+            over = max(0, current_gen - target_cum)
+            if over > 0:
+                # 在已确认集合里按历史置信度最低回遮
+                gen_idx = torch.where(gen_mask[j])[0]
+                if gen_idx.numel() > 0:
+                    hist_vals = fixed_conf[j, gen_idx]  # float32
+                    over = min(over, int(gen_idx.numel()))
+                    _, low_local = torch.topk(hist_vals, k=over, largest=False)
+                    low_global = gen_idx[low_local]
+                    # 回遮：恢复为 MASK，并撤销确认标记 & 清空历史置信度
+                    x[j, low_global] = mask_token_id
+                    gen_mask[j, low_global] = False
+                    fixed_conf[j, low_global] = float("-inf")
     def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
         if is_torchdynamo_compiling():
                 UserWarning,
             )
         if input_ids_length >= generation_config.max_length:
+            input_ids_string = "input_ids"
             raise ValueError(
+                f"Input length of {input_ids_string} is {input_ids_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. You should consider increasing `max_length` or, better yet,"
+                " setting `max_new_tokens`."
             )
     def _prepare_generated_length(self, generation_config, has_default_max_length, input_ids_length):
             if not has_default_max_length and generation_config.max_length is not None:
                 logger.warning(
                     f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information."
                 )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_length
         elif has_default_max_length:
             if generation_config.max_length == DreamGenerationConfig().max_length:
                 generation_config.max_length = generation_config.max_length + input_ids_length
+                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+                if max_position_embeddings is not None:
+                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
         return generation_config
+    def _prepare_generation_config(self, generation_config: Optional[DreamGenerationConfig], **kwargs: Dict) -> DreamGenerationConfig:
         using_model_generation_config = False
         if generation_config is None:
             generation_config = DreamGenerationConfig.from_model_config(self.config)
                     generation_config.pad_token_id = self.generation_config.pad_token_id
                 if generation_config.mask_token_id is None:
                     generation_config.mask_token_id = self.generation_config.mask_token_id
         return generation_config
+    def _prepare_special_tokens(self, generation_config: DreamGenerationConfig, device: Optional[Union[torch.device, str]] = None):
         def _tensor_or_none(token, device=None):
             if token is None:
                 return token
         if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
             warnings.warn(
+                "You are calling .generate() with the `input_ids` being on a device type different"
+                f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
+                f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
+                " Please make sure that you have put `input_ids` to the"
+                f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
+                " running `.generate()`.",
                 UserWarning,
             )
         if (
             hasattr(generation_config, "pad_token_id")
             and torch.any(input_ids == generation_config.pad_token_id)
             and attention_mask is None
         ):
             warnings.warn(
+                "Padding was detected but no attention mask is passed here. For correct "
+                "generation results, please set `attention_mask` when batch-padding inputs.",
                 UserWarning,
             )
         attention_mask: Optional[torch.LongTensor],
         generation_config: DreamGenerationConfig,
         generation_tokens_hook_func,
+        generation_logits_hook_func
     ) -> Union[DreamModelOutput, torch.LongTensor]:
+        # === 基本变量 ===
         output_history = generation_config.output_history
         return_dict_in_generate = generation_config.return_dict_in_generate
         max_length = generation_config.max_length
         top_p = generation_config.top_p
         top_k = generation_config.top_k
+        # === RCR 控制变量 ===
         rcr = generation_config.rcr
         conf_alg = generation_config.conf_alg
         histories = [] if (return_dict_in_generate and output_history) else None
+        # pad input_ids to max_length
         x = F.pad(input_ids, (0, max_length - input_ids.shape[1]), value=mask_token_id)
         if attention_mask is not None and torch.any(attention_mask == 0.0):
         timesteps = torch.linspace(1, eps, steps + 1, device=x.device)
+        # === RCR 缓冲（仅 rcr=True 时启用） ===
+        if rcr:
+            init_mask_count = (x == mask_token_id).sum(dim=1)  # [B]
+            fixed_conf = torch.full(
+                x.shape, float("-inf"), dtype=torch.float32, device=x.device
+            )  # 历史置信度
+            gen_mask = torch.zeros_like(x, dtype=torch.bool)    # 已确认集合
+        else:
+            init_mask_count = None
+            fixed_conf = None
+            gen_mask = None
+        # hooks：允许用户中间控制
         x = generation_tokens_hook_func(None, x, None)
         for i in range(steps):
             mask_index = (x == mask_token_id)
             logits = self(x, attention_mask, tok_idx).logits
+            # 右移一位（Dream 原实现）
             logits = torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
             logits = generation_logits_hook_func(i, x, logits)
             mask_logits = logits[mask_index]
             t = timesteps[i]
             s = timesteps[i + 1]
+            if alg == 'origin':
+                # === 原版 origin：随机按比例转移（不涉及置信度） ===
                 p_transfer = 1 - s / t if i < steps - 1 else 1
+                x0 = torch.zeros_like(x[mask_index], device=self.device, dtype=torch.long) + mask_token_id
                 transfer_index_t_s = torch.rand(*x0.shape, device=self.device) < p_transfer
                 _, x0[transfer_index_t_s] = sample_tokens(
                     mask_logits[transfer_index_t_s], temperature=temperature, top_p=top_p, top_k=top_k
                 )
                 x[mask_index] = x0.clone()
             else:
+                # === 置信度算法选择（vanilla 与 RCR 复用此处） ===
+                use_alg = conf_alg if rcr else alg
+                if use_alg == 'maskgit_plus':
                     confidence, x0 = sample_tokens(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k)
+                elif use_alg == 'topk_margin':
                     confidence, x0 = sample_tokens(
                         mask_logits, temperature=temperature, top_p=top_p, top_k=top_k, margin_confidence=True
                     )
+                elif use_alg == 'entropy':
                     confidence, x0 = sample_tokens(
+                        mask_logits, temperature=temperature, top_p=top_p, top_k=top_k, neg_entropy=True
                     )
                 else:
+                    raise RuntimeError(f"Unknown alg/conf_alg: {use_alg}")
                 if rcr:
+                    # === 历史置信度版 RCR ===
                     self._apply_rcr_logic(
                         x=x,
+                        x0=x0,
+                        conf_now=confidence,
                         mask_index=mask_index,
+                        fixed_conf=fixed_conf,
+                        gen_mask=gen_mask,
+                        init_mask_count=init_mask_count,
                         mask_token_id=mask_token_id,
                         step=i,
                         total_steps=steps,
+                        s=s, t=t,
                     )
                 else:
+                    # === 原版 Dream（vanilla）：本步 top-k，永久确认，不回遮 ===
+                    # number_transfer_tokens 基于“当前平均剩余 mask * (1 - s/t)”
+                    avg_mask_now = (mask_index.sum().item() / max(1, mask_index.shape[0]))
+                    ratio = (1.0 - (s.item() / t.item())) if i < steps - 1 else 1.0
+                    number_transfer_tokens = int(avg_mask_now * ratio)
+                    full_confidence = torch.full_like(x, -torch.inf, device=self.device, dtype=logits.dtype)
+                    full_confidence[mask_index] = confidence
                     if number_transfer_tokens > 0:
                         if alg_temp is None or alg_temp == 0:
                             _, transfer_index = torch.topk(full_confidence, number_transfer_tokens)
                         else:
+                            full_confidence = full_confidence / alg_temp
+                            full_confidence = F.softmax(full_confidence, dim=-1)
+                            transfer_index = torch.multinomial(full_confidence, num_samples=number_transfer_tokens)
+                        x_ = torch.zeros_like(x, device=self.device, dtype=torch.long) + mask_token_id
                         x_[mask_index] = x0.clone()
                         row_indices = torch.arange(x.size(0), device=self.device).unsqueeze(1).expand_as(transfer_index)
                         x[row_indices, transfer_index] = x_[row_indices, transfer_index]
             x = generation_tokens_hook_func(i, x, logits)
             if histories is not None:
                 histories.append(x.clone())
         if return_dict_in_generate:
+            return DreamModelOutput(
+                sequences=x,
+                history=histories,
+            )
         else:
             return x