Spaces:

thankfulcarp
/

Wan_t2v_FusionX_with_Loras

Runtime error

App Files Files Community

thankfulcarp commited on Jul 3, 2025

Commit

f5509c2

1 Parent(s): dd8ce86

Duration fixes

Browse files

Files changed (1) hide show

app.py +67 -23

app.py CHANGED Viewed

@@ -11,6 +11,16 @@ Fix (July 3, 2024): Corrected a tensor device mismatch error in the
 `enhance_prompt_with_llm` function for ZeroGPU compatibility. The fix
 involves manually tokenizing the input and moving the resulting tensors
 to the CUDA device before calling the model's generate method.
 """
 # --- 1. Imports ---
@@ -82,17 +92,41 @@ def sanitize_prompt_for_filename(prompt: str) -> str:
     return sanitized[:50]
 def get_t2v_duration(
-    steps: int, duration_seconds: float
 ) -> int:
-    """Estimates GPU time for Text-to-Video generation using tiered logic."""
-    if steps > 15 and duration_seconds > 4:
-        duration = 700
-    elif steps > 15 or duration_seconds > 4:
-        duration = 400
     else:
-        duration = 200
-    print(f"Requesting {duration}s of GPU time for {steps} steps and {duration_seconds:.1f}s duration.")
-    return duration
 def get_available_presets(repo_id, subfolder):
     """
@@ -286,9 +320,12 @@ def enhance_prompt_with_llm(prompt: str, enhancer_pipeline):
         gr.Warning("LLM enhancer is not available.")
         return prompt
-    # Move the pipeline's underlying model to the GPU if it's not already there.
     if enhancer_pipeline.model.device.type != 'cuda':
-        print("Moving enhancer model to CUDA...")
         enhancer_pipeline.model.to("cuda")
     messages = [{"role": "system", "content": T2V_CINEMATIC_PROMPT_SYSTEM}, {"role": "user", "content": prompt}]
@@ -298,31 +335,38 @@ def enhance_prompt_with_llm(prompt: str, enhancer_pipeline):
         # 1. Get the tokenizer from the pipeline.
         tokenizer = enhancer_pipeline.tokenizer
-        # 2. Apply the chat template to format the input and tokenize it.
-        #    `return_tensors="pt"` creates PyTorch tensors.
-        #    Crucially, `.to("cuda")` moves these tensors to the GPU, resolving the device mismatch.
-        inputs = tokenizer.apply_chat_template(
             messages,
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt"
-        ).to("cuda")
-        # 3. Use the model's generate() method directly with the GPU tensors.
-        #    This bypasses the high-level pipeline call that was causing the error.
         generated_ids = enhancer_pipeline.model.generate(
-            inputs,
             max_new_tokens=256,
             do_sample=True,
             temperature=0.7,
             top_p=0.95
         )
-        # 4. The output from generate() includes the input tokens. We need to decode only the newly generated part.
-        input_token_length = inputs.shape[1]
         newly_generated_ids = generated_ids[:, input_token_length:]
-        # 5. Decode the new tokens back into a string.
         final_answer = tokenizer.decode(newly_generated_ids[0], skip_special_tokens=True)
         print(f"Enhanced prompt: '{final_answer.strip()}'")
@@ -512,4 +556,4 @@ if __name__ == "__main__":
         available_loras = get_available_presets(DYNAMIC_LORA_REPO_ID, DYNAMIC_LORA_SUBFOLDER)
     app_ui = build_ui(t2v_pipe, enhancer_pipe, available_loras)
-    app_ui.queue(max_size=10).launch()

 `enhance_prompt_with_llm` function for ZeroGPU compatibility. The fix
 involves manually tokenizing the input and moving the resulting tensors
 to the CUDA device before calling the model's generate method.
+Fix (July 3, 2025): Addressed an attention_mask warning and clarified ZeroGPU
+behavior. The model is now correctly fed both input_ids and attention_mask.
+Added comments explaining why the model is moved to CUDA on each call, which
+is the expected behavior for ZeroGPU Spaces.
+Fix (July 3, 2025): Corrected the GPU duration estimation logic (`get_t2v_duration`)
+to prevent timeouts on longer or high-resolution videos. The new logic now
+considers video resolution (width and height) in addition to steps and duration,
+and uses more generous time allocations.
 """
 # --- 1. Imports ---
     return sanitized[:50]
 def get_t2v_duration(
+    prompt: str, height: int, width: int, negative_prompt: str,
+    duration_seconds: float, steps: int, seed: int,
+    randomize_seed: bool, selected_lora: str,
+    lora_weight: float
 ) -> int:
+    """
+    Estimates GPU time for Text-to-Video generation.
+    The logic is tiered and considers duration, steps, and resolution to prevent timeouts.
+    """
+    # Calculate a resolution multiplier. A higher resolution will significantly increase generation time.
+    # Base resolution is considered 640x480 pixels.
+    base_pixels = DEFAULT_W_SLIDER_VALUE * DEFAULT_H_SLIDER_VALUE
+    current_pixels = width * height
+    # Check if the current resolution is significantly larger than the base.
+    is_high_res = current_pixels > (base_pixels * 1.5)
+    # Tiered duration based on video length and number of inference steps.
+    if steps > 10 or duration_seconds > 4:
+        # Longest generations (e.g., high step count or long duration).
+        base_duration = 600
+    elif steps > 10 or duration_seconds > 3:
+        # Medium-length generations.
+        base_duration = 400
     else:
+        # Shortest/quickest generations.
+        base_duration = 250
+    # Apply a multiplier for high-resolution videos.
+    final_duration = base_duration * 2 if is_high_res else base_duration
+    # Cap the duration at a maximum value (900s = 15 minutes) to comply with typical free-tier limits.
+    final_duration = min(final_duration, 900)
+    print(f"Requesting {final_duration}s of GPU time for {steps} steps, {duration_seconds:.1f}s duration, and {width}x{height} resolution.")
+    return final_duration
 def get_available_presets(repo_id, subfolder):
     """
         gr.Warning("LLM enhancer is not available.")
         return prompt
+    # In a Hugging Face ZeroGPU Space, the GPU is provisioned on-demand for functions
+    # decorated with @spaces.GPU and de-provisioned afterward. Therefore, the model,
+    # which is loaded on the CPU at startup, must be moved to the GPU for every call.
+    # The "Moving enhancer model to CUDA..." message is expected and correct for this setup.
     if enhancer_pipeline.model.device.type != 'cuda':
+        print("Moving enhancer model to CUDA for on-demand GPU execution...")
         enhancer_pipeline.model.to("cuda")
     messages = [{"role": "system", "content": T2V_CINEMATIC_PROMPT_SYSTEM}, {"role": "user", "content": prompt}]
         # 1. Get the tokenizer from the pipeline.
         tokenizer = enhancer_pipeline.tokenizer
+        # FIX: Set pad_token to eos_token if not set. This is a common requirement for
+        # models like Qwen2 and helps prevent warnings about attention masks.
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # 2. Apply the chat template and tokenize. This returns a dictionary containing
+        #    'input_ids' and 'attention_mask' as PyTorch tensors.
+        tokenized_inputs = tokenizer.apply_chat_template(
             messages,
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt"
+        )
+        # 3. FIX: Move each tensor in the dictionary to the CUDA device.
+        inputs_on_cuda = {k: v.to("cuda") for k, v in tokenized_inputs.items()}
+        # 4. Use the model's generate() method, unpacking the dictionary to pass
+        #    both `input_ids` and `attention_mask`. This resolves the warning.
         generated_ids = enhancer_pipeline.model.generate(
+            **inputs_on_cuda,
             max_new_tokens=256,
             do_sample=True,
             temperature=0.7,
             top_p=0.95
         )
+        # 5. The output from generate() includes the input tokens. We need to decode only the newly generated part.
+        input_token_length = inputs_on_cuda['input_ids'].shape[1]
         newly_generated_ids = generated_ids[:, input_token_length:]
+        # 6. Decode the new tokens back into a string.
         final_answer = tokenizer.decode(newly_generated_ids[0], skip_special_tokens=True)
         print(f"Enhanced prompt: '{final_answer.strip()}'")
         available_loras = get_available_presets(DYNAMIC_LORA_REPO_ID, DYNAMIC_LORA_SUBFOLDER)
     app_ui = build_ui(t2v_pipe, enhancer_pipe, available_loras)
+    app_ui.queue(max_size=10).launch()