thankfulcarp commited on
Commit
f5509c2
·
1 Parent(s): dd8ce86

Duration fixes

Browse files
Files changed (1) hide show
  1. app.py +67 -23
app.py CHANGED
@@ -11,6 +11,16 @@ Fix (July 3, 2024): Corrected a tensor device mismatch error in the
11
  `enhance_prompt_with_llm` function for ZeroGPU compatibility. The fix
12
  involves manually tokenizing the input and moving the resulting tensors
13
  to the CUDA device before calling the model's generate method.
 
 
 
 
 
 
 
 
 
 
14
  """
15
 
16
  # --- 1. Imports ---
@@ -82,17 +92,41 @@ def sanitize_prompt_for_filename(prompt: str) -> str:
82
  return sanitized[:50]
83
 
84
  def get_t2v_duration(
85
- steps: int, duration_seconds: float
 
 
 
86
  ) -> int:
87
- """Estimates GPU time for Text-to-Video generation using tiered logic."""
88
- if steps > 15 and duration_seconds > 4:
89
- duration = 700
90
- elif steps > 15 or duration_seconds > 4:
91
- duration = 400
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  else:
93
- duration = 200
94
- print(f"Requesting {duration}s of GPU time for {steps} steps and {duration_seconds:.1f}s duration.")
95
- return duration
 
 
 
 
 
 
 
 
96
 
97
  def get_available_presets(repo_id, subfolder):
98
  """
@@ -286,9 +320,12 @@ def enhance_prompt_with_llm(prompt: str, enhancer_pipeline):
286
  gr.Warning("LLM enhancer is not available.")
287
  return prompt
288
 
289
- # Move the pipeline's underlying model to the GPU if it's not already there.
 
 
 
290
  if enhancer_pipeline.model.device.type != 'cuda':
291
- print("Moving enhancer model to CUDA...")
292
  enhancer_pipeline.model.to("cuda")
293
 
294
  messages = [{"role": "system", "content": T2V_CINEMATIC_PROMPT_SYSTEM}, {"role": "user", "content": prompt}]
@@ -298,31 +335,38 @@ def enhance_prompt_with_llm(prompt: str, enhancer_pipeline):
298
  # 1. Get the tokenizer from the pipeline.
299
  tokenizer = enhancer_pipeline.tokenizer
300
 
301
- # 2. Apply the chat template to format the input and tokenize it.
302
- # `return_tensors="pt"` creates PyTorch tensors.
303
- # Crucially, `.to("cuda")` moves these tensors to the GPU, resolving the device mismatch.
304
- inputs = tokenizer.apply_chat_template(
 
 
 
 
305
  messages,
306
  tokenize=True,
307
  add_generation_prompt=True,
308
  return_tensors="pt"
309
- ).to("cuda")
 
 
 
310
 
311
- # 3. Use the model's generate() method directly with the GPU tensors.
312
- # This bypasses the high-level pipeline call that was causing the error.
313
  generated_ids = enhancer_pipeline.model.generate(
314
- inputs,
315
  max_new_tokens=256,
316
  do_sample=True,
317
  temperature=0.7,
318
  top_p=0.95
319
  )
320
 
321
- # 4. The output from generate() includes the input tokens. We need to decode only the newly generated part.
322
- input_token_length = inputs.shape[1]
323
  newly_generated_ids = generated_ids[:, input_token_length:]
324
 
325
- # 5. Decode the new tokens back into a string.
326
  final_answer = tokenizer.decode(newly_generated_ids[0], skip_special_tokens=True)
327
 
328
  print(f"Enhanced prompt: '{final_answer.strip()}'")
@@ -512,4 +556,4 @@ if __name__ == "__main__":
512
  available_loras = get_available_presets(DYNAMIC_LORA_REPO_ID, DYNAMIC_LORA_SUBFOLDER)
513
 
514
  app_ui = build_ui(t2v_pipe, enhancer_pipe, available_loras)
515
- app_ui.queue(max_size=10).launch()
 
11
  `enhance_prompt_with_llm` function for ZeroGPU compatibility. The fix
12
  involves manually tokenizing the input and moving the resulting tensors
13
  to the CUDA device before calling the model's generate method.
14
+
15
+ Fix (July 3, 2025): Addressed an attention_mask warning and clarified ZeroGPU
16
+ behavior. The model is now correctly fed both input_ids and attention_mask.
17
+ Added comments explaining why the model is moved to CUDA on each call, which
18
+ is the expected behavior for ZeroGPU Spaces.
19
+
20
+ Fix (July 3, 2025): Corrected the GPU duration estimation logic (`get_t2v_duration`)
21
+ to prevent timeouts on longer or high-resolution videos. The new logic now
22
+ considers video resolution (width and height) in addition to steps and duration,
23
+ and uses more generous time allocations.
24
  """
25
 
26
  # --- 1. Imports ---
 
92
  return sanitized[:50]
93
 
94
  def get_t2v_duration(
95
+ prompt: str, height: int, width: int, negative_prompt: str,
96
+ duration_seconds: float, steps: int, seed: int,
97
+ randomize_seed: bool, selected_lora: str,
98
+ lora_weight: float
99
  ) -> int:
100
+ """
101
+ Estimates GPU time for Text-to-Video generation.
102
+ The logic is tiered and considers duration, steps, and resolution to prevent timeouts.
103
+ """
104
+ # Calculate a resolution multiplier. A higher resolution will significantly increase generation time.
105
+ # Base resolution is considered 640x480 pixels.
106
+ base_pixels = DEFAULT_W_SLIDER_VALUE * DEFAULT_H_SLIDER_VALUE
107
+ current_pixels = width * height
108
+ # Check if the current resolution is significantly larger than the base.
109
+ is_high_res = current_pixels > (base_pixels * 1.5)
110
+
111
+ # Tiered duration based on video length and number of inference steps.
112
+ if steps > 10 or duration_seconds > 4:
113
+ # Longest generations (e.g., high step count or long duration).
114
+ base_duration = 600
115
+ elif steps > 10 or duration_seconds > 3:
116
+ # Medium-length generations.
117
+ base_duration = 400
118
  else:
119
+ # Shortest/quickest generations.
120
+ base_duration = 250
121
+
122
+ # Apply a multiplier for high-resolution videos.
123
+ final_duration = base_duration * 2 if is_high_res else base_duration
124
+
125
+ # Cap the duration at a maximum value (900s = 15 minutes) to comply with typical free-tier limits.
126
+ final_duration = min(final_duration, 900)
127
+
128
+ print(f"Requesting {final_duration}s of GPU time for {steps} steps, {duration_seconds:.1f}s duration, and {width}x{height} resolution.")
129
+ return final_duration
130
 
131
  def get_available_presets(repo_id, subfolder):
132
  """
 
320
  gr.Warning("LLM enhancer is not available.")
321
  return prompt
322
 
323
+ # In a Hugging Face ZeroGPU Space, the GPU is provisioned on-demand for functions
324
+ # decorated with @spaces.GPU and de-provisioned afterward. Therefore, the model,
325
+ # which is loaded on the CPU at startup, must be moved to the GPU for every call.
326
+ # The "Moving enhancer model to CUDA..." message is expected and correct for this setup.
327
  if enhancer_pipeline.model.device.type != 'cuda':
328
+ print("Moving enhancer model to CUDA for on-demand GPU execution...")
329
  enhancer_pipeline.model.to("cuda")
330
 
331
  messages = [{"role": "system", "content": T2V_CINEMATIC_PROMPT_SYSTEM}, {"role": "user", "content": prompt}]
 
335
  # 1. Get the tokenizer from the pipeline.
336
  tokenizer = enhancer_pipeline.tokenizer
337
 
338
+ # FIX: Set pad_token to eos_token if not set. This is a common requirement for
339
+ # models like Qwen2 and helps prevent warnings about attention masks.
340
+ if tokenizer.pad_token is None:
341
+ tokenizer.pad_token = tokenizer.eos_token
342
+
343
+ # 2. Apply the chat template and tokenize. This returns a dictionary containing
344
+ # 'input_ids' and 'attention_mask' as PyTorch tensors.
345
+ tokenized_inputs = tokenizer.apply_chat_template(
346
  messages,
347
  tokenize=True,
348
  add_generation_prompt=True,
349
  return_tensors="pt"
350
+ )
351
+
352
+ # 3. FIX: Move each tensor in the dictionary to the CUDA device.
353
+ inputs_on_cuda = {k: v.to("cuda") for k, v in tokenized_inputs.items()}
354
 
355
+ # 4. Use the model's generate() method, unpacking the dictionary to pass
356
+ # both `input_ids` and `attention_mask`. This resolves the warning.
357
  generated_ids = enhancer_pipeline.model.generate(
358
+ **inputs_on_cuda,
359
  max_new_tokens=256,
360
  do_sample=True,
361
  temperature=0.7,
362
  top_p=0.95
363
  )
364
 
365
+ # 5. The output from generate() includes the input tokens. We need to decode only the newly generated part.
366
+ input_token_length = inputs_on_cuda['input_ids'].shape[1]
367
  newly_generated_ids = generated_ids[:, input_token_length:]
368
 
369
+ # 6. Decode the new tokens back into a string.
370
  final_answer = tokenizer.decode(newly_generated_ids[0], skip_special_tokens=True)
371
 
372
  print(f"Enhanced prompt: '{final_answer.strip()}'")
 
556
  available_loras = get_available_presets(DYNAMIC_LORA_REPO_ID, DYNAMIC_LORA_SUBFOLDER)
557
 
558
  app_ui = build_ui(t2v_pipe, enhancer_pipe, available_loras)
559
+ app_ui.queue(max_size=10).launch()