Spaces:
Runtime error
Runtime error
Commit
·
f5509c2
1
Parent(s):
dd8ce86
Duration fixes
Browse files
app.py
CHANGED
|
@@ -11,6 +11,16 @@ Fix (July 3, 2024): Corrected a tensor device mismatch error in the
|
|
| 11 |
`enhance_prompt_with_llm` function for ZeroGPU compatibility. The fix
|
| 12 |
involves manually tokenizing the input and moving the resulting tensors
|
| 13 |
to the CUDA device before calling the model's generate method.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
# --- 1. Imports ---
|
|
@@ -82,17 +92,41 @@ def sanitize_prompt_for_filename(prompt: str) -> str:
|
|
| 82 |
return sanitized[:50]
|
| 83 |
|
| 84 |
def get_t2v_duration(
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
| 86 |
) -> int:
|
| 87 |
-
"""
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
else:
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
def get_available_presets(repo_id, subfolder):
|
| 98 |
"""
|
|
@@ -286,9 +320,12 @@ def enhance_prompt_with_llm(prompt: str, enhancer_pipeline):
|
|
| 286 |
gr.Warning("LLM enhancer is not available.")
|
| 287 |
return prompt
|
| 288 |
|
| 289 |
-
#
|
|
|
|
|
|
|
|
|
|
| 290 |
if enhancer_pipeline.model.device.type != 'cuda':
|
| 291 |
-
print("Moving enhancer model to CUDA...")
|
| 292 |
enhancer_pipeline.model.to("cuda")
|
| 293 |
|
| 294 |
messages = [{"role": "system", "content": T2V_CINEMATIC_PROMPT_SYSTEM}, {"role": "user", "content": prompt}]
|
|
@@ -298,31 +335,38 @@ def enhance_prompt_with_llm(prompt: str, enhancer_pipeline):
|
|
| 298 |
# 1. Get the tokenizer from the pipeline.
|
| 299 |
tokenizer = enhancer_pipeline.tokenizer
|
| 300 |
|
| 301 |
-
#
|
| 302 |
-
#
|
| 303 |
-
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
messages,
|
| 306 |
tokenize=True,
|
| 307 |
add_generation_prompt=True,
|
| 308 |
return_tensors="pt"
|
| 309 |
-
)
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
-
#
|
| 312 |
-
#
|
| 313 |
generated_ids = enhancer_pipeline.model.generate(
|
| 314 |
-
|
| 315 |
max_new_tokens=256,
|
| 316 |
do_sample=True,
|
| 317 |
temperature=0.7,
|
| 318 |
top_p=0.95
|
| 319 |
)
|
| 320 |
|
| 321 |
-
#
|
| 322 |
-
input_token_length =
|
| 323 |
newly_generated_ids = generated_ids[:, input_token_length:]
|
| 324 |
|
| 325 |
-
#
|
| 326 |
final_answer = tokenizer.decode(newly_generated_ids[0], skip_special_tokens=True)
|
| 327 |
|
| 328 |
print(f"Enhanced prompt: '{final_answer.strip()}'")
|
|
@@ -512,4 +556,4 @@ if __name__ == "__main__":
|
|
| 512 |
available_loras = get_available_presets(DYNAMIC_LORA_REPO_ID, DYNAMIC_LORA_SUBFOLDER)
|
| 513 |
|
| 514 |
app_ui = build_ui(t2v_pipe, enhancer_pipe, available_loras)
|
| 515 |
-
app_ui.queue(max_size=10).launch()
|
|
|
|
| 11 |
`enhance_prompt_with_llm` function for ZeroGPU compatibility. The fix
|
| 12 |
involves manually tokenizing the input and moving the resulting tensors
|
| 13 |
to the CUDA device before calling the model's generate method.
|
| 14 |
+
|
| 15 |
+
Fix (July 3, 2025): Addressed an attention_mask warning and clarified ZeroGPU
|
| 16 |
+
behavior. The model is now correctly fed both input_ids and attention_mask.
|
| 17 |
+
Added comments explaining why the model is moved to CUDA on each call, which
|
| 18 |
+
is the expected behavior for ZeroGPU Spaces.
|
| 19 |
+
|
| 20 |
+
Fix (July 3, 2025): Corrected the GPU duration estimation logic (`get_t2v_duration`)
|
| 21 |
+
to prevent timeouts on longer or high-resolution videos. The new logic now
|
| 22 |
+
considers video resolution (width and height) in addition to steps and duration,
|
| 23 |
+
and uses more generous time allocations.
|
| 24 |
"""
|
| 25 |
|
| 26 |
# --- 1. Imports ---
|
|
|
|
| 92 |
return sanitized[:50]
|
| 93 |
|
| 94 |
def get_t2v_duration(
|
| 95 |
+
prompt: str, height: int, width: int, negative_prompt: str,
|
| 96 |
+
duration_seconds: float, steps: int, seed: int,
|
| 97 |
+
randomize_seed: bool, selected_lora: str,
|
| 98 |
+
lora_weight: float
|
| 99 |
) -> int:
|
| 100 |
+
"""
|
| 101 |
+
Estimates GPU time for Text-to-Video generation.
|
| 102 |
+
The logic is tiered and considers duration, steps, and resolution to prevent timeouts.
|
| 103 |
+
"""
|
| 104 |
+
# Calculate a resolution multiplier. A higher resolution will significantly increase generation time.
|
| 105 |
+
# Base resolution is considered 640x480 pixels.
|
| 106 |
+
base_pixels = DEFAULT_W_SLIDER_VALUE * DEFAULT_H_SLIDER_VALUE
|
| 107 |
+
current_pixels = width * height
|
| 108 |
+
# Check if the current resolution is significantly larger than the base.
|
| 109 |
+
is_high_res = current_pixels > (base_pixels * 1.5)
|
| 110 |
+
|
| 111 |
+
# Tiered duration based on video length and number of inference steps.
|
| 112 |
+
if steps > 10 or duration_seconds > 4:
|
| 113 |
+
# Longest generations (e.g., high step count or long duration).
|
| 114 |
+
base_duration = 600
|
| 115 |
+
elif steps > 10 or duration_seconds > 3:
|
| 116 |
+
# Medium-length generations.
|
| 117 |
+
base_duration = 400
|
| 118 |
else:
|
| 119 |
+
# Shortest/quickest generations.
|
| 120 |
+
base_duration = 250
|
| 121 |
+
|
| 122 |
+
# Apply a multiplier for high-resolution videos.
|
| 123 |
+
final_duration = base_duration * 2 if is_high_res else base_duration
|
| 124 |
+
|
| 125 |
+
# Cap the duration at a maximum value (900s = 15 minutes) to comply with typical free-tier limits.
|
| 126 |
+
final_duration = min(final_duration, 900)
|
| 127 |
+
|
| 128 |
+
print(f"Requesting {final_duration}s of GPU time for {steps} steps, {duration_seconds:.1f}s duration, and {width}x{height} resolution.")
|
| 129 |
+
return final_duration
|
| 130 |
|
| 131 |
def get_available_presets(repo_id, subfolder):
|
| 132 |
"""
|
|
|
|
| 320 |
gr.Warning("LLM enhancer is not available.")
|
| 321 |
return prompt
|
| 322 |
|
| 323 |
+
# In a Hugging Face ZeroGPU Space, the GPU is provisioned on-demand for functions
|
| 324 |
+
# decorated with @spaces.GPU and de-provisioned afterward. Therefore, the model,
|
| 325 |
+
# which is loaded on the CPU at startup, must be moved to the GPU for every call.
|
| 326 |
+
# The "Moving enhancer model to CUDA..." message is expected and correct for this setup.
|
| 327 |
if enhancer_pipeline.model.device.type != 'cuda':
|
| 328 |
+
print("Moving enhancer model to CUDA for on-demand GPU execution...")
|
| 329 |
enhancer_pipeline.model.to("cuda")
|
| 330 |
|
| 331 |
messages = [{"role": "system", "content": T2V_CINEMATIC_PROMPT_SYSTEM}, {"role": "user", "content": prompt}]
|
|
|
|
| 335 |
# 1. Get the tokenizer from the pipeline.
|
| 336 |
tokenizer = enhancer_pipeline.tokenizer
|
| 337 |
|
| 338 |
+
# FIX: Set pad_token to eos_token if not set. This is a common requirement for
|
| 339 |
+
# models like Qwen2 and helps prevent warnings about attention masks.
|
| 340 |
+
if tokenizer.pad_token is None:
|
| 341 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 342 |
+
|
| 343 |
+
# 2. Apply the chat template and tokenize. This returns a dictionary containing
|
| 344 |
+
# 'input_ids' and 'attention_mask' as PyTorch tensors.
|
| 345 |
+
tokenized_inputs = tokenizer.apply_chat_template(
|
| 346 |
messages,
|
| 347 |
tokenize=True,
|
| 348 |
add_generation_prompt=True,
|
| 349 |
return_tensors="pt"
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
# 3. FIX: Move each tensor in the dictionary to the CUDA device.
|
| 353 |
+
inputs_on_cuda = {k: v.to("cuda") for k, v in tokenized_inputs.items()}
|
| 354 |
|
| 355 |
+
# 4. Use the model's generate() method, unpacking the dictionary to pass
|
| 356 |
+
# both `input_ids` and `attention_mask`. This resolves the warning.
|
| 357 |
generated_ids = enhancer_pipeline.model.generate(
|
| 358 |
+
**inputs_on_cuda,
|
| 359 |
max_new_tokens=256,
|
| 360 |
do_sample=True,
|
| 361 |
temperature=0.7,
|
| 362 |
top_p=0.95
|
| 363 |
)
|
| 364 |
|
| 365 |
+
# 5. The output from generate() includes the input tokens. We need to decode only the newly generated part.
|
| 366 |
+
input_token_length = inputs_on_cuda['input_ids'].shape[1]
|
| 367 |
newly_generated_ids = generated_ids[:, input_token_length:]
|
| 368 |
|
| 369 |
+
# 6. Decode the new tokens back into a string.
|
| 370 |
final_answer = tokenizer.decode(newly_generated_ids[0], skip_special_tokens=True)
|
| 371 |
|
| 372 |
print(f"Enhanced prompt: '{final_answer.strip()}'")
|
|
|
|
| 556 |
available_loras = get_available_presets(DYNAMIC_LORA_REPO_ID, DYNAMIC_LORA_SUBFOLDER)
|
| 557 |
|
| 558 |
app_ui = build_ui(t2v_pipe, enhancer_pipe, available_loras)
|
| 559 |
+
app_ui.queue(max_size=10).launch()
|