add image to video gen
Browse files
app.py
CHANGED
|
@@ -28,6 +28,8 @@ from huggingface_hub import HfApi
|
|
| 28 |
import tempfile
|
| 29 |
from openai import OpenAI
|
| 30 |
from mistralai import Mistral
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# Gradio supported languages for syntax highlighting
|
| 33 |
GRADIO_SUPPORTED_LANGUAGES = [
|
|
@@ -86,6 +88,64 @@ Structural requirements:
|
|
| 86 |
Return ONLY the code inside a single ```html ... ``` code block. No additional text before or after.
|
| 87 |
"""
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
TRANSFORMERS_JS_SYSTEM_PROMPT = """You are an expert web developer creating a transformers.js application. You will generate THREE separate files: index.html, index.js, and style.css.
|
| 90 |
|
| 91 |
IMPORTANT: You MUST output ALL THREE files in the following format:
|
|
@@ -1236,6 +1296,129 @@ def generate_image_to_image(input_image_data, prompt: str) -> str:
|
|
| 1236 |
print(f"Image-to-image generation error: {str(e)}")
|
| 1237 |
return f"Error generating image (image-to-image): {str(e)}"
|
| 1238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1239 |
def extract_image_prompts_from_text(text: str, num_images_needed: int = 1) -> list:
|
| 1240 |
"""Extract image generation prompts from the full text based on number of images needed"""
|
| 1241 |
# Use the entire text as the base prompt for image generation
|
|
@@ -1308,7 +1491,8 @@ def create_image_replacement_blocks(html_content: str, user_prompt: str) -> str:
|
|
| 1308 |
# If no placeholder images found, look for any img tags
|
| 1309 |
if not placeholder_images:
|
| 1310 |
img_pattern = r'<img[^>]*>'
|
| 1311 |
-
|
|
|
|
| 1312 |
|
| 1313 |
# Also look for div elements that might be image placeholders
|
| 1314 |
div_placeholder_patterns = [
|
|
@@ -1543,17 +1727,127 @@ def create_image_replacement_blocks_from_input_image(html_content: str, user_pro
|
|
| 1543 |
|
| 1544 |
return '\n\n'.join(replacement_blocks)
|
| 1545 |
|
| 1546 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1547 |
"""Apply text-to-image and/or image-to-image replacements to HTML content.
|
| 1548 |
|
| 1549 |
If both toggles are enabled, text-to-image replacements run first, then image-to-image.
|
| 1550 |
"""
|
| 1551 |
result = html_content
|
| 1552 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1553 |
# If an input image is provided and image-to-image is enabled, we only replace one image
|
| 1554 |
# and skip text-to-image to satisfy the requirement to replace exactly the number of uploaded images.
|
| 1555 |
if enable_image_to_image and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
|
| 1556 |
-
# Prefer the dedicated image-to-image prompt if provided
|
| 1557 |
i2i_prompt = (image_to_image_prompt or user_prompt or "").strip()
|
| 1558 |
blocks2 = create_image_replacement_blocks_from_input_image(result, i2i_prompt, input_image_data, max_images=1)
|
| 1559 |
if blocks2:
|
|
@@ -1562,11 +1856,16 @@ def apply_generated_images_to_html(html_content: str, user_prompt: str, enable_t
|
|
| 1562 |
|
| 1563 |
if enable_text_to_image and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
|
| 1564 |
t2i_prompt = (text_to_image_prompt or user_prompt or "").strip()
|
|
|
|
| 1565 |
# Single-image flow for text-to-image
|
| 1566 |
blocks = create_image_replacement_blocks_text_to_image_single(result, t2i_prompt)
|
| 1567 |
if blocks:
|
|
|
|
| 1568 |
result = apply_search_replace_changes(result, blocks)
|
| 1569 |
except Exception:
|
|
|
|
|
|
|
|
|
|
| 1570 |
return html_content
|
| 1571 |
return result
|
| 1572 |
|
|
@@ -1856,6 +2155,39 @@ Please use the search results above to help create the requested application wit
|
|
| 1856 |
def send_to_sandbox(code):
|
| 1857 |
"""Render HTML in a sandboxed iframe. Assumes full HTML is provided by prompts."""
|
| 1858 |
html_doc = (code or "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1859 |
encoded_html = base64.b64encode(html_doc.encode('utf-8')).decode('utf-8')
|
| 1860 |
data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
|
| 1861 |
iframe = f'<iframe src="{data_uri}" width="100%" height="920px" sandbox="allow-scripts allow-same-origin allow-forms allow-popups allow-modals allow-presentation" allow="display-capture"></iframe>'
|
|
@@ -2361,7 +2693,7 @@ The HTML code above contains the complete original website structure with all im
|
|
| 2361 |
stop_generation = False
|
| 2362 |
|
| 2363 |
|
| 2364 |
-
def generation_code(query: Optional[str],
|
| 2365 |
if query is None:
|
| 2366 |
query = ''
|
| 2367 |
if _history is None:
|
|
@@ -2389,6 +2721,22 @@ def generation_code(query: Optional[str], image: Optional[gr.Image], file: Optio
|
|
| 2389 |
'=== src/App.svelte ===' in last_assistant_msg):
|
| 2390 |
has_existing_content = True
|
| 2391 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2392 |
# Choose system prompt based on context
|
| 2393 |
if has_existing_content:
|
| 2394 |
# Use follow-up prompt for modifying existing content
|
|
@@ -2444,8 +2792,8 @@ This will help me create a better design for you."""
|
|
| 2444 |
|
| 2445 |
# Check if this is GLM-4.5 model and handle with simple HuggingFace InferenceClient
|
| 2446 |
if _current_model["id"] == "zai-org/GLM-4.5":
|
| 2447 |
-
if
|
| 2448 |
-
messages.append(create_multimodal_message(enhanced_query,
|
| 2449 |
else:
|
| 2450 |
messages.append({'role': 'user', 'content': enhanced_query})
|
| 2451 |
|
|
@@ -2486,13 +2834,17 @@ This will help me create a better design for you."""
|
|
| 2486 |
clean_code = remove_code_block(content)
|
| 2487 |
|
| 2488 |
# Apply image generation (text→image and/or image→image)
|
|
|
|
| 2489 |
final_content = apply_generated_images_to_html(
|
| 2490 |
content,
|
| 2491 |
query,
|
| 2492 |
enable_text_to_image=enable_image_generation,
|
| 2493 |
enable_image_to_image=enable_image_to_image,
|
| 2494 |
-
input_image_data=
|
| 2495 |
image_to_image_prompt=image_to_image_prompt,
|
|
|
|
|
|
|
|
|
|
| 2496 |
)
|
| 2497 |
|
| 2498 |
_history.append([query, final_content])
|
|
@@ -2647,13 +2999,17 @@ This will help me create a better design for you."""
|
|
| 2647 |
clean_content = remove_code_block(modified_content)
|
| 2648 |
|
| 2649 |
# Apply image generation (text→image and/or image→image)
|
|
|
|
| 2650 |
clean_content = apply_generated_images_to_html(
|
| 2651 |
clean_content,
|
| 2652 |
query,
|
| 2653 |
enable_text_to_image=enable_image_generation,
|
| 2654 |
enable_image_to_image=enable_image_to_image,
|
| 2655 |
-
input_image_data=
|
| 2656 |
image_to_image_prompt=image_to_image_prompt,
|
|
|
|
|
|
|
|
|
|
| 2657 |
)
|
| 2658 |
|
| 2659 |
yield {
|
|
@@ -2664,14 +3020,18 @@ This will help me create a better design for you."""
|
|
| 2664 |
}
|
| 2665 |
else:
|
| 2666 |
# Apply image generation (text→image and/or image→image)
|
|
|
|
| 2667 |
final_content = apply_generated_images_to_html(
|
| 2668 |
clean_code,
|
| 2669 |
query,
|
| 2670 |
enable_text_to_image=enable_image_generation,
|
| 2671 |
enable_image_to_image=enable_image_to_image,
|
| 2672 |
-
input_image_data=
|
| 2673 |
image_to_image_prompt=image_to_image_prompt,
|
| 2674 |
text_to_image_prompt=text_to_image_prompt,
|
|
|
|
|
|
|
|
|
|
| 2675 |
)
|
| 2676 |
|
| 2677 |
preview_val = None
|
|
@@ -2693,7 +3053,7 @@ This will help me create a better design for you."""
|
|
| 2693 |
structured = [
|
| 2694 |
{"role": "system", "content": GLM45V_HTML_SYSTEM_PROMPT}
|
| 2695 |
]
|
| 2696 |
-
if
|
| 2697 |
user_msg = {
|
| 2698 |
"role": "user",
|
| 2699 |
"content": [
|
|
@@ -2704,10 +3064,10 @@ This will help me create a better design for you."""
|
|
| 2704 |
import io, base64
|
| 2705 |
from PIL import Image
|
| 2706 |
import numpy as np
|
| 2707 |
-
if isinstance(
|
| 2708 |
-
|
| 2709 |
buf = io.BytesIO()
|
| 2710 |
-
|
| 2711 |
b64 = base64.b64encode(buf.getvalue()).decode()
|
| 2712 |
user_msg["content"].append({
|
| 2713 |
"type": "image_url",
|
|
@@ -2775,8 +3135,8 @@ This will help me create a better design for you."""
|
|
| 2775 |
# Use dynamic client based on selected model (for non-GLM-4.5 models)
|
| 2776 |
client = get_inference_client(_current_model["id"], provider)
|
| 2777 |
|
| 2778 |
-
if
|
| 2779 |
-
messages.append(create_multimodal_message(enhanced_query,
|
| 2780 |
else:
|
| 2781 |
messages.append({'role': 'user', 'content': enhanced_query})
|
| 2782 |
try:
|
|
@@ -3060,13 +3420,17 @@ This will help me create a better design for you."""
|
|
| 3060 |
clean_content = remove_code_block(modified_content)
|
| 3061 |
|
| 3062 |
# Apply image generation (text→image and/or image→image)
|
|
|
|
| 3063 |
clean_content = apply_generated_images_to_html(
|
| 3064 |
clean_content,
|
| 3065 |
query,
|
| 3066 |
enable_text_to_image=enable_image_generation,
|
| 3067 |
enable_image_to_image=enable_image_to_image,
|
| 3068 |
-
input_image_data=
|
| 3069 |
image_to_image_prompt=image_to_image_prompt,
|
|
|
|
|
|
|
|
|
|
| 3070 |
text_to_image_prompt=text_to_image_prompt,
|
| 3071 |
)
|
| 3072 |
|
|
@@ -3083,14 +3447,18 @@ This will help me create a better design for you."""
|
|
| 3083 |
final_content = remove_code_block(content)
|
| 3084 |
|
| 3085 |
# Apply image generation (text→image and/or image→image)
|
|
|
|
| 3086 |
final_content = apply_generated_images_to_html(
|
| 3087 |
final_content,
|
| 3088 |
query,
|
| 3089 |
enable_text_to_image=enable_image_generation,
|
| 3090 |
enable_image_to_image=enable_image_to_image,
|
| 3091 |
-
input_image_data=
|
| 3092 |
image_to_image_prompt=image_to_image_prompt,
|
| 3093 |
text_to_image_prompt=text_to_image_prompt,
|
|
|
|
|
|
|
|
|
|
| 3094 |
)
|
| 3095 |
|
| 3096 |
_history.append([query, final_content])
|
|
@@ -4138,6 +4506,11 @@ with gr.Blocks(
|
|
| 4138 |
label="UI design image",
|
| 4139 |
visible=False
|
| 4140 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4141 |
image_to_image_prompt = gr.Textbox(
|
| 4142 |
label="Image-to-Image Prompt",
|
| 4143 |
placeholder="Describe how to transform the uploaded image (e.g., 'Turn the cat into a tiger.')",
|
|
@@ -4194,9 +4567,21 @@ with gr.Blocks(
|
|
| 4194 |
visible=True,
|
| 4195 |
info="Transform your uploaded image using FLUX.1-Kontext-dev"
|
| 4196 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4197 |
|
| 4198 |
def on_image_to_image_toggle(toggled):
|
| 4199 |
-
# Show image input and its prompt when image-to-image is enabled
|
| 4200 |
return gr.update(visible=bool(toggled)), gr.update(visible=bool(toggled))
|
| 4201 |
|
| 4202 |
def on_text_to_image_toggle(toggled):
|
|
@@ -4205,7 +4590,15 @@ with gr.Blocks(
|
|
| 4205 |
image_to_image_toggle.change(
|
| 4206 |
on_image_to_image_toggle,
|
| 4207 |
inputs=[image_to_image_toggle],
|
| 4208 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4209 |
)
|
| 4210 |
image_generation_toggle.change(
|
| 4211 |
on_text_to_image_toggle,
|
|
@@ -4462,7 +4855,7 @@ with gr.Blocks(
|
|
| 4462 |
show_progress="hidden",
|
| 4463 |
).then(
|
| 4464 |
generation_code,
|
| 4465 |
-
inputs=[input, image_input, file_input, website_url_input, setting, history, current_model, search_toggle, language_dropdown, provider_state, image_generation_toggle, image_to_image_toggle, image_to_image_prompt, text_to_image_prompt],
|
| 4466 |
outputs=[code_output, history, sandbox, history_output]
|
| 4467 |
).then(
|
| 4468 |
end_generation_ui,
|
|
|
|
| 28 |
import tempfile
|
| 29 |
from openai import OpenAI
|
| 30 |
from mistralai import Mistral
|
| 31 |
+
import uuid
|
| 32 |
+
import threading
|
| 33 |
|
| 34 |
# Gradio supported languages for syntax highlighting
|
| 35 |
GRADIO_SUPPORTED_LANGUAGES = [
|
|
|
|
| 88 |
Return ONLY the code inside a single ```html ... ``` code block. No additional text before or after.
|
| 89 |
"""
|
| 90 |
|
| 91 |
+
# ---------------------------------------------------------------------------
|
| 92 |
+
# Video temp-file management (per-session tracking and cleanup)
|
| 93 |
+
# ---------------------------------------------------------------------------
|
| 94 |
+
VIDEO_TEMP_DIR = os.path.join(tempfile.gettempdir(), "anycoder_videos")
|
| 95 |
+
VIDEO_FILE_TTL_SECONDS = 6 * 60 * 60 # 6 hours
|
| 96 |
+
_SESSION_VIDEO_FILES: Dict[str, List[str]] = {}
|
| 97 |
+
_VIDEO_FILES_LOCK = threading.Lock()
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def _ensure_video_dir_exists() -> None:
|
| 101 |
+
try:
|
| 102 |
+
os.makedirs(VIDEO_TEMP_DIR, exist_ok=True)
|
| 103 |
+
except Exception:
|
| 104 |
+
pass
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _register_video_for_session(session_id: Optional[str], file_path: str) -> None:
|
| 108 |
+
if not session_id or not file_path:
|
| 109 |
+
return
|
| 110 |
+
with _VIDEO_FILES_LOCK:
|
| 111 |
+
if session_id not in _SESSION_VIDEO_FILES:
|
| 112 |
+
_SESSION_VIDEO_FILES[session_id] = []
|
| 113 |
+
_SESSION_VIDEO_FILES[session_id].append(file_path)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def cleanup_session_videos(session_id: Optional[str]) -> None:
|
| 117 |
+
if not session_id:
|
| 118 |
+
return
|
| 119 |
+
with _VIDEO_FILES_LOCK:
|
| 120 |
+
file_list = _SESSION_VIDEO_FILES.pop(session_id, [])
|
| 121 |
+
for path in file_list:
|
| 122 |
+
try:
|
| 123 |
+
if path and os.path.exists(path):
|
| 124 |
+
os.unlink(path)
|
| 125 |
+
except Exception:
|
| 126 |
+
# Best-effort cleanup
|
| 127 |
+
pass
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def reap_old_videos(ttl_seconds: int = VIDEO_FILE_TTL_SECONDS) -> None:
|
| 131 |
+
"""Delete old video files in the temp directory based on modification time."""
|
| 132 |
+
try:
|
| 133 |
+
_ensure_video_dir_exists()
|
| 134 |
+
now_ts = time.time()
|
| 135 |
+
for name in os.listdir(VIDEO_TEMP_DIR):
|
| 136 |
+
path = os.path.join(VIDEO_TEMP_DIR, name)
|
| 137 |
+
try:
|
| 138 |
+
if not os.path.isfile(path):
|
| 139 |
+
continue
|
| 140 |
+
mtime = os.path.getmtime(path)
|
| 141 |
+
if now_ts - mtime > ttl_seconds:
|
| 142 |
+
os.unlink(path)
|
| 143 |
+
except Exception:
|
| 144 |
+
pass
|
| 145 |
+
except Exception:
|
| 146 |
+
# Temp dir might not exist or be accessible; ignore
|
| 147 |
+
pass
|
| 148 |
+
|
| 149 |
TRANSFORMERS_JS_SYSTEM_PROMPT = """You are an expert web developer creating a transformers.js application. You will generate THREE separate files: index.html, index.js, and style.css.
|
| 150 |
|
| 151 |
IMPORTANT: You MUST output ALL THREE files in the following format:
|
|
|
|
| 1296 |
print(f"Image-to-image generation error: {str(e)}")
|
| 1297 |
return f"Error generating image (image-to-image): {str(e)}"
|
| 1298 |
|
| 1299 |
+
def generate_video_from_image(input_image_data, prompt: str, session_id: Optional[str] = None) -> str:
|
| 1300 |
+
"""Generate a video from an input image and prompt using Hugging Face InferenceClient.
|
| 1301 |
+
|
| 1302 |
+
Returns an HTML <video> tag whose source points to a local file URL (file://...).
|
| 1303 |
+
"""
|
| 1304 |
+
try:
|
| 1305 |
+
print("[Image2Video] Starting video generation")
|
| 1306 |
+
if not os.getenv('HF_TOKEN'):
|
| 1307 |
+
print("[Image2Video] Missing HF_TOKEN")
|
| 1308 |
+
return "Error: HF_TOKEN environment variable is not set. Please set it to your Hugging Face API token."
|
| 1309 |
+
|
| 1310 |
+
# Prepare client
|
| 1311 |
+
client = InferenceClient(
|
| 1312 |
+
provider="auto",
|
| 1313 |
+
api_key=os.getenv('HF_TOKEN'),
|
| 1314 |
+
bill_to="huggingface",
|
| 1315 |
+
)
|
| 1316 |
+
print(f"[Image2Video] InferenceClient initialized (provider=auto)")
|
| 1317 |
+
|
| 1318 |
+
# Normalize input image to bytes
|
| 1319 |
+
import io
|
| 1320 |
+
from PIL import Image
|
| 1321 |
+
try:
|
| 1322 |
+
import numpy as np
|
| 1323 |
+
except Exception:
|
| 1324 |
+
np = None
|
| 1325 |
+
|
| 1326 |
+
print(f"[Image2Video] Normalizing input image type={type(input_image_data)}")
|
| 1327 |
+
if hasattr(input_image_data, 'read'):
|
| 1328 |
+
raw = input_image_data.read()
|
| 1329 |
+
pil_image = Image.open(io.BytesIO(raw))
|
| 1330 |
+
elif hasattr(input_image_data, 'mode') and hasattr(input_image_data, 'size'):
|
| 1331 |
+
pil_image = input_image_data
|
| 1332 |
+
elif np is not None and isinstance(input_image_data, np.ndarray):
|
| 1333 |
+
pil_image = Image.fromarray(input_image_data)
|
| 1334 |
+
elif isinstance(input_image_data, (bytes, bytearray)):
|
| 1335 |
+
pil_image = Image.open(io.BytesIO(input_image_data))
|
| 1336 |
+
else:
|
| 1337 |
+
pil_image = Image.open(io.BytesIO(bytes(input_image_data)))
|
| 1338 |
+
|
| 1339 |
+
if pil_image.mode != 'RGB':
|
| 1340 |
+
pil_image = pil_image.convert('RGB')
|
| 1341 |
+
try:
|
| 1342 |
+
print(f"[Image2Video] Input PIL image size={pil_image.size} mode={pil_image.mode}")
|
| 1343 |
+
except Exception:
|
| 1344 |
+
pass
|
| 1345 |
+
|
| 1346 |
+
buf = io.BytesIO()
|
| 1347 |
+
pil_image.save(buf, format='PNG')
|
| 1348 |
+
input_bytes = buf.getvalue()
|
| 1349 |
+
|
| 1350 |
+
# Call image-to-video; require method support
|
| 1351 |
+
model_id = "Lightricks/LTX-Video-0.9.8-13B-distilled"
|
| 1352 |
+
image_to_video_method = getattr(client, "image_to_video", None)
|
| 1353 |
+
if not callable(image_to_video_method):
|
| 1354 |
+
print("[Image2Video] InferenceClient.image_to_video not available in this huggingface_hub version")
|
| 1355 |
+
return (
|
| 1356 |
+
"Error generating video (image-to-video): Your installed huggingface_hub version "
|
| 1357 |
+
"does not expose InferenceClient.image_to_video. Please upgrade with "
|
| 1358 |
+
"`pip install -U huggingface_hub` and try again."
|
| 1359 |
+
)
|
| 1360 |
+
print(f"[Image2Video] Calling image_to_video with model={model_id}, prompt length={len(prompt or '')}")
|
| 1361 |
+
video_bytes = image_to_video_method(
|
| 1362 |
+
input_bytes,
|
| 1363 |
+
prompt=prompt,
|
| 1364 |
+
model=model_id,
|
| 1365 |
+
)
|
| 1366 |
+
print(f"[Image2Video] Received video bytes: {len(video_bytes) if hasattr(video_bytes, '__len__') else 'unknown length'}")
|
| 1367 |
+
|
| 1368 |
+
# Save to temp file for this session (for cleanup on next Generate)
|
| 1369 |
+
try:
|
| 1370 |
+
_ensure_video_dir_exists()
|
| 1371 |
+
file_name = f"{uuid.uuid4()}.mp4"
|
| 1372 |
+
file_path = os.path.join(VIDEO_TEMP_DIR, file_name)
|
| 1373 |
+
with open(file_path, "wb") as f:
|
| 1374 |
+
f.write(video_bytes)
|
| 1375 |
+
_register_video_for_session(session_id, file_path)
|
| 1376 |
+
try:
|
| 1377 |
+
file_size = os.path.getsize(file_path)
|
| 1378 |
+
except Exception:
|
| 1379 |
+
file_size = -1
|
| 1380 |
+
print(f"[Image2Video] Saved video to temp file: {file_path} (size={file_size} bytes)")
|
| 1381 |
+
except Exception as save_exc:
|
| 1382 |
+
print(f"[Image2Video] Warning: could not persist temp video file: {save_exc}")
|
| 1383 |
+
|
| 1384 |
+
# Always use a file URL for the video source.
|
| 1385 |
+
video_html = ""
|
| 1386 |
+
file_url = None
|
| 1387 |
+
try:
|
| 1388 |
+
if 'file_path' in locals() and file_path:
|
| 1389 |
+
# Build a proper file:// URL for absolute paths (e.g., file:///var/.../uuid.mp4)
|
| 1390 |
+
try:
|
| 1391 |
+
from pathlib import Path
|
| 1392 |
+
file_url = Path(file_path).as_uri()
|
| 1393 |
+
except Exception:
|
| 1394 |
+
# Fallback to manual construction; ensure three slashes
|
| 1395 |
+
# Note: this may not be fully standards-compliant on Windows
|
| 1396 |
+
if file_path.startswith('/'):
|
| 1397 |
+
file_url = f"file:///{file_path.lstrip('/')}" # file:///abs/path
|
| 1398 |
+
else:
|
| 1399 |
+
file_url = f"file:///{file_path}"
|
| 1400 |
+
except Exception:
|
| 1401 |
+
file_url = None
|
| 1402 |
+
|
| 1403 |
+
if file_url:
|
| 1404 |
+
video_html = (
|
| 1405 |
+
f"<video controls style=\"max-width: 100%; height: auto; border-radius: 8px; margin: 10px 0;\">"
|
| 1406 |
+
f"<source src=\"{file_url}\" type=\"video/mp4\" />"
|
| 1407 |
+
f"Your browser does not support the video tag."
|
| 1408 |
+
f"</video>"
|
| 1409 |
+
)
|
| 1410 |
+
else:
|
| 1411 |
+
# If a file URL cannot be constructed, signal error to avoid embedding data URIs.
|
| 1412 |
+
return "Error generating video (image-to-video): Could not persist video to a local file."
|
| 1413 |
+
print("[Image2Video] Successfully generated video HTML tag")
|
| 1414 |
+
return video_html
|
| 1415 |
+
except Exception as e:
|
| 1416 |
+
import traceback
|
| 1417 |
+
print("[Image2Video] Exception during generation:")
|
| 1418 |
+
traceback.print_exc()
|
| 1419 |
+
print(f"Image-to-video generation error: {str(e)}")
|
| 1420 |
+
return f"Error generating video (image-to-video): {str(e)}"
|
| 1421 |
+
|
| 1422 |
def extract_image_prompts_from_text(text: str, num_images_needed: int = 1) -> list:
|
| 1423 |
"""Extract image generation prompts from the full text based on number of images needed"""
|
| 1424 |
# Use the entire text as the base prompt for image generation
|
|
|
|
| 1491 |
# If no placeholder images found, look for any img tags
|
| 1492 |
if not placeholder_images:
|
| 1493 |
img_pattern = r'<img[^>]*>'
|
| 1494 |
+
# Case-insensitive to catch <IMG> or mixed-case tags
|
| 1495 |
+
placeholder_images = re.findall(img_pattern, html_content, re.IGNORECASE)
|
| 1496 |
|
| 1497 |
# Also look for div elements that might be image placeholders
|
| 1498 |
div_placeholder_patterns = [
|
|
|
|
| 1727 |
|
| 1728 |
return '\n\n'.join(replacement_blocks)
|
| 1729 |
|
| 1730 |
+
def create_video_replacement_blocks_from_input_image(html_content: str, user_prompt: str, input_image_data, session_id: Optional[str] = None) -> str:
|
| 1731 |
+
"""Create search/replace blocks that replace the first <img> (or placeholder) with a generated <video>.
|
| 1732 |
+
|
| 1733 |
+
Uses generate_video_from_image to produce a single video and swaps it in.
|
| 1734 |
+
"""
|
| 1735 |
+
if not user_prompt:
|
| 1736 |
+
return ""
|
| 1737 |
+
|
| 1738 |
+
import re
|
| 1739 |
+
print("[Image2Video] Creating replacement blocks for video insertion")
|
| 1740 |
+
|
| 1741 |
+
placeholder_patterns = [
|
| 1742 |
+
r'<img[^>]*src=["\'](?:placeholder|dummy|sample|example)[^"\']*["\'][^>]*>',
|
| 1743 |
+
r'<img[^>]*src=["\']https?://via\.placeholder\.com[^"\']*["\'][^>]*>',
|
| 1744 |
+
r'<img[^>]*src=["\']https?://picsum\.photos[^"\']*["\'][^>]*>',
|
| 1745 |
+
r'<img[^>]*src=["\']https?://dummyimage\.com[^"\']*["\'][^>]*>',
|
| 1746 |
+
r'<img[^>]*alt=["\'][^"\']*placeholder[^"\']*["\'][^>]*>',
|
| 1747 |
+
r'<img[^>]*class=["\'][^"\']*placeholder[^"\']*["\'][^>]*>',
|
| 1748 |
+
r'<img[^>]*id=["\'][^"\']*placeholder[^"\']*["\'][^>]*>',
|
| 1749 |
+
r'<img[^>]*src=["\']data:image[^"\']*["\'][^>]*>',
|
| 1750 |
+
r'<img[^>]*src=["\']#["\'][^>]*>',
|
| 1751 |
+
r'<img[^>]*src=["\']about:blank["\'][^>]*>',
|
| 1752 |
+
]
|
| 1753 |
+
|
| 1754 |
+
placeholder_images = []
|
| 1755 |
+
for pattern in placeholder_patterns:
|
| 1756 |
+
matches = re.findall(pattern, html_content, re.IGNORECASE)
|
| 1757 |
+
if matches:
|
| 1758 |
+
placeholder_images.extend(matches)
|
| 1759 |
+
|
| 1760 |
+
if not placeholder_images:
|
| 1761 |
+
img_pattern = r'<img[^>]*>'
|
| 1762 |
+
placeholder_images = re.findall(img_pattern, html_content)
|
| 1763 |
+
print(f"[Image2Video] Found {len(placeholder_images)} candidate <img> elements")
|
| 1764 |
+
|
| 1765 |
+
video_html = generate_video_from_image(input_image_data, user_prompt, session_id=session_id)
|
| 1766 |
+
try:
|
| 1767 |
+
has_file_src = 'src="' in video_html and video_html.count('src="') >= 1 and 'data:video/mp4;base64' not in video_html.split('src="', 1)[1]
|
| 1768 |
+
print(f"[Image2Video] Generated video HTML length={len(video_html)}; has_file_src={has_file_src}")
|
| 1769 |
+
except Exception:
|
| 1770 |
+
pass
|
| 1771 |
+
if video_html.startswith("Error"):
|
| 1772 |
+
print("[Image2Video] Video generation returned error; aborting replacement")
|
| 1773 |
+
return ""
|
| 1774 |
+
|
| 1775 |
+
if placeholder_images:
|
| 1776 |
+
placeholder = placeholder_images[0]
|
| 1777 |
+
placeholder_clean = re.sub(r'\s+', ' ', placeholder.strip())
|
| 1778 |
+
print("[Image2Video] Replacing first image placeholder with video")
|
| 1779 |
+
placeholder_variations = [
|
| 1780 |
+
# Try the exact string first to maximize replacement success
|
| 1781 |
+
placeholder,
|
| 1782 |
+
placeholder_clean,
|
| 1783 |
+
placeholder_clean.replace('"', "'"),
|
| 1784 |
+
placeholder_clean.replace("'", '"'),
|
| 1785 |
+
re.sub(r'\s+', ' ', placeholder_clean),
|
| 1786 |
+
placeholder_clean.replace(' ', ' '),
|
| 1787 |
+
]
|
| 1788 |
+
blocks = []
|
| 1789 |
+
for variation in placeholder_variations:
|
| 1790 |
+
blocks.append(f"""{SEARCH_START}
|
| 1791 |
+
{variation}
|
| 1792 |
+
{DIVIDER}
|
| 1793 |
+
{video_html}
|
| 1794 |
+
{REPLACE_END}""")
|
| 1795 |
+
return '\n\n'.join(blocks)
|
| 1796 |
+
|
| 1797 |
+
if '<body' in html_content:
|
| 1798 |
+
body_start = html_content.find('<body')
|
| 1799 |
+
body_end = html_content.find('>', body_start) + 1
|
| 1800 |
+
opening_body_tag = html_content[body_start:body_end]
|
| 1801 |
+
print("[Image2Video] No <img> found; inserting video right after the opening <body> tag")
|
| 1802 |
+
print(f"[Image2Video] Opening <body> tag snippet: {opening_body_tag[:120]}")
|
| 1803 |
+
return f"""{SEARCH_START}
|
| 1804 |
+
{opening_body_tag}
|
| 1805 |
+
{DIVIDER}
|
| 1806 |
+
{opening_body_tag}
|
| 1807 |
+
{video_html}
|
| 1808 |
+
{REPLACE_END}"""
|
| 1809 |
+
|
| 1810 |
+
print("[Image2Video] No <body> tag; appending video via replacement block")
|
| 1811 |
+
return f"{SEARCH_START}\n\n{DIVIDER}\n{video_html}\n{REPLACE_END}"
|
| 1812 |
+
|
| 1813 |
+
def apply_generated_images_to_html(html_content: str, user_prompt: str, enable_text_to_image: bool, enable_image_to_image: bool, input_image_data, image_to_image_prompt: str | None = None, text_to_image_prompt: str | None = None, enable_image_to_video: bool = False, image_to_video_prompt: str | None = None, session_id: Optional[str] = None) -> str:
|
| 1814 |
"""Apply text-to-image and/or image-to-image replacements to HTML content.
|
| 1815 |
|
| 1816 |
If both toggles are enabled, text-to-image replacements run first, then image-to-image.
|
| 1817 |
"""
|
| 1818 |
result = html_content
|
| 1819 |
try:
|
| 1820 |
+
print(
|
| 1821 |
+
f"[MediaApply] enable_i2v={enable_image_to_video}, enable_i2i={enable_image_to_image}, "
|
| 1822 |
+
f"enable_t2i={enable_text_to_image}, has_image={input_image_data is not None}"
|
| 1823 |
+
)
|
| 1824 |
+
# If image-to-video is enabled, replace the first image with a generated video and return.
|
| 1825 |
+
if enable_image_to_video and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
|
| 1826 |
+
i2v_prompt = (image_to_video_prompt or user_prompt or "").strip()
|
| 1827 |
+
print(f"[MediaApply] Running image-to-video with prompt len={len(i2v_prompt)}")
|
| 1828 |
+
blocks_v = create_video_replacement_blocks_from_input_image(result, i2v_prompt, input_image_data, session_id=session_id)
|
| 1829 |
+
if blocks_v:
|
| 1830 |
+
print("[MediaApply] Applying image-to-video replacement blocks")
|
| 1831 |
+
before_len = len(result)
|
| 1832 |
+
result_after = apply_search_replace_changes(result, blocks_v)
|
| 1833 |
+
after_len = len(result_after)
|
| 1834 |
+
changed = (result_after != result)
|
| 1835 |
+
print(f"[MediaApply] i2v blocks length={len(blocks_v)}; html before={before_len}, after={after_len}, changed={changed}")
|
| 1836 |
+
if not changed:
|
| 1837 |
+
print("[MediaApply] DEBUG: Replacement did not change content. Dumping first block:")
|
| 1838 |
+
try:
|
| 1839 |
+
first_block = blocks_v.split(REPLACE_END)[0][:1000]
|
| 1840 |
+
print(first_block)
|
| 1841 |
+
except Exception:
|
| 1842 |
+
pass
|
| 1843 |
+
result = result_after
|
| 1844 |
+
else:
|
| 1845 |
+
print("[MediaApply] No i2v replacement blocks generated")
|
| 1846 |
+
return result
|
| 1847 |
+
|
| 1848 |
# If an input image is provided and image-to-image is enabled, we only replace one image
|
| 1849 |
# and skip text-to-image to satisfy the requirement to replace exactly the number of uploaded images.
|
| 1850 |
if enable_image_to_image and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
|
|
|
|
| 1851 |
i2i_prompt = (image_to_image_prompt or user_prompt or "").strip()
|
| 1852 |
blocks2 = create_image_replacement_blocks_from_input_image(result, i2i_prompt, input_image_data, max_images=1)
|
| 1853 |
if blocks2:
|
|
|
|
| 1856 |
|
| 1857 |
if enable_text_to_image and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
|
| 1858 |
t2i_prompt = (text_to_image_prompt or user_prompt or "").strip()
|
| 1859 |
+
print(f"[MediaApply] Running text-to-image with prompt len={len(t2i_prompt)}")
|
| 1860 |
# Single-image flow for text-to-image
|
| 1861 |
blocks = create_image_replacement_blocks_text_to_image_single(result, t2i_prompt)
|
| 1862 |
if blocks:
|
| 1863 |
+
print("[MediaApply] Applying text-to-image replacement blocks")
|
| 1864 |
result = apply_search_replace_changes(result, blocks)
|
| 1865 |
except Exception:
|
| 1866 |
+
import traceback
|
| 1867 |
+
print("[MediaApply] Exception during media application:")
|
| 1868 |
+
traceback.print_exc()
|
| 1869 |
return html_content
|
| 1870 |
return result
|
| 1871 |
|
|
|
|
| 2155 |
def send_to_sandbox(code):
|
| 2156 |
"""Render HTML in a sandboxed iframe. Assumes full HTML is provided by prompts."""
|
| 2157 |
html_doc = (code or "").strip()
|
| 2158 |
+
# For preview only: inline local file URLs (e.g., file:///.../video.mp4) as data URIs so the
|
| 2159 |
+
# data: iframe can load them. The original code (shown to the user) still contains file URLs.
|
| 2160 |
+
try:
|
| 2161 |
+
import re
|
| 2162 |
+
import base64 as _b64
|
| 2163 |
+
import mimetypes as _mtypes
|
| 2164 |
+
import urllib.parse as _uparse
|
| 2165 |
+
def _file_url_to_data_uri(file_url: str) -> str | None:
|
| 2166 |
+
try:
|
| 2167 |
+
parsed = _uparse.urlparse(file_url)
|
| 2168 |
+
path = _uparse.unquote(parsed.path)
|
| 2169 |
+
if not path:
|
| 2170 |
+
return None
|
| 2171 |
+
with open(path, 'rb') as _f:
|
| 2172 |
+
raw = _f.read()
|
| 2173 |
+
mime = _mtypes.guess_type(path)[0] or 'application/octet-stream'
|
| 2174 |
+
b64 = _b64.b64encode(raw).decode()
|
| 2175 |
+
return f"data:{mime};base64,{b64}"
|
| 2176 |
+
except Exception:
|
| 2177 |
+
return None
|
| 2178 |
+
def _repl_double(m):
|
| 2179 |
+
url = m.group(1)
|
| 2180 |
+
data_uri = _file_url_to_data_uri(url)
|
| 2181 |
+
return f'src="{data_uri}"' if data_uri else m.group(0)
|
| 2182 |
+
def _repl_single(m):
|
| 2183 |
+
url = m.group(1)
|
| 2184 |
+
data_uri = _file_url_to_data_uri(url)
|
| 2185 |
+
return f"src='{data_uri}'" if data_uri else m.group(0)
|
| 2186 |
+
html_doc = re.sub(r'src="(file:[^"]+)"', _repl_double, html_doc)
|
| 2187 |
+
html_doc = re.sub(r"src='(file:[^']+)'", _repl_single, html_doc)
|
| 2188 |
+
except Exception:
|
| 2189 |
+
# Best-effort; continue without inlining
|
| 2190 |
+
pass
|
| 2191 |
encoded_html = base64.b64encode(html_doc.encode('utf-8')).decode('utf-8')
|
| 2192 |
data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
|
| 2193 |
iframe = f'<iframe src="{data_uri}" width="100%" height="920px" sandbox="allow-scripts allow-same-origin allow-forms allow-popups allow-modals allow-presentation" allow="display-capture"></iframe>'
|
|
|
|
| 2693 |
stop_generation = False
|
| 2694 |
|
| 2695 |
|
| 2696 |
+
def generation_code(query: Optional[str], vlm_image: Optional[gr.Image], gen_image: Optional[gr.Image], file: Optional[str], website_url: Optional[str], _setting: Dict[str, str], _history: Optional[History], _current_model: Dict, enable_search: bool = False, language: str = "html", provider: str = "auto", enable_image_generation: bool = False, enable_image_to_image: bool = False, image_to_image_prompt: Optional[str] = None, text_to_image_prompt: Optional[str] = None, enable_image_to_video: bool = False, image_to_video_prompt: Optional[str] = None):
|
| 2697 |
if query is None:
|
| 2698 |
query = ''
|
| 2699 |
if _history is None:
|
|
|
|
| 2721 |
'=== src/App.svelte ===' in last_assistant_msg):
|
| 2722 |
has_existing_content = True
|
| 2723 |
|
| 2724 |
+
# Create/lookup a session id for temp-file tracking and cleanup
|
| 2725 |
+
if _setting is not None and isinstance(_setting, dict):
|
| 2726 |
+
session_id = _setting.get("__session_id__")
|
| 2727 |
+
if not session_id:
|
| 2728 |
+
session_id = str(uuid.uuid4())
|
| 2729 |
+
_setting["__session_id__"] = session_id
|
| 2730 |
+
else:
|
| 2731 |
+
session_id = str(uuid.uuid4())
|
| 2732 |
+
|
| 2733 |
+
# On each generate, reap old global files and cleanup previous session files
|
| 2734 |
+
try:
|
| 2735 |
+
cleanup_session_videos(session_id)
|
| 2736 |
+
reap_old_videos()
|
| 2737 |
+
except Exception:
|
| 2738 |
+
pass
|
| 2739 |
+
|
| 2740 |
# Choose system prompt based on context
|
| 2741 |
if has_existing_content:
|
| 2742 |
# Use follow-up prompt for modifying existing content
|
|
|
|
| 2792 |
|
| 2793 |
# Check if this is GLM-4.5 model and handle with simple HuggingFace InferenceClient
|
| 2794 |
if _current_model["id"] == "zai-org/GLM-4.5":
|
| 2795 |
+
if vlm_image is not None:
|
| 2796 |
+
messages.append(create_multimodal_message(enhanced_query, vlm_image))
|
| 2797 |
else:
|
| 2798 |
messages.append({'role': 'user', 'content': enhanced_query})
|
| 2799 |
|
|
|
|
| 2834 |
clean_code = remove_code_block(content)
|
| 2835 |
|
| 2836 |
# Apply image generation (text→image and/or image→image)
|
| 2837 |
+
print("[Generate] Applying post-generation media to GLM-4.5 HTML output")
|
| 2838 |
final_content = apply_generated_images_to_html(
|
| 2839 |
content,
|
| 2840 |
query,
|
| 2841 |
enable_text_to_image=enable_image_generation,
|
| 2842 |
enable_image_to_image=enable_image_to_image,
|
| 2843 |
+
input_image_data=gen_image,
|
| 2844 |
image_to_image_prompt=image_to_image_prompt,
|
| 2845 |
+
enable_image_to_video=enable_image_to_video,
|
| 2846 |
+
image_to_video_prompt=image_to_video_prompt,
|
| 2847 |
+
session_id=session_id,
|
| 2848 |
)
|
| 2849 |
|
| 2850 |
_history.append([query, final_content])
|
|
|
|
| 2999 |
clean_content = remove_code_block(modified_content)
|
| 3000 |
|
| 3001 |
# Apply image generation (text→image and/or image→image)
|
| 3002 |
+
print("[Generate] Applying post-generation media to modified HTML content")
|
| 3003 |
clean_content = apply_generated_images_to_html(
|
| 3004 |
clean_content,
|
| 3005 |
query,
|
| 3006 |
enable_text_to_image=enable_image_generation,
|
| 3007 |
enable_image_to_image=enable_image_to_image,
|
| 3008 |
+
input_image_data=gen_image,
|
| 3009 |
image_to_image_prompt=image_to_image_prompt,
|
| 3010 |
+
enable_image_to_video=enable_image_to_video,
|
| 3011 |
+
image_to_video_prompt=image_to_video_prompt,
|
| 3012 |
+
session_id=session_id,
|
| 3013 |
)
|
| 3014 |
|
| 3015 |
yield {
|
|
|
|
| 3020 |
}
|
| 3021 |
else:
|
| 3022 |
# Apply image generation (text→image and/or image→image)
|
| 3023 |
+
print("[Generate] Applying post-generation media to new HTML content")
|
| 3024 |
final_content = apply_generated_images_to_html(
|
| 3025 |
clean_code,
|
| 3026 |
query,
|
| 3027 |
enable_text_to_image=enable_image_generation,
|
| 3028 |
enable_image_to_image=enable_image_to_image,
|
| 3029 |
+
input_image_data=gen_image,
|
| 3030 |
image_to_image_prompt=image_to_image_prompt,
|
| 3031 |
text_to_image_prompt=text_to_image_prompt,
|
| 3032 |
+
enable_image_to_video=enable_image_to_video,
|
| 3033 |
+
image_to_video_prompt=image_to_video_prompt,
|
| 3034 |
+
session_id=session_id,
|
| 3035 |
)
|
| 3036 |
|
| 3037 |
preview_val = None
|
|
|
|
| 3053 |
structured = [
|
| 3054 |
{"role": "system", "content": GLM45V_HTML_SYSTEM_PROMPT}
|
| 3055 |
]
|
| 3056 |
+
if vlm_image is not None:
|
| 3057 |
user_msg = {
|
| 3058 |
"role": "user",
|
| 3059 |
"content": [
|
|
|
|
| 3064 |
import io, base64
|
| 3065 |
from PIL import Image
|
| 3066 |
import numpy as np
|
| 3067 |
+
if isinstance(vlm_image, np.ndarray):
|
| 3068 |
+
vlm_image = Image.fromarray(vlm_image)
|
| 3069 |
buf = io.BytesIO()
|
| 3070 |
+
vlm_image.save(buf, format="PNG")
|
| 3071 |
b64 = base64.b64encode(buf.getvalue()).decode()
|
| 3072 |
user_msg["content"].append({
|
| 3073 |
"type": "image_url",
|
|
|
|
| 3135 |
# Use dynamic client based on selected model (for non-GLM-4.5 models)
|
| 3136 |
client = get_inference_client(_current_model["id"], provider)
|
| 3137 |
|
| 3138 |
+
if vlm_image is not None:
|
| 3139 |
+
messages.append(create_multimodal_message(enhanced_query, vlm_image))
|
| 3140 |
else:
|
| 3141 |
messages.append({'role': 'user', 'content': enhanced_query})
|
| 3142 |
try:
|
|
|
|
| 3420 |
clean_content = remove_code_block(modified_content)
|
| 3421 |
|
| 3422 |
# Apply image generation (text→image and/or image→image)
|
| 3423 |
+
print("[Generate] Applying post-generation media to follow-up HTML content")
|
| 3424 |
clean_content = apply_generated_images_to_html(
|
| 3425 |
clean_content,
|
| 3426 |
query,
|
| 3427 |
enable_text_to_image=enable_image_generation,
|
| 3428 |
enable_image_to_image=enable_image_to_image,
|
| 3429 |
+
input_image_data=gen_image,
|
| 3430 |
image_to_image_prompt=image_to_image_prompt,
|
| 3431 |
+
enable_image_to_video=enable_image_to_video,
|
| 3432 |
+
image_to_video_prompt=image_to_video_prompt,
|
| 3433 |
+
session_id=session_id,
|
| 3434 |
text_to_image_prompt=text_to_image_prompt,
|
| 3435 |
)
|
| 3436 |
|
|
|
|
| 3447 |
final_content = remove_code_block(content)
|
| 3448 |
|
| 3449 |
# Apply image generation (text→image and/or image→image)
|
| 3450 |
+
print("[Generate] Applying post-generation media to final HTML content")
|
| 3451 |
final_content = apply_generated_images_to_html(
|
| 3452 |
final_content,
|
| 3453 |
query,
|
| 3454 |
enable_text_to_image=enable_image_generation,
|
| 3455 |
enable_image_to_image=enable_image_to_image,
|
| 3456 |
+
input_image_data=gen_image,
|
| 3457 |
image_to_image_prompt=image_to_image_prompt,
|
| 3458 |
text_to_image_prompt=text_to_image_prompt,
|
| 3459 |
+
enable_image_to_video=enable_image_to_video,
|
| 3460 |
+
image_to_video_prompt=image_to_video_prompt,
|
| 3461 |
+
session_id=session_id,
|
| 3462 |
)
|
| 3463 |
|
| 3464 |
_history.append([query, final_content])
|
|
|
|
| 4506 |
label="UI design image",
|
| 4507 |
visible=False
|
| 4508 |
)
|
| 4509 |
+
# New hidden image input used for VLMs, image-to-image, and image-to-video
|
| 4510 |
+
generation_image_input = gr.Image(
|
| 4511 |
+
label="image for generation",
|
| 4512 |
+
visible=False
|
| 4513 |
+
)
|
| 4514 |
image_to_image_prompt = gr.Textbox(
|
| 4515 |
label="Image-to-Image Prompt",
|
| 4516 |
placeholder="Describe how to transform the uploaded image (e.g., 'Turn the cat into a tiger.')",
|
|
|
|
| 4567 |
visible=True,
|
| 4568 |
info="Transform your uploaded image using FLUX.1-Kontext-dev"
|
| 4569 |
)
|
| 4570 |
+
image_to_video_toggle = gr.Checkbox(
|
| 4571 |
+
label="🎞️ Image to Video (uses input image)",
|
| 4572 |
+
value=False,
|
| 4573 |
+
visible=True,
|
| 4574 |
+
info="Generate a short video from your uploaded image using Lightricks LTX-Video"
|
| 4575 |
+
)
|
| 4576 |
+
image_to_video_prompt = gr.Textbox(
|
| 4577 |
+
label="Image-to-Video Prompt",
|
| 4578 |
+
placeholder="Describe the motion (e.g., 'The cat starts to dance')",
|
| 4579 |
+
lines=2,
|
| 4580 |
+
visible=False
|
| 4581 |
+
)
|
| 4582 |
|
| 4583 |
def on_image_to_image_toggle(toggled):
|
| 4584 |
+
# Show generation image input and its prompt when image-to-image is enabled
|
| 4585 |
return gr.update(visible=bool(toggled)), gr.update(visible=bool(toggled))
|
| 4586 |
|
| 4587 |
def on_text_to_image_toggle(toggled):
|
|
|
|
| 4590 |
image_to_image_toggle.change(
|
| 4591 |
on_image_to_image_toggle,
|
| 4592 |
inputs=[image_to_image_toggle],
|
| 4593 |
+
outputs=[generation_image_input, image_to_image_prompt]
|
| 4594 |
+
)
|
| 4595 |
+
def on_image_to_video_toggle(toggled):
|
| 4596 |
+
return gr.update(visible=bool(toggled)), gr.update(visible=bool(toggled))
|
| 4597 |
+
|
| 4598 |
+
image_to_video_toggle.change(
|
| 4599 |
+
on_image_to_video_toggle,
|
| 4600 |
+
inputs=[image_to_video_toggle],
|
| 4601 |
+
outputs=[generation_image_input, image_to_video_prompt]
|
| 4602 |
)
|
| 4603 |
image_generation_toggle.change(
|
| 4604 |
on_text_to_image_toggle,
|
|
|
|
| 4855 |
show_progress="hidden",
|
| 4856 |
).then(
|
| 4857 |
generation_code,
|
| 4858 |
+
inputs=[input, image_input, generation_image_input, file_input, website_url_input, setting, history, current_model, search_toggle, language_dropdown, provider_state, image_generation_toggle, image_to_image_toggle, image_to_image_prompt, text_to_image_prompt, image_to_video_toggle, image_to_video_prompt],
|
| 4859 |
outputs=[code_output, history, sandbox, history_output]
|
| 4860 |
).then(
|
| 4861 |
end_generation_ui,
|