Spaces:

start3406
/

work

Sleeping

App Files Files Community

start3406 commited on Apr 19

Commit

a63d56e

verified ·

1 Parent(s): ec797fe

Update app.py

Browse files

Files changed (1) hide show

app.py +255 -183

app.py CHANGED Viewed

@@ -2,280 +2,352 @@ import gradio as gr
 import torch
 from transformers import pipeline, set_seed
 from diffusers import StableDiffusionPipeline
 import os
 import time
-# ---- 配置与模型加载 (在应用启动时加载一次) ----
-# 检查是否有可用的GPU，否则使用CPU
-device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 # 1. 语音转文本模型 (Whisper) - 加分项
 asr_pipeline = None
 try:
-    print("Loading ASR pipeline (Whisper)...")
-    # 使用较小的模型以节省资源，可根据需要替换 openai/whisper-medium 或 large
-    # 在不需要GPU的应用部分可以强制使用CPU
-    asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device if device == "cuda" else -1) # whisper在CPU上也可以运行
-    print("ASR pipeline loaded.")
 except Exception as e:
     print(f"Could not load ASR pipeline: {e}. Voice input will be disabled.")
-# 2. 提示词增强模型 (LLM) - Step 1
-prompt_enhancer_pipeline = None
-try:
-    print("Loading Prompt Enhancer pipeline (GPT-2)...")
-    # 使用 GPT-2 作为示例，实际应用中建议使用更强大的指令微调模型如 Mistral 或 Llama
-    # 注意：GPT-2 可能不会生成特别高质量的SD提示词，这里仅作结构演示
-    # 如果资源允许，可以替换为 'mistralai/Mistral-7B-Instruct-v0.1' 等，但需要更多内存/GPU
-    prompt_enhancer_pipeline = pipeline("text-generation", model="gpt2", device=device if device == "cuda" else -1) # text-generation在CPU上也可以运行
-    print("Prompt Enhancer pipeline loaded.")
-except Exception as e:
-    print(f"Could not load Prompt Enhancer pipeline: {e}. Prompt enhancement might fail.")
-# 3. 文本到图像模型 (Stable Diffusion) - Step 2
 image_generator_pipe = None
 try:
-    print("Loading Stable Diffusion pipeline (v1.5)...")
     model_id = "runwayml/stable-diffusion-v1-5"
-    image_generator_pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16 if device == "cuda" else torch.float32)
     image_generator_pipe = image_generator_pipe.to(device)
-    # 如果内存不足，可以启用CPU offloading (需要 accelerate库)
-    # image_generator_pipe.enable_model_cpu_offload()
-    print("Stable Diffusion pipeline loaded.")
 except Exception as e:
-    print(f"Could not load Stable Diffusion pipeline: {e}. Image generation will fail.")
-    # 如果模型加载失败，创建一个虚拟对象以避免后续代码出错
     class DummyPipe:
         def __call__(self, *args, **kwargs):
-            # 返回一个占位符错误信息或图像
-            raise RuntimeError(f"Stable Diffusion model failed to load: {e}")
     image_generator_pipe = DummyPipe()
-# ---- 核心功能函数 ----
-# Step 1: Prompt-to-Prompt
-def enhance_prompt(short_prompt, style_modifier="cinematic", quality_boost="photorealistic, highly detailed"):
-    """使用LLM增强简短描述"""
-    if not prompt_enhancer_pipeline:
-        return f"[Error: LLM not loaded] Original prompt: {short_prompt}"
     if not short_prompt:
-        return "[Error: Input description is empty]"
-    # 构建给LLM的指令
-    # 注意：这个指令对GPT-2来说可能太复杂，对Mistral等更有效
-    input_text = (
-        f"Generate a detailed and vivid prompt for an AI image generator based on the following description. "
-        f"Incorporate the style '{style_modifier}' and quality boost '{quality_boost}'. "
-        f"Focus on visual details, lighting, composition, and mood. "
-        f"Description: \"{short_prompt}\"\n\n"
-        f"Detailed Prompt:"
     )
     try:
-        # 设置种子以获得可复现的（某种程度上的）结果
-        set_seed(int(time.time()))
-        # max_length 控制生成文本的总长度 (包括输入)
-        # num_return_sequences 返回多少个结果
-        # temperature 控制随机性，较低的值更保守
-        # no_repeat_ngram_size 避免重复短语
-        outputs = prompt_enhancer_pipeline(
-            input_text,
-            max_length=150, # 限制输出长度，避免过长
-            num_return_sequences=1,
-            temperature=0.7,
-            no_repeat_ngram_size=2,
-            pad_token_id=prompt_enhancer_pipeline.tokenizer.eos_token_id # 避免padding warning
         )
-        generated_text = outputs[0]['generated_text']
-        # 从LLM的完整输出中提取增强后的提示词部分
-        # 简单方法：取 "Detailed Prompt:" 之后的内容
-        enhanced = generated_text.split("Detailed Prompt:")[-1].strip()
-        # 进一步清理可能包含的原始输入或指令痕迹
-        if short_prompt in enhanced[:len(short_prompt)+5]: # 如果开头包含原始输入
-             enhanced = enhanced.replace(short_prompt, "", 1).strip(' ,"')
-        # 添加基础的风格和质量词，如果LLM没有包含的话
-        if style_modifier not in enhanced:
-            enhanced += f", {style_modifier}"
-        if quality_boost not in enhanced:
-             enhanced += f", {quality_boost}"
-        return enhanced
     except Exception as e:
-        print(f"Error during prompt enhancement: {e}")
-        return f"[Error: Prompt enhancement failed] Original prompt: {short_prompt}"
-# Step 2: Prompt-to-Image
-def generate_image(prompt, negative_prompt, guidance_scale, num_inference_steps):
-    """使用Stable Diffusion生成图像"""
     if not isinstance(image_generator_pipe, StableDiffusionPipeline):
-         raise gr.Error(f"Stable Diffusion model is not available. Load error: {image_generator_pipe}") # 使用gr.Error在UI上显示错误
-    if not prompt or "[Error:" in prompt:
         raise gr.Error("Cannot generate image due to invalid or missing prompt.")
-    print(f"Generating image for prompt: {prompt}")
     print(f"Negative prompt: {negative_prompt}")
     print(f"Guidance scale: {guidance_scale}, Steps: {num_inference_steps}")
     try:
-        # 设置随机种子
-        generator = torch.Generator(device=device).manual_seed(int(time.time()))
-        # 执行推理
-        with torch.inference_mode(): # 节省内存
-            image = image_generator_pipe(
-                prompt=prompt,
-                negative_prompt=negative_prompt,
-                guidance_scale=float(guidance_scale),
-                num_inference_steps=int(num_inference_steps),
-                generator=generator
-            ).images[0]
-        print("Image generated successfully.")
         return image
     except Exception as e:
-        print(f"Error during image generation: {e}")
-        # 将底层错误传递给 Gradio，使其能在 UI 中显示
-        raise gr.Error(f"Image generation failed: {e}")
-# Bonus: Voice-to-Text
 def transcribe_audio(audio_file_path):
-    """将音频文件转录为文本"""
     if not asr_pipeline:
-        return "[Error: ASR model not loaded]", "" # 返回错误信息和空路径
     if audio_file_path is None:
-        return "", "" # 没有音频输入
-    print(f"Transcribing audio file: {audio_file_path}")
     try:
-        # 转录音频
         transcription = asr_pipeline(audio_file_path)["text"]
         print(f"Transcription result: {transcription}")
-        return transcription, audio_file_path # 返回文本和路径（可能用于显示）
     except Exception as e:
-        print(f"Error during audio transcription: {e}")
         return f"[Error: Transcription failed: {e}]", audio_file_path
-# ---- Gradio 应用流程 ----
 def process_input(input_text, audio_file, style_choice, quality_choice, neg_prompt, guidance, steps):
-    """处理输入（文本或语音），生成提示词和图像"""
     final_text_input = ""
-    transcription_source = "" # 用于标记来源
-    # 优先使用文本框输入
     if input_text and input_text.strip():
         final_text_input = input_text.strip()
-        transcription_source = " (from text input)"
-    # 如果文本框为空，且有音频文件，则使用语音输入
     elif audio_file is not None:
         transcribed_text, _ = transcribe_audio(audio_file)
-        if transcribed_text and "[Error:" not in transcribed_text:
             final_text_input = transcribed_text
-            transcription_source = " (from audio input)"
-        elif "[Error:" in transcribed_text:
-             # 如果语音识别出错，直接返回错误信息
-             return transcribed_text, None # 返回错误提示，不生成图像
         else:
-             # 音频为空或识别为空
-             return "[Error: Please provide input via text or voice]", None
     else:
-        # 没有有效输入
-        return "[Error: Please provide input via text or voice]", None
-    print(f"Using input: '{final_text_input}'{transcription_source}")
-    # Step 1: Enhance prompt
-    enhanced_prompt = enhance_prompt(final_text_input, style_modifier=style_choice, quality_boost=quality_choice)
-    print(f"Enhanced prompt: {enhanced_prompt}")
-    # Step 2: Generate image (如果提示词增强成功)
-    generated_image = None
-    if "[Error:" not in enhanced_prompt:
         try:
-            generated_image = generate_image(enhanced_prompt, neg_prompt, guidance, steps)
         except gr.Error as e:
-            # 如果 generate_image 抛出 gr.Error，将其信息作为 enhanced_prompt 返回给UI
-            enhanced_prompt = f"{enhanced_prompt}\n\n[Image Generation Error: {e}]"
-            # 不再尝试显示图片
         except Exception as e:
-            # 捕获其他意外错误
-             enhanced_prompt = f"{enhanced_prompt}\n\n[Unexpected Image Generation Error: {e}]"
-    # 返回结果给Gradio界面
-    return enhanced_prompt, generated_image
-# ---- Gradio 界面构建 (Step 3: Controls & Step 4: Layout) ----
-# 定义可选的风格和质量提升选项 (用于Dropdown/Radio)
-style_options = ["cinematic", "photorealistic", "anime", "fantasy art", "cyberpunk", "steampunk", "watercolor"]
-quality_options = ["highly detailed", "sharp focus", "intricate details", "4k", "masterpiece", "best quality"]
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# AI Image Generator: From Idea to Image")
-    gr.Markdown("Enter a short description (or use voice input), and the app will enhance it into a detailed prompt and generate an image using Stable Diffusion.")
     with gr.Row():
         with gr.Column(scale=1):
-            # 输入区域
-            inp_text = gr.Textbox(label="Enter short description here", placeholder="e.g., A magical treehouse in the sky")
-            # 加分项：语音输入控件
-            inp_audio = gr.Audio(sources=["microphone"], type="filepath", label="Or record your idea (clears text box if used)", visible=asr_pipeline is not None) # 只有ASR加载成功才显示
-            # Step 3: 使用不同控件
-            # 控件1: Dropdown (下拉菜单)
-            inp_style = gr.Dropdown(label="Choose Base Style", choices=style_options, value="cinematic")
-            # 控件2: Radio (单选框) - 也可以用 CheckboxGroup 实现多选
             inp_quality = gr.Radio(label="Quality Boost", choices=quality_options, value="highly detailed")
-            # 控件3: Textbox (用于Negative Prompt)
-            inp_neg_prompt = gr.Textbox(label="Negative Prompt (optional)", placeholder="e.g., blurry, low quality, text, watermark")
-            # 控件4: Slider (滑块)
-            inp_guidance = gr.Slider(minimum=1.0, maximum=20.0, step=0.5, value=7.5, label="Guidance Scale (CFG)")
-            # 控件5: Slider (滑块)
-            inp_steps = gr.Slider(minimum=10, maximum=100, step=1, value=30, label="Inference Steps")
-            # 提交按钮
             btn_generate = gr.Button("Generate Image", variant="primary")
         with gr.Column(scale=1):
-            # 输出区域
-            out_prompt = gr.Textbox(label="Generated Prompt", interactive=False) # 输出文本框不可编辑
-            out_image = gr.Image(label="Generated Image", type="pil") # 输出图像
-    # 设置按钮点击事件
     btn_generate.click(
         fn=process_input,
-        inputs=[inp_text, inp_audio, inp_style, inp_quality, inp_neg_prompt, inp_guidance, inp_steps],
         outputs=[out_prompt, out_image]
     )
-    # (可选) 当用户录音后，可以自动清空文本框，以明确优先使用语音
     if asr_pipeline:
         def clear_text_on_audio(audio_data):
             if audio_data is not None:
-                return "" # 返回空字符串清空文本框
-            return gr.update() # 否则不改变文本框内容 (gr.update()是占位符)
         inp_audio.change(fn=clear_text_on_audio, inputs=inp_audio, outputs=inp_text)
-# ---- 启动应用 ----
 if __name__ == "__main__":
-    # 设置Hugging Face Hub Token (如果需要从私有仓库加载模型)
-    # from huggingface_hub import login
-    # login("YOUR_HF_TOKEN") # 在本地运行时取消注释并替换
-    # 在Hugging Face Spaces上运行时，端口通常由平台管理
-    # share=True 会创建一个公共链接 (如果在本地运行需要)
-    demo.launch(share=False)

 import torch
 from transformers import pipeline, set_seed
 from diffusers import StableDiffusionPipeline
+import openai
 import os
 import time
+import traceback # For detailed error logging
+# ---- Configuration & API Key ----
+# Check for OpenAI API Key in Hugging Face Secrets
+api_key = os.environ.get("OPENAI_API_KEY")
+openai_client = None
+openai_available = False
+if api_key:
+    try:
+        openai.api_key = api_key
+        # Starting with openai v1, client instantiation is preferred
+        openai_client = openai.OpenAI(api_key=api_key)
+        # Simple test to check if the key is valid (optional, but good)
+        # openai_client.models.list() # This call might incur small cost/quota usage
+        openai_available = True
+        print("OpenAI API key found and client initialized.")
+    except Exception as e:
+        print(f"Error initializing OpenAI client: {e}")
+        print("Proceeding without OpenAI features.")
+else:
+    print("WARNING: OPENAI_API_KEY secret not found. Prompt enhancement via OpenAI is disabled.")
+# Force CPU usage
+device = "cpu"
 print(f"Using device: {device}")
+# ---- Model Loading (CPU Focused) ----
 # 1. 语音转文本模型 (Whisper) - 加分项
 asr_pipeline = None
 try:
+    print("Loading ASR pipeline (Whisper) on CPU...")
+    # Force CPU usage with device=-1 or device="cpu"
+    asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
+    print("ASR pipeline loaded successfully on CPU.")
 except Exception as e:
     print(f"Could not load ASR pipeline: {e}. Voice input will be disabled.")
+    traceback.print_exc() # Print full traceback for debugging
+# 2. 文本到图像模型 (Stable Diffusion) - Step 2 (CPU)
 image_generator_pipe = None
 try:
+    print("Loading Stable Diffusion pipeline (v1.5) on CPU...")
+    print("WARNING: Stable Diffusion on CPU is VERY SLOW (expect minutes per image).")
     model_id = "runwayml/stable-diffusion-v1-5"
+    # Use float32 for CPU
+    image_generator_pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float32)
     image_generator_pipe = image_generator_pipe.to(device)
+    print("Stable Diffusion pipeline loaded successfully on CPU.")
 except Exception as e:
+    print(f"CRITICAL: Could not load Stable Diffusion pipeline: {e}. Image generation will fail.")
+    traceback.print_exc() # Print full traceback for debugging
+    # Define a dummy object to prevent crashes later if loading failed
     class DummyPipe:
         def __call__(self, *args, **kwargs):
+             raise RuntimeError(f"Stable Diffusion model failed to load: {e}")
     image_generator_pipe = DummyPipe()
+# ---- Core Function Definitions ----
+# Step 1: Prompt-to-Prompt (using OpenAI API)
+def enhance_prompt_openai(short_prompt, style_modifier="cinematic", quality_boost="photorealistic, highly detailed"):
+    """Uses OpenAI API to enhance the short description."""
+    if not openai_available or not openai_client:
+        # Fallback or error if OpenAI key is missing/invalid
+        print("OpenAI not available. Returning original prompt with modifiers.")
+        return f"{short_prompt}, {style_modifier}, {quality_boost}"
     if not short_prompt:
+        # Return an error message formatted for Gradio output
+        raise gr.Error("Input description cannot be empty.")
+    # Construct the prompt for the OpenAI model
+    system_message = (
+        "You are an expert prompt engineer for AI image generation models like Stable Diffusion. "
+        "Expand the user's short description into a detailed, vivid, and coherent prompt. "
+        "Focus on visual details: subjects, objects, environment, lighting, atmosphere, composition. "
+        "Incorporate the requested style and quality keywords naturally. Avoid conversational text."
     )
+    user_message = (
+        f"Enhance this description: \"{short_prompt}\". "
+        f"Style: '{style_modifier}'. Quality: '{quality_boost}'."
+    )
+    print(f"Sending request to OpenAI for prompt enhancement: {short_prompt}")
     try:
+        response = openai_client.chat.completions.create(
+            model="gpt-3.5-turbo", # Cost-effective choice, can use gpt-4 if needed/key allows
+            messages=[
+                {"role": "system", "content": system_message},
+                {"role": "user", "content": user_message},
+            ],
+            temperature=0.7, # Controls creativity vs predictability
+            max_tokens=150, # Limit output length
+            n=1, # Generate one response
+            stop=None # Let the model decide when to stop
         )
+        enhanced_prompt = response.choices[0].message.content.strip()
+        print("OpenAI enhancement successful.")
+        # Basic cleanup: remove potential quotes around the whole response
+        if enhanced_prompt.startswith('"') and enhanced_prompt.endswith('"'):
+            enhanced_prompt = enhanced_prompt[1:-1]
+        return enhanced_prompt
+    except openai.AuthenticationError:
+        print("OpenAI Authentication Error: Invalid API key?")
+        raise gr.Error("OpenAI Authentication Error: Check your API key.")
+    except openai.RateLimitError:
+         print("OpenAI Rate Limit Error: You've exceeded your quota or rate limit.")
+         raise gr.Error("OpenAI Error: Rate limit exceeded.")
+    except openai.APIError as e:
+        print(f"OpenAI API Error: {e}")
+        raise gr.Error(f"OpenAI API Error: {e}")
     except Exception as e:
+        print(f"An unexpected error occurred during OpenAI call: {e}")
+        traceback.print_exc()
+        raise gr.Error(f"Prompt enhancement failed: {e}")
+# Step 2: Prompt-to-Image (CPU)
+def generate_image_cpu(prompt, negative_prompt, guidance_scale, num_inference_steps):
+    """Generates image using Stable Diffusion on CPU."""
     if not isinstance(image_generator_pipe, StableDiffusionPipeline):
+         raise gr.Error("Stable Diffusion model is not available (failed to load).")
+    if not prompt or "[Error:" in prompt or "Error:" in prompt:
+        # Check if the prompt itself is an error message from the previous step
         raise gr.Error("Cannot generate image due to invalid or missing prompt.")
+    print(f"Generating image on CPU for prompt: {prompt[:100]}...") # Log truncated prompt
     print(f"Negative prompt: {negative_prompt}")
     print(f"Guidance scale: {guidance_scale}, Steps: {num_inference_steps}")
+    start_time = time.time()
     try:
+        # Use torch.inference_mode() or torch.no_grad() for efficiency
+        with torch.no_grad():
+             # Seed for reproducibility (optional, but good practice)
+             generator = torch.Generator(device=device).manual_seed(int(time.time()))
+             image = image_generator_pipe(
+                 prompt=prompt,
+                 negative_prompt=negative_prompt,
+                 guidance_scale=float(guidance_scale),
+                 num_inference_steps=int(num_inference_steps),
+                 generator=generator,
+             ).images[0]
+        end_time = time.time()
+        print(f"Image generated successfully on CPU in {end_time - start_time:.2f} seconds.")
         return image
     except Exception as e:
+        print(f"Error during image generation on CPU: {e}")
+        traceback.print_exc()
+        # Propagate error to Gradio UI
+        raise gr.Error(f"Image generation failed on CPU: {e}")
+# Bonus: Voice-to-Text (CPU)
 def transcribe_audio(audio_file_path):
+    """Transcribes audio to text using Whisper on CPU."""
     if not asr_pipeline:
+        # This case should ideally be handled by hiding the control, but double-check
+        return "[Error: ASR model not loaded]", audio_file_path
     if audio_file_path is None:
+        return "", audio_file_path # No audio input
+    print(f"Transcribing audio file: {audio_file_path} on CPU...")
+    start_time = time.time()
     try:
+        # Ensure the pipeline uses the correct device (should be CPU based on loading)
         transcription = asr_pipeline(audio_file_path)["text"]
+        end_time = time.time()
+        print(f"Transcription successful in {end_time - start_time:.2f} seconds.")
         print(f"Transcription result: {transcription}")
+        return transcription, audio_file_path
     except Exception as e:
+        print(f"Error during audio transcription on CPU: {e}")
+        traceback.print_exc()
+        # Return error message in the expected tuple format
         return f"[Error: Transcription failed: {e}]", audio_file_path
+# ---- Gradio Application Flow ----
 def process_input(input_text, audio_file, style_choice, quality_choice, neg_prompt, guidance, steps):
+    """Main function triggered by Gradio button."""
     final_text_input = ""
+    enhanced_prompt = ""
+    generated_image = None
+    status_message = "" # To gather status/errors for the prompt box
+    # 1. Determine Input (Text or Audio)
     if input_text and input_text.strip():
         final_text_input = input_text.strip()
+        print(f"Using text input: '{final_text_input}'")
     elif audio_file is not None:
+        print("Processing audio input...")
         transcribed_text, _ = transcribe_audio(audio_file)
+        if "[Error:" in transcribed_text:
+            # Display transcription error clearly
+            status_message = transcribed_text
+            print(status_message)
+            # Return error in prompt field, no image
+            return status_message, None
+        elif transcribed_text:
             final_text_input = transcribed_text
+            print(f"Using transcribed audio input: '{final_text_input}'")
         else:
+            status_message = "[Error: Audio input received but transcription was empty.]"
+            print(status_message)
+            return status_message, None # Return error
     else:
+        status_message = "[Error: No input provided. Please enter text or record audio.]"
+        print(status_message)
+        return status_message, None # Return error
+    # 2. Enhance Prompt (using OpenAI if available)
+    if final_text_input:
         try:
+            enhanced_prompt = enhance_prompt_openai(final_text_input, style_choice, quality_choice)
+            status_message = enhanced_prompt # Display the prompt
+            print(f"Enhanced prompt: {enhanced_prompt}")
         except gr.Error as e:
+            # Catch Gradio-specific errors from enhancement function
+            status_message = f"[Prompt Enhancement Error: {e}]"
+            print(status_message)
+            # Return the error, no image generation attempt
+            return status_message, None
         except Exception as e:
+             # Catch any other unexpected errors
+             status_message = f"[Unexpected Prompt Enhancement Error: {e}]"
+             print(status_message)
+             traceback.print_exc()
+             return status_message, None
+    # 3. Generate Image (if prompt is valid)
+    if enhanced_prompt and not status_message.startswith("[Error:") and not status_message.startswith("[Prompt Enhancement Error:"):
+        try:
+            # Show "Generating..." message while waiting
+            gr.Info("Starting image generation on CPU... This will take a while (possibly several minutes).")
+            generated_image = generate_image_cpu(enhanced_prompt, neg_prompt, guidance, steps)
+            gr.Info("Image generation complete!")
+        except gr.Error as e:
+            # Catch Gradio errors from generation function
+            status_message = f"{enhanced_prompt}\n\n[Image Generation Error: {e}]" # Append error to prompt
+            print(f"Image Generation Error: {e}")
+        except Exception as e:
+             status_message = f"{enhanced_prompt}\n\n[Unexpected Image Generation Error: {e}]"
+             print(f"Unexpected Image Generation Error: {e}")
+             traceback.print_exc()
+             # Set image to None explicitly on error
+             generated_image = None
+    # 4. Return results to Gradio UI
+    # Return the status message (enhanced prompt or error) and the image (or None if error)
+    return status_message, generated_image
+# ---- Gradio Interface Construction ----
+style_options = ["cinematic", "photorealistic", "anime", "fantasy art", "cyberpunk", "steampunk", "watercolor", "illustration", "low poly"]
+quality_options = ["highly detailed", "sharp focus", "intricate details", "4k", "masterpiece", "best quality", "professional lighting"]
+# Reduced steps for faster CPU generation attempt
+default_steps = 20
+max_steps = 50 # Limit max steps on CPU
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# AI Image Generator (CPU Version)")
+    gr.Markdown(
+        "**Enter a short description or use voice input.** The app uses OpenAI (if API key is provided) "
+        "to create a detailed prompt, then generates an image using Stable Diffusion v1.5 **on the CPU**."
+    )
+    # Add specific warning about CPU speed
+    gr.HTML("<p style='color:orange;font-weight:bold;'>⚠️ Warning: Image generation on CPU is very slow! Expect several minutes per image.</p>")
+    # Display OpenAI availability status
+    if not openai_available:
+        gr.Markdown("**Note:** OpenAI API key not found or invalid. Prompt enhancement will use a basic fallback.")
     with gr.Row():
         with gr.Column(scale=1):
+            # --- Inputs ---
+            inp_text = gr.Textbox(label="Enter short description", placeholder="e.g., A cute robot drinking coffee on Mars")
+            # Only show Audio input if ASR model loaded successfully
+            if asr_pipeline:
+                inp_audio = gr.Audio(sources=["microphone"], type="filepath", label="Or record your idea (clears text box if used)")
+            else:
+                gr.Markdown("**Voice input disabled:** Whisper model failed to load.")
+                inp_audio = gr.Textbox(visible=False) # Hidden placeholder
+            # --- Controls (Step 3 requirements met) ---
+            # Control 1: Dropdown
+            inp_style = gr.Dropdown(label="Base Style", choices=style_options, value="cinematic")
+            # Control 2: Radio
             inp_quality = gr.Radio(label="Quality Boost", choices=quality_options, value="highly detailed")
+            # Control 3: Textbox (Negative Prompt)
+            inp_neg_prompt = gr.Textbox(label="Negative Prompt (optional)", placeholder="e.g., blurry, low quality, text, watermark, signature, deformed")
+            # Control 4: Slider (Guidance Scale)
+            inp_guidance = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, value=7.0, label="Guidance Scale (CFG)") # Slightly lower max maybe better for CPU
+            # Control 5: Slider (Inference Steps) - Reduced max/default
+            inp_steps = gr.Slider(minimum=10, maximum=max_steps, step=1, value=default_steps, label=f"Inference Steps (lower = faster but less detail, max {max_steps})")
+            # --- Action Button ---
             btn_generate = gr.Button("Generate Image", variant="primary")
         with gr.Column(scale=1):
+            # --- Outputs ---
+            out_prompt = gr.Textbox(label="Generated Prompt / Status", interactive=False, lines=5) # Show prompt or error status here
+            out_image = gr.Image(label="Generated Image", type="pil")
+    # --- Event Handling ---
+    # Define inputs list carefully, handling potentially invisible audio input
+    inputs_list = [inp_text]
+    if asr_pipeline:
+        inputs_list.append(inp_audio)
+    else:
+         inputs_list.append(gr.State(None)) # Pass None if audio control doesn't exist
+    inputs_list.extend([inp_style, inp_quality, inp_neg_prompt, inp_guidance, inp_steps])
     btn_generate.click(
         fn=process_input,
+        inputs=inputs_list,
         outputs=[out_prompt, out_image]
     )
+    # Clear text input if audio is used
     if asr_pipeline:
         def clear_text_on_audio(audio_data):
             if audio_data is not None:
+                return "" # Clear text box
+            return gr.update() # No change if no audio data
         inp_audio.change(fn=clear_text_on_audio, inputs=inp_audio, outputs=inp_text)
+# ---- Application Launch ----
 if __name__ == "__main__":
+    # Check again if SD loaded, maybe prevent launch? Or let it run and fail gracefully in UI.
+    if not isinstance(image_generator_pipe, StableDiffusionPipeline):
+        print("CRITICAL FAILURE: Stable Diffusion pipeline did not load. The application UI will load, but image generation WILL NOT WORK.")
+        # Optionally, you could raise an error here to stop the script if SD is essential
+        # raise RuntimeError("Failed to load Stable Diffusion pipeline, cannot start application.")
+    # Launch the Gradio app
+    demo.launch(share=False) # share=True generates a public link if run locally