Spaces:

start3406
/

work

Sleeping

App Files Files Community

start3406 commited on Apr 19

Commit

f8eb849

verified ·

1 Parent(s): 2b1fd8f

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -56

app.py CHANGED Viewed

@@ -46,13 +46,13 @@ except Exception as e:
     print(f"Could not load ASR pipeline: {e}. Voice input will be disabled.")
     traceback.print_exc() # Print full traceback for debugging
-# 2. 文本到图像模型 (Tiny Text-to-Image) - 资源友好模型
 image_generator_pipe = None
-# 使用资源需求极低的 Tiny Text-to-Image 模型
-model_id = "hf-internal-testing/tiny-text-to-image"
 try:
     print(f"Loading Text-to-Image pipeline ({model_id}) on CPU...")
-    print("NOTE: Using a very small model for resource efficiency. Image quality will be lower than Stable Diffusion.")
     # 使用 AutoPipelineForText2Image 自动识别模型类型
     image_generator_pipe = AutoPipelineForText2Image.from_pretrained(model_id, torch_dtype=torch.float32)
     image_generator_pipe = image_generator_pipe.to(device)
@@ -155,7 +155,7 @@ def generate_image_cpu(prompt, negative_prompt, guidance_scale, num_inference_st
     print(f"Generating image on CPU for prompt: {prompt[:100]}...") # Log truncated prompt
     # Note: Negative prompt and guidance scale might have less impact or behave differently
-    # on very small models like tiny-text-to-image.
     print(f"Negative prompt: {negative_prompt}") # Will likely be ignored by tiny model
     print(f"Guidance scale: {guidance_scale}, Steps: {num_inference_steps}") # Steps might be fixed internally by tiny model
@@ -166,23 +166,25 @@ def generate_image_cpu(prompt, negative_prompt, guidance_scale, num_inference_st
         with torch.no_grad():
              # Seed for reproducibility (optional, but good practice)
              # generator = torch.Generator(device=device).manual_seed(int(time.time())) # Tiny model might not use generator param
-             # Tiny Text-to-Image pipeline call structure might be simpler
-             # Check model specific documentation if parameters like guidance_scale, num_inference_steps, negative_prompt
-             # are actually supported. They might be ignored.
-             # Using a simple call that is generally compatible
-             output = image_generator_pipe(prompt=prompt) # Tiny model might only take prompt
-             # The output structure varies between pipelines, assuming it has .images
-             # if hasattr(output, 'images') and isinstance(output.images, list) and len(output.images) > 0:
-             #    image = output.images[0] # Access the first image
-             # else:
-             #    # Handle cases where output format is different
-             #    print("Warning: Pipeline output format unexpected. Assuming the output itself is the image.")
-             #    image = output # Assume output is the image if no .images
-             # Based on tiny-text-to-image, the output is likely a tuple where the first element is a list of images
-             image = output[0][0] # Access the first image in the first list of the tuple output structure
         end_time = time.time()
         print(f"Image generated successfully on CPU in {end_time - start_time:.2f} seconds (using {model_id}).")
@@ -208,7 +210,19 @@ def transcribe_audio(audio_file_path):
     try:
         # Ensure the pipeline uses the correct device (should be CPU based on loading)
         # Ensure input is in expected format for Whisper pipeline (filepath or audio array)
-        transcription = asr_pipeline(audio_file_path)["text"]
         end_time = time.time()
         print(f"Transcription successful in {end_time - start_time:.2f} seconds.")
         print(f"Transcription result: {transcription}")
@@ -236,17 +250,8 @@ def process_input(input_text, audio_file, style_choice, quality_choice, neg_prom
     elif audio_file is not None:
         print("Processing audio input...")
         try:
-            # Gradio might pass a tuple (samplerate, audio_data) or a filepath depending on type="filepath" vs "numpy"
-            # transcribe_audio expects a filepath based on the Gradio component config
-            if isinstance(audio_file, tuple):
-                 # If Gradio gives tuple for some reason, try to save to temp file or adjust transcribe_audio
-                 # Assuming type="filepath" works as expected and passes filepath
-                 audio_filepath_to_transcribe = audio_file[0] # This might be incorrect depending on Gradio version/config
-                 print(f"Warning: Gradio audio input was tuple, attempting to use first element as path: {audio_filepath_to_transcribe}")
-            else:
-                 audio_filepath_to_transcribe = audio_file # This is expected for type="filepath"
-            transcribed_text, _ = transcribe_audio(audio_filepath_to_transcribe)
             if "[Error:" in transcribed_text:
                 # Display transcription error clearly
@@ -295,7 +300,7 @@ def process_input(input_text, audio_file, style_choice, quality_choice, neg_prom
     if enhanced_prompt and not status_message.startswith("[Error:") and not status_message.startswith("[Prompt Enhancement Error:"):
         try:
             # Show "Generating..." message while waiting
-            gr.Info(f"Starting image generation on CPU using {model_id}. This should be fast but quality is low.")
             generated_image = generate_image_cpu(enhanced_prompt, neg_prompt, guidance, steps)
             gr.Info("Image generation complete!")
         except gr.Error as e:
@@ -327,19 +332,21 @@ def process_input(input_text, audio_file, style_choice, quality_choice, neg_prom
 style_options = ["cinematic", "photorealistic", "anime", "fantasy art", "cyberpunk", "steampunk", "watercolor", "illustration", "low poly"]
 quality_options = ["highly detailed", "sharp focus", "intricate details", "4k", "masterpiece", "best quality", "professional lighting"]
-# Tiny model is very fast, steps/guidance might be ignored or have less effect
-# Keep sliders but note their limited impact on this specific model
-default_steps = 10 # Tiny model often uses few steps internally
-max_steps = 20    # Limit max steps as they might not matter much
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# AI Image Generator (Resource-Friendly CPU Version)")
     gr.Markdown(
         "**Enter a short description or use voice input.** The app uses OpenAI (if API key is provided) "
-        f"to create a detailed prompt, then generates an image using a **small, fast model ({model_id}) on the CPU**."
     )
-    # Add specific warning about image quality for the tiny model
-    gr.HTML("<p style='color:orange;font-weight:bold;'>⚠️ Note: Using a small model for compatibility. Image quality and resolution will be significantly lower than models like Stable Diffusion.</p>")
     # Display OpenAI availability status
     if not openai_available:
@@ -347,10 +354,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     else:
          gr.Markdown("**Note:** OpenAI API key found. Prompt will be enhanced using OpenAI.")
     # Display Model loading status
     if not isinstance(image_generator_pipe, AutoPipelineForText2Image):
-         gr.Markdown(f"**CRITICAL:** Image generation model ({model_id}) failed to load. Image generation is disabled. Check logs.")
     with gr.Row():
@@ -366,19 +373,19 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 # Using gr.State as a placeholder that holds None
                 inp_audio = gr.State(None)
-            # --- Controls (Step 3 requirements met) ---
-            # Note: These controls might have limited effect on the small model
-            gr.Markdown("*(Optional controls - Note: These may have limited or no effect on the small model used)*")
             # Control 1: Dropdown
-            inp_style = gr.Dropdown(label="Base Style", choices=style_options, value="cinematic", interactive=True)
             # Control 2: Radio
-            inp_quality = gr.Radio(label="Quality Boost", choices=quality_options, value="highly detailed", interactive=True)
             # Control 3: Textbox (Negative Prompt)
-            inp_neg_prompt = gr.Textbox(label="Negative Prompt (optional)", placeholder="e.g., blurry, low quality, text, watermark", interactive=True)
             # Control 4: Slider (Guidance Scale)
-            inp_guidance = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, value=3.0, label="Guidance Scale (CFG)", interactive=True) # Lower default for small model
-            # Control 5: Slider (Inference Steps) - Reduced max/default
-            inp_steps = gr.Slider(minimum=1, maximum=max_steps, step=1, value=default_steps, label=f"Inference Steps (lower = faster but less detail, max {max_steps})", interactive=True)
             # --- Action Button ---
             # Disable button if model failed to load
@@ -397,7 +404,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     else:
          inputs_list.append(inp_audio) # Pass the gr.State(None) placeholder
     inputs_list.extend([inp_style, inp_quality, inp_neg_prompt, inp_guidance, inp_steps])
     # Link button click to processing function
@@ -424,12 +430,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
 # ---- Application Launch ----
 if __name__ == "__main__":
     # Final check before launch
     if not isinstance(image_generator_pipe, AutoPipelineForText2Image):
         print("\n" + "="*50)
         print("CRITICAL WARNING:")
         print(f"Image generation model ({model_id}) failed to load during startup.")
         print("The Gradio UI will launch, but the 'Generate Image' button will be disabled.")
-        print("Check the logs above for the specific model loading error.")
         print("="*50 + "\n")

     print(f"Could not load ASR pipeline: {e}. Voice input will be disabled.")
     traceback.print_exc() # Print full traceback for debugging
+# 2. 文本到图像模型 (nota-ai/bk-sdm-tiny) - 资源友好模型
 image_generator_pipe = None
+# 使用 nota-ai/bk-sdm-tiny 模型
+model_id = "nota-ai/bk-sdm-tiny"
 try:
     print(f"Loading Text-to-Image pipeline ({model_id}) on CPU...")
+    print("NOTE: Using a small model for resource efficiency. Image quality and details may differ from larger models.")
     # 使用 AutoPipelineForText2Image 自动识别模型类型
     image_generator_pipe = AutoPipelineForText2Image.from_pretrained(model_id, torch_dtype=torch.float32)
     image_generator_pipe = image_generator_pipe.to(device)
     print(f"Generating image on CPU for prompt: {prompt[:100]}...") # Log truncated prompt
     # Note: Negative prompt and guidance scale might have less impact or behave differently
+    # on very small models.
     print(f"Negative prompt: {negative_prompt}") # Will likely be ignored by tiny model
     print(f"Guidance scale: {guidance_scale}, Steps: {num_inference_steps}") # Steps might be fixed internally by tiny model
         with torch.no_grad():
              # Seed for reproducibility (optional, but good practice)
              # generator = torch.Generator(device=device).manual_seed(int(time.time())) # Tiny model might not use generator param
+             # Call the pipeline - assuming standard parameters are accepted
+             output = image_generator_pipe(
+                 prompt=prompt,
+                 # It's possible tiny models ignore some parameters, but passing them is safer
+                 negative_prompt=negative_prompt,
+                 guidance_scale=float(guidance_scale),
+                 num_inference_steps=int(num_inference_steps),
+                 # generator=generator, # Omit if tiny model pipeline doesn't accept it
+                 # height and width might need to be specified or limited for tiny models
+                 # height=..., width=...
+             )
+             # Access the generated image(s). Assuming standard diffusers output structure (.images[0])
+             if hasattr(output, 'images') and isinstance(output.images, list) and len(output.images) > 0:
+                image = output.images[0] # Access the first image
+             else:
+                # Handle cases where output format is different (less common for AutoPipelines)
+                print("Warning: Pipeline output format unexpected. Attempting to use the output directly.")
+                image = output # Assume output is the image
         end_time = time.time()
         print(f"Image generated successfully on CPU in {end_time - start_time:.2f} seconds (using {model_id}).")
     try:
         # Ensure the pipeline uses the correct device (should be CPU based on loading)
         # Ensure input is in expected format for Whisper pipeline (filepath or audio array)
+        if isinstance(audio_file_path, tuple): # Handle case where Gradio might pass tuple
+             # Assuming tuple is (samplerate, numpy_array), need to save to temp file or process directly
+             # For simplicity with type="filepath", assume it passes path directly
+             print("Warning: Audio input was tuple, expecting filepath. This might fail.")
+             # Attempting to process numpy array if it's the second element
+             if isinstance(audio_file_path[1], torch.Tensor) or isinstance(audio_file_path[1], list) or isinstance(audio_file_path[1], (int, float)):
+                  # This path is complex, sticking to filepath assumption for now
+                  pass # Let the pipeline call below handle potential error
+             audio_input_for_pipeline = audio_file_path # Pass original tuple, let pipeline handle
+        else:
+             audio_input_for_pipeline = audio_file_path # Expected filepath
+        transcription = asr_pipeline(audio_input_for_pipeline)["text"]
         end_time = time.time()
         print(f"Transcription successful in {end_time - start_time:.2f} seconds.")
         print(f"Transcription result: {transcription}")
     elif audio_file is not None:
         print("Processing audio input...")
         try:
+            # transcribe_audio handles different Gradio audio output types potentially
+            transcribed_text, _ = transcribe_audio(audio_file)
             if "[Error:" in transcribed_text:
                 # Display transcription error clearly
     if enhanced_prompt and not status_message.startswith("[Error:") and not status_message.startswith("[Prompt Enhancement Error:"):
         try:
             # Show "Generating..." message while waiting
+            gr.Info(f"Starting image generation on CPU using {model_id}. This should be faster than full SD, but might still take time.")
             generated_image = generate_image_cpu(enhanced_prompt, neg_prompt, guidance, steps)
             gr.Info("Image generation complete!")
         except gr.Error as e:
 style_options = ["cinematic", "photorealistic", "anime", "fantasy art", "cyberpunk", "steampunk", "watercolor", "illustration", "low poly"]
 quality_options = ["highly detailed", "sharp focus", "intricate details", "4k", "masterpiece", "best quality", "professional lighting"]
+# Adjust steps/guidance defaults for a smaller model, still might be ignored by some pipelines
+default_steps = 20
+max_steps = 40 # Adjusted max steps
+default_guidance = 5.0 # Adjusted default guidance
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# AI Image Generator (CPU Version - Using Small Model)")
     gr.Markdown(
         "**Enter a short description or use voice input.** The app uses OpenAI (if API key is provided) "
+        f"to create a detailed prompt, then generates an image using a **small model ({model_id}) on the CPU**."
     )
+    # Add specific warning about CPU speed and potential resource issues for this specific model
+    gr.HTML("<p style='color:orange;font-weight:bold;'>⚠️ Note: Using a small model for better compatibility on CPU. Generation should be faster than full Stable Diffusion, but quality/details may differ.</p>")
+    gr.HTML("<p style='color:red;font-weight:bold;'>⏰ CPU generation can still take 1-5 minutes per image depending on load and model specifics.</p>")
     # Display OpenAI availability status
     if not openai_available:
     else:
          gr.Markdown("**Note:** OpenAI API key found. Prompt will be enhanced using OpenAI.")
     # Display Model loading status
+    # Check against AutoPipelineForText2Image type
     if not isinstance(image_generator_pipe, AutoPipelineForText2Image):
+         gr.Markdown(f"**CRITICAL:** Image generation model ({model_id}) failed to load. Image generation is disabled. Check Space logs for details.")
     with gr.Row():
                 # Using gr.State as a placeholder that holds None
                 inp_audio = gr.State(None)
+            # --- Controls ---
+            # Note: These controls might have less impact than on larger models
+            gr.Markdown("*(Optional controls - Note: Their impact might vary on this small model)*")
             # Control 1: Dropdown
+            inp_style = gr.Dropdown(label="Base Style", choices=style_options, value="cinematic")
             # Control 2: Radio
+            inp_quality = gr.Radio(label="Quality Boost", choices=quality_options, value="highly detailed")
             # Control 3: Textbox (Negative Prompt)
+            inp_neg_prompt = gr.Textbox(label="Negative Prompt (optional)", placeholder="e.g., blurry, low quality, text, watermark, signature, deformed")
             # Control 4: Slider (Guidance Scale)
+            inp_guidance = gr.Slider(minimum=1.0, maximum=10.0, step=0.5, value=default_guidance, label="Guidance Scale (CFG)") # Lower max guidance
+            # Control 5: Slider (Inference Steps) - Adjusted max/default
+            inp_steps = gr.Slider(minimum=5, maximum=max_steps, step=1, value=default_steps, label=f"Inference Steps (lower = faster but less detail, max {max_steps})") # Lower min steps
             # --- Action Button ---
             # Disable button if model failed to load
     else:
          inputs_list.append(inp_audio) # Pass the gr.State(None) placeholder
     inputs_list.extend([inp_style, inp_quality, inp_neg_prompt, inp_guidance, inp_steps])
     # Link button click to processing function
 # ---- Application Launch ----
 if __name__ == "__main__":
     # Final check before launch
+    # Check against AutoPipelineForText2Image type
     if not isinstance(image_generator_pipe, AutoPipelineForText2Image):
         print("\n" + "="*50)
         print("CRITICAL WARNING:")
         print(f"Image generation model ({model_id}) failed to load during startup.")
         print("The Gradio UI will launch, but the 'Generate Image' button will be disabled.")
+        print("Check the Space logs above for the specific model loading error.")
         print("="*50 + "\n")