Spaces:

programmersd
/

Zitc

Running

App Files Files Community

programmersd commited on 4 days ago

Commit

24ee00a

verified ·

1 Parent(s): f7912b7

Update app.py

Browse files

Files changed (1) hide show

app.py +236 -109

app.py CHANGED Viewed

@@ -6,10 +6,12 @@ import torch
 import gradio as gr
 # =====================================================
-# 🔥 EXTREME CPU + RAM CONTROL
 # =====================================================
-CPU_THREADS = 2  # Ultra survival safe value
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
@@ -17,139 +19,225 @@ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
 os.environ["OMP_NUM_THREADS"] = str(CPU_THREADS)
 os.environ["MKL_NUM_THREADS"] = str(CPU_THREADS)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 torch.set_num_threads(CPU_THREADS)
 torch.set_grad_enabled(False)
 DEVICE = "cpu"
-DTYPE = torch.float32
 CACHE_DIR = "./hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 # =====================================================
-# 📦 IMPORTS
 # =====================================================
-from huggingface_hub import hf_hub_download
-from diffusers import (
-    ZImagePipeline,
-    ZImageTransformer2DModel,
-    GGUFQuantizationConfig,
-    AutoencoderKL,
-    FlowMatchEulerDiscreteScheduler
-)
-from transformers import AutoTokenizer, AutoModel
 # =====================================================
-# 🧠 MODEL REFERENCES
 # =====================================================
-BASE_MODEL_ID = "Tongyi-MAI/Z-Image-Turbo"
-TEXT_ENCODER_ID = "Qwen/Qwen3-4B"
-GGUF_REPO_ID = "unsloth/Z-Image-Turbo-GGUF"
-GGUF_FILENAME = "z-image-turbo-Q2_K.gguf"
-print("⚡ Initializing Z-Image Turbo ULTRA CPU Engine...")
 # =====================================================
-# 🧠 LOAD PIPELINE (MEMORY SAFE)
 # =====================================================
-def load_pipeline():
-    scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
-        BASE_MODEL_ID,
-        subfolder="scheduler",
-        cache_dir=CACHE_DIR,
-        low_cpu_mem_usage=True
-    )
-    vae = AutoencoderKL.from_pretrained(
-        BASE_MODEL_ID,
-        subfolder="vae",
-        torch_dtype=DTYPE,
-        low_cpu_mem_usage=True,
-        cache_dir=CACHE_DIR
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        TEXT_ENCODER_ID,
-        cache_dir=CACHE_DIR
-    )
-    text_encoder = AutoModel.from_pretrained(
-        TEXT_ENCODER_ID,
-        torch_dtype=DTYPE,
-        low_cpu_mem_usage=True,
-        cache_dir=CACHE_DIR
-    )
-    gguf_path = hf_hub_download(
-        repo_id=GGUF_REPO_ID,
-        filename=GGUF_FILENAME,
-        cache_dir=CACHE_DIR,
-        resume_download=True
-    )
-    transformer = ZImageTransformer2DModel.from_single_file(
-        gguf_path,
-        quantization_config=GGUFQuantizationConfig(compute_dtype=DTYPE),
-        torch_dtype=DTYPE,
-        low_cpu_mem_usage=True
-    )
-    pipe = ZImagePipeline(
-        vae=vae,
-        text_encoder=text_encoder,
-        tokenizer=tokenizer,
-        transformer=transformer,
-        scheduler=scheduler
-    ).to(DEVICE)
-    # 🔥 MAX SAFE MEMORY STACK
-    pipe.enable_attention_slicing()
-    pipe.enable_vae_slicing()
-    pipe.enable_vae_tiling()
-    pipe.set_progress_bar_config(disable=True)
-    print("✅ Engine Ready")
-    return pipe
-pipe = load_pipeline()
 # =====================================================
-# 🚀 GENERATION CORE WITH ETA
 # =====================================================
 @torch.inference_mode()
 def generate(prompt, width, height, steps, seed, progress=gr.Progress()):
-    if not prompt:
-        raise gr.Error("Prompt required")
-    # HARD OOM PROTECTION
-    width = max(256, min(width, 640))
-    height = max(256, min(height, 640))
-    steps = max(1, min(steps, 6))
-    if seed < 0:
         seed = random.randint(0, 2**31 - 1)
-    generator = torch.Generator(device=DEVICE).manual_seed(seed)
-    start_time = time.time()
-    def callback(step, timestep, latents=None):
-        elapsed = time.time() - start_time
-        avg = elapsed / (step + 1)
-        remaining = avg * (steps - step - 1)
-        progress(
-            (step + 1) / steps,
-            desc=f"Step {step+1}/{steps} | ETA: {remaining:.1f}s"
-        )
     try:
         result = pipe(
             prompt=prompt,
             negative_prompt=None,
@@ -159,37 +247,59 @@ def generate(prompt, width, height, steps, seed, progress=gr.Progress()):
             guidance_scale=1.0,
             generator=generator,
             callback=callback,
-            callback_steps=1
         )
         image = result.images[0]
         gc.collect()
         return image, seed
     except Exception as e:
         gc.collect()
-        raise gr.Error(f"Generation error: {e}")
 # =====================================================
-# 🎛 UI
 # =====================================================
-with gr.Blocks(title="Z-Image Turbo ULTRA CPU") as demo:
-    gr.Markdown("# ⚡ Z-Image Turbo — MAX CPU SURVIVAL MODE")
-    prompt = gr.Textbox(label="Prompt", lines=2)
     with gr.Row():
-        width = gr.Slider(256, 640, 512, step=64)
-        height = gr.Slider(256, 640, 512, step=64)
-    steps = gr.Slider(1, 6, value=4, step=1)
-    seed = gr.Number(value=-1, precision=0)
-    btn = gr.Button("🚀 Generate")
-    output = gr.Image()
-    used_seed = gr.Number(label="Seed Used")
     btn.click(
         generate,
@@ -197,5 +307,22 @@ with gr.Blocks(title="Z-Image Turbo ULTRA CPU") as demo:
         outputs=[output, used_seed]
     )
-demo.queue(concurrency_count=1, max_size=4)
-demo.launch()

 import gradio as gr
 # =====================================================
+# 🔥 EXTREME CPU + RAM CONTROL - ULTIMATE OPTIMIZATION
 # =====================================================
+CPU_THREADS = 1  # Minimum safe value for HF Spaces
+MAX_RESOLUTION = 512
+MAX_STEPS = 4
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
 os.environ["OMP_NUM_THREADS"] = str(CPU_THREADS)
 os.environ["MKL_NUM_THREADS"] = str(CPU_THREADS)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["TRANSFORMERS_CACHE"] = "./hf_cache"
+os.environ["HF_DATASETS_CACHE"] = "./hf_cache"
 torch.set_num_threads(CPU_THREADS)
 torch.set_grad_enabled(False)
+torch.set_float32_matmul_precision('lowest')
 DEVICE = "cpu"
+DTYPE = torch.float16  # CRITICAL: Use float16 to save 50% memory
 CACHE_DIR = "./hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
+print("⚡ Z-Image Turbo ULTRA CPU - EXTREME MODE (HF Spaces 16GB)")
 # =====================================================
+# 📦 MINIMAL IMPORTS
 # =====================================================
+try:
+    from huggingface_hub import hf_hub_download
+    from diffusers import (
+        ZImagePipeline,
+        ZImageTransformer2DModel,
+        GGUFQuantizationConfig,
+        AutoencoderKL,
+        FlowMatchEulerDiscreteScheduler
+    )
+    from transformers import (
+        AutoTokenizer,
+        CLIPTextModel,
+        BertModel,
+        BertTokenizer
+    )
+except ImportError as e:
+    print(f"⚠️  Import error (models may not load): {e}")
 # =====================================================
+# 🧠 GLOBAL PIPELINE STATE (Lazy Loading)
 # =====================================================
+pipe = None
+_pipe_lock = False
 # =====================================================
+# 🎯 LIGHTWEIGHT TEXT ENCODER LOADER
 # =====================================================
+def load_text_encoder_lightweight():
+    """Load absolute minimum text encoder"""
+    print("📝 Loading lightweight text encoder...")
+    try:
+        # Try tiny CLIP first
+        from transformers import CLIPTokenizer, CLIPTextModel
+        tokenizer = CLIPTokenizer.from_pretrained(
+            "openai/clip-vit-base-patch32",
+            cache_dir=CACHE_DIR,
+            local_files_only=False
+        )
+        text_encoder = CLIPTextModel.from_pretrained(
+            "openai/clip-vit-base-patch32",
+            torch_dtype=DTYPE,
+            low_cpu_mem_usage=True,
+            cache_dir=CACHE_DIR,
+            local_files_only=False
+        )
+        return tokenizer, text_encoder
+    except Exception as e:
+        print(f"⚠️  CLIP failed: {e}, using fallback...")
+        # Fallback: Use BERT-tiny (much smaller)
+        from transformers import AutoTokenizer, AutoModel
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                "prajjwal1/bert-tiny",
+                cache_dir=CACHE_DIR
+            )
+            text_encoder = AutoModel.from_pretrained(
+                "prajjwal1/bert-tiny",
+                torch_dtype=DTYPE,
+                low_cpu_mem_usage=True,
+                cache_dir=CACHE_DIR
+            )
+            return tokenizer, text_encoder
+        except Exception as e2:
+            print(f"❌ Both encoders failed: {e2}")
+            raise
+# =====================================================
+# 🚀 LAZY-LOADED PIPELINE WITH MEMORY CONTROL
+# =====================================================
+def load_pipeline():
+    """Load pipeline once, keep in memory"""
+    global pipe, _pipe_lock
+    if pipe is not None:
+        return pipe
+    if _pipe_lock:
+        raise gr.Error("Pipeline already loading. Please wait...")
+    _pipe_lock = True
+    try:
+        print("⚡ Loading scheduler...")
+        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            "Tongyi-MAI/Z-Image-Turbo",
+            subfolder="scheduler",
+            cache_dir=CACHE_DIR,
+            low_cpu_mem_usage=True
+        )
+        print("⚡ Loading VAE (memory-optimized)...")
+        vae = AutoencoderKL.from_pretrained(
+            "Tongyi-MAI/Z-Image-Turbo",
+            subfolder="vae",
+            torch_dtype=DTYPE,
+            low_cpu_mem_usage=True,
+            cache_dir=CACHE_DIR,
+            variant="fp16"  # Force fp16 variant
+        )
+        print("⚡ Loading text encoder (lightweight)...")
+        tokenizer, text_encoder = load_text_encoder_lightweight()
+        print("⚡ Loading transformer (GGUF quantized)...")
+        gguf_path = hf_hub_download(
+            repo_id="unsloth/Z-Image-Turbo-GGUF",
+            filename="z-image-turbo-Q2_K.gguf",
+            cache_dir=CACHE_DIR,
+            resume_download=True,
+            local_files_only=False
+        )
+        transformer = ZImageTransformer2DModel.from_single_file(
+            gguf_path,
+            quantization_config=GGUFQuantizationConfig(compute_dtype=DTYPE),
+            torch_dtype=DTYPE,
+            low_cpu_mem_usage=True
+        )
+        # Build pipeline
+        pipe = ZImagePipeline(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler
+        ).to(DEVICE)
+        # EXTREME memory optimization
+        pipe.enable_attention_slicing()
+        pipe.enable_vae_slicing()
+        pipe.enable_vae_tiling()
+        pipe.set_progress_bar_config(disable=True)
+        # Explicitly set to eval mode and disable gradients
+        pipe.vae.eval()
+        pipe.text_encoder.eval()
+        pipe.transformer.eval()
+        print("✅ Pipeline loaded successfully")
+        return pipe
+    except Exception as e:
+        print(f"❌ Pipeline load failed: {e}")
+        raise gr.Error(f"Failed to load model: {str(e)}")
+    finally:
+        _pipe_lock = False
 # =====================================================
+# 🎨 ULTRA-OPTIMIZED GENERATION
 # =====================================================
 @torch.inference_mode()
 def generate(prompt, width, height, steps, seed, progress=gr.Progress()):
+    """Generate image with aggressive memory management"""
+    if not prompt or not prompt.strip():
+        raise gr.Error("❌ Prompt is required")
+    # HARD safety limits for HF Spaces
+    width = max(256, min(int(width), 512))
+    height = max(256, min(int(height), 512))
+    steps = max(1, min(int(steps), 4))
+    # Reduce to multiple of 64
+    width = (width // 64) * 64
+    height = (height // 64) * 64
+    if seed < 0 or seed == "":
         seed = random.randint(0, 2**31 - 1)
+    else:
+        seed = int(seed)
+    # Pre-generation cleanup
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     try:
+        # Load pipeline on first use
+        pipe = load_pipeline()
+        generator = torch.Generator(device=DEVICE).manual_seed(seed)
+        start_time = time.time()
+        def callback(step, timestep, latents=None):
+            elapsed = time.time() - start_time
+            avg = elapsed / (step + 1) if step > 0 else 0
+            remaining = avg * (steps - step - 1) if step < steps - 1 else 0
+            progress(
+                (step + 1) / steps,
+                desc=f"Step {step+1}/{steps} | ETA: {remaining:.1f}s"
+            )
+        print(f"🎨 Generating {width}x{height} in {steps} steps...")
         result = pipe(
             prompt=prompt,
             negative_prompt=None,
             guidance_scale=1.0,
             generator=generator,
             callback=callback,
+            callback_steps=1,
+            output_type="pil"
         )
         image = result.images[0]
+        # Post-generation cleanup
+        del result
         gc.collect()
         return image, seed
+    except torch.cuda.OutOfMemoryError:
+        gc.collect()
+        raise gr.Error("❌ Out of memory! Try smaller size or fewer steps")
     except Exception as e:
         gc.collect()
+        raise gr.Error(f"❌ Generation error: {str(e)}")
 # =====================================================
+# 🎛️ MINIMAL GRADIO UI
 # =====================================================
+with gr.Blocks(title="Z-Image Turbo CPU") as demo:
+    gr.Markdown("""
+# ⚡ Z-Image Turbo — CPU ULTRA MODE
+**HF Spaces Optimized | 16GB RAM | No GPU**
+⚠️ Slow generation expected on CPU. Start with 256x256 and low steps.
+    """)
     with gr.Row():
+        with gr.Column(scale=2):
+            prompt = gr.Textbox(
+                label="Prompt",
+                placeholder="Describe what you want...",
+                lines=3
+            )
+            with gr.Row():
+                width = gr.Slider(256, 512, 256, step=64, label="Width")
+                height = gr.Slider(256, 512, 256, step=64, label="Height")
+            with gr.Row():
+                steps = gr.Slider(1, 4, 2, step=1, label="Steps")
+                seed = gr.Number(value=-1, precision=0, label="Seed (-1=random)")
+            btn = gr.Button("🚀 Generate", variant="primary", scale=2)
+        with gr.Column(scale=1):
+            output = gr.Image(label="Output")
+            used_seed = gr.Number(label="Seed Used", interactive=False)
     btn.click(
         generate,
         outputs=[output, used_seed]
     )
+    gr.Markdown("""
+### ⚡ Performance Tips
+- Start with **256x256** resolution
+- Use **1-2 steps** for fast results
+- Each step takes ~30-60s on CPU
+- Results improve with more steps
+- Negative seeds auto-randomize
+### 💾 Memory Strategy
+- Models loaded on first request only
+- Aggressive garbage collection after each run
+- float16 reduces memory by 50%
+- VAE tiling saves additional ~2GB
+    """)
+demo.queue(concurrency_count=1, max_size=2)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)