Spaces:

rahul7star
/

Image2Video

Running on Zero

App Files Files Community

rahul7star commited on 2 days ago

Commit

792bd64

verified ·

1 Parent(s): cbb491e

Update app_quant_latent.py

Browse files

Files changed (1) hide show

app_quant_latent.py +107 -66

app_quant_latent.py CHANGED Viewed

@@ -245,82 +245,123 @@ log_system_stats("AFTER PIPELINE BUILD")
 @spaces.GPU
 def generate_image(prompt, height, width, steps, seed):
-    try:
-        generator = torch.Generator(device).manual_seed(int(seed))
-        latent_history = []
-        # callback signature expected by ZImagePipeline:
-        # callback_on_step_end(self_pipeline, step_index, timestep, callback_kwargs_dict)
-        def save_latents(self_pipeline, step_idx, timestep, callback_kwargs):
-            # callback_kwargs contains tensor inputs specified by
-            # callback_on_step_end_tensor_inputs (defaults to ["latents"])
-            try:
-                lat = callback_kwargs.get("latents", None)
-                if lat is not None:
-                    # store CPU copy to avoid holding GPU memory
-                    latent_history.append(lat.detach().clone().cpu())
-                # we must return a dict (may include overrides), here no overrides:
-                return {}
-            except Exception as e:
-                log(f"⚠️ save_latents error: {e}")
-                return {}
-        # Run pipeline once, using the pipeline's callback mechanism
-        out = pipe(
-            prompt=prompt,
-            height=height,
-            width=width,
-            num_inference_steps=steps,
-            guidance_scale=0.0,
-            generator=generator,
-            callback_on_step_end=save_latents,
-            callback_on_step_end_tensor_inputs=["latents"],  # ensure latents passed to callback
-        )
-        # out is a ZImagePipelineOutput; pipeline already postprocessed images
-        final_image = out.images[0] if hasattr(out, "images") and len(out.images) > 0 else out
-        # Convert saved latents into displayable images (use same postprocessing as pipeline)
-        latent_images = []
         try:
-            # Determine decode device and dtype
-            vae = pipe.vae
-            img_proc = pipe.image_processor
-            vae_device = vae.device if hasattr(vae, "device") else device
-            for i, lat_cpu in enumerate(latent_history):
-                try:
-                    # move to vae device and dtype
-                    lat = lat_cpu.to(vae_device).to(vae.dtype)
-                    # pipeline used this transform before decoding:
-                    lat = (lat / vae.config.scaling_factor) + getattr(vae.config, "shift_factor", 0.0)
-                    # decode: vae.decode returns (batch, C, H, W)
-                    img_tensor = vae.decode(lat, return_dict=False)[0]
-                    # postprocess with pipeline's image processor to PIL
-                    pil = img_proc.postprocess(img_tensor.unsqueeze(0), output_type="pil")[0]
-                    latent_images.append(pil)
-                except Exception as e:
-                    log(f"⚠️ Failed to decode latent step {i}: {e}")
         except Exception as e:
-            log(f"⚠️ Error while converting latents: {e}")
-        log("✅ Inference finished.")
-        log_system_stats("AFTER INFERENCE")
-        return final_image, latent_images, LOGS
-    except Exception as e:
-        log(f"❌ Inference error: {e}")
-        return None, [], LOGS

+import torch
+from PIL import Image
+import io
+logs = []
+latent_gallery = []
 @spaces.GPU
 def generate_image(prompt, height, width, steps, seed):
+ try:
+    device = pipe._execution_device
+    generator = torch.Generator(device).manual_seed(int(seed))
+    # 1. Encode prompt
+    prompt_embeds, negative_prompt_embeds = pipe.encode_prompt(
+        prompt=prompt,
+        negative_prompt=None,
+        do_classifier_free_guidance=True,
+        device=device,
+    )
+    batch_size = 1
+    num_images_per_prompt = 1
+    actual_batch_size = batch_size * num_images_per_prompt
+    num_channels_latents = pipe.transformer.in_channels
+    # 2. Prepare latents
+    latents = pipe.prepare_latents(
+        actual_batch_size,
+        num_channels_latents,
+        height,
+        width,
+        torch.float32,
+        device,
+        generator,
+        latents=None,
+    )
+    # Repeat prompt embeddings for multiple images
+    prompt_embeds = [pe for pe in prompt_embeds for _ in range(num_images_per_prompt)]
+    negative_prompt_embeds = [npe for npe in negative_prompt_embeds for _ in range(num_images_per_prompt)]
+    # 3. Prepare timesteps
+    image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2)
+    mu = calculate_shift(
+        image_seq_len,
+        pipe.scheduler.config.get("base_image_seq_len", 256),
+        pipe.scheduler.config.get("max_image_seq_len", 4096),
+        pipe.scheduler.config.get("base_shift", 0.5),
+        pipe.scheduler.config.get("max_shift", 1.15),
+    )
+    pipe.scheduler.sigma_min = 0.0
+    timesteps, num_inference_steps = retrieve_timesteps(pipe.scheduler, steps, device, sigmas=None, mu=mu)
+    # 4. Denoising loop
+    with pipe.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            timestep = t.expand(latents.shape[0])
+            timestep = (1000 - timestep) / 1000
+            # CFG
+            latents_typed = latents.to(pipe.transformer.dtype)
+            latent_model_input = latents_typed.repeat(2, 1, 1, 1).unsqueeze(2)
+            prompt_embeds_model_input = prompt_embeds + negative_prompt_embeds
+            timestep_model_input = timestep.repeat(2)
+            latent_model_input_list = list(latent_model_input.unbind(dim=0))
+            model_out_list = pipe.transformer(
+                latent_model_input_list, timestep_model_input, prompt_embeds_model_input, return_dict=False
+            )[0]
+            # Perform CFG
+            pos_out = model_out_list[:actual_batch_size]
+            neg_out = model_out_list[actual_batch_size:]
+            noise_pred = torch.stack([p + pipe.guidance_scale * (p - n) for p, n in zip(pos_out, neg_out)], dim=0)
+            noise_pred = noise_pred.squeeze(2)
+            noise_pred = -noise_pred
+            latents = pipe.scheduler.step(noise_pred.to(torch.float32), t, latents, return_dict=False)[0]
+            # Store each latent step for gallery
+            latent_gallery.append(latents.clone().detach().cpu())
+            progress_bar.update()
+    # 5. Decode final image
+    latents_dec = latents.to(pipe.vae.dtype)
+    latents_dec = (latents_dec / pipe.vae.config.scaling_factor) + getattr(pipe.vae.config, "shift_factor", 0.0)
+    # Squeeze extra dim if present
+    if latents_dec.dim() == 5 and latents_dec.shape[2] == 1:
+        latents_dec = latents_dec.squeeze(2)
+    image = pipe.vae.decode(latents_dec, return_dict=False)[0]
+    final_image = pipe.image_processor.postprocess(image, output_type="pil")
+    # Decode latent gallery steps to images (optional)
+    gallery_images = []
+    for idx, lat in enumerate(latent_gallery):
         try:
+            lat = lat.to(pipe.vae.dtype)
+            if lat.dim() == 5 and lat.shape[2] == 1:
+                lat = lat.squeeze(2)
+            img = pipe.vae.decode(lat, return_dict=False)[0]
+            img = pipe.image_processor.postprocess(img, output_type="pil")
+            gallery_images.append(img[0])
         except Exception as e:
+            logs.append(f"⚠️ Failed to decode latent step {idx}: {e}")
+    return final_image[0], gallery_images, "\n".join(logs)
+ except Exception as e:
+    return None, [], f"❌ Inference error: {e}"