Spaces:

eleazhong
/

2-Step-Qwen-Image-Edit

Running on Zero

App Files Files Community

Elea Zhong commited on 15 days ago

Commit

92d8df6

1 Parent(s): 1b9d6c7

add 2step pipe and app

Browse files

Files changed (6) hide show

app.py +75 -28
qwenimage/experiments/quantize_experiments.py +22 -2
qwenimage/foundation.py +2 -2
qwenimage/models/autoencoder_kl_qwenimage.py +5 -0
qwenimage/models/pipeline_qwenimage_edit_plus.py +142 -79
qwenimage/models/transformer_qwenimage.py +60 -54

app.py CHANGED Viewed

@@ -12,6 +12,9 @@ import gradio as gr
 import spaces
 import subprocess
 GIT_TOKEN = os.environ.get("GIT_TOKEN")
 import subprocess
@@ -34,6 +37,7 @@ import subprocess
 from qwenimage.debug import ctimed
 from qwenimage.models.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
 from qwenimage.models.transformer_qwenimage import QwenImageTransformer2DModel
 # --- Model Loading ---
@@ -64,7 +68,26 @@ pipe.load_lora_weights(
     "checkpoints/distill_5k_lora.safetensors",
     adapter_name="fast_5k",
 )
-# pipe.unload_lora_weights()
 MAX_SEED = np.iinfo(np.int32).max
@@ -73,12 +96,12 @@ MAX_SEED = np.iinfo(np.int32).max
 def run_pipe(
     image,
     prompt,
     seed,
     randomize_seed,
     num_inference_steps,
     shift,
-    prev_output = None,
-    progress=gr.Progress(track_tqdm=True)
 ):
     with ctimed("pre pipe"):
@@ -90,35 +113,40 @@ def run_pipe(
         # Choose input image (prefer uploaded, else last output)
         pil_images = []
-        if image is not None:
-            if isinstance(image, Image.Image):
-                pil_images.append(image.convert("RGB"))
-            elif hasattr(image, "name"):
-                pil_images.append(Image.open(image.name).convert("RGB"))
-        elif prev_output:
-            pil_images.append(prev_output.convert("RGB"))
-        if len(pil_images) == 0:
             raise gr.Error("Please upload an image first.")
-        print(f"{len(pil_images)=}")
     # finetuner.enable()
     pipe.scheduler.config["base_shift"] = shift
     pipe.scheduler.config["max_shift"] = shift
-    result = pipe(
-        image=pil_images,
-        prompt=prompt,
-        num_inference_steps=num_inference_steps,
-        generator=generator,
-    ).images[0]
-    return result, seed
 # --- UI ---
 with gr.Blocks(theme=gr.themes.Citrus()) as demo:
@@ -127,32 +155,40 @@ with gr.Blocks(theme=gr.themes.Citrus()) as demo:
     with gr.Row():
         with gr.Column():
             image = gr.Image(label="Input Image", type="pil")
-            prev_output = gr.Image(value=None, visible=False)
-            is_reset = gr.Checkbox(value=False, visible=False)
             prompt = gr.Textbox(label="Prompt", placeholder="Prompt", lines=2)
             run_btn = gr.Button("Generate", variant="primary")
             with gr.Accordion("Advanced Settings", open=False):
                 seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                 num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=40, step=1, value=2)
                 shift = gr.Slider(label="Timestep Shift", minimum=0.0, maximum=4.0, step=0.1, value=2.0)
         with gr.Column():
-            result = gr.Image(label="Output Image", interactive=False)
     inputs = [
         image,
         prompt,
         seed,
         randomize_seed,
         num_inference_steps,
         shift,
-        prev_output,
     ]
-    outputs = [result, seed]
     run_event = run_btn.click(
@@ -161,6 +197,17 @@ with gr.Blocks(theme=gr.themes.Citrus()) as demo:
         outputs=outputs
     )
-    run_event.then(lambda img, *_: img, inputs=[result], outputs=[prev_output])
 demo.launch()

 import spaces
 import subprocess
+from qwenimage.models.attention_processors import QwenDoubleStreamAttnProcessorFA3
+from qwenimage.optimization import optimize_pipeline_
 GIT_TOKEN = os.environ.get("GIT_TOKEN")
 import subprocess
 from qwenimage.debug import ctimed
 from qwenimage.models.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
 from qwenimage.models.transformer_qwenimage import QwenImageTransformer2DModel
+from qwenimage.experiments.quantize_experiments import conf_fp8darow_nolast, quantize_transformer_fp8darow_nolast
 # --- Model Loading ---
     "checkpoints/distill_5k_lora.safetensors",
     adapter_name="fast_5k",
 )
+pipe.set_adapters(["fast_5k"], adapter_weights=[1.0])
+pipe.fuse_lora(adapter_names=["fast_5k"], lora_scale=1.0)
+pipe.unload_lora_weights()
+pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
+pipe.transformer.fuse_qkv_projections()
+pipe.transformer.check_fused_qkv()
+optimize_pipeline_(
+    pipe,
+    cache_compiled=True,
+    quantize=True,
+    suffix="_fp8darow_nolast_fa3_fast5k",
+    quantize_config=conf_fp8darow_nolast(),
+    pipe_kwargs={
+        "image": [Image.new("RGB", (1024, 1024))],
+        "prompt":"prompt",
+        "num_inference_steps":2,
+    }
+)
 MAX_SEED = np.iinfo(np.int32).max
 def run_pipe(
     image,
     prompt,
+    num_runs,
     seed,
     randomize_seed,
     num_inference_steps,
     shift,
+    prompt_cached,
 ):
     with ctimed("pre pipe"):
         # Choose input image (prefer uploaded, else last output)
         pil_images = []
+        if image is None:
             raise gr.Error("Please upload an image first.")
+        if isinstance(image, Image.Image):
+            pil_images.append(image.convert("RGB"))
+        elif hasattr(image, "name"):
+            pil_images.append(Image.open(image.name).convert("RGB"))
     # finetuner.enable()
     pipe.scheduler.config["base_shift"] = shift
     pipe.scheduler.config["max_shift"] = shift
+    gallery_images = []
+    for i in range(num_runs):
+        result = pipe(
+            image=pil_images,
+            prompt=prompt,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            vae_image_override=1024 * 1024, #512 * 512,
+            latent_size_override=1024 * 1024,
+            prompt_cached=prompt_cached,
+            return_dict=True,
+        ).images[0]
+        prompt_cached = True
+        gallery_images.append(result)
+        yield gallery_images, seed, prompt_cached
 # --- UI ---
+def reset_prompt_cache():
+    return False
 with gr.Blocks(theme=gr.themes.Citrus()) as demo:
     with gr.Row():
         with gr.Column():
             image = gr.Image(label="Input Image", type="pil")
             prompt = gr.Textbox(label="Prompt", placeholder="Prompt", lines=2)
+            num_runs = gr.Slider(label="Run Consecutively", minimum=0, maximum=100, step=1, value=16)
             run_btn = gr.Button("Generate", variant="primary")
             with gr.Accordion("Advanced Settings", open=False):
+                prompt_cached = gr.Checkbox(label="Auto-Cached embeds", value=False)
                 seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                 num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=40, step=1, value=2)
                 shift = gr.Slider(label="Timestep Shift", minimum=0.0, maximum=4.0, step=0.1, value=2.0)
         with gr.Column():
+            result = gr.Gallery(
+                label="Output Image",
+                interactive=False,
+                # type="filepath",
+                columns=4,
+                height=800,
+                object_fit="scale-down",
+            )
     inputs = [
         image,
         prompt,
+        num_runs,
         seed,
         randomize_seed,
         num_inference_steps,
         shift,
+        prompt_cached,
     ]
+    outputs = [result, seed, prompt_cached]
     run_event = run_btn.click(
         outputs=outputs
     )
+    image.upload(
+        fn=reset_prompt_cache,
+        inputs=[],
+        outputs=[prompt_cached],
+    )
+    prompt.input(
+        fn=reset_prompt_cache,
+        inputs=[],
+        outputs=[prompt_cached],
+    )
 demo.launch()

qwenimage/experiments/quantize_experiments.py CHANGED Viewed

@@ -224,11 +224,11 @@ class Qwen_FA3_AoT_fp8darow_nolast(QwenBaseExperiment):
             }
         )
-def quantize_transformer_fp8darow_nolast(model):
     module_fqn_to_config = ModuleFqnToConfig(
         OrderedDict([
             (ATTN_LAST_LAYER, None),
-            # ("_default",Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),),
             ("_default",Float8DynamicActivationFloat8WeightConfig(),),
         ])
     )
@@ -237,6 +237,26 @@ def quantize_transformer_fp8darow_nolast(model):
     print_first_param(model)
     print(f"quantized model size: {get_model_size_in_bytes(model) / 1024 / 1024} MB")
 @ExperimentRegistry.register(name="qwen_fa3_aot_fp8darow_nofirstlast")
 class Qwen_FA3_AoT_fp8darow_nofirstlast(QwenBaseExperiment):

             }
         )
+def quantize_transformer_fp8da_nolast(model):
     module_fqn_to_config = ModuleFqnToConfig(
         OrderedDict([
             (ATTN_LAST_LAYER, None),
             ("_default",Float8DynamicActivationFloat8WeightConfig(),),
         ])
     )
     print_first_param(model)
     print(f"quantized model size: {get_model_size_in_bytes(model) / 1024 / 1024} MB")
+def quantize_transformer_fp8darow_nolast(model):
+    module_fqn_to_config = ModuleFqnToConfig(
+        OrderedDict([
+            (ATTN_LAST_LAYER, None),
+            ("_default",Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),),
+        ])
+    )
+    print(f"original model size: {get_model_size_in_bytes(model) / 1024 / 1024} MB")
+    quantize_(model, module_fqn_to_config)
+    print_first_param(model)
+    print(f"quantized model size: {get_model_size_in_bytes(model) / 1024 / 1024} MB")
+def conf_fp8darow_nolast():
+    module_fqn_to_config = ModuleFqnToConfig(
+        OrderedDict([
+            (ATTN_LAST_LAYER, None),
+            ("_default",Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),),
+        ])
+    )
+    return module_fqn_to_config
 @ExperimentRegistry.register(name="qwen_fa3_aot_fp8darow_nofirstlast")
 class Qwen_FA3_AoT_fp8darow_nofirstlast(QwenBaseExperiment):

qwenimage/foundation.py CHANGED Viewed

@@ -15,7 +15,7 @@ from einops import rearrange
 from qwenimage.datamodels import QwenConfig, QwenInputs
 from qwenimage.debug import clear_cuda_memory, ctimed, ftimed, print_gpu_memory, texam
 from qwenimage.experiments.quantize_text_encoder_experiments import quantize_text_encoder_int4wo_linear
-from qwenimage.experiments.quantize_experiments import quantize_transformer_fp8darow_nolast
 from qwenimage.loss import LossAccumulator
 from qwenimage.models.pipeline_qwenimage_edit_plus import CONDITION_IMAGE_SIZE, QwenImageEditPlusPipeline, calculate_dimensions
 from qwenimage.models.pipeline_qwenimage_edit_save_interm import QwenImageEditSaveIntermPipeline
@@ -110,7 +110,7 @@ class QwenImageFoundation(WandModel):
             quantize_text_encoder_int4wo_linear(self.text_encoder)
         if self.config.quantize_transformer:
-            quantize_transformer_fp8darow_nolast(self.transformer)
     def load(self, load_path):

 from qwenimage.datamodels import QwenConfig, QwenInputs
 from qwenimage.debug import clear_cuda_memory, ctimed, ftimed, print_gpu_memory, texam
 from qwenimage.experiments.quantize_text_encoder_experiments import quantize_text_encoder_int4wo_linear
+from qwenimage.experiments.quantize_experiments import quantize_transformer_fp8da_nolast
 from qwenimage.loss import LossAccumulator
 from qwenimage.models.pipeline_qwenimage_edit_plus import CONDITION_IMAGE_SIZE, QwenImageEditPlusPipeline, calculate_dimensions
 from qwenimage.models.pipeline_qwenimage_edit_save_interm import QwenImageEditSaveIntermPipeline
             quantize_text_encoder_int4wo_linear(self.text_encoder)
         if self.config.quantize_transformer:
+            quantize_transformer_fp8da_nolast(self.transformer)
     def load(self, load_path):

qwenimage/models/autoencoder_kl_qwenimage.py CHANGED Viewed

@@ -33,6 +33,8 @@ from diffusers.models.modeling_outputs import AutoencoderKLOutput
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.models.autoencoders.vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -870,11 +872,14 @@ class AutoencoderKLQwenImage(ModelMixin, AutoencoderMixin, ConfigMixin, FromOrig
                 If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                 returned.
         """
         if self.use_slicing and z.shape[0] > 1:
             decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
             decoded = torch.cat(decoded_slices)
         else:
             decoded = self._decode(z).sample
         if not return_dict:
             return (decoded,)

 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.models.autoencoders.vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution
+from qwenimage.debug import texam
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
                 If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                 returned.
         """
+        texam(z, "z")
         if self.use_slicing and z.shape[0] > 1:
             decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
             decoded = torch.cat(decoded_slices)
         else:
             decoded = self._decode(z).sample
+        texam(decoded, "decoded")
         if not return_dict:
             return (decoded,)

qwenimage/models/pipeline_qwenimage_edit_plus.py CHANGED Viewed

@@ -15,6 +15,7 @@
 import inspect
 import math
 from typing import Any, Callable, Dict, List, Optional, Union
 import warnings
 from PIL import Image
@@ -24,6 +25,7 @@ import torch
 from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
 from transformers.models.qwen2 import Qwen2Tokenizer
 from transformers.models.qwen2_vl import Qwen2VLProcessor
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import QwenImageLoraLoaderMixin
@@ -226,6 +228,10 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         self.prompt_template_encode_start_idx = 64
         self.default_sample_size = 128
     # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._extract_masked_hidden
     def _extract_masked_hidden(self, hidden_states: torch.Tensor, mask: torch.Tensor):
         bool_mask = mask.bool()
@@ -571,6 +577,7 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         channels_last_format: bool = False,
         vae_image_override: int | None = None,
         latent_size_override: int | None = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -708,23 +715,24 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
             device = self._execution_device
             # 3. Preprocess image
-            if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
-                if not isinstance(image, list):
-                    image = [image]
-                condition_image_sizes = []
-                condition_images = []
-                vae_image_sizes = []
-                vae_images = []
-                for img in image:
-                    image_width, image_height = img.size
-                    condition_width, condition_height = calculate_dimensions(
-                        CONDITION_IMAGE_SIZE, image_width / image_height
-                    )
-                    vae_width, vae_height = calculate_dimensions(vae_image_size, image_width / image_height)
-                    condition_image_sizes.append((condition_width, condition_height))
-                    vae_image_sizes.append((vae_width, vae_height))
-                    condition_images.append(self.image_processor.resize(img, condition_height, condition_width))
-                    vae_images.append(self.image_processor.preprocess(img, vae_height, vae_width).unsqueeze(2))
             has_neg_prompt = negative_prompt is not None or (
                 negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
@@ -741,15 +749,19 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         with ctimed("Encode Prompt"):
             do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
-            prompt_embeds, prompt_embeds_mask = self.encode_prompt(
-                image=condition_images,
-                prompt=prompt,
-                prompt_embeds=prompt_embeds,
-                prompt_embeds_mask=prompt_embeds_mask,
-                device=device,
-                num_images_per_prompt=num_images_per_prompt,
-                max_sequence_length=max_sequence_length,
-            )
             if do_true_cfg:
                 negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
                     image=condition_images,
@@ -764,26 +776,37 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         with ctimed("Prep gen"):
             # 4. Prepare latent variables
             num_channels_latents = self.transformer.config.in_channels // 4
-            latents, image_latents = self.prepare_latents(
-                vae_images,
-                batch_size * num_images_per_prompt,
-                num_channels_latents,
-                height,
-                width,
-                prompt_embeds.dtype,
-                device,
-                generator,
-                latents,
-            )
-            img_shapes = [
-                [
-                    (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2),
-                    *[
-                        (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2)
-                        for vae_width, vae_height in vae_image_sizes
-                    ],
-                ]
-            ] * batch_size
             # 5. Prepare timesteps
             # print(f"{num_inference_steps=}")
@@ -857,18 +880,23 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                 for i in range(len(ts)-1):
                     t = ts[i]
                     with ctimed(f"loop {i}"):
-                        if self.interrupt:
-                            continue
-                        # self._current_timestep = t
-                        latent_model_input = latents
-                        if image_latents is not None:
-                            latent_model_input = torch.cat([latents, image_latents], dim=1)
-                        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                        in_t = t.expand(latents.shape[0]).to(latents.dtype)
-                        with self.transformer.cache_context("cond"):
                             noise_pred = self.transformer(
                                 hidden_states=latent_model_input,
                                 timestep=in_t,
@@ -882,7 +910,7 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                             noise_pred = noise_pred[:, : latents.size(1)]
                         if do_true_cfg:
-                            warnings.warn("doing true CFG")
                             with self.transformer.cache_context("uncond"):
                                 neg_noise_pred = self.transformer(
                                     hidden_states=latent_model_input,
@@ -907,29 +935,29 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                         latents = t_utils.inference_ode_step(noise_pred, latents, i, ts)
-                        if latents.dtype != latents_dtype:
-                            if torch.backends.mps.is_available():
-                                # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                                latents = latents.to(latents_dtype)
-                        if callback_on_step_end is not None:
-                            callback_kwargs = {}
-                            for k in callback_on_step_end_tensor_inputs:
-                                callback_kwargs[k] = locals()[k]
-                            callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-                            latents = callback_outputs.pop("latents", latents)
-                            prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                        # call the callback, if provided
-                        # if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
-                        if XLA_AVAILABLE:
-                            xm.mark_step()
-        # with ctimed("Post (vae)"):
         self._current_timestep = None
         if output_type == "latent":
             image = latents
@@ -940,16 +968,51 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                 latents_mean = (
                     torch.tensor(self.vae.config.latents_mean)
                     .view(1, self.vae.config.z_dim, 1, 1, 1)
-                    .to(latents.device, latents.dtype)
                 )
                 latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
-                    latents.device, latents.dtype
                 )
                 latents = latents / latents_std + latents_mean
-            with ctimed("vae.decode"):
                 image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
-            with ctimed("post process"):
-                image = self.image_processor.postprocess(image, output_type=output_type)
         # Offload all models

 import inspect
 import math
 from typing import Any, Callable, Dict, List, Optional, Union
+import uuid
 import warnings
 from PIL import Image
 from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
 from transformers.models.qwen2 import Qwen2Tokenizer
 from transformers.models.qwen2_vl import Qwen2VLProcessor
+from torchvision.io import encode_jpeg, write_file, write_jpeg
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import QwenImageLoraLoaderMixin
         self.prompt_template_encode_start_idx = 64
         self.default_sample_size = 128
+        self.prompt_embeds, self.prompt_embeds_mask = None, None
+        self.image_latents = None
+        self.latents_mean, self.latents_std = None, None
     # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._extract_masked_hidden
     def _extract_masked_hidden(self, hidden_states: torch.Tensor, mask: torch.Tensor):
         bool_mask = mask.bool()
         channels_last_format: bool = False,
         vae_image_override: int | None = None,
         latent_size_override: int | None = None,
+        prompt_cached: bool = False,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
             device = self._execution_device
             # 3. Preprocess image
+            if not prompt_cached:
+                if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
+                    if not isinstance(image, list):
+                        image = [image]
+                    condition_image_sizes = []
+                    condition_images = []
+                    vae_image_sizes = []
+                    vae_images = []
+                    for img in image:
+                        image_width, image_height = img.size
+                        condition_width, condition_height = calculate_dimensions(
+                            CONDITION_IMAGE_SIZE, image_width / image_height
+                        )
+                        vae_width, vae_height = calculate_dimensions(vae_image_size, image_width / image_height)
+                        condition_image_sizes.append((condition_width, condition_height))
+                        vae_image_sizes.append((vae_width, vae_height))
+                        condition_images.append(self.image_processor.resize(img, condition_height, condition_width))
+                        vae_images.append(self.image_processor.preprocess(img, vae_height, vae_width).unsqueeze(2))
             has_neg_prompt = negative_prompt is not None or (
                 negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
         with ctimed("Encode Prompt"):
             do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+            if prompt_cached:
+                prompt_embeds, prompt_embeds_mask = self.prompt_embeds, self.prompt_embeds_mask
+            else:
+                prompt_embeds, prompt_embeds_mask = self.encode_prompt(
+                    image=condition_images,
+                    prompt=prompt,
+                    prompt_embeds=prompt_embeds,
+                    prompt_embeds_mask=prompt_embeds_mask,
+                    device=device,
+                    num_images_per_prompt=num_images_per_prompt,
+                    max_sequence_length=max_sequence_length,
+                )
+                self.prompt_embeds, self.prompt_embeds_mask = prompt_embeds, prompt_embeds_mask
             if do_true_cfg:
                 negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
                     image=condition_images,
         with ctimed("Prep gen"):
             # 4. Prepare latent variables
             num_channels_latents = self.transformer.config.in_channels // 4
+            if prompt_cached:
+                image_latents = self.image_latents
+                _height = 2 * (int(height) // (self.vae_scale_factor * 2))
+                _width = 2 * (int(width) // (self.vae_scale_factor * 2))
+                shape = (batch_size * num_images_per_prompt, 1, num_channels_latents, _height, _width)
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=image_latents.dtype)
+                latents = self._pack_latents(latents, batch_size * num_images_per_prompt, num_channels_latents, _height, _width)
+                img_shapes = self.img_shapes
+            else:
+                latents, image_latents = self.prepare_latents(
+                    vae_images,
+                    batch_size * num_images_per_prompt,
+                    num_channels_latents,
+                    height,
+                    width,
+                    prompt_embeds.dtype,
+                    device,
+                    generator,
+                    latents,
+                )
+                self.image_latents = image_latents
+                img_shapes = [
+                    [
+                        (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2),
+                        *[
+                            (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2)
+                            for vae_width, vae_height in vae_image_sizes
+                        ],
+                    ]
+                ] * batch_size
+                self.img_shapes = img_shapes
             # 5. Prepare timesteps
             # print(f"{num_inference_steps=}")
                 for i in range(len(ts)-1):
                     t = ts[i]
                     with ctimed(f"loop {i}"):
+                        with ctimed("pre trans"):
+                            if self.interrupt:
+                                continue
+                            # self._current_timestep = t
+                            with ctimed("cat lats"):
+                                latent_model_input = latents
+                                if image_latents is not None:
+                                    latent_model_input = torch.cat([latents, image_latents], dim=1)
+                            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                            with ctimed("broadcast lats"):
+                                in_t = t.expand(latents.shape[0]).to(latents.dtype)
+                        with ctimed("transformer proper"):
                             noise_pred = self.transformer(
                                 hidden_states=latent_model_input,
                                 timestep=in_t,
                             noise_pred = noise_pred[:, : latents.size(1)]
                         if do_true_cfg:
+                            raise NotImplementedError()
                             with self.transformer.cache_context("uncond"):
                                 neg_noise_pred = self.transformer(
                                     hidden_states=latent_model_input,
                         latents = t_utils.inference_ode_step(noise_pred, latents, i, ts)
+                        with ctimed("dtype stuff"):
+                            if latents.dtype != latents_dtype:
+                                if torch.backends.mps.is_available():
+                                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                                    latents = latents.to(latents_dtype)
+                        with ctimed("callback and shenanagans"):
+                            if callback_on_step_end is not None:
+                                callback_kwargs = {}
+                                for k in callback_on_step_end_tensor_inputs:
+                                    callback_kwargs[k] = locals()[k]
+                                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                                latents = callback_outputs.pop("latents", latents)
+                                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                            # call the callback, if provided
+                            # if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                            progress_bar.update()
+                            if XLA_AVAILABLE:
+                                xm.mark_step()
         self._current_timestep = None
         if output_type == "latent":
             image = latents
                 latents_mean = (
                     torch.tensor(self.vae.config.latents_mean)
                     .view(1, self.vae.config.z_dim, 1, 1, 1)
+                    .to(device, self.vae.dtype)
                 )
                 latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                    device, self.vae.dtype
                 )
                 latents = latents / latents_std + latents_mean
                 image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
+                image = image.squeeze(0).add(1).mul(127.5).to(torch.uint8).cpu()
+                image_path = f"/tmp/{str(uuid.uuid4())[:8]}.jpg"
+                write_jpeg(image, image_path)
+                image = (image_path,)
+        # with ctimed("Post (vae)"):
+        # self._current_timestep = None
+        # if output_type == "latent":
+        #     image = latents
+        # else:
+        #     with ctimed("pre decode"):
+        #         latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+        #         if prompt_cached:
+        #             latents_mean, latents_std = self.latents_mean, self.latents_std
+        #         else:
+        #             latents_mean = torch.tensor(self.vae.config.latents_mean, device=device, dtype=latents.dtype).view(1, self.vae.config.z_dim, 1, 1, 1)
+        #             latents_std = 1.0 / torch.tensor(self.vae.config.latents_std, device=device, dtype=latents.dtype).view(1, self.vae.config.z_dim, 1, 1, 1)
+        #             self.latents_mean, self.latents_std = latents_mean, latents_std
+        #         latents = latents / latents_std + latents_mean
+        #     with ctimed("todtype"):
+        #         latents = latents.to(self.vae.dtype)
+        #     with ctimed("vae.decode"):
+        #         image = self.vae.decode(latents, return_dict=False)[0][:, :, 0] # [B,C,H,W]
+        #     with ctimed("post process"):
+        #         with ctimed("convert"):
+        #             image = image.squeeze(0).add(1).mul(127.5).to(torch.uint8).cpu()
+        #         with ctimed("write"):
+        #             image_path = f"/tmp/{str(uuid.uuid4())[:8]}.jpg"
+        #             write_jpeg(image, image_path)
+        #         image = (image_path,)
         # Offload all models

qwenimage/models/transformer_qwenimage.py CHANGED Viewed

@@ -35,6 +35,7 @@ from diffusers.models.modeling_utils import ModelMixin
 from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
 from qwenimage.activation_record import ActivationReport
 from qwenimage.models.attention_processors import QwenDoubleStreamAttnProcessor2_0
@@ -511,61 +512,66 @@ class QwenImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Fro
         else:
             lora_scale = 1.0
-        if USE_PEFT_BACKEND:
-            # weight the lora layers by setting `lora_scale` for each PEFT layer
-            scale_lora_layers(self, lora_scale)
-        else:
-            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
-                logger.warning(
-                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
-                )
-        hidden_states = self.img_in(hidden_states)
-        timestep = timestep.to(hidden_states.dtype)
-        encoder_hidden_states = self.txt_norm(encoder_hidden_states)
-        encoder_hidden_states = self.txt_in(encoder_hidden_states)
-        if guidance is not None:
-            guidance = guidance.to(hidden_states.dtype) * 1000
-        temb = (
-            self.time_text_embed(timestep, hidden_states)
-            if guidance is None
-            else self.time_text_embed(timestep, guidance, hidden_states)
-        )
-        for index_block, block in enumerate(self.transformer_blocks):
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                warnings.warn("Gradient ckpt?")
-                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
-                    block,
-                    hidden_states,
-                    encoder_hidden_states,
-                    encoder_hidden_states_mask,
-                    temb,
-                    image_rotary_emb,
-                )
             else:
-                encoder_hidden_states, hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_hidden_states_mask=encoder_hidden_states_mask,
-                    temb=temb,
-                    image_rotary_emb=image_rotary_emb,
-                    joint_attention_kwargs=attention_kwargs,
-                )
-                self.arec(f"encoder_hidden_states.{index_block}", encoder_hidden_states)
-                self.arec(f"hidden_states.{index_block}", hidden_states)
-        # Use only the image part (hidden_states) from the dual-stream blocks
-        hidden_states = self.norm_out(hidden_states, temb)
-        output = self.proj_out(hidden_states)
-        if USE_PEFT_BACKEND:
-            # remove `lora_scale` from each PEFT layer
-            unscale_lora_layers(self, lora_scale)
         if not return_dict:
             return (output,)

 from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
 from qwenimage.activation_record import ActivationReport
+from qwenimage.debug import ctimed
 from qwenimage.models.attention_processors import QwenDoubleStreamAttnProcessor2_0
         else:
             lora_scale = 1.0
+        with ctimed("scale lora"):
+            if USE_PEFT_BACKEND:
+                # weight the lora layers by setting `lora_scale` for each PEFT layer
+                scale_lora_layers(self, lora_scale)
             else:
+                if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                    logger.warning(
+                        "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                    )
+        with ctimed("pre blocks"):
+            hidden_states = self.img_in(hidden_states)
+            timestep = timestep.to(hidden_states.dtype)
+            encoder_hidden_states = self.txt_norm(encoder_hidden_states)
+            encoder_hidden_states = self.txt_in(encoder_hidden_states)
+            if guidance is not None:
+                guidance = guidance.to(hidden_states.dtype) * 1000
+            temb = (
+                self.time_text_embed(timestep, hidden_states)
+                if guidance is None
+                else self.time_text_embed(timestep, guidance, hidden_states)
+            )
+        with ctimed("blocks"):
+            for index_block, block in enumerate(self.transformer_blocks):
+                if torch.is_grad_enabled() and self.gradient_checkpointing:
+                    warnings.warn("Gradient ckpt?")
+                    encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                        block,
+                        hidden_states,
+                        encoder_hidden_states,
+                        encoder_hidden_states_mask,
+                        temb,
+                        image_rotary_emb,
+                    )
+                else:
+                    encoder_hidden_states, hidden_states = block(
+                        hidden_states=hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        encoder_hidden_states_mask=encoder_hidden_states_mask,
+                        temb=temb,
+                        image_rotary_emb=image_rotary_emb,
+                        joint_attention_kwargs=attention_kwargs,
+                    )
+                    self.arec(f"encoder_hidden_states.{index_block}", encoder_hidden_states)
+                    self.arec(f"hidden_states.{index_block}", hidden_states)
+        with ctimed("post blocks"):
+            # Use only the image part (hidden_states) from the dual-stream blocks
+            hidden_states = self.norm_out(hidden_states, temb)
+            output = self.proj_out(hidden_states)
+        with ctimed("lora"):
+            if USE_PEFT_BACKEND:
+                # remove `lora_scale` from each PEFT layer
+                unscale_lora_layers(self, lora_scale)
         if not return_dict:
             return (output,)