import torch import time import os # 1. Import the final, self-contained tiling pipeline from pipeline_z_image_mod import ZImageMoDTilingPipeline from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, GGUFQuantizationConfig from diffusers.models import ZImageTransformer2DModel from transformers import AutoModelForCausalLM, AutoTokenizer from huggingface_hub import hf_hub_download def main(): # 1. Load Components Manually, using GGUF for the Transformer print("--- 1. Loading Model Components (with GGUF) ---") BASE_MODEL_ID = "Tongyi-MAI/Z-Image-Turbo" GGUF_REPO_ID = "jayn7/Z-Image-Turbo-GGUF" GGUF_FILENAME = "z_image_turbo-Q4_K_M.gguf" GGUF_LOCAL_DIR = "F:\\models\\Z-Image-Turbo" print("Loading VAE...") vae = AutoencoderKL.from_pretrained(BASE_MODEL_ID, subfolder="vae", torch_dtype=torch.bfloat16) print("Loading Text Encoder and Tokenizer...") text_encoder = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, subfolder="text_encoder", torch_dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, subfolder="tokenizer") print(f"Loading Transformer from GGUF file: {GGUF_FILENAME}...") transformer = ZImageTransformer2DModel.from_single_file(hf_hub_download(GGUF_REPO_ID, GGUF_FILENAME, local_dir=GGUF_LOCAL_DIR, local_dir_use_symlinks=False), quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16), torch_dtype=torch.bfloat16) scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0) # 2. Initialize our Tiling Pipeline with Pre-loaded Components print("\n--- 2. Assembling the ZImageMoDTilingPipeline ---") pipe = ZImageMoDTilingPipeline(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler, transformer=transformer) print("Enabling model CPU offload...") pipe.enable_model_cpu_offload() # 3. Set Up Tiling and Inference Parameters print("\n--- 3. Setting Up Inference Parameters ---") prompt_grid = [[ "On the left side of a wide, wet stone courtyard at night, a traditional wooden temple building has a soft, warm glow from its illuminated windows, the environment is a deep, dark night sky.", "Young Chinese woman in red Hanfu, standing gracefully in the center of the stone-paved temple courtyard, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights.", "On the right side of the frame, a wooden temple corridor with a row of glowing paper lanterns recedes into the background. The lantern light reflects on the wet stone path below, and the distant pagoda is silhouetted against the same deep, dark night sky.", ]] target_height, canvas_width = 1024, 3072 num_inference_steps = 8 seed = 109320357 generator = torch.Generator("cuda").manual_seed(seed) # 4. Start Inference print("\n--- 4. Starting Inference ---") start_inference_time = time.time() # A single, clean call to our self-contained pipeline. # No guidance_scale or negative_prompt is needed for the Turbo model. image = pipe( prompt=prompt_grid, height=target_height, width=canvas_width, num_inference_steps=num_inference_steps, generator=generator, ).images[0] end_inference_time = time.time() print(f"\nInference finished in {end_inference_time - start_inference_time:.2f} seconds.") # 5. Save Output if not os.path.exists("outputs"): os.makedirs("outputs") output_filename = "outputs/z_image_panorama_final.png" image.save(output_filename) print(f"Image successfully saved as '{output_filename}'") if __name__ == "__main__": main()