import torch
import time
import os

# 1. Import the final, self-contained tiling pipeline
from pipeline_z_image_mod import ZImageMoDTilingPipeline 
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, GGUFQuantizationConfig
from diffusers.models import ZImageTransformer2DModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import hf_hub_download

def main():
    # 1. Load Components Manually, using GGUF for the Transformer
    print("--- 1. Loading Model Components (with GGUF) ---")
    BASE_MODEL_ID = "Tongyi-MAI/Z-Image-Turbo"
    GGUF_REPO_ID = "jayn7/Z-Image-Turbo-GGUF"
    GGUF_FILENAME = "z_image_turbo-Q4_K_M.gguf"
    GGUF_LOCAL_DIR = "F:\\models\\Z-Image-Turbo"
    
    print("Loading VAE...")
    vae = AutoencoderKL.from_pretrained(BASE_MODEL_ID, subfolder="vae", torch_dtype=torch.bfloat16)
    print("Loading Text Encoder and Tokenizer...")
    text_encoder = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, subfolder="text_encoder", torch_dtype=torch.bfloat16)
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, subfolder="tokenizer")
    print(f"Loading Transformer from GGUF file: {GGUF_FILENAME}...")
    transformer = ZImageTransformer2DModel.from_single_file(hf_hub_download(GGUF_REPO_ID, GGUF_FILENAME, local_dir=GGUF_LOCAL_DIR, local_dir_use_symlinks=False), quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16), torch_dtype=torch.bfloat16)
    
    scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)

    # 2. Initialize our Tiling Pipeline with Pre-loaded Components
    print("\n--- 2. Assembling the ZImageMoDTilingPipeline ---")
    pipe = ZImageMoDTilingPipeline(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler, transformer=transformer)
    
    print("Enabling model CPU offload...")
    pipe.enable_model_cpu_offload()

    # 3. Set Up Tiling and Inference Parameters
    print("\n--- 3. Setting Up Inference Parameters ---")
    
    prompt_grid = [[
        "On the left side of a wide, wet stone courtyard at night, a traditional wooden temple building has a soft, warm glow from its illuminated windows, the environment is a deep, dark night sky.",
        "Young Chinese woman in red Hanfu, standing gracefully in the center of the stone-paved temple courtyard, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights.",
        "On the right side of the frame, a wooden temple corridor with a row of glowing paper lanterns recedes into the background. The lantern light reflects on the wet stone path below, and the distant pagoda is silhouetted against the same deep, dark night sky.",    
    ]]
    
    target_height, canvas_width = 1024, 3072
    num_inference_steps = 8
    seed = 109320357
    generator = torch.Generator("cuda").manual_seed(seed)
    
    # 4. Start Inference
    print("\n--- 4. Starting Inference ---")
    
    start_inference_time = time.time()

    # A single, clean call to our self-contained pipeline.
    # No guidance_scale or negative_prompt is needed for the Turbo model.
    image = pipe(
        prompt=prompt_grid,
        height=target_height,
        width=canvas_width,
        num_inference_steps=num_inference_steps,
        generator=generator,
    ).images[0]
    
    end_inference_time = time.time()
    print(f"\nInference finished in {end_inference_time - start_inference_time:.2f} seconds.")

    # 5. Save Output
    if not os.path.exists("outputs"): os.makedirs("outputs")
    output_filename = "outputs/z_image_panorama_final.png"
    image.save(output_filename)
    print(f"Image successfully saved as '{output_filename}'")

if __name__ == "__main__":
    main()