File size: 5,923 Bytes
1b8645f
 
 
 
dc103ee
1b8645f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ecec1f
dc103ee
5ecec1f
dc103ee
1b8645f
 
dc103ee
1b8645f
 
 
 
 
dc103ee
 
 
6857708
 
1b8645f
 
5ecec1f
1b8645f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6857708
1b8645f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc103ee
1b8645f
 
dc103ee
 
 
 
 
 
1b8645f
 
 
 
dc103ee
1b8645f
 
 
 
 
 
 
 
 
 
 
 
6857708
1b8645f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc103ee
6857708
1b8645f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6857708
1b8645f
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import torch
import spaces
import gradio as gr
from diffusers import DiffusionPipeline
from transformers import pipeline
import diffusers
import io

# ------------------------
# GLOBAL LOG BUFFER
# ------------------------
log_buffer = io.StringIO()

def log(msg):
    print(msg)
    log_buffer.write(msg + "\n")

# Enable diffusers debug logs
diffusers.utils.logging.set_verbosity_info()

# ------------------------
# LOAD PIPELINES
# ------------------------
log("Loading Z-Image-Turbo pipeline...")
pipe = DiffusionPipeline.from_pretrained(
    "Tongyi-MAI/Z-Image-Turbo",
    dtype=torch.bfloat16,
    low_cpu_mem_usage=False,
    attn_implementation="kernels-community/vllm-flash-attn3",
)
pipe.to("cuda")

log("Loading FP8 text encoder: Qwen/Qwen3-4B...")
fp8_encoder = pipeline("text-generation", model="Qwen/Qwen3-4B", device=0)  # device=0 β†’ CUDA

# ------------------------
# PIPELINE DEBUG INFO
# ------------------------
def pipeline_debug_info(pipe):
    info = ["=== PIPELINE DEBUG INFO ==="]
    try:
        tr = pipe.transformer.config
        info.append(f"Transformer Class: {pipe.transformer.__class__.__name__}")
        info.append(f"Hidden dim: {tr.get('hidden_dim')}")
        info.append(f"Attention heads: {tr.get('num_heads')}")
        info.append(f"Depth (layers): {tr.get('depth')}")
        info.append(f"Patch size: {tr.get('patch_size')}")
        info.append(f"MLP ratio: {tr.get('mlp_ratio')}")
        info.append(f"Attention backend: {tr.get('attn_implementation')}")
    except Exception as e:
        info.append(f"Transformer diagnostics failed: {e}")

    try:
        vae = pipe.vae.config
        info.append(f"VAE latent channels: {vae.latent_channels}")
        info.append(f"VAE scaling factor: {vae.scaling_factor}")
    except Exception as e:
        info.append(f"VAE diagnostics failed: {e}")

    return "\n".join(info)

def latent_shape_info(h, w, pipe):
    try:
        c = pipe.vae.config.latent_channels
        s = pipe.vae.config.scaling_factor
        h_lat = int(h * s)
        w_lat = int(w * s)
        return f"Latent shape β†’ ({c}, {h_lat}, {w_lat})"
    except Exception as e:
        return f"Latent shape calc failed: {e}"

# ------------------------
# IMAGE GENERATION
# ------------------------
@spaces.GPU
def generate_image(prompt, height, width, num_inference_steps, seed, randomize_seed, num_images):
    log_buffer.truncate(0)
    log_buffer.seek(0)

    log("=== NEW GENERATION REQUEST ===")
    log(f"Prompt: {prompt}")
    log(f"Height: {height}, Width: {width}")
    log(f"Inference Steps: {num_inference_steps}")
    log(f"Num Images: {num_images}")

    if randomize_seed:
        seed = torch.randint(0, 2**32 - 1, (1,)).item()
        log(f"Randomized Seed β†’ {seed}")
    else:
        log(f"Seed: {seed}")

    # Clamp images to 1–3
    num_images = min(max(1, int(num_images)), 3)

    # Run FP8 text encoder first
    log("Encoding prompt with FP8 text encoder...")
    encoded_prompt = fp8_encoder([{"role": "user", "content": prompt}])
    log(f"FP8 encoding output: {encoded_prompt}")

    # Debug pipeline info
    log(pipeline_debug_info(pipe))

    generator = torch.Generator("cuda").manual_seed(int(seed))

    log("Running Z-Image-Turbo pipeline forward()...")
    result = pipe(
        prompt=prompt,
        height=int(height),
        width=int(width),
        num_inference_steps=int(num_inference_steps),
        guidance_scale=0.0,
        generator=generator,
        max_sequence_length=1024,
        num_images_per_prompt=num_images,
        output_type="pil",
    )

    # Latent diagnostics
    try:
        log(f"VAE latent channels: {pipe.vae.config.latent_channels}")
        log(f"VAE scaling factor: {pipe.vae.config.scaling_factor}")
        log(latent_shape_info(height, width, pipe))
    except Exception as e:
        log(f"Latent diagnostics error: {e}")

    log("Pipeline finished.")
    log("Returning images...")

    return result.images, seed, log_buffer.getvalue()

# ------------------------
# GRADIO UI
# ------------------------
examples = [
    ["Young Chinese woman in red Hanfu, intricate embroidery..."],
    ["A majestic dragon soaring through clouds at sunset..."],
    ["Cozy coffee shop interior, warm lighting, rain on windows..."],
    ["Astronaut riding a horse on Mars, cinematic lighting..."],
    ["Portrait of a wise old wizard..."],
]

with gr.Blocks(title="Z-Image-Turbo Multi Image Demo") as demo:
    gr.Markdown("# 🎨 Z-Image-Turbo β€” Multi Image (FP8 Text Encoder)")

    with gr.Row():
        with gr.Column(scale=1):
            prompt = gr.Textbox(label="Prompt", lines=4)

            with gr.Row():
                height = gr.Slider(512, 2048, 1024, step=64, label="Height")
                width = gr.Slider(512, 2048, 1024, step=64, label="Width")

            num_images = gr.Slider(1, 3, 2, step=1, label="Number of Images")

            num_inference_steps = gr.Slider(
                1, 20, 9, step=1, label="Inference Steps",
                info="9 steps = 8 DiT forward passes",
            )

            with gr.Row():
                seed = gr.Number(label="Seed", value=42, precision=0)
                randomize_seed = gr.Checkbox(label="Randomize Seed", value=False)

            generate_btn = gr.Button("πŸš€ Generate", variant="primary")

        with gr.Column(scale=1):
            output_images = gr.Gallery(label="Generated Images", type="pil")
            used_seed = gr.Number(label="Seed Used", interactive=False)
            debug_log = gr.Textbox(label="Debug Log Output", lines=25, interactive=False)

    gr.Examples(examples=examples, inputs=[prompt], cache_examples=False)

    generate_btn.click(
        fn=generate_image,
        inputs=[prompt, height, width, num_inference_steps, seed, randomize_seed, num_images],
        outputs=[output_images, used_seed, debug_log],
    )

if __name__ == "__main__":
    demo.launch()