programmersd commited on
Commit
a0d0bbb
·
verified ·
1 Parent(s): f1d70fe

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -0
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import random
4
+ import gc
5
+ import torch
6
+ import gradio as gr
7
+
8
+ from huggingface_hub import hf_hub_download
9
+ from transformers import AutoTokenizer, AutoModel
10
+ from diffusers import (
11
+ ZImagePipeline,
12
+ ZImageTransformer2DModel,
13
+ GGUFQuantizationConfig,
14
+ AutoencoderKL,
15
+ FlowMatchEulerDiscreteScheduler
16
+ )
17
+
18
+ # =========================
19
+ # FORCE CPU ENV
20
+ # =========================
21
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
22
+ os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
23
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
24
+
25
+ cpu_cores = os.cpu_count() or 1
26
+
27
+ torch.set_num_threads(cpu_cores)
28
+ torch.set_num_interop_threads(cpu_cores)
29
+
30
+ os.environ["OMP_NUM_THREADS"] = str(cpu_cores)
31
+ os.environ["MKL_NUM_THREADS"] = str(cpu_cores)
32
+
33
+ torch.backends.mkldnn.enabled = True
34
+ torch.backends.quantized.engine = "fbgemm"
35
+ torch.backends.cudnn.enabled = False
36
+ torch.set_float32_matmul_precision("high")
37
+
38
+ dtype = torch.float32
39
+ device = torch.device("cpu")
40
+
41
+ # =========================
42
+ # MODEL CONFIG
43
+ # =========================
44
+ BASE_MODEL_ID = "Tongyi-MAI/Z-Image-Turbo"
45
+ TEXT_ENCODER_ID = "Qwen/Qwen3-4B"
46
+ GGUF_REPO_ID = "unsloth/Z-Image-Turbo-GGUF"
47
+ GGUF_FILENAME = "z-image-turbo-Q2_K.gguf"
48
+ CACHE_DIR = "models"
49
+
50
+ os.makedirs(CACHE_DIR, exist_ok=True)
51
+
52
+ def download_if_needed(repo_id, filename):
53
+ local_path = os.path.join(CACHE_DIR, filename)
54
+ if os.path.exists(local_path):
55
+ print("Model cached locally.")
56
+ return local_path
57
+
58
+ print("Downloading model (first run)...")
59
+ path = hf_hub_download(
60
+ repo_id=repo_id,
61
+ filename=filename,
62
+ cache_dir=CACHE_DIR,
63
+ resume_download=True
64
+ )
65
+ print("Download finished.")
66
+ return path
67
+
68
+ # =========================
69
+ # LOAD PIPELINE CPU ONLY
70
+ # =========================
71
+ def load_pipeline():
72
+ scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
73
+ BASE_MODEL_ID,
74
+ subfolder="scheduler",
75
+ cache_dir=CACHE_DIR
76
+ )
77
+
78
+ vae = AutoencoderKL.from_pretrained(
79
+ BASE_MODEL_ID,
80
+ subfolder="vae",
81
+ torch_dtype=dtype,
82
+ cache_dir=CACHE_DIR
83
+ )
84
+
85
+ tokenizer = AutoTokenizer.from_pretrained(
86
+ TEXT_ENCODER_ID,
87
+ cache_dir=CACHE_DIR
88
+ )
89
+
90
+ text_encoder = AutoModel.from_pretrained(
91
+ TEXT_ENCODER_ID,
92
+ torch_dtype=dtype,
93
+ cache_dir=CACHE_DIR
94
+ ).to(device)
95
+
96
+ gguf_path = download_if_needed(GGUF_REPO_ID, GGUF_FILENAME)
97
+
98
+ transformer = ZImageTransformer2DModel.from_single_file(
99
+ gguf_path,
100
+ quantization_config=GGUFQuantizationConfig(compute_dtype=dtype),
101
+ torch_dtype=dtype
102
+ ).to(device)
103
+
104
+ pipe = ZImagePipeline(
105
+ vae=vae.to(device),
106
+ text_encoder=text_encoder,
107
+ tokenizer=tokenizer,
108
+ transformer=transformer,
109
+ scheduler=scheduler
110
+ ).to(device)
111
+
112
+ pipe.unet.to(memory_format=torch.channels_last)
113
+ pipe.text_encoder.to(memory_format=torch.channels_last)
114
+
115
+ pipe.unet = torch.compile(pipe.unet, mode="max-autotune", fullgraph=True)
116
+ pipe.text_encoder = torch.compile(pipe.text_encoder, mode="max-autotune", fullgraph=True)
117
+
118
+ return pipe
119
+
120
+ pipe = load_pipeline()
121
+
122
+ # Warmup compile
123
+ with torch.inference_mode():
124
+ _ = pipe(
125
+ prompt="warmup",
126
+ width=256,
127
+ height=256,
128
+ num_inference_steps=1,
129
+ guidance_scale=1.0
130
+ )
131
+
132
+ # =========================
133
+ # GENERATION WITH PROGRESS
134
+ # =========================
135
+ def generate(prompt, seed, progress=gr.Progress()):
136
+ if not prompt:
137
+ raise gr.Error("Prompt required")
138
+
139
+ if seed < 0:
140
+ seed = random.randint(0, 2**31 - 1)
141
+
142
+ generator = torch.Generator(device=device).manual_seed(seed)
143
+
144
+ total_steps = 4
145
+ start_time = time.time()
146
+
147
+ def step_callback(step, timestep, latents):
148
+ elapsed = time.time() - start_time
149
+ done = step + 1
150
+ avg = elapsed / done
151
+ eta = avg * (total_steps - done)
152
+ progress(done / total_steps, desc=f"Step {done}/{total_steps} | ETA {eta:.1f}s")
153
+
154
+ with torch.inference_mode():
155
+ gc.disable()
156
+ try:
157
+ image = pipe(
158
+ prompt=prompt,
159
+ width=256,
160
+ height=256,
161
+ num_inference_steps=total_steps,
162
+ guidance_scale=1.0,
163
+ generator=generator,
164
+ callback=step_callback,
165
+ callback_steps=1
166
+ ).images[0]
167
+ finally:
168
+ gc.enable()
169
+
170
+ return image, seed
171
+
172
+ # =========================
173
+ # UI + QUEUE
174
+ # =========================
175
+ with gr.Blocks(title="Z-Image Turbo Q2_K CPU MAX") as demo:
176
+ gr.Markdown("# Z-Image Turbo Q2_K — FULL CPU MAX MODE")
177
+
178
+ prompt = gr.Textbox(label="Prompt", lines=3)
179
+ seed = gr.Number(label="Seed (-1 random)", value=-1, precision=0)
180
+ btn = gr.Button("Generate")
181
+
182
+ image_out = gr.Image()
183
+ seed_out = gr.Number(interactive=False)
184
+
185
+ btn.click(generate, inputs=[prompt, seed], outputs=[image_out, seed_out])
186
+
187
+ demo.queue(max_size=10, concurrency_count=1)
188
+
189
+ if __name__ == "__main__":
190
+ demo.launch(server_name="0.0.0.0", server_port=7860)