snake7gun commited on
Commit
a5309b5
·
verified ·
1 Parent(s): c9b74f8

Delete README.md

Browse files
Files changed (1) hide show
  1. README.md +0 -184
README.md DELETED
@@ -1,184 +0,0 @@
1
- ---
2
- library_name: Diffusers
3
- pipeline_tag: text-to-image
4
- inference: true
5
- base_model:
6
- - Qwen/Qwen-Image
7
- ---
8
-
9
- This tiny model is for debugging. It is randomly initialized with the config adapted from [Qwen/Qwen-Image](https://huggingface.co/Qwen/Qwen-Image).
10
-
11
- File size:
12
- - ~10MB text_encoder/model.safetensors
13
- - ~200KB transformer/diffusion_pytorch_model.safetensors
14
- - ~5MB vae/diffusion_pytorch_model.safetensors
15
-
16
- ### Example usage:
17
-
18
- ```python
19
- import torch
20
- from diffusers import DiffusionPipeline
21
-
22
- model_id = "tiny-random/Qwen-Image"
23
- torch_dtype = torch.bfloat16
24
- device = "cuda"
25
- pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
26
- pipe = pipe.to(device)
27
-
28
- positive_magic = {
29
- "en": "Ultra HD, 4K, cinematic composition.", # for english prompt,
30
- "zh": "超清,4K,电影级构图" # for chinese prompt,
31
- }
32
- prompt = '''A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic composition.'''
33
- prompt += 'Some dummy random texts to make prompt long enough ' * 10
34
- negative_prompt = " "
35
-
36
- # Generate with different aspect ratios
37
- aspect_ratios = {
38
- "1:1": (1328, 1328),
39
- "16:9": (1664, 928),
40
- "9:16": (928, 1664),
41
- "4:3": (1472, 1140),
42
- "3:4": (1140, 1472)
43
- }
44
-
45
- for width, height in aspect_ratios.values():
46
- image = pipe(
47
- prompt=prompt + positive_magic["en"],
48
- negative_prompt=negative_prompt,
49
- width=width,
50
- height=height,
51
- num_inference_steps=4,
52
- true_cfg_scale=4.0,
53
- generator=torch.Generator(device="cuda").manual_seed(42)
54
- ).images[0]
55
- print(image)
56
- ```
57
-
58
- ### Codes to create this repo:
59
-
60
- ```python
61
- import json
62
-
63
- import torch
64
- from diffusers import (
65
- AutoencoderKLQwenImage,
66
- DiffusionPipeline,
67
- FlowMatchEulerDiscreteScheduler,
68
- QwenImagePipeline,
69
- QwenImageTransformer2DModel,
70
- )
71
- from huggingface_hub import hf_hub_download
72
- from transformers import AutoConfig, AutoTokenizer, Qwen2_5_VLForConditionalGeneration
73
- from transformers.generation import GenerationConfig
74
-
75
- source_model_id = "Qwen/Qwen-Image"
76
- save_folder = "/tmp/tiny-random/Qwen-Image"
77
-
78
- torch.set_default_dtype(torch.bfloat16)
79
- scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(source_model_id, subfolder='scheduler')
80
- tokenizer = AutoTokenizer.from_pretrained(source_model_id, subfolder='tokenizer')
81
-
82
- def save_json(path, obj):
83
- import json
84
- from pathlib import Path
85
- Path(path).parent.mkdir(parents=True, exist_ok=True)
86
- with open(path, 'w', encoding='utf-8') as f:
87
- json.dump(obj, f, indent=2, ensure_ascii=False)
88
-
89
- def init_weights(model):
90
- import torch
91
- torch.manual_seed(42)
92
- with torch.no_grad():
93
- for name, p in sorted(model.named_parameters()):
94
- torch.nn.init.normal_(p, 0, 0.1)
95
- print(name, p.shape, p.dtype, p.device)
96
-
97
- with open(hf_hub_download(source_model_id, filename='text_encoder/config.json', repo_type='model'), 'r', encoding='utf - 8') as f:
98
- config = json.load(f)
99
- config.update({
100
- 'hidden_size': 32,
101
- 'intermediate_size': 64,
102
- 'max_window_layers': 1,
103
- 'num_attention_heads': 2,
104
- 'num_hidden_layers': 2,
105
- 'num_key_value_heads': 1,
106
- 'sliding_window': 64,
107
- 'tie_word_embeddings': True,
108
- 'use_sliding_window': True,
109
- })
110
- del config['torch_dtype']
111
- config['rope_scaling']['mrope_section'] = [4, 2, 2]
112
- config['text_config'].update({
113
- 'hidden_size': 32,
114
- 'intermediate_size': 64,
115
- 'num_attention_heads': 2,
116
- 'num_hidden_layers': 2,
117
- 'num_key_value_heads': 1,
118
- 'sliding_window': 64,
119
- 'tie_word_embeddings': True,
120
- 'max_window_layers': 1,
121
- 'use_sliding_window': True,
122
- 'layer_types': ['full_attention', 'sliding_attention']
123
- })
124
- del config['text_config']['torch_dtype']
125
- config['text_config']['rope_scaling']['mrope_section'] = [4, 2, 2]
126
- config['vision_config'].update(
127
- {
128
- 'depth': 2,
129
- 'fullatt_block_indexes': [0],
130
- 'hidden_size': 32,
131
- 'intermediate_size': 64,
132
- 'num_heads': 2,
133
- 'out_hidden_size': 32,
134
- }
135
- )
136
- del config['vision_config']['torch_dtype']
137
- save_json(f'{save_folder}/text_encoder/config.json', config)
138
- text_encoder_config = AutoConfig.from_pretrained(f'{save_folder}/text_encoder')
139
- text_encoder = Qwen2_5_VLForConditionalGeneration(text_encoder_config).to(torch.bfloat16)
140
- generation_config = GenerationConfig.from_pretrained(source_model_id, subfolder='text_encoder')
141
- # text_encoder.config.generation_config = generation_config
142
- text_encoder.generation_config = generation_config
143
- init_weights(text_encoder)
144
-
145
- with open(hf_hub_download(source_model_id, filename='transformer/config.json', repo_type='model'), 'r', encoding='utf-8') as f:
146
- config = json.load(f)
147
- config.update({
148
- 'attention_head_dim': 32,
149
- 'axes_dims_rope': [8, 12, 12],
150
- 'joint_attention_dim': 32,
151
- 'num_attention_heads': 1,
152
- 'num_layers': 2,
153
- })
154
- del config['pooled_projection_dim'] # not used
155
- save_json(f'{save_folder}/transformer/config.json', config)
156
- transformer_config = QwenImageTransformer2DModel.load_config(f'{save_folder}/transformer')
157
- transformer = QwenImageTransformer2DModel.from_config(transformer_config)
158
- init_weights(transformer)
159
-
160
- with open(hf_hub_download(source_model_id, filename='vae/config.json', repo_type='model'), 'r', encoding='utf-8') as f:
161
- config = json.load(f)
162
- config.update({
163
- 'num_res_blocks': 1,
164
- 'base_dim': 16,
165
- 'dim_mult': [1, 2, 4, 4],
166
- })
167
- del config['latents_mean'] # not used
168
- del config['latents_std'] # not used
169
- save_json(f'{save_folder}/vae/config.json', config)
170
- vae_config = AutoencoderKLQwenImage.load_config(f'{save_folder}/vae')
171
- vae = AutoencoderKLQwenImage.from_config(vae_config)
172
- init_weights(vae)
173
-
174
- pipeline = QwenImagePipeline(
175
- scheduler=scheduler,
176
- text_encoder=text_encoder,
177
- tokenizer=tokenizer,
178
- transformer=transformer,
179
- vae=vae,
180
- )
181
- pipeline = pipeline.to(torch.bfloat16)
182
- pipeline.save_pretrained(save_folder, safe_serialization=True)
183
- print(pipeline)
184
- ```