aleclyu commited on
Commit
acfce9f
·
1 Parent(s): 1efad72

fix zerogpu error

Browse files
Files changed (1) hide show
  1. app.py +40 -61
app.py CHANGED
@@ -48,40 +48,18 @@ def _get_args():
48
 
49
 
50
  def _load_model_processor(args):
51
- # 优化:尝试使用 flash_attention_2 sdpa
52
- try:
53
- attn_impl = "flash_attention_2"
54
- print(f"[INFO] 尝试使用 {attn_impl}")
55
- model = HunYuanVLForConditionalGeneration.from_pretrained(
56
- args.checkpoint_path,
57
- attn_implementation=attn_impl,
58
- torch_dtype=torch.bfloat16,
59
- device_map="cuda",
60
- token=os.environ.get('HF_TOKEN')
61
- )
62
- except Exception as e:
63
- print(f"[WARNING] flash_attention_2 不可用: {e}")
64
- print(f"[INFO] 降级使用 sdpa")
65
- try:
66
- model = HunYuanVLForConditionalGeneration.from_pretrained(
67
- args.checkpoint_path,
68
- attn_implementation="sdpa",
69
- torch_dtype=torch.bfloat16,
70
- device_map="cuda",
71
- token=os.environ.get('HF_TOKEN')
72
- )
73
- except Exception as e2:
74
- print(f"[WARNING] sdpa 不可用: {e2}")
75
- print(f"[INFO] 使用 eager (最慢)")
76
- model = HunYuanVLForConditionalGeneration.from_pretrained(
77
- args.checkpoint_path,
78
- attn_implementation="eager",
79
- torch_dtype=torch.bfloat16,
80
- device_map="cuda",
81
- token=os.environ.get('HF_TOKEN')
82
- )
83
-
84
  processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
 
85
  return model, processor
86
 
87
 
@@ -112,28 +90,25 @@ def _gc():
112
 
113
 
114
  def _launch_demo(args, model, processor):
115
- # 关键修复:减少 duration,添加调试信息
116
- @spaces.GPU(duration=60)
117
  def call_local_model(model, processor, messages):
118
  import time
119
  start_time = time.time()
120
- print(f"[DEBUG] 开始推理,时间: {start_time}")
121
- print(f"[DEBUG] Messages: {messages}")
122
 
123
  messages = [messages]
 
124
  # 使用 processor 构造输入格式
125
  texts = [
126
  processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
127
  for msg in messages
128
  ]
129
-
130
- prep_time = time.time()
131
- print(f"[DEBUG] 模板处理耗时: {prep_time - start_time:.2f}s")
132
 
133
  image_inputs, video_inputs = process_vision_info(messages)
134
-
135
- vision_time = time.time()
136
- print(f"[DEBUG] 视觉处理耗时: {vision_time - prep_time:.2f}s")
137
 
138
  inputs = processor(
139
  text=texts,
@@ -143,51 +118,55 @@ def _launch_demo(args, model, processor):
143
  return_tensors="pt",
144
  )
145
  inputs = inputs.to(model.device)
 
 
 
146
 
147
- input_time = time.time()
148
- print(f"[DEBUG] 输入处理耗时: {input_time - vision_time:.2f}s")
149
- print(f"[DEBUG] Input shape: {inputs.input_ids.shape if 'input_ids' in inputs else 'N/A'}")
150
-
151
- # 关键修复1: 大幅减少 max_new_tokens
152
- # 关键修复2: 添加 EOS token 和停止条件
153
- # 关键修复3: 添加超时保护
154
  with torch.no_grad():
155
  generated_ids = model.generate(
156
  **inputs,
157
  max_new_tokens=512, # 从 8192 降到 512,避免无限生成
158
  repetition_penalty=1.03,
159
  do_sample=False,
160
- # 关键:设置 EOS token,确保能正常停止
161
  eos_token_id=processor.tokenizer.eos_token_id,
162
  pad_token_id=processor.tokenizer.pad_token_id,
163
- # 添加提前停止条件
164
  use_cache=True,
 
 
 
 
165
  )
166
 
167
- gen_time = time.time()
168
- print(f"[DEBUG] 生成耗时: {gen_time - input_time:.2f}s")
169
- print(f"[DEBUG] Generated shape: {generated_ids.shape}")
 
170
 
171
  # 解码输出
172
  if "input_ids" in inputs:
173
  input_ids = inputs.input_ids
174
  else:
175
- input_ids = inputs.inputs # fallback
176
 
177
  generated_ids_trimmed = [
178
  out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
179
  ]
180
 
181
- print(f"[DEBUG] Trimmed tokens count: {[len(ids) for ids in generated_ids_trimmed]}")
 
 
182
 
183
  output_texts = processor.batch_decode(
184
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
185
  )
186
 
187
- decode_time = time.time()
188
- print(f"[DEBUG] 解码耗时: {decode_time - gen_time:.2f}s")
189
- print(f"[DEBUG] 总耗时: {decode_time - start_time:.2f}s")
190
- print(f"[DEBUG] Output: {output_texts[0][:200]}...") # 只打印前200字符
 
191
 
192
  return output_texts
193
 
 
48
 
49
 
50
  def _load_model_processor(args):
51
+ # ZeroGPU 环境:模型在 CPU 上加载,使用 eager 模式
52
+ # 在 @spaces.GPU 装饰器内会自动移到 GPU
53
+ print(f"[INFO] 加载模型(ZeroGPU 环境使用 eager 模式)")
54
+ model = HunYuanVLForConditionalGeneration.from_pretrained(
55
+ args.checkpoint_path,
56
+ attn_implementation="eager", # ZeroGPU 必须用 eager,因为初始在 CPU
57
+ torch_dtype=torch.bfloat16,
58
+ device_map="auto", # 改回 auto,让 ZeroGPU 自动管理
59
+ token=os.environ.get('HF_TOKEN')
60
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
62
+ print(f"[INFO] 模型加载完成")
63
  return model, processor
64
 
65
 
 
90
 
91
 
92
  def _launch_demo(args, model, processor):
93
+ # 关键:减少 duration 到 30 秒,如果超时说明有问题
94
+ @spaces.GPU(duration=30)
95
  def call_local_model(model, processor, messages):
96
  import time
97
  start_time = time.time()
98
+ print(f"[DEBUG] ========== 开始推理 ==========")
99
+ print(f"[DEBUG] 时间: {time.strftime('%Y-%m-%d %H:%M:%S')}")
100
 
101
  messages = [messages]
102
+
103
  # 使用 processor 构造输入格式
104
  texts = [
105
  processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
106
  for msg in messages
107
  ]
108
+ print(f"[DEBUG] 模板构建完成,耗时: {time.time() - start_time:.2f}s")
 
 
109
 
110
  image_inputs, video_inputs = process_vision_info(messages)
111
+ print(f"[DEBUG] 图像处理完成,耗时: {time.time() - start_time:.2f}s")
 
 
112
 
113
  inputs = processor(
114
  text=texts,
 
118
  return_tensors="pt",
119
  )
120
  inputs = inputs.to(model.device)
121
+ print(f"[DEBUG] 输入准备完成,耗时: {time.time() - start_time:.2f}s")
122
+ print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
123
+ print(f"[DEBUG] Device: {model.device}")
124
 
125
+ # 关键优化:极限压缩参数
126
+ gen_start = time.time()
 
 
 
 
 
127
  with torch.no_grad():
128
  generated_ids = model.generate(
129
  **inputs,
130
  max_new_tokens=512, # 从 8192 降到 512,避免无限生成
131
  repetition_penalty=1.03,
132
  do_sample=False,
 
133
  eos_token_id=processor.tokenizer.eos_token_id,
134
  pad_token_id=processor.tokenizer.pad_token_id,
 
135
  use_cache=True,
136
+ # 关键:添加长度惩罚,鼓励短输出
137
+ length_penalty=0.8,
138
+ # 添加早停
139
+ early_stopping=True,
140
  )
141
 
142
+ gen_time = time.time() - gen_start
143
+ print(f"[DEBUG] ========== 生成完成 ==========")
144
+ print(f"[DEBUG] 生成耗时: {gen_time:.2f}s")
145
+ print(f"[DEBUG] Output shape: {generated_ids.shape}")
146
 
147
  # 解码输出
148
  if "input_ids" in inputs:
149
  input_ids = inputs.input_ids
150
  else:
151
+ input_ids = inputs.inputs
152
 
153
  generated_ids_trimmed = [
154
  out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
155
  ]
156
 
157
+ actual_tokens = len(generated_ids_trimmed[0])
158
+ print(f"[DEBUG] 实际生成 token 数: {actual_tokens}")
159
+ print(f"[DEBUG] 每 token 耗时: {gen_time/actual_tokens if actual_tokens > 0 else 0:.3f}s")
160
 
161
  output_texts = processor.batch_decode(
162
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
163
  )
164
 
165
+ total_time = time.time() - start_time
166
+ print(f"[DEBUG] ========== 全部完成 ==========")
167
+ print(f"[DEBUG] 总耗时: {total_time:.2f}s")
168
+ print(f"[DEBUG] 输出长度: {len(output_texts[0])} 字符")
169
+ print(f"[DEBUG] 输出预览: {output_texts[0][:100]}...")
170
 
171
  return output_texts
172