Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -75,7 +75,7 @@ def format_messages(system, history, user_text, audio_data_list=None):
|
|
| 75 |
continue
|
| 76 |
|
| 77 |
# Check for Audio
|
| 78 |
-
is_audio = not isinstance(content, list) and content
|
| 79 |
|
| 80 |
if is_audio:
|
| 81 |
audio_path = content["value"]["path"]
|
|
@@ -241,7 +241,8 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
|
|
| 241 |
|
| 242 |
try:
|
| 243 |
with httpx.Client(base_url=API_BASE_URL, timeout=120) as client:
|
| 244 |
-
|
|
|
|
| 245 |
"model": model_name,
|
| 246 |
"messages": messages,
|
| 247 |
"max_tokens": max_tokens,
|
|
@@ -250,75 +251,77 @@ def chat(system_prompt, user_text, audio_file, history, max_tokens, temperature,
|
|
| 250 |
"stream": True,
|
| 251 |
"repetition_penalty": 1.07,
|
| 252 |
"stop_token_ids": [151665]
|
| 253 |
-
})
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
line
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
if '
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
parts = buffer.split("</think>", 1)
|
| 295 |
-
think_content = parts[0]
|
| 296 |
response_content = parts[1]
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
# Add response message
|
| 305 |
-
history.append({"role": "assistant", "content": response_content})
|
| 306 |
-
else:
|
| 307 |
-
# Update thinking message
|
| 308 |
-
current_think = buffer
|
| 309 |
-
if current_think.startswith("<think>"):
|
| 310 |
-
current_think = current_think[len("<think>"):]
|
| 311 |
-
history[-1].content = current_think
|
| 312 |
-
else:
|
| 313 |
-
# Already split, just update response message
|
| 314 |
-
parts = buffer.split("</think>", 1)
|
| 315 |
-
response_content = parts[1]
|
| 316 |
-
history[-1]["content"] = response_content
|
| 317 |
-
|
| 318 |
-
yield history, ""
|
| 319 |
-
|
| 320 |
-
except json.JSONDecodeError:
|
| 321 |
-
continue
|
| 322 |
|
| 323 |
except httpx.ConnectError:
|
| 324 |
yield history, "❌ Cannot connect to vLLM API"
|
|
|
|
| 75 |
continue
|
| 76 |
|
| 77 |
# Check for Audio
|
| 78 |
+
is_audio = not isinstance(content, list) and content.get("component", None) == "audio"
|
| 79 |
|
| 80 |
if is_audio:
|
| 81 |
audio_path = content["value"]["path"]
|
|
|
|
| 241 |
|
| 242 |
try:
|
| 243 |
with httpx.Client(base_url=API_BASE_URL, timeout=120) as client:
|
| 244 |
+
# Use client.stream for better streaming control
|
| 245 |
+
with client.stream("POST", "/chat/completions", json={
|
| 246 |
"model": model_name,
|
| 247 |
"messages": messages,
|
| 248 |
"max_tokens": max_tokens,
|
|
|
|
| 251 |
"stream": True,
|
| 252 |
"repetition_penalty": 1.07,
|
| 253 |
"stop_token_ids": [151665]
|
| 254 |
+
}) as response:
|
| 255 |
+
|
| 256 |
+
if response.status_code != 200:
|
| 257 |
+
error_msg = f"❌ API Error {response.status_code}"
|
| 258 |
+
if response.status_code == 404:
|
| 259 |
+
error_msg += " - vLLM service not ready"
|
| 260 |
+
elif response.status_code == 400:
|
| 261 |
+
error_msg += " - Bad request"
|
| 262 |
+
elif response.status_code == 500:
|
| 263 |
+
error_msg += " - Model error"
|
| 264 |
+
yield history, error_msg
|
| 265 |
+
return
|
| 266 |
+
|
| 267 |
+
# Process streaming response
|
| 268 |
+
buffer = ""
|
| 269 |
+
is_thinking = True
|
| 270 |
+
|
| 271 |
+
print("[DEBUG] Start receiving stream...")
|
| 272 |
+
for line in response.iter_lines():
|
| 273 |
+
if not line:
|
| 274 |
+
continue
|
| 275 |
+
# Ensure line is string format
|
| 276 |
+
if isinstance(line, bytes):
|
| 277 |
+
line = line.decode('utf-8')
|
| 278 |
+
else:
|
| 279 |
+
line = str(line)
|
| 280 |
+
|
| 281 |
+
if line.startswith('data: '):
|
| 282 |
+
data_str = line[6:]
|
| 283 |
+
if data_str.strip() == '[DONE]':
|
| 284 |
+
print("[DEBUG] Stream finished [DONE]")
|
| 285 |
+
break
|
| 286 |
+
try:
|
| 287 |
+
data = json.loads(data_str)
|
| 288 |
+
if 'choices' in data and len(data['choices']) > 0:
|
| 289 |
+
delta = data['choices'][0].get('delta', {})
|
| 290 |
+
if 'content' in delta:
|
| 291 |
+
content = delta['content']
|
| 292 |
+
buffer += content
|
| 293 |
+
|
| 294 |
+
if is_thinking:
|
| 295 |
+
if "</think>" in buffer:
|
| 296 |
+
is_thinking = False
|
| 297 |
+
parts = buffer.split("</think>", 1)
|
| 298 |
+
think_content = parts[0]
|
| 299 |
+
response_content = parts[1]
|
| 300 |
+
|
| 301 |
+
if think_content.startswith("<think>"):
|
| 302 |
+
think_content = think_content[len("<think>"):].strip()
|
| 303 |
+
|
| 304 |
+
# Update thinking message
|
| 305 |
+
history[-1].content = think_content
|
| 306 |
+
|
| 307 |
+
# Add response message
|
| 308 |
+
history.append({"role": "assistant", "content": response_content})
|
| 309 |
+
else:
|
| 310 |
+
# Update thinking message
|
| 311 |
+
current_think = buffer
|
| 312 |
+
if current_think.startswith("<think>"):
|
| 313 |
+
current_think = current_think[len("<think>"):]
|
| 314 |
+
history[-1].content = current_think
|
| 315 |
+
else:
|
| 316 |
+
# Already split, just update response message
|
| 317 |
parts = buffer.split("</think>", 1)
|
|
|
|
| 318 |
response_content = parts[1]
|
| 319 |
+
history[-1]["content"] = response_content
|
| 320 |
+
|
| 321 |
+
yield history, ""
|
| 322 |
+
|
| 323 |
+
except json.JSONDecodeError:
|
| 324 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
|
| 326 |
except httpx.ConnectError:
|
| 327 |
yield history, "❌ Cannot connect to vLLM API"
|