Spaces:
Running
Fix OpenRouter streaming: Use OpenAI client with proper headers
Browse filesIssue: Streaming not working with OpenRouter models
Root cause: Using InferenceClient instead of OpenAI client for OpenRouter
Fix:
- Use OpenAI client directly for OpenRouter (as per their docs)
- Add required headers: HTTP-Referer and X-Title for OpenRouter ranking
- Remove artificial 0.01s delay - use asyncio.sleep(0) for immediate flush
- Add detailed logging every 10 chunks to track streaming progress
- Log when streaming starts and total chunks/chars at completion
OpenRouter streaming now works correctly:
β
Uses base_url='https://openrouter.ai/api/v1'
β
Includes HTTP-Referer and X-Title headers
β
Streams immediately with no artificial delays
β
Logs: 'Starting to stream...', 'Streamed N chunks', 'Completed'
β
Works with all openrouter/ prefixed models
Example streaming flow:
1. User sends request
2. Backend logs: '[Generate] Starting to stream from openrouter/sherlock-dash-alpha...'
3. Chunks arrive: '[Generate] Streamed 10 chunks, 250 chars total'
4. Frontend receives each chunk immediately via SSE
5. Monaco editor updates in real-time
6. Backend logs: '[Generate] Completed with 147 chunks, total length: 3241'
- backend_api.py +20 -6
|
@@ -377,10 +377,18 @@ async def generate_code(
|
|
| 377 |
|
| 378 |
# Determine which provider/API to use based on model ID
|
| 379 |
if actual_model_id.startswith("openrouter/"):
|
| 380 |
-
# OpenRouter models - use
|
|
|
|
| 381 |
api_key = os.getenv("OPENROUTER_API_KEY") or os.getenv("HF_TOKEN")
|
| 382 |
-
client =
|
| 383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
elif actual_model_id == "MiniMaxAI/MiniMax-M2":
|
| 385 |
# MiniMax M2 via HuggingFace with Novita provider
|
| 386 |
hf_token = os.getenv("HF_TOKEN")
|
|
@@ -432,6 +440,8 @@ async def generate_code(
|
|
| 432 |
)
|
| 433 |
|
| 434 |
chunk_count = 0
|
|
|
|
|
|
|
| 435 |
for chunk in stream:
|
| 436 |
# Check if choices array has elements before accessing
|
| 437 |
if (hasattr(chunk, 'choices') and
|
|
@@ -444,7 +454,11 @@ async def generate_code(
|
|
| 444 |
generated_code += content
|
| 445 |
chunk_count += 1
|
| 446 |
|
| 447 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
event_data = json.dumps({
|
| 449 |
"type": "chunk",
|
| 450 |
"content": content,
|
|
@@ -452,8 +466,8 @@ async def generate_code(
|
|
| 452 |
})
|
| 453 |
yield f"data: {event_data}\n\n"
|
| 454 |
|
| 455 |
-
#
|
| 456 |
-
await asyncio.sleep(0
|
| 457 |
|
| 458 |
print(f"[Generate] Completed with {chunk_count} chunks, total length: {len(generated_code)}")
|
| 459 |
|
|
|
|
| 377 |
|
| 378 |
# Determine which provider/API to use based on model ID
|
| 379 |
if actual_model_id.startswith("openrouter/"):
|
| 380 |
+
# OpenRouter models - use OpenAI client directly
|
| 381 |
+
from openai import OpenAI
|
| 382 |
api_key = os.getenv("OPENROUTER_API_KEY") or os.getenv("HF_TOKEN")
|
| 383 |
+
client = OpenAI(
|
| 384 |
+
base_url="https://openrouter.ai/api/v1",
|
| 385 |
+
api_key=api_key,
|
| 386 |
+
default_headers={
|
| 387 |
+
"HTTP-Referer": "https://huggingface.co/spaces/akhaliq/anycoder",
|
| 388 |
+
"X-Title": "AnyCoder"
|
| 389 |
+
}
|
| 390 |
+
)
|
| 391 |
+
print(f"[Generate] Using OpenRouter with model: {actual_model_id}")
|
| 392 |
elif actual_model_id == "MiniMaxAI/MiniMax-M2":
|
| 393 |
# MiniMax M2 via HuggingFace with Novita provider
|
| 394 |
hf_token = os.getenv("HF_TOKEN")
|
|
|
|
| 440 |
)
|
| 441 |
|
| 442 |
chunk_count = 0
|
| 443 |
+
print(f"[Generate] Starting to stream from {actual_model_id}...")
|
| 444 |
+
|
| 445 |
for chunk in stream:
|
| 446 |
# Check if choices array has elements before accessing
|
| 447 |
if (hasattr(chunk, 'choices') and
|
|
|
|
| 454 |
generated_code += content
|
| 455 |
chunk_count += 1
|
| 456 |
|
| 457 |
+
# Log every 10th chunk to avoid spam
|
| 458 |
+
if chunk_count % 10 == 0:
|
| 459 |
+
print(f"[Generate] Streamed {chunk_count} chunks, {len(generated_code)} chars total")
|
| 460 |
+
|
| 461 |
+
# Send chunk as Server-Sent Event - yield immediately for instant streaming
|
| 462 |
event_data = json.dumps({
|
| 463 |
"type": "chunk",
|
| 464 |
"content": content,
|
|
|
|
| 466 |
})
|
| 467 |
yield f"data: {event_data}\n\n"
|
| 468 |
|
| 469 |
+
# Yield control to allow async processing - no artificial delay
|
| 470 |
+
await asyncio.sleep(0)
|
| 471 |
|
| 472 |
print(f"[Generate] Completed with {chunk_count} chunks, total length: {len(generated_code)}")
|
| 473 |
|