akhaliq HF Staff commited on
Commit
c77d732
Β·
1 Parent(s): f26c666

Fix OpenRouter streaming: Use OpenAI client with proper headers

Browse files

Issue: Streaming not working with OpenRouter models
Root cause: Using InferenceClient instead of OpenAI client for OpenRouter

Fix:
- Use OpenAI client directly for OpenRouter (as per their docs)
- Add required headers: HTTP-Referer and X-Title for OpenRouter ranking
- Remove artificial 0.01s delay - use asyncio.sleep(0) for immediate flush
- Add detailed logging every 10 chunks to track streaming progress
- Log when streaming starts and total chunks/chars at completion

OpenRouter streaming now works correctly:
βœ… Uses base_url='https://openrouter.ai/api/v1'
βœ… Includes HTTP-Referer and X-Title headers
βœ… Streams immediately with no artificial delays
βœ… Logs: 'Starting to stream...', 'Streamed N chunks', 'Completed'
βœ… Works with all openrouter/ prefixed models

Example streaming flow:
1. User sends request
2. Backend logs: '[Generate] Starting to stream from openrouter/sherlock-dash-alpha...'
3. Chunks arrive: '[Generate] Streamed 10 chunks, 250 chars total'
4. Frontend receives each chunk immediately via SSE
5. Monaco editor updates in real-time
6. Backend logs: '[Generate] Completed with 147 chunks, total length: 3241'

Files changed (1) hide show
  1. backend_api.py +20 -6
backend_api.py CHANGED
@@ -377,10 +377,18 @@ async def generate_code(
377
 
378
  # Determine which provider/API to use based on model ID
379
  if actual_model_id.startswith("openrouter/"):
380
- # OpenRouter models - use via OpenAI-compatible API
 
381
  api_key = os.getenv("OPENROUTER_API_KEY") or os.getenv("HF_TOKEN")
382
- client = InferenceClient(api_key=api_key, provider="openai", base_url="https://openrouter.ai/api/v1")
383
- # Keep the model_id as-is for OpenRouter
 
 
 
 
 
 
 
384
  elif actual_model_id == "MiniMaxAI/MiniMax-M2":
385
  # MiniMax M2 via HuggingFace with Novita provider
386
  hf_token = os.getenv("HF_TOKEN")
@@ -432,6 +440,8 @@ async def generate_code(
432
  )
433
 
434
  chunk_count = 0
 
 
435
  for chunk in stream:
436
  # Check if choices array has elements before accessing
437
  if (hasattr(chunk, 'choices') and
@@ -444,7 +454,11 @@ async def generate_code(
444
  generated_code += content
445
  chunk_count += 1
446
 
447
- # Send chunk as Server-Sent Event
 
 
 
 
448
  event_data = json.dumps({
449
  "type": "chunk",
450
  "content": content,
@@ -452,8 +466,8 @@ async def generate_code(
452
  })
453
  yield f"data: {event_data}\n\n"
454
 
455
- # Ensure immediate flush to client
456
- await asyncio.sleep(0.01) # Small delay to ensure flushing
457
 
458
  print(f"[Generate] Completed with {chunk_count} chunks, total length: {len(generated_code)}")
459
 
 
377
 
378
  # Determine which provider/API to use based on model ID
379
  if actual_model_id.startswith("openrouter/"):
380
+ # OpenRouter models - use OpenAI client directly
381
+ from openai import OpenAI
382
  api_key = os.getenv("OPENROUTER_API_KEY") or os.getenv("HF_TOKEN")
383
+ client = OpenAI(
384
+ base_url="https://openrouter.ai/api/v1",
385
+ api_key=api_key,
386
+ default_headers={
387
+ "HTTP-Referer": "https://huggingface.co/spaces/akhaliq/anycoder",
388
+ "X-Title": "AnyCoder"
389
+ }
390
+ )
391
+ print(f"[Generate] Using OpenRouter with model: {actual_model_id}")
392
  elif actual_model_id == "MiniMaxAI/MiniMax-M2":
393
  # MiniMax M2 via HuggingFace with Novita provider
394
  hf_token = os.getenv("HF_TOKEN")
 
440
  )
441
 
442
  chunk_count = 0
443
+ print(f"[Generate] Starting to stream from {actual_model_id}...")
444
+
445
  for chunk in stream:
446
  # Check if choices array has elements before accessing
447
  if (hasattr(chunk, 'choices') and
 
454
  generated_code += content
455
  chunk_count += 1
456
 
457
+ # Log every 10th chunk to avoid spam
458
+ if chunk_count % 10 == 0:
459
+ print(f"[Generate] Streamed {chunk_count} chunks, {len(generated_code)} chars total")
460
+
461
+ # Send chunk as Server-Sent Event - yield immediately for instant streaming
462
  event_data = json.dumps({
463
  "type": "chunk",
464
  "content": content,
 
466
  })
467
  yield f"data: {event_data}\n\n"
468
 
469
+ # Yield control to allow async processing - no artificial delay
470
+ await asyncio.sleep(0)
471
 
472
  print(f"[Generate] Completed with {chunk_count} chunks, total length: {len(generated_code)}")
473