Bi Yoo commited on
Commit
588ba9b
·
1 Parent(s): 57dc0e4
Files changed (2) hide show
  1. app.py +57 -2
  2. config.py +5 -3
app.py CHANGED
@@ -77,6 +77,8 @@ import faiss
77
  # Import configuration
78
  from config import (
79
  LLM_PROVIDER,
 
 
80
  HUGGINGFACE_API_KEY,
81
  HUGGINGFACE_MODEL,
82
  LOCAL_MODEL_REPO,
@@ -392,7 +394,13 @@ def initialize_llm():
392
  """Initialize LLM client based on provider"""
393
  global llm_client, local_model_path
394
 
395
- if LLM_PROVIDER == "huggingface":
 
 
 
 
 
 
396
  # Will use requests for HF Inference API
397
  if not HUGGINGFACE_API_KEY:
398
  print("WARNING: HUGGINGFACE_API_KEY not set - HuggingFace provider will fail at runtime")
@@ -469,6 +477,50 @@ def retrieve_relevant_chunks(query: str, top_k: int = TOP_K_RESULTS) -> List[str
469
  return relevant_chunks
470
 
471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  def generate_response_huggingface(prompt: str) -> str:
473
  """Generate response using HuggingFace Inference API (OpenAI-compatible endpoint)."""
474
  import requests
@@ -528,6 +580,7 @@ def generate_response_local(system_prompt: str, user_prompt: str) -> str:
528
  max_tokens=LOCAL_MODEL_MAX_OUTPUT_TOKENS,
529
  temperature=0.3,
530
  top_p=0.7,
 
531
  repeat_penalty=1.3,
532
  stop=["<|im_end|>", "<|endoftext|>", "<think>"], # Qwen3 stop tokens + thinking
533
  )
@@ -565,7 +618,9 @@ Answer:"""
565
 
566
  combined_prompt = f"{system_prompt}\n\n{user_prompt}"
567
 
568
- if LLM_PROVIDER == "huggingface":
 
 
569
  return generate_response_huggingface(combined_prompt)
570
  elif LLM_PROVIDER == "local":
571
  return generate_response_local(system_prompt, user_prompt)
 
77
  # Import configuration
78
  from config import (
79
  LLM_PROVIDER,
80
+ BEAM_API_URL,
81
+ BEAM_API_TOKEN,
82
  HUGGINGFACE_API_KEY,
83
  HUGGINGFACE_MODEL,
84
  LOCAL_MODEL_REPO,
 
394
  """Initialize LLM client based on provider"""
395
  global llm_client, local_model_path
396
 
397
+ if LLM_PROVIDER == "beam":
398
+ # Beam uses external vLLM API endpoint
399
+ if not BEAM_API_URL:
400
+ print("WARNING: BEAM_API_URL not set - Beam provider will fail at runtime")
401
+ else:
402
+ print(f"Initialized Beam vLLM API at: {BEAM_API_URL}")
403
+ elif LLM_PROVIDER == "huggingface":
404
  # Will use requests for HF Inference API
405
  if not HUGGINGFACE_API_KEY:
406
  print("WARNING: HUGGINGFACE_API_KEY not set - HuggingFace provider will fail at runtime")
 
477
  return relevant_chunks
478
 
479
 
480
+ def generate_response_beam(system_prompt: str, user_prompt: str) -> str:
481
+ """Generate response using Beam vLLM service (Qwen3 4B Instruct on GPU)."""
482
+ import requests
483
+
484
+ if not BEAM_API_URL:
485
+ raise HTTPException(status_code=500, detail="BEAM_API_URL is not set")
486
+
487
+ api_url = f"{BEAM_API_URL.rstrip('/')}/v1/chat/completions"
488
+
489
+ payload = {
490
+ "model": "Qwen/Qwen3-4B-Instruct-2507",
491
+ "messages": [
492
+ {"role": "system", "content": system_prompt},
493
+ {"role": "user", "content": user_prompt},
494
+ ],
495
+ "max_tokens": 100,
496
+ "temperature": 0.3,
497
+ "top_p": 0.7,
498
+ }
499
+
500
+ headers = {
501
+ "Authorization": f"Bearer {BEAM_API_TOKEN}",
502
+ "Content-Type": "application/json"
503
+ }
504
+
505
+ try:
506
+ response = requests.post(api_url, headers=headers, json=payload, timeout=120)
507
+ print("Beam status:", response.status_code)
508
+ response.raise_for_status()
509
+
510
+ result = response.json()
511
+ if isinstance(result, dict):
512
+ choices = result.get("choices")
513
+ if isinstance(choices, list) and choices:
514
+ message = choices[0].get("message", {})
515
+ content = message.get("content")
516
+ if content:
517
+ return content.strip()
518
+ return str(result)
519
+ except Exception as e:
520
+ print("Beam API error occurred:", repr(e))
521
+ raise HTTPException(status_code=500, detail=f"Beam API error: {str(e)}")
522
+
523
+
524
  def generate_response_huggingface(prompt: str) -> str:
525
  """Generate response using HuggingFace Inference API (OpenAI-compatible endpoint)."""
526
  import requests
 
580
  max_tokens=LOCAL_MODEL_MAX_OUTPUT_TOKENS,
581
  temperature=0.3,
582
  top_p=0.7,
583
+ top_k=20, # Qwen-recommended sampling parameter
584
  repeat_penalty=1.3,
585
  stop=["<|im_end|>", "<|endoftext|>", "<think>"], # Qwen3 stop tokens + thinking
586
  )
 
618
 
619
  combined_prompt = f"{system_prompt}\n\n{user_prompt}"
620
 
621
+ if LLM_PROVIDER == "beam":
622
+ return generate_response_beam(system_prompt, user_prompt)
623
+ elif LLM_PROVIDER == "huggingface":
624
  return generate_response_huggingface(combined_prompt)
625
  elif LLM_PROVIDER == "local":
626
  return generate_response_local(system_prompt, user_prompt)
config.py CHANGED
@@ -6,9 +6,11 @@ Change LLM_PROVIDER to switch between different models
6
  import os
7
 
8
  # Swappable LLM provider (environment configurable)
9
- LLM_PROVIDER = os.getenv("LLM_PROVIDER", "local") # Options: "huggingface", "local"
10
 
11
  # API Keys (set these as environment variables in HuggingFace Space secrets)
 
 
12
  HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
13
 
14
  # Model configurations
@@ -19,8 +21,8 @@ LOCAL_MODEL_REPO = os.getenv("LOCAL_MODEL_REPO", "bartowski/Qwen_Qwen3-4B-Instru
19
  LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf") # Q4_K_M (2.50GB, recommended)
20
  LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
21
  LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 2))) # HF Spaces has 2 vCPUs
22
- LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "512")) # Increased for better throughput
23
- LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "200"))
24
  LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
25
 
26
  # Access control configuration
 
6
  import os
7
 
8
  # Swappable LLM provider (environment configurable)
9
+ LLM_PROVIDER = os.getenv("LLM_PROVIDER", "beam") # Options: "beam", "huggingface", "local"
10
 
11
  # API Keys (set these as environment variables in HuggingFace Space secrets)
12
+ BEAM_API_URL = os.getenv("BEAM_API_URL", "")
13
+ BEAM_API_TOKEN = os.getenv("BEAM_API_TOKEN", "")
14
  HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
15
 
16
  # Model configurations
 
21
  LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf") # Q4_K_M (2.50GB, recommended)
22
  LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
23
  LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 2))) # HF Spaces has 2 vCPUs
24
+ LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "1024")) # Optimal for CPU throughput
25
+ LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "100")) # Shorter responses for faster UX
26
  LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
27
 
28
  # Access control configuration