Spaces:
Running
Running
Bi Yoo
commited on
Commit
·
588ba9b
1
Parent(s):
57dc0e4
try w ias
Browse files
app.py
CHANGED
|
@@ -77,6 +77,8 @@ import faiss
|
|
| 77 |
# Import configuration
|
| 78 |
from config import (
|
| 79 |
LLM_PROVIDER,
|
|
|
|
|
|
|
| 80 |
HUGGINGFACE_API_KEY,
|
| 81 |
HUGGINGFACE_MODEL,
|
| 82 |
LOCAL_MODEL_REPO,
|
|
@@ -392,7 +394,13 @@ def initialize_llm():
|
|
| 392 |
"""Initialize LLM client based on provider"""
|
| 393 |
global llm_client, local_model_path
|
| 394 |
|
| 395 |
-
if LLM_PROVIDER == "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
# Will use requests for HF Inference API
|
| 397 |
if not HUGGINGFACE_API_KEY:
|
| 398 |
print("WARNING: HUGGINGFACE_API_KEY not set - HuggingFace provider will fail at runtime")
|
|
@@ -469,6 +477,50 @@ def retrieve_relevant_chunks(query: str, top_k: int = TOP_K_RESULTS) -> List[str
|
|
| 469 |
return relevant_chunks
|
| 470 |
|
| 471 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
def generate_response_huggingface(prompt: str) -> str:
|
| 473 |
"""Generate response using HuggingFace Inference API (OpenAI-compatible endpoint)."""
|
| 474 |
import requests
|
|
@@ -528,6 +580,7 @@ def generate_response_local(system_prompt: str, user_prompt: str) -> str:
|
|
| 528 |
max_tokens=LOCAL_MODEL_MAX_OUTPUT_TOKENS,
|
| 529 |
temperature=0.3,
|
| 530 |
top_p=0.7,
|
|
|
|
| 531 |
repeat_penalty=1.3,
|
| 532 |
stop=["<|im_end|>", "<|endoftext|>", "<think>"], # Qwen3 stop tokens + thinking
|
| 533 |
)
|
|
@@ -565,7 +618,9 @@ Answer:"""
|
|
| 565 |
|
| 566 |
combined_prompt = f"{system_prompt}\n\n{user_prompt}"
|
| 567 |
|
| 568 |
-
if LLM_PROVIDER == "
|
|
|
|
|
|
|
| 569 |
return generate_response_huggingface(combined_prompt)
|
| 570 |
elif LLM_PROVIDER == "local":
|
| 571 |
return generate_response_local(system_prompt, user_prompt)
|
|
|
|
| 77 |
# Import configuration
|
| 78 |
from config import (
|
| 79 |
LLM_PROVIDER,
|
| 80 |
+
BEAM_API_URL,
|
| 81 |
+
BEAM_API_TOKEN,
|
| 82 |
HUGGINGFACE_API_KEY,
|
| 83 |
HUGGINGFACE_MODEL,
|
| 84 |
LOCAL_MODEL_REPO,
|
|
|
|
| 394 |
"""Initialize LLM client based on provider"""
|
| 395 |
global llm_client, local_model_path
|
| 396 |
|
| 397 |
+
if LLM_PROVIDER == "beam":
|
| 398 |
+
# Beam uses external vLLM API endpoint
|
| 399 |
+
if not BEAM_API_URL:
|
| 400 |
+
print("WARNING: BEAM_API_URL not set - Beam provider will fail at runtime")
|
| 401 |
+
else:
|
| 402 |
+
print(f"Initialized Beam vLLM API at: {BEAM_API_URL}")
|
| 403 |
+
elif LLM_PROVIDER == "huggingface":
|
| 404 |
# Will use requests for HF Inference API
|
| 405 |
if not HUGGINGFACE_API_KEY:
|
| 406 |
print("WARNING: HUGGINGFACE_API_KEY not set - HuggingFace provider will fail at runtime")
|
|
|
|
| 477 |
return relevant_chunks
|
| 478 |
|
| 479 |
|
| 480 |
+
def generate_response_beam(system_prompt: str, user_prompt: str) -> str:
|
| 481 |
+
"""Generate response using Beam vLLM service (Qwen3 4B Instruct on GPU)."""
|
| 482 |
+
import requests
|
| 483 |
+
|
| 484 |
+
if not BEAM_API_URL:
|
| 485 |
+
raise HTTPException(status_code=500, detail="BEAM_API_URL is not set")
|
| 486 |
+
|
| 487 |
+
api_url = f"{BEAM_API_URL.rstrip('/')}/v1/chat/completions"
|
| 488 |
+
|
| 489 |
+
payload = {
|
| 490 |
+
"model": "Qwen/Qwen3-4B-Instruct-2507",
|
| 491 |
+
"messages": [
|
| 492 |
+
{"role": "system", "content": system_prompt},
|
| 493 |
+
{"role": "user", "content": user_prompt},
|
| 494 |
+
],
|
| 495 |
+
"max_tokens": 100,
|
| 496 |
+
"temperature": 0.3,
|
| 497 |
+
"top_p": 0.7,
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
headers = {
|
| 501 |
+
"Authorization": f"Bearer {BEAM_API_TOKEN}",
|
| 502 |
+
"Content-Type": "application/json"
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
try:
|
| 506 |
+
response = requests.post(api_url, headers=headers, json=payload, timeout=120)
|
| 507 |
+
print("Beam status:", response.status_code)
|
| 508 |
+
response.raise_for_status()
|
| 509 |
+
|
| 510 |
+
result = response.json()
|
| 511 |
+
if isinstance(result, dict):
|
| 512 |
+
choices = result.get("choices")
|
| 513 |
+
if isinstance(choices, list) and choices:
|
| 514 |
+
message = choices[0].get("message", {})
|
| 515 |
+
content = message.get("content")
|
| 516 |
+
if content:
|
| 517 |
+
return content.strip()
|
| 518 |
+
return str(result)
|
| 519 |
+
except Exception as e:
|
| 520 |
+
print("Beam API error occurred:", repr(e))
|
| 521 |
+
raise HTTPException(status_code=500, detail=f"Beam API error: {str(e)}")
|
| 522 |
+
|
| 523 |
+
|
| 524 |
def generate_response_huggingface(prompt: str) -> str:
|
| 525 |
"""Generate response using HuggingFace Inference API (OpenAI-compatible endpoint)."""
|
| 526 |
import requests
|
|
|
|
| 580 |
max_tokens=LOCAL_MODEL_MAX_OUTPUT_TOKENS,
|
| 581 |
temperature=0.3,
|
| 582 |
top_p=0.7,
|
| 583 |
+
top_k=20, # Qwen-recommended sampling parameter
|
| 584 |
repeat_penalty=1.3,
|
| 585 |
stop=["<|im_end|>", "<|endoftext|>", "<think>"], # Qwen3 stop tokens + thinking
|
| 586 |
)
|
|
|
|
| 618 |
|
| 619 |
combined_prompt = f"{system_prompt}\n\n{user_prompt}"
|
| 620 |
|
| 621 |
+
if LLM_PROVIDER == "beam":
|
| 622 |
+
return generate_response_beam(system_prompt, user_prompt)
|
| 623 |
+
elif LLM_PROVIDER == "huggingface":
|
| 624 |
return generate_response_huggingface(combined_prompt)
|
| 625 |
elif LLM_PROVIDER == "local":
|
| 626 |
return generate_response_local(system_prompt, user_prompt)
|
config.py
CHANGED
|
@@ -6,9 +6,11 @@ Change LLM_PROVIDER to switch between different models
|
|
| 6 |
import os
|
| 7 |
|
| 8 |
# Swappable LLM provider (environment configurable)
|
| 9 |
-
LLM_PROVIDER = os.getenv("LLM_PROVIDER", "
|
| 10 |
|
| 11 |
# API Keys (set these as environment variables in HuggingFace Space secrets)
|
|
|
|
|
|
|
| 12 |
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
|
| 13 |
|
| 14 |
# Model configurations
|
|
@@ -19,8 +21,8 @@ LOCAL_MODEL_REPO = os.getenv("LOCAL_MODEL_REPO", "bartowski/Qwen_Qwen3-4B-Instru
|
|
| 19 |
LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf") # Q4_K_M (2.50GB, recommended)
|
| 20 |
LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
|
| 21 |
LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 2))) # HF Spaces has 2 vCPUs
|
| 22 |
-
LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "
|
| 23 |
-
LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "
|
| 24 |
LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
|
| 25 |
|
| 26 |
# Access control configuration
|
|
|
|
| 6 |
import os
|
| 7 |
|
| 8 |
# Swappable LLM provider (environment configurable)
|
| 9 |
+
LLM_PROVIDER = os.getenv("LLM_PROVIDER", "beam") # Options: "beam", "huggingface", "local"
|
| 10 |
|
| 11 |
# API Keys (set these as environment variables in HuggingFace Space secrets)
|
| 12 |
+
BEAM_API_URL = os.getenv("BEAM_API_URL", "")
|
| 13 |
+
BEAM_API_TOKEN = os.getenv("BEAM_API_TOKEN", "")
|
| 14 |
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
|
| 15 |
|
| 16 |
# Model configurations
|
|
|
|
| 21 |
LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf") # Q4_K_M (2.50GB, recommended)
|
| 22 |
LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
|
| 23 |
LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 2))) # HF Spaces has 2 vCPUs
|
| 24 |
+
LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "1024")) # Optimal for CPU throughput
|
| 25 |
+
LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "100")) # Shorter responses for faster UX
|
| 26 |
LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
|
| 27 |
|
| 28 |
# Access control configuration
|