Spaces:
Running
Running
Commit
·
0eae068
1
Parent(s):
bbb8c10
adjusted params
Browse files
app.py
CHANGED
|
@@ -479,10 +479,10 @@ def generate_response_local(system_prompt: str, user_prompt: str) -> str:
|
|
| 479 |
{"role": "user", "content": user_prompt},
|
| 480 |
],
|
| 481 |
max_tokens=LOCAL_MODEL_MAX_OUTPUT_TOKENS,
|
| 482 |
-
temperature=0.
|
| 483 |
-
top_p=0.
|
| 484 |
-
repeat_penalty=1.
|
| 485 |
-
stop=["<end_of_turn>", "</s>"],
|
| 486 |
)
|
| 487 |
except Exception as err:
|
| 488 |
raise HTTPException(status_code=500, detail=f"Local model error: {err}") from err
|
|
@@ -507,34 +507,14 @@ def generate_response(
|
|
| 507 |
assistant_query: bool = False,
|
| 508 |
) -> str:
|
| 509 |
"""Generate response using configured LLM provider"""
|
| 510 |
-
|
| 511 |
-
persona_instruction = (
|
| 512 |
-
"Respond in first person as Bi's AI assistant. Mention you run locally on a "
|
| 513 |
-
"quantized TinyLlama 1.1B Chat model (Q4_K_M via llama.cpp with MiniLM embeddings and FAISS)."
|
| 514 |
-
)
|
| 515 |
-
else:
|
| 516 |
-
persona_instruction = (
|
| 517 |
-
"Speak directly about Bi by name in a professional, supportive manner - like a knowledgeable secretary. "
|
| 518 |
-
"Use direct references such as 'Bi has experience in...', 'Bi specializes in...', 'Bi worked on...'. "
|
| 519 |
-
"Rely only on the provided context."
|
| 520 |
-
)
|
| 521 |
|
| 522 |
-
|
| 523 |
-
[
|
| 524 |
-
SYSTEM_PROMPT.strip(),
|
| 525 |
-
persona_instruction,
|
| 526 |
-
"Provide a direct, concise answer without repeating the context.",
|
| 527 |
-
"If the context lacks the answer, state that politely.",
|
| 528 |
-
"Do not echo or list the context - synthesize it into a clear response.",
|
| 529 |
-
]
|
| 530 |
-
)
|
| 531 |
-
|
| 532 |
-
user_prompt = f"""Context:
|
| 533 |
{context}
|
| 534 |
|
| 535 |
Question: {original_question or question}
|
| 536 |
|
| 537 |
-
|
| 538 |
|
| 539 |
combined_prompt = f"{system_prompt}\n\n{user_prompt}"
|
| 540 |
|
|
|
|
| 479 |
{"role": "user", "content": user_prompt},
|
| 480 |
],
|
| 481 |
max_tokens=LOCAL_MODEL_MAX_OUTPUT_TOKENS,
|
| 482 |
+
temperature=0.3,
|
| 483 |
+
top_p=0.7,
|
| 484 |
+
repeat_penalty=1.3,
|
| 485 |
+
stop=["<end_of_turn>", "</s>", "Question:", "Context:"],
|
| 486 |
)
|
| 487 |
except Exception as err:
|
| 488 |
raise HTTPException(status_code=500, detail=f"Local model error: {err}") from err
|
|
|
|
| 507 |
assistant_query: bool = False,
|
| 508 |
) -> str:
|
| 509 |
"""Generate response using configured LLM provider"""
|
| 510 |
+
system_prompt = SYSTEM_PROMPT.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
+
user_prompt = f"""Context about Bi:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
{context}
|
| 514 |
|
| 515 |
Question: {original_question or question}
|
| 516 |
|
| 517 |
+
Answer:"""
|
| 518 |
|
| 519 |
combined_prompt = f"{system_prompt}\n\n{user_prompt}"
|
| 520 |
|
config.py
CHANGED
|
@@ -20,7 +20,7 @@ LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "tinyllama-1.1b-chat-v1
|
|
| 20 |
LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
|
| 21 |
LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 4)))
|
| 22 |
LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "256"))
|
| 23 |
-
LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "
|
| 24 |
LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
|
| 25 |
|
| 26 |
# Access control configuration
|
|
@@ -37,18 +37,9 @@ SESSION_TOKEN_TTL_SECONDS = int(os.getenv("SESSION_TOKEN_TTL_SECONDS", "600"))
|
|
| 37 |
|
| 38 |
# RAG Configuration
|
| 39 |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Fast, lightweight
|
| 40 |
-
CHUNK_SIZE =
|
| 41 |
-
CHUNK_OVERLAP =
|
| 42 |
-
TOP_K_RESULTS =
|
| 43 |
|
| 44 |
# System prompt for the chatbot
|
| 45 |
-
SYSTEM_PROMPT = """
|
| 46 |
-
|
| 47 |
-
Instructions:
|
| 48 |
-
- Refer to Bi directly by name (e.g., "Bi has experience in...", "Bi worked on...")
|
| 49 |
-
- Answer questions based ONLY on the provided context about Bi
|
| 50 |
-
- Be conversational, friendly, and professional - like a knowledgeable secretary
|
| 51 |
-
- If information is not in the context, politely say you don't have that information about Bi
|
| 52 |
-
- Keep responses concise but informative
|
| 53 |
-
- Speak on Bi's behalf in a supportive, professional manner
|
| 54 |
-
"""
|
|
|
|
| 20 |
LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
|
| 21 |
LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 4)))
|
| 22 |
LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "256"))
|
| 23 |
+
LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "150"))
|
| 24 |
LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
|
| 25 |
|
| 26 |
# Access control configuration
|
|
|
|
| 37 |
|
| 38 |
# RAG Configuration
|
| 39 |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Fast, lightweight
|
| 40 |
+
CHUNK_SIZE = 300 # Characters per chunk (reduced for faster inference)
|
| 41 |
+
CHUNK_OVERLAP = 30 # Overlap between chunks
|
| 42 |
+
TOP_K_RESULTS = 2 # Number of relevant chunks to retrieve (reduced to minimize context)
|
| 43 |
|
| 44 |
# System prompt for the chatbot
|
| 45 |
+
SYSTEM_PROMPT = """Answer questions about Bi using the provided context. Keep answers short and direct. Always refer to Bi by name."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|