biyootiful commited on
Commit
0eae068
·
1 Parent(s): bbb8c10

adjusted params

Browse files
Files changed (2) hide show
  1. app.py +7 -27
  2. config.py +5 -14
app.py CHANGED
@@ -479,10 +479,10 @@ def generate_response_local(system_prompt: str, user_prompt: str) -> str:
479
  {"role": "user", "content": user_prompt},
480
  ],
481
  max_tokens=LOCAL_MODEL_MAX_OUTPUT_TOKENS,
482
- temperature=0.5,
483
- top_p=0.9,
484
- repeat_penalty=1.2,
485
- stop=["<end_of_turn>", "</s>"],
486
  )
487
  except Exception as err:
488
  raise HTTPException(status_code=500, detail=f"Local model error: {err}") from err
@@ -507,34 +507,14 @@ def generate_response(
507
  assistant_query: bool = False,
508
  ) -> str:
509
  """Generate response using configured LLM provider"""
510
- if assistant_query:
511
- persona_instruction = (
512
- "Respond in first person as Bi's AI assistant. Mention you run locally on a "
513
- "quantized TinyLlama 1.1B Chat model (Q4_K_M via llama.cpp with MiniLM embeddings and FAISS)."
514
- )
515
- else:
516
- persona_instruction = (
517
- "Speak directly about Bi by name in a professional, supportive manner - like a knowledgeable secretary. "
518
- "Use direct references such as 'Bi has experience in...', 'Bi specializes in...', 'Bi worked on...'. "
519
- "Rely only on the provided context."
520
- )
521
 
522
- system_prompt = "\n".join(
523
- [
524
- SYSTEM_PROMPT.strip(),
525
- persona_instruction,
526
- "Provide a direct, concise answer without repeating the context.",
527
- "If the context lacks the answer, state that politely.",
528
- "Do not echo or list the context - synthesize it into a clear response.",
529
- ]
530
- )
531
-
532
- user_prompt = f"""Context:
533
  {context}
534
 
535
  Question: {original_question or question}
536
 
537
- Provide a concise, professional answer based only on the context above."""
538
 
539
  combined_prompt = f"{system_prompt}\n\n{user_prompt}"
540
 
 
479
  {"role": "user", "content": user_prompt},
480
  ],
481
  max_tokens=LOCAL_MODEL_MAX_OUTPUT_TOKENS,
482
+ temperature=0.3,
483
+ top_p=0.7,
484
+ repeat_penalty=1.3,
485
+ stop=["<end_of_turn>", "</s>", "Question:", "Context:"],
486
  )
487
  except Exception as err:
488
  raise HTTPException(status_code=500, detail=f"Local model error: {err}") from err
 
507
  assistant_query: bool = False,
508
  ) -> str:
509
  """Generate response using configured LLM provider"""
510
+ system_prompt = SYSTEM_PROMPT.strip()
 
 
 
 
 
 
 
 
 
 
511
 
512
+ user_prompt = f"""Context about Bi:
 
 
 
 
 
 
 
 
 
 
513
  {context}
514
 
515
  Question: {original_question or question}
516
 
517
+ Answer:"""
518
 
519
  combined_prompt = f"{system_prompt}\n\n{user_prompt}"
520
 
config.py CHANGED
@@ -20,7 +20,7 @@ LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "tinyllama-1.1b-chat-v1
20
  LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
21
  LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 4)))
22
  LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "256"))
23
- LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "512"))
24
  LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
25
 
26
  # Access control configuration
@@ -37,18 +37,9 @@ SESSION_TOKEN_TTL_SECONDS = int(os.getenv("SESSION_TOKEN_TTL_SECONDS", "600"))
37
 
38
  # RAG Configuration
39
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Fast, lightweight
40
- CHUNK_SIZE = 500 # Characters per chunk
41
- CHUNK_OVERLAP = 50 # Overlap between chunks
42
- TOP_K_RESULTS = 3 # Number of relevant chunks to retrieve
43
 
44
  # System prompt for the chatbot
45
- SYSTEM_PROMPT = """You are Bi's professional assistant, helping visitors learn about his background, skills, and experience.
46
-
47
- Instructions:
48
- - Refer to Bi directly by name (e.g., "Bi has experience in...", "Bi worked on...")
49
- - Answer questions based ONLY on the provided context about Bi
50
- - Be conversational, friendly, and professional - like a knowledgeable secretary
51
- - If information is not in the context, politely say you don't have that information about Bi
52
- - Keep responses concise but informative
53
- - Speak on Bi's behalf in a supportive, professional manner
54
- """
 
20
  LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
21
  LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 4)))
22
  LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "256"))
23
+ LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "150"))
24
  LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
25
 
26
  # Access control configuration
 
37
 
38
  # RAG Configuration
39
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Fast, lightweight
40
+ CHUNK_SIZE = 300 # Characters per chunk (reduced for faster inference)
41
+ CHUNK_OVERLAP = 30 # Overlap between chunks
42
+ TOP_K_RESULTS = 2 # Number of relevant chunks to retrieve (reduced to minimize context)
43
 
44
  # System prompt for the chatbot
45
+ SYSTEM_PROMPT = """Answer questions about Bi using the provided context. Keep answers short and direct. Always refer to Bi by name."""