Kalpokoch commited on
Commit
ec3f347
·
verified ·
1 Parent(s): 2a5a6a3

Update app/app.py

Browse files
Files changed (1) hide show
  1. app/app.py +68 -14
app/app.py CHANGED
@@ -1,25 +1,79 @@
1
- from fastapi import FastAPI
 
 
2
  from huggingface_hub import hf_hub_download
 
3
  import os
 
 
 
 
 
4
 
5
- app = FastAPI()
 
 
 
 
 
6
 
7
- @app.on_event("startup")
8
- def download_model():
9
- print("🔄 Downloading TinyLlama model...")
10
- token = os.getenv("HF_TOKEN")
11
- if not token:
12
- raise EnvironmentError("HF_TOKEN not found in environment")
13
 
14
- model_path = hf_hub_download(
 
15
  repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
16
  filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
17
- local_dir="/app/models", # ✅ Absolute path in container
18
  token=os.getenv("HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  )
20
 
21
- print(f"✅ Model downloaded to: {model_path}")
 
 
22
 
23
- @app.get("/")
24
- def root():
25
- return {"message": "TinyLlama FastAPI app is running"}
 
1
+ from fastapi import FastAPI, Request
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
  from huggingface_hub import hf_hub_download
5
+ from llama_cpp import Llama
6
  import os
7
+ import json
8
+ import numpy as np
9
+ from typing import List
10
+ from sentence_transformers import SentenceTransformer
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
 
13
+ # Load processed chunks (RAG context source)
14
+ with open("processed_chunks.json", "r") as f:
15
+ chunks = json.load(f)
16
+
17
+ # Load embeddings model (use a lightweight one for Docker CPU)
18
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
19
 
20
+ # Precompute embeddings
21
+ chunk_texts = [chunk["text"] for chunk in chunks]
22
+ chunk_embeddings = embedder.encode(chunk_texts, convert_to_tensor=False)
 
 
 
23
 
24
+ # Download model file
25
+ model_path = hf_hub_download(
26
  repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
27
  filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
28
+ local_dir="/app/models",
29
  token=os.getenv("HF_TOKEN")
30
+ )
31
+
32
+ # Load TinyLlama model
33
+ llm = Llama(
34
+ model_path=model_path,
35
+ n_ctx=2048,
36
+ n_threads=4 # adjust depending on CPU cores
37
+ )
38
+
39
+ # FastAPI app
40
+ app = FastAPI()
41
+
42
+ # Allow Netlify frontend to access the backend
43
+ app.add_middleware(
44
+ CORSMiddleware,
45
+ allow_origins=["*"], # or specify your Netlify URL for more security
46
+ allow_credentials=True,
47
+ allow_methods=["*"],
48
+ allow_headers=["*"],
49
+ )
50
+
51
+ class ChatRequest(BaseModel):
52
+ question: str
53
+
54
+ @app.post("/chat")
55
+ def chat(request: ChatRequest):
56
+ question = request.question.strip()
57
+ if not question:
58
+ return {"response": "Please ask a question."}
59
+
60
+ # Embed the user's question
61
+ q_embedding = embedder.encode([question])[0]
62
+
63
+ # Find top 3 most similar chunks
64
+ similarities = cosine_similarity([q_embedding], chunk_embeddings)[0]
65
+ top_indices = similarities.argsort()[-3:][::-1]
66
+ retrieved = "\n\n".join(chunk_texts[i] for i in top_indices)
67
+
68
+ # Build the prompt
69
+ prompt = (
70
+ f"Context:\n{retrieved}\n\n"
71
+ f"User: {question}\n"
72
+ f"Assistant:"
73
  )
74
 
75
+ # Generate a response from the model
76
+ output = llm(prompt, max_tokens=256)
77
+ reply = output["choices"][0]["text"].strip()
78
 
79
+ return {"response": reply}