Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Jul 16

Commit

ec3f347

verified ·

1 Parent(s): 2a5a6a3

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +68 -14

app/app.py CHANGED Viewed

@@ -1,25 +1,79 @@
-from fastapi import FastAPI
 from huggingface_hub import hf_hub_download
 import os
-app = FastAPI()
-@app.on_event("startup")
-def download_model():
-    print("🔄 Downloading TinyLlama model...")
-    token = os.getenv("HF_TOKEN")
-    if not token:
-        raise EnvironmentError("HF_TOKEN not found in environment")
-    model_path = hf_hub_download(
     repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
     filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-    local_dir="/app/models",  # ✅ Absolute path in container
     token=os.getenv("HF_TOKEN")
     )
-    print(f"✅ Model downloaded to: {model_path}")
-@app.get("/")
-def root():
-    return {"message": "TinyLlama FastAPI app is running"}

+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
 import os
+import json
+import numpy as np
+from typing import List
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+# Load processed chunks (RAG context source)
+with open("processed_chunks.json", "r") as f:
+    chunks = json.load(f)
+# Load embeddings model (use a lightweight one for Docker CPU)
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+# Precompute embeddings
+chunk_texts = [chunk["text"] for chunk in chunks]
+chunk_embeddings = embedder.encode(chunk_texts, convert_to_tensor=False)
+# Download model file
+model_path = hf_hub_download(
     repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
     filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+    local_dir="/app/models",
     token=os.getenv("HF_TOKEN")
+)
+# Load TinyLlama model
+llm = Llama(
+    model_path=model_path,
+    n_ctx=2048,
+    n_threads=4  # adjust depending on CPU cores
+)
+# FastAPI app
+app = FastAPI()
+# Allow Netlify frontend to access the backend
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # or specify your Netlify URL for more security
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class ChatRequest(BaseModel):
+    question: str
+@app.post("/chat")
+def chat(request: ChatRequest):
+    question = request.question.strip()
+    if not question:
+        return {"response": "Please ask a question."}
+    # Embed the user's question
+    q_embedding = embedder.encode([question])[0]
+    # Find top 3 most similar chunks
+    similarities = cosine_similarity([q_embedding], chunk_embeddings)[0]
+    top_indices = similarities.argsort()[-3:][::-1]
+    retrieved = "\n\n".join(chunk_texts[i] for i in top_indices)
+    # Build the prompt
+    prompt = (
+        f"Context:\n{retrieved}\n\n"
+        f"User: {question}\n"
+        f"Assistant:"
     )
+    # Generate a response from the model
+    output = llm(prompt, max_tokens=256)
+    reply = output["choices"][0]["text"].strip()
+    return {"response": reply}