Spaces:

omaryasserhassan
/

llm_server

Sleeping

App Files Files Community

omaryasserhassan commited on Aug 14

Commit

33f19bf

verified ·

1 Parent(s): b5dfa0f

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -26

app.py CHANGED Viewed

@@ -1,56 +1,63 @@
 import os
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
-REPO_ID  = "bartowski/Llama-3.2-3B-Instruct-GGUF"
-FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
-CACHE_DIR = "/app/models"  # matches Dockerfile pre-download
-os.makedirs(CACHE_DIR, exist_ok=True)
-app = FastAPI()
 _model = None
-def get_model():
     global _model
     if _model is not None:
         return _model
     local_path = hf_hub_download(
         repo_id=REPO_ID,
         filename=FILENAME,
         cache_dir=CACHE_DIR,
         local_dir_use_symlinks=False,
     )
     _model = Llama(
         model_path=local_path,
-        n_ctx=2048,
-        n_threads=os.cpu_count() or 2,
-        n_batch=256,
         verbose=False
     )
     return _model
-class PromptRequest(BaseModel):
-    prompt: str
-    max_tokens: int = 256
-    temperature: float = 0.7
-@app.post("/generate")
-def generate_text(req: PromptRequest):
-    try:
-        model = get_model()
-        output = model(
-            req.prompt,
-            max_tokens=req.max_tokens,
-            temperature=req.temperature,
-            stop=["</s>"]
-        )
-        return {"ok": True, "response": output["choices"][0]["text"]}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
 def health():
     try:
@@ -58,3 +65,47 @@ def health():
         return {"ok": True}
     except Exception as e:
         return {"ok": False, "error": str(e)}

 import os
+import time
 from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import BaseModel
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
+# ---------------- Config ----------------
+REPO_ID   = "bartowski/Llama-3.2-3B-Instruct-GGUF"
+FILENAME  = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
+CACHE_DIR = "/app/models"   # match your Dockerfile prefetch if you use it
+# Conservative CPU settings for Spaces (prevents stalls)
+N_THREADS = min(4, (os.cpu_count() or 2))  # don't over-thread on tiny CPUs
+N_BATCH   = 64                              # modest batch to avoid RAM thrash
+N_CTX     = 2048                            # enough for short prompts
+# --------------- FastAPI App ---------------
+app = FastAPI(title="Llama 3.2 3B Instruct (llama.cpp) API")
 _model = None
+# --------------- Load Model ---------------
+def get_model() -> Llama:
     global _model
     if _model is not None:
         return _model
+    os.makedirs(CACHE_DIR, exist_ok=True)
     local_path = hf_hub_download(
         repo_id=REPO_ID,
         filename=FILENAME,
         cache_dir=CACHE_DIR,
         local_dir_use_symlinks=False,
     )
+    # IMPORTANT: use Llama-3 chat template
     _model = Llama(
         model_path=local_path,
+        chat_format="llama-3",   # <- ensures proper prompt templating
+        n_ctx=N_CTX,
+        n_threads=N_THREADS,
+        n_batch=N_BATCH,
         verbose=False
     )
     return _model
+# --------------- Schemas ----------------
+class ChatMessage(BaseModel):
+    role: str   # "system" | "user" | "assistant"
+    content: str
+class ChatRequest(BaseModel):
+    messages: list[ChatMessage]
+    max_tokens: int = 128
+    temperature: float = 0.7
+    top_p: float = 0.9
+    stream: bool = False
+# --------------- Endpoints ---------------
 @app.get("/health")
 def health():
     try:
         return {"ok": True}
     except Exception as e:
         return {"ok": False, "error": str(e)}
+@app.post("/generate")
+def generate(req: ChatRequest):
+    """
+    Chat-completion endpoint with optional server-side streaming.
+    Uses Llama-3 chat template via chat_format="llama-3".
+    """
+    try:
+        model = get_model()
+        # Convert to llama.cpp message format
+        msgs = [{"role": m.role, "content": m.content} for m in req.messages]
+        if not req.stream:
+            out = model.create_chat_completion(
+                messages=msgs,
+                max_tokens=req.max_tokens,
+                temperature=req.temperature,
+                top_p=req.top_p,
+            )
+            text = out["choices"][0]["message"]["content"]
+            return JSONResponse({"ok": True, "response": text})
+        # --- Streaming mode ---
+        def token_stream():
+            start = time.time()
+            for chunk in model.create_chat_completion(
+                messages=msgs,
+                max_tokens=req.max_tokens,
+                temperature=req.temperature,
+                top_p=req.top_p,
+                stream=True,
+            ):
+                if "choices" in chunk and chunk["choices"]:
+                    delta = chunk["choices"][0]["delta"].get("content", "")
+                    if delta:
+                        yield delta
+            # small trailer to mark end (optional)
+            yield f"\n\n[done in {time.time()-start:.2f}s]"
+        return StreamingResponse(token_stream(), media_type="text/plain")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))