Spaces:

omaryasserhassan
/

llm_server

Sleeping

App Files Files Community

omaryasserhassan commited on Aug 14

Commit

0f0af70

verified ·

1 Parent(s): 794b6d3

Create app.py

Browse files

Files changed (1) hide show

app.py +62 -0

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from huggingface_hub import hf_hub_download
+from ctransformers import AutoModelForCausalLM
+# --- Config ---
+REPO_ID  = "bartowski/Llama-3.2-3B-Instruct-GGUF"
+FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"  # Low quantization
+MODEL_TYPE = "llama"
+# --- Cache dir ---
+CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+# --- FastAPI App ---
+app = FastAPI(title="Llama 3.2 3B Instruct API")
+_model = None
+# --- Load Model ---
+def get_model():
+    global _model
+    if _model is not None:
+        return _model
+    print("📥 Downloading model...")
+    local_path = hf_hub_download(
+        repo_id=REPO_ID,
+        filename=FILENAME,
+        cache_dir=CACHE_DIR,
+        local_dir_use_symlinks=False,
+    )
+    print("✅ Model downloaded at", local_path)
+    print("🔄 Loading model into memory...")
+    _model = AutoModelForCausalLM.from_pretrained(
+        local_path,
+        model_type=MODEL_TYPE,
+        gpu_layers=0  # 0 = CPU only
+    )
+    print("✅ Model loaded")
+    return _model
+# --- Request Schema ---
+class PromptRequest(BaseModel):
+    prompt: str
+    max_new_tokens: int = 256
+    temperature: float = 0.7
+# --- API Endpoint ---
+@app.post("/generate")
+def generate_text(req: PromptRequest):
+    try:
+        model = get_model()
+        output = model(
+            req.prompt,
+            max_new_tokens=req.max_new_tokens,
+            temperature=req.temperature
+        )
+        return {"response": output}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))