omaryasserhassan commited on
Commit
0f0af70
·
verified ·
1 Parent(s): 794b6d3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -0
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from fastapi import FastAPI, HTTPException
3
+ from pydantic import BaseModel
4
+ from huggingface_hub import hf_hub_download
5
+ from ctransformers import AutoModelForCausalLM
6
+
7
+ # --- Config ---
8
+ REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
9
+ FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" # Low quantization
10
+ MODEL_TYPE = "llama"
11
+
12
+ # --- Cache dir ---
13
+ CACHE_DIR = os.environ.get("HF_HOME", "/tmp/hf_cache")
14
+ os.makedirs(CACHE_DIR, exist_ok=True)
15
+
16
+ # --- FastAPI App ---
17
+ app = FastAPI(title="Llama 3.2 3B Instruct API")
18
+ _model = None
19
+
20
+ # --- Load Model ---
21
+ def get_model():
22
+ global _model
23
+ if _model is not None:
24
+ return _model
25
+
26
+ print("📥 Downloading model...")
27
+ local_path = hf_hub_download(
28
+ repo_id=REPO_ID,
29
+ filename=FILENAME,
30
+ cache_dir=CACHE_DIR,
31
+ local_dir_use_symlinks=False,
32
+ )
33
+ print("✅ Model downloaded at", local_path)
34
+
35
+ print("🔄 Loading model into memory...")
36
+ _model = AutoModelForCausalLM.from_pretrained(
37
+ local_path,
38
+ model_type=MODEL_TYPE,
39
+ gpu_layers=0 # 0 = CPU only
40
+ )
41
+ print("✅ Model loaded")
42
+ return _model
43
+
44
+ # --- Request Schema ---
45
+ class PromptRequest(BaseModel):
46
+ prompt: str
47
+ max_new_tokens: int = 256
48
+ temperature: float = 0.7
49
+
50
+ # --- API Endpoint ---
51
+ @app.post("/generate")
52
+ def generate_text(req: PromptRequest):
53
+ try:
54
+ model = get_model()
55
+ output = model(
56
+ req.prompt,
57
+ max_new_tokens=req.max_new_tokens,
58
+ temperature=req.temperature
59
+ )
60
+ return {"response": output}
61
+ except Exception as e:
62
+ raise HTTPException(status_code=500, detail=str(e))