zach9111 commited on
Commit
8788df8
·
1 Parent(s): 5949242
Files changed (3) hide show
  1. Dockerfile +21 -0
  2. app.py +50 -0
  3. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.10 as the base image (change to 3.9 if needed)
2
+ FROM python:3.10
3
+
4
+ # Create a non-root user
5
+ RUN useradd -m -u 1000 user
6
+ USER user
7
+ ENV PATH="/home/user/.local/bin:$PATH"
8
+
9
+ # Set working directory
10
+ WORKDIR /app
11
+
12
+ # Copy and install dependencies first (helps with caching)
13
+ COPY --chown=user requirements.txt .
14
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
15
+
16
+ # Copy the rest of the files
17
+ COPY --chown=user . .
18
+
19
+ # Expose the API port (7860 for Hugging Face Spaces)
20
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
21
+
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from peft import PeftModel
5
+ import torch
6
+
7
+ app = FastAPI()
8
+
9
+ # Define paths
10
+ base_model_path = "NousResearch/Hermes-3-Llama-3.2-3B"
11
+ adapter_path = "zach9111/llama_startup_adapter"
12
+
13
+ # Check if GPU is available
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+
16
+ # Load base model with `device_map="auto"` to handle GPUs automatically
17
+ base_model = AutoModelForCausalLM.from_pretrained(
18
+ base_model_path, torch_dtype=torch.float16, device_map="auto"
19
+ )
20
+
21
+ # Load adapter and ensure it is on the correct device
22
+ model = PeftModel.from_pretrained(base_model, adapter_path).to(device)
23
+
24
+ # Load tokenizer
25
+ tokenizer = AutoTokenizer.from_pretrained(base_model_path)
26
+
27
+
28
+ class GenerateRequest(BaseModel):
29
+ prompt: str
30
+
31
+ # **Use `model.generate()` instead of `pipeline()`**
32
+ def generate_text_from_model(prompt: str):
33
+ try:
34
+ input_ids = tokenizer(f"<s>[INST] {prompt} [/INST]", return_tensors="pt").input_ids.to(device)
35
+ output_ids = model.generate(input_ids, max_length=512)
36
+ generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
37
+ return generated_text
38
+ except Exception as e:
39
+ raise HTTPException(status_code=500, detail=str(e))
40
+
41
+ # Root endpoint for testing
42
+ @app.get("/")
43
+ async def root():
44
+ return {"message": "Model is running! Use /generate/ for text generation."}
45
+
46
+ # Text generation endpoint
47
+ @app.post("/generate/")
48
+ async def generate_text(request: GenerateRequest):
49
+ response = generate_text_from_model(request.prompt)
50
+ return {"response": response}
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ transformers
4
+ torch
5
+ peft