saeid1999 commited on
Commit
31433e6
·
verified ·
1 Parent(s): 882458d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ import torch
5
+
6
+ app = FastAPI()
7
+
8
+ MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
+
11
+ print(f"Loading model on {device}...")
12
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
13
+ model = AutoModelForCausalLM.from_pretrained(
14
+ MODEL_NAME,
15
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
16
+ device_map="auto",
17
+ trust_remote_code=True
18
+ )
19
+ print("Model loaded!")
20
+
21
+ class ChatRequest(BaseModel):
22
+ message: str
23
+ max_tokens: int = 512
24
+ temperature: float = 0.7
25
+
26
+ class CompletionRequest(BaseModel):
27
+ messages: list
28
+ max_tokens: int = 512
29
+ temperature: float = 0.7
30
+ stream: bool = False
31
+
32
+ @app.post("/chat")
33
+ def chat(req: ChatRequest):
34
+ messages = [{"role": "user", "content": req.message}]
35
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
36
+ inputs = tokenizer([text], return_tensors="pt").to(device)
37
+
38
+ outputs = model.generate(
39
+ **inputs,
40
+ max_new_tokens=req.max_tokens,
41
+ temperature=req.temperature,
42
+ do_sample=True
43
+ )
44
+
45
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
46
+ response = response.split("assistant\n")[-1].strip()
47
+
48
+ return {"response": response}
49
+
50
+ @app.post("/v1/chat/completions")
51
+ def completions(req: CompletionRequest):
52
+ text = tokenizer.apply_chat_template(req.messages, tokenize=False, add_generation_prompt=True)
53
+ inputs = tokenizer([text], return_tensors="pt").to(device)
54
+
55
+ outputs = model.generate(
56
+ **inputs,
57
+ max_new_tokens=req.max_tokens,
58
+ temperature=req.temperature,
59
+ do_sample=True
60
+ )
61
+
62
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
63
+ response = response.split("assistant\n")[-1].strip()
64
+
65
+ return {
66
+ "choices": [{
67
+ "message": {"role": "assistant", "content": response},
68
+ "finish_reason": "stop"
69
+ }]
70
+ }
71
+
72
+ @app.get("/health")
73
+ def health():
74
+ return {"status": "ok"}
75
+
76
+ if __name__ == "__main__":
77
+ import uvicorn
78
+ uvicorn.run(app, host="0.0.0.0", port=8000)