aguitauwu commited on
Commit
08540f7
·
1 Parent(s): f419ee9
Files changed (3) hide show
  1. Dockerfile +0 -7
  2. app.py +14 -5
  3. requirements.txt +0 -1
Dockerfile CHANGED
@@ -2,21 +2,16 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
5
- # Instalar dependencias del sistema
6
  RUN apt-get update && apt-get install -y \
7
  git \
8
  curl \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
- # Copiar e instalar dependencias Python primero (cache de Docker)
12
  COPY requirements.txt .
13
  RUN pip install --no-cache-dir -r requirements.txt
14
 
15
- # Copiar código
16
  COPY app.py .
17
 
18
- # Pre-descargar el modelo durante el build
19
- # (no en runtime, así el container arranca rápido)
20
  RUN python -c "\
21
  from transformers import AutoTokenizer, AutoModelForCausalLM; \
22
  print('Downloading tokenizer...'); \
@@ -27,9 +22,7 @@ print('Done!')"
27
 
28
  EXPOSE 7860
29
 
30
- # Healthcheck para saber cuando el servidor está listo
31
  HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
32
  CMD curl -f http://localhost:7860/health || exit 1
33
 
34
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
35
-
 
2
 
3
  WORKDIR /app
4
 
 
5
  RUN apt-get update && apt-get install -y \
6
  git \
7
  curl \
8
  && rm -rf /var/lib/apt/lists/*
9
 
 
10
  COPY requirements.txt .
11
  RUN pip install --no-cache-dir -r requirements.txt
12
 
 
13
  COPY app.py .
14
 
 
 
15
  RUN python -c "\
16
  from transformers import AutoTokenizer, AutoModelForCausalLM; \
17
  print('Downloading tokenizer...'); \
 
22
 
23
  EXPOSE 7860
24
 
 
25
  HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
26
  CMD curl -f http://localhost:7860/health || exit 1
27
 
28
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
app.py CHANGED
@@ -13,7 +13,6 @@ app = FastAPI(
13
  version="1.0.0"
14
  )
15
 
16
- # CORS para que Yuuki-chat pueda llamar desde el browser
17
  app.add_middleware(
18
  CORSMiddleware,
19
  allow_origins=["*"],
@@ -21,7 +20,6 @@ app.add_middleware(
21
  allow_headers=["*"],
22
  )
23
 
24
- # Cargar modelo una sola vez al arrancar
25
  print(f"Loading tokenizer from {MODEL_ID}...")
26
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
27
 
@@ -31,7 +29,7 @@ model = AutoModelForCausalLM.from_pretrained(
31
  torch_dtype=torch.float32
32
  ).to("cpu")
33
 
34
- model.eval() # Modo inferencia (más rápido, menos memoria)
35
  print("Model ready!")
36
 
37
 
@@ -48,6 +46,19 @@ class GenerateResponse(BaseModel):
48
  time_ms: int
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  @app.get("/health")
52
  def health():
53
  return {"status": "ok", "model": MODEL_ID}
@@ -78,7 +89,6 @@ def generate(req: GenerateRequest):
78
  repetition_penalty=1.1,
79
  )
80
 
81
- # Solo devolver tokens NUEVOS (no el prompt)
82
  new_tokens = output[0][input_length:]
83
  response_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
84
 
@@ -92,4 +102,3 @@ def generate(req: GenerateRequest):
92
 
93
  except Exception as e:
94
  raise HTTPException(status_code=500, detail=str(e))
95
-
 
13
  version="1.0.0"
14
  )
15
 
 
16
  app.add_middleware(
17
  CORSMiddleware,
18
  allow_origins=["*"],
 
20
  allow_headers=["*"],
21
  )
22
 
 
23
  print(f"Loading tokenizer from {MODEL_ID}...")
24
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
25
 
 
29
  torch_dtype=torch.float32
30
  ).to("cpu")
31
 
32
+ model.eval()
33
  print("Model ready!")
34
 
35
 
 
46
  time_ms: int
47
 
48
 
49
+ @app.get("/")
50
+ def root():
51
+ return {
52
+ "message": "Yuuki Local Inference API",
53
+ "model": MODEL_ID,
54
+ "endpoints": {
55
+ "health": "GET /health",
56
+ "generate": "POST /generate",
57
+ "docs": "GET /docs"
58
+ }
59
+ }
60
+
61
+
62
  @app.get("/health")
63
  def health():
64
  return {"status": "ok", "model": MODEL_ID}
 
89
  repetition_penalty=1.1,
90
  )
91
 
 
92
  new_tokens = output[0][input_length:]
93
  response_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
94
 
 
102
 
103
  except Exception as e:
104
  raise HTTPException(status_code=500, detail=str(e))
 
requirements.txt CHANGED
@@ -4,4 +4,3 @@ transformers==4.45.0
4
  torch==2.4.1
5
  pydantic==2.9.0
6
  accelerate==0.34.2
7
-
 
4
  torch==2.4.1
5
  pydantic==2.9.0
6
  accelerate==0.34.2