Leonardo0711 commited on
Commit
5b5f5ea
·
verified ·
1 Parent(s): b6a3f56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -77
app.py CHANGED
@@ -3,25 +3,24 @@
3
 
4
  import os, glob, textwrap
5
  from pathlib import Path
 
 
 
 
 
6
 
7
- import gradio as gr
8
  from huggingface_hub import snapshot_download
9
  from llama_cpp import Llama
10
  import requests
11
  from bs4 import BeautifulSoup
12
 
13
- from fastapi import FastAPI, Body
14
- from fastapi.middleware.cors import CORSMiddleware
15
- from gradio.routes import mount_gradio_app
16
-
17
- # ===== Dónde guardar el modelo (NO usar /app) =====
18
- # Gratis/ephemeral: /tmp/models | Persistente (si contratas storage): /data/models
19
  MODELS_DIR = Path(os.getenv("MODELS_DIR", "/tmp/models"))
20
  MODELS_DIR.mkdir(parents=True, exist_ok=True)
21
 
22
  # ===== Modelo (GGUF) =====
23
  MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-7B-Instruct-GGUF")
24
- # Si falta RAM o quieres descargar más rápido: exporta MODEL_PATTERN=qwen2.5-7b-instruct-q3_k_m-*.gguf
25
  MODEL_PATTERN = os.getenv("MODEL_PATTERN", "qwen2.5-7b-instruct-q4_k_m-*.gguf")
26
 
27
  print(f"[Boot] Descargando {MODEL_REPO} patrón {MODEL_PATTERN} en {MODELS_DIR} ...")
@@ -47,6 +46,7 @@ llm = Llama(
47
  n_gpu_layers=0,
48
  verbose=False,
49
  )
 
50
 
51
  SYSTEM_DEFAULT = textwrap.dedent("""\
52
  Eres Astrohunters-Guide, un asistente en español.
@@ -60,73 +60,29 @@ def fetch_url_text(url: str, max_chars: int = 6000) -> str:
60
  r = requests.get(url, timeout=15)
61
  r.raise_for_status()
62
  soup = BeautifulSoup(r.text, "html.parser")
63
- for t in soup(["script", "style", "noscript"]): t.decompose()
64
  txt = " ".join(soup.get_text(separator=" ").split())
65
  return txt[:max_chars]
66
  except Exception as e:
67
  return f"[No se pudo cargar {url}: {e}]"
68
 
69
- def run_llm(messages, temperature=0.6, top_p=0.95, max_tokens=768):
70
- out = llm.create_chat_completion(
71
- messages=messages,
72
- temperature=temperature,
73
- top_p=top_p,
74
- max_tokens=max_tokens,
75
- stream=False,
76
- )
 
77
  return out["choices"][0]["message"]["content"].strip()
78
 
79
- # ====== Lógica API ======
80
- def api_run_predict(prompt: str, system: str = "") -> str:
81
- messages = [
82
- {"role": "system", "content": system or SYSTEM_DEFAULT},
83
- {"role": "user", "content": prompt},
84
- ]
85
- return run_llm(messages, max_tokens=512)
86
 
87
- def api_run_predict_with_url(prompt: str, url: str = "", system: str = "") -> str:
88
- web_ctx = fetch_url_text(url) if url else ""
89
- user_msg = prompt if not web_ctx else f"{prompt}\n\n[CONTEXTO_WEB]\n{web_ctx}"
90
- messages = [
91
- {"role": "system", "content": system or SYSTEM_DEFAULT},
92
- {"role": "user", "content": user_msg},
93
- ]
94
- return run_llm(messages, max_tokens=700)
95
-
96
- # ====== UI de chat (Gradio) ======
97
- with gr.Blocks(title="Astrohunters LLM (Qwen2.5 7B)") as chat_ui:
98
- gr.Markdown("## 🛰️ Astrohunters LLM (Qwen2.5 7B Instruct, GGUF — CPU Basic)")
99
- with gr.Row():
100
- with gr.Column(scale=3):
101
- chat = gr.Chatbot(height=420, type="tuples")
102
- with gr.Row():
103
- txt = gr.Textbox(placeholder="Escribe tu pregunta...", scale=4)
104
- btn = gr.Button("Enviar", scale=1, variant="primary")
105
- with gr.Column(scale=2):
106
- system_tb = gr.Textbox(label="System prompt", value=SYSTEM_DEFAULT, lines=10)
107
- url_tb = gr.Textbox(label="URL (opcional): Cargar contenido web", placeholder="https://...")
108
-
109
- def chat_infer(history, system_prompt, user, url_to_load):
110
- web_ctx = fetch_url_text(url_to_load.strip()) if url_to_load and url_to_load.strip() else ""
111
- messages = [{"role": "system", "content": system_prompt or SYSTEM_DEFAULT}]
112
- for u, a in history:
113
- if u: messages.append({"role": "user", "content": u})
114
- if a: messages.append({"role": "assistant", "content": a})
115
- user_msg = user or ""
116
- if web_ctx:
117
- user_msg = f"{user_msg}\n\n[CONTEXTO_WEB]\n{web_ctx}"
118
- messages.append({"role": "user", "content": user_msg})
119
- reply = run_llm(messages, max_tokens=700)
120
- history.append((user, reply))
121
- return history, ""
122
-
123
- btn.click(chat_infer, inputs=[chat, system_tb, txt, url_tb], outputs=[chat, txt])
124
- txt.submit(chat_infer, inputs=[chat, system_tb, txt, url_tb], outputs=[chat, txt])
125
-
126
- # ====== FastAPI + CORS + endpoints REST ======
127
- api = FastAPI()
128
  ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*").split(",")
129
- api.add_middleware(
130
  CORSMiddleware,
131
  allow_origins=ALLOWED_ORIGINS,
132
  allow_credentials=True,
@@ -134,25 +90,57 @@ api.add_middleware(
134
  allow_headers=["*"],
135
  )
136
 
137
- @api.get("/healthz")
138
  def healthz():
139
  return {"ok": True}
140
 
141
- @api.post("/run_predict")
142
  def run_predict(body: dict = Body(...)):
143
  prompt = body.get("prompt", "")
144
  system = body.get("system", "")
145
- return {"reply": api_run_predict(prompt, system)}
 
 
 
 
 
146
 
147
- @api.post("/run_predict_with_url")
148
  def run_predict_with_url(body: dict = Body(...)):
149
  prompt = body.get("prompt", "")
150
  url = body.get("url", "")
151
  system = body.get("system", "")
152
- return {"reply": api_run_predict_with_url(prompt, url, system)}
153
-
154
- # Montamos la UI de Gradio en "/"
155
- app = mount_gradio_app(api, chat_ui, path="/")
156
-
157
- if __name__ == "__main__":
158
- chat_ui.queue(max_size=16).launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  import os, glob, textwrap
5
  from pathlib import Path
6
+ from threading import Lock
7
+
8
+ from fastapi import FastAPI, Body
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from fastapi.responses import HTMLResponse, JSONResponse
11
 
 
12
  from huggingface_hub import snapshot_download
13
  from llama_cpp import Llama
14
  import requests
15
  from bs4 import BeautifulSoup
16
 
17
+ # ===== Carpeta para el modelo (NO usar /app) =====
 
 
 
 
 
18
  MODELS_DIR = Path(os.getenv("MODELS_DIR", "/tmp/models"))
19
  MODELS_DIR.mkdir(parents=True, exist_ok=True)
20
 
21
  # ===== Modelo (GGUF) =====
22
  MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-7B-Instruct-GGUF")
23
+ # Para CPU basic puedes poner en Variables: MODEL_PATTERN=qwen2.5-7b-instruct-q3_k_m-*.gguf
24
  MODEL_PATTERN = os.getenv("MODEL_PATTERN", "qwen2.5-7b-instruct-q4_k_m-*.gguf")
25
 
26
  print(f"[Boot] Descargando {MODEL_REPO} patrón {MODEL_PATTERN} en {MODELS_DIR} ...")
 
46
  n_gpu_layers=0,
47
  verbose=False,
48
  )
49
+ _llm_lock = Lock()
50
 
51
  SYSTEM_DEFAULT = textwrap.dedent("""\
52
  Eres Astrohunters-Guide, un asistente en español.
 
60
  r = requests.get(url, timeout=15)
61
  r.raise_for_status()
62
  soup = BeautifulSoup(r.text, "html.parser")
63
+ for t in soup(["script", "style", "noscript"]): t.remove()
64
  txt = " ".join(soup.get_text(separator=" ").split())
65
  return txt[:max_chars]
66
  except Exception as e:
67
  return f"[No se pudo cargar {url}: {e}]"
68
 
69
+ def run_llm(messages, temperature=0.6, top_p=0.95, max_tokens=768) -> str:
70
+ with _llm_lock:
71
+ out = llm.create_chat_completion(
72
+ messages=messages,
73
+ temperature=temperature,
74
+ top_p=top_p,
75
+ max_tokens=max_tokens,
76
+ stream=False,
77
+ )
78
  return out["choices"][0]["message"]["content"].strip()
79
 
80
+ # ===== FastAPI =====
81
+ app = FastAPI(title="Astrohunters LLM API", version="1.0.0")
 
 
 
 
 
82
 
83
+ # CORS (ajusta ALLOWED_ORIGINS en Settings Variables si quieres limitar a tu dominio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*").split(",")
85
+ app.add_middleware(
86
  CORSMiddleware,
87
  allow_origins=ALLOWED_ORIGINS,
88
  allow_credentials=True,
 
90
  allow_headers=["*"],
91
  )
92
 
93
+ @app.get("/healthz")
94
  def healthz():
95
  return {"ok": True}
96
 
97
+ @app.post("/run_predict")
98
  def run_predict(body: dict = Body(...)):
99
  prompt = body.get("prompt", "")
100
  system = body.get("system", "")
101
+ messages = [
102
+ {"role": "system", "content": system or SYSTEM_DEFAULT},
103
+ {"role": "user", "content": prompt},
104
+ ]
105
+ reply = run_llm(messages, max_tokens=512)
106
+ return {"reply": reply}
107
 
108
+ @app.post("/run_predict_with_url")
109
  def run_predict_with_url(body: dict = Body(...)):
110
  prompt = body.get("prompt", "")
111
  url = body.get("url", "")
112
  system = body.get("system", "")
113
+ web_ctx = fetch_url_text(url) if url else ""
114
+ user_msg = prompt if not web_ctx else f"{prompt}\n\n[CONTEXTO_WEB]\n{web_ctx}"
115
+ messages = [
116
+ {"role": "system", "content": system or SYSTEM_DEFAULT},
117
+ {"role": "user", "content": user_msg},
118
+ ]
119
+ reply = run_llm(messages, max_tokens=700)
120
+ return {"reply": reply}
121
+
122
+ # Página mínima de prueba
123
+ @app.get("/", response_class=HTMLResponse)
124
+ def home():
125
+ return """
126
+ <!doctype html>
127
+ <html>
128
+ <head><meta charset="utf-8"><title>Astrohunters LLM API</title></head>
129
+ <body style="font-family:system-ui;max-width:800px;margin:40px auto">
130
+ <h2>🛰️ Astrohunters LLM API</h2>
131
+ <p>Endpoints: <code>/healthz</code>, <code>/run_predict</code>, <code>/run_predict_with_url</code>, y <a href="/docs">/docs</a> (Swagger).</p>
132
+ <textarea id="q" rows="4" style="width:100%" placeholder="Escribe tu pregunta..."></textarea>
133
+ <button id="btn">Preguntar</button>
134
+ <pre id="out"></pre>
135
+ <script>
136
+ document.getElementById('btn').onclick = async () => {
137
+ const r = await fetch('/run_predict', {
138
+ method:'POST', headers:{'Content-Type':'application/json'},
139
+ body: JSON.stringify({prompt: document.getElementById('q').value})
140
+ });
141
+ const j = await r.json();
142
+ document.getElementById('out').textContent = j.reply || JSON.stringify(j,null,2);
143
+ };
144
+ </script>
145
+ </body></html>
146
+ """