Leonardo0711 commited on
Commit
9124adb
·
verified ·
1 Parent(s): 83e6608

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -136
app.py CHANGED
@@ -1,136 +1,154 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- import os, glob, textwrap
5
- from pathlib import Path
6
-
7
- import gradio as gr
8
- from huggingface_hub import snapshot_download
9
- from llama_cpp import Llama
10
- import requests
11
- from bs4 import BeautifulSoup
12
-
13
- # ===== Modelo (GGUF) =====
14
- MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-7B-Instruct-GGUF")
15
- # Para CPU Basic puedes bajar a q3_k_m si te falta RAM
16
- MODEL_PATTERN = os.getenv("MODEL_PATTERN", "qwen2.5-7b-instruct-q4_k_m-*.gguf")
17
-
18
- LOCAL_DIR = Path("models"); LOCAL_DIR.mkdir(parents=True, exist_ok=True)
19
- print(f"[Boot] Descargando {MODEL_REPO} patrón {MODEL_PATTERN} ...")
20
- snapshot_dir = snapshot_download(repo_id=MODEL_REPO, local_dir=str(LOCAL_DIR),
21
- allow_patterns=[MODEL_PATTERN])
22
- candidates = sorted(glob.glob(str(Path(snapshot_dir) / MODEL_PATTERN)))
23
- if not candidates:
24
- raise FileNotFoundError(f"No hay shards para {MODEL_PATTERN} en {snapshot_dir}")
25
- MODEL_PATH = candidates[0]
26
- print(f"[Boot] Usando shard: {MODEL_PATH}")
27
-
28
- N_THREADS = max(1, (os.cpu_count() or 2) - 1)
29
-
30
- llm = Llama(
31
- model_path=MODEL_PATH,
32
- n_ctx=4096,
33
- n_threads=N_THREADS,
34
- n_batch=256,
35
- n_gpu_layers=0,
36
- verbose=False,
37
- )
38
-
39
- SYSTEM_DEFAULT = textwrap.dedent("""\
40
- Eres Astrohunters-Guide, un asistente en español.
41
- - Respondes con precisión y sin inventar datos.
42
- - Sabes explicar resultados de exoplanetas (período, duración, profundidad, SNR, radio).
43
- - Si te paso una URL, lees su contenido y lo usas como contexto.
44
- """)
45
-
46
- def fetch_url_text(url: str, max_chars: int = 6000) -> str:
47
- try:
48
- r = requests.get(url, timeout=15)
49
- r.raise_for_status()
50
- soup = BeautifulSoup(r.text, "html.parser")
51
- for t in soup(["script", "style", "noscript"]): t.decompose()
52
- txt = " ".join(soup.get_text(separator=" ").split())
53
- return txt[:max_chars]
54
- except Exception as e:
55
- return f"[No se pudo cargar {url}: {e}]"
56
-
57
- def run_llm(messages, temperature=0.6, top_p=0.95, max_tokens=768):
58
- out = llm.create_chat_completion(
59
- messages=messages,
60
- temperature=temperature,
61
- top_p=top_p,
62
- max_tokens=max_tokens,
63
- stream=False,
64
- )
65
- return out["choices"][0]["message"]["content"].strip()
66
-
67
- # ====== Funciones API ======
68
- def api_run_predict(prompt: str, system: str = "") -> str:
69
- messages = [
70
- {"role": "system", "content": system or SYSTEM_DEFAULT},
71
- {"role": "user", "content": prompt},
72
- ]
73
- return run_llm(messages, max_tokens=512)
74
-
75
- def api_run_predict_with_url(prompt: str, url: str = "", system: str = "") -> str:
76
- web_ctx = fetch_url_text(url) if url else ""
77
- user_msg = prompt if not web_ctx else f"{prompt}\n\n[CONTEXTO_WEB]\n{web_ctx}"
78
- messages = [
79
- {"role": "system", "content": system or SYSTEM_DEFAULT},
80
- {"role": "user", "content": user_msg},
81
- ]
82
- return run_llm(messages, max_tokens=700)
83
-
84
- # ===== UI de chat =====
85
- with gr.Blocks(title="Astrohunters LLM (Qwen2.5 7B)") as chat_ui:
86
- gr.Markdown("## 🛰️ Astrohunters LLM (Qwen2.5 7B Instruct, GGUF — CPU Basic)")
87
- with gr.Row():
88
- with gr.Column(scale=3):
89
- chat = gr.Chatbot(height=420, type="tuples")
90
- with gr.Row():
91
- txt = gr.Textbox(placeholder="Escribe tu pregunta...", scale=4)
92
- btn = gr.Button("Enviar", scale=1, variant="primary")
93
- with gr.Column(scale=2):
94
- system_tb = gr.Textbox(label="System prompt", value=SYSTEM_DEFAULT, lines=10)
95
- url_tb = gr.Textbox(label="URL (opcional): Cargar contenido web", placeholder="https://...")
96
-
97
- def chat_infer(history, system_prompt, user, url_to_load):
98
- web_ctx = fetch_url_text(url_to_load.strip()) if url_to_load and url_to_load.strip() else ""
99
- messages = [{"role": "system", "content": system_prompt or SYSTEM_DEFAULT}]
100
- for u, a in history:
101
- if u: messages.append({"role": "user", "content": u})
102
- if a: messages.append({"role": "assistant", "content": a})
103
- user_msg = user or ""
104
- if web_ctx:
105
- user_msg = f"{user_msg}\n\n[CONTEXTO_WEB]\n{web_ctx}"
106
- messages.append({"role": "user", "content": user_msg})
107
- reply = run_llm(messages, max_tokens=700)
108
- history.append((user, reply))
109
- return history, ""
110
-
111
- btn.click(chat_infer, inputs=[chat, system_tb, txt, url_tb], outputs=[chat, txt])
112
- txt.submit(chat_infer, inputs=[chat, system_tb, txt, url_tb], outputs=[chat, txt])
113
-
114
- # ====== APIs nombradas (fuera del Blocks) ======
115
- api1 = gr.Interface(
116
- fn=api_run_predict,
117
- inputs=[gr.Textbox(label="prompt"), gr.Textbox(label="system")],
118
- outputs=gr.Textbox(label="reply"),
119
- api_name="run_predict",
120
- )
121
-
122
- api2 = gr.Interface(
123
- fn=api_run_predict_with_url,
124
- inputs=[gr.Textbox(label="prompt"), gr.Textbox(label="url"), gr.Textbox(label="system")],
125
- outputs=gr.Textbox(label="reply"),
126
- api_name="run_predict_with_url",
127
- )
128
-
129
- # Unimos todo en un solo demo para que Gradio registre las rutas
130
- demo = gr.TabbedInterface(
131
- [chat_ui, api1, api2],
132
- tab_names=["Chat", "API: run_predict", "API: run_predict_with_url"],
133
- )
134
-
135
- if __name__ == "__main__":
136
- demo.queue(max_size=16).launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os, glob, textwrap
5
+ from pathlib import Path
6
+
7
+ import gradio as gr
8
+ from huggingface_hub import snapshot_download
9
+ from llama_cpp import Llama
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+
13
+ # FastAPI + CORS + mount de Gradio
14
+ from fastapi import FastAPI, Body
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from gradio.routes import mount_gradio_app
17
+
18
+ # ===== Config del modelo (puedes cambiar por variables de entorno) =====
19
+ MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-7B-Instruct-GGUF")
20
+ # En CPU Basic, si te falta RAM/tiempo de inferencia: qwen2.5-7b-instruct-q3_k_m-*.gguf
21
+ MODEL_PATTERN = os.getenv("MODEL_PATTERN", "qwen2.5-7b-instruct-q4_k_m-*.gguf")
22
+
23
+ LOCAL_DIR = Path("models"); LOCAL_DIR.mkdir(parents=True, exist_ok=True)
24
+ print(f"[Boot] Descargando {MODEL_REPO} patrón {MODEL_PATTERN} ...")
25
+ snapshot_dir = snapshot_download(repo_id=MODEL_REPO, local_dir=str(LOCAL_DIR),
26
+ allow_patterns=[MODEL_PATTERN])
27
+ candidates = sorted(glob.glob(str(Path(snapshot_dir) / MODEL_PATTERN)))
28
+ if not candidates:
29
+ raise FileNotFoundError(f"No hay shards para {MODEL_PATTERN} en {snapshot_dir}")
30
+ MODEL_PATH = candidates[0]
31
+ print(f"[Boot] Usando shard: {MODEL_PATH}")
32
+
33
+ # Hilos seguros para CPU Basic
34
+ N_THREADS = max(1, (os.cpu_count() or 2) - 1)
35
+
36
+ llm = Llama(
37
+ model_path=MODEL_PATH,
38
+ n_ctx=4096,
39
+ n_threads=N_THREADS,
40
+ n_batch=256,
41
+ n_gpu_layers=0,
42
+ verbose=False,
43
+ )
44
+
45
+ SYSTEM_DEFAULT = textwrap.dedent("""\
46
+ Eres Astrohunters-Guide, un asistente en español.
47
+ - Respondes con precisión y sin inventar datos.
48
+ - Sabes explicar resultados de exoplanetas (período, duración, profundidad, SNR, radio).
49
+ - Si te paso una URL, lees su contenido y lo usas como contexto.
50
+ """)
51
+
52
+ def fetch_url_text(url: str, max_chars: int = 6000) -> str:
53
+ try:
54
+ r = requests.get(url, timeout=15)
55
+ r.raise_for_status()
56
+ soup = BeautifulSoup(r.text, "html.parser")
57
+ for t in soup(["script", "style", "noscript"]): t.decompose()
58
+ txt = " ".join(soup.get_text(separator=" ").split())
59
+ return txt[:max_chars]
60
+ except Exception as e:
61
+ return f"[No se pudo cargar {url}: {e}]"
62
+
63
+ def run_llm(messages, temperature=0.6, top_p=0.95, max_tokens=768):
64
+ out = llm.create_chat_completion(
65
+ messages=messages,
66
+ temperature=temperature,
67
+ top_p=top_p,
68
+ max_tokens=max_tokens,
69
+ stream=False,
70
+ )
71
+ return out["choices"][0]["message"]["content"].strip()
72
+
73
+ # ====== Lógica API ======
74
+ def api_run_predict(prompt: str, system: str = "") -> str:
75
+ messages = [
76
+ {"role": "system", "content": system or SYSTEM_DEFAULT},
77
+ {"role": "user", "content": prompt},
78
+ ]
79
+ return run_llm(messages, max_tokens=512)
80
+
81
+ def api_run_predict_with_url(prompt: str, url: str = "", system: str = "") -> str:
82
+ web_ctx = fetch_url_text(url) if url else ""
83
+ user_msg = prompt if not web_ctx else f"{prompt}\n\n[CONTEXTO_WEB]\n{web_ctx}"
84
+ messages = [
85
+ {"role": "system", "content": system or SYSTEM_DEFAULT},
86
+ {"role": "user", "content": user_msg},
87
+ ]
88
+ return run_llm(messages, max_tokens=700)
89
+
90
+ # ====== UI de chat (Gradio) ======
91
+ with gr.Blocks(title="Astrohunters LLM (Qwen2.5 7B)") as chat_ui:
92
+ gr.Markdown("## 🛰️ Astrohunters LLM (Qwen2.5 7B Instruct, GGUF — CPU Basic)")
93
+ with gr.Row():
94
+ with gr.Column(scale=3):
95
+ chat = gr.Chatbot(height=420, type="tuples")
96
+ with gr.Row():
97
+ txt = gr.Textbox(placeholder="Escribe tu pregunta...", scale=4)
98
+ btn = gr.Button("Enviar", scale=1, variant="primary")
99
+ with gr.Column(scale=2):
100
+ system_tb = gr.Textbox(label="System prompt", value=SYSTEM_DEFAULT, lines=10)
101
+ url_tb = gr.Textbox(label="URL (opcional): Cargar contenido web", placeholder="https://...")
102
+
103
+ def chat_infer(history, system_prompt, user, url_to_load):
104
+ web_ctx = fetch_url_text(url_to_load.strip()) if url_to_load and url_to_load.strip() else ""
105
+ messages = [{"role": "system", "content": system_prompt or SYSTEM_DEFAULT}]
106
+ for u, a in history:
107
+ if u: messages.append({"role": "user", "content": u})
108
+ if a: messages.append({"role": "assistant", "content": a})
109
+ user_msg = user or ""
110
+ if web_ctx:
111
+ user_msg = f"{user_msg}\n\n[CONTEXTO_WEB]\n{web_ctx}"
112
+ messages.append({"role": "user", "content": user_msg})
113
+ reply = run_llm(messages, max_tokens=700)
114
+ history.append((user, reply))
115
+ return history, ""
116
+
117
+ btn.click(chat_infer, inputs=[chat, system_tb, txt, url_tb], outputs=[chat, txt])
118
+ txt.submit(chat_infer, inputs=[chat, system_tb, txt, url_tb], outputs=[chat, txt])
119
+
120
+ # ====== FastAPI + CORS + endpoints REST ======
121
+ api = FastAPI()
122
+ # CORS (ajusta ALLOWED_ORIGINS a tu dominio en Settings > Variables si quieres)
123
+ ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*").split(",")
124
+ api.add_middleware(
125
+ CORSMiddleware,
126
+ allow_origins=ALLOWED_ORIGINS,
127
+ allow_credentials=True,
128
+ allow_methods=["*"],
129
+ allow_headers=["*"],
130
+ )
131
+
132
+ @api.get("/healthz")
133
+ def healthz():
134
+ return {"ok": True}
135
+
136
+ @api.post("/run_predict")
137
+ def run_predict(body: dict = Body(...)):
138
+ prompt = body.get("prompt", "")
139
+ system = body.get("system", "")
140
+ return {"reply": api_run_predict(prompt, system)}
141
+
142
+ @api.post("/run_predict_with_url")
143
+ def run_predict_with_url(body: dict = Body(...)):
144
+ prompt = body.get("prompt", "")
145
+ url = body.get("url", "")
146
+ system = body.get("system", "")
147
+ return {"reply": api_run_predict_with_url(prompt, url, system)}
148
+
149
+ # Montamos la UI de Gradio bajo la raíz
150
+ app = mount_gradio_app(api, chat_ui, path="/")
151
+
152
+ if __name__ == "__main__":
153
+ # Para ejecución local (en Docker usamos Uvicorn del CMD)
154
+ chat_ui.queue(max_size=16).launch(server_name="0.0.0.0", server_port=7860)