#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os, glob, textwrap from pathlib import Path import gradio as gr from huggingface_hub import snapshot_download from llama_cpp import Llama import requests from bs4 import BeautifulSoup # ===== Modelo (GGUF) ===== MODEL_REPO = os.getenv("MODEL_REPO", "Qwen/Qwen2.5-7B-Instruct-GGUF") # Para CPU Basic puedes bajar a q3_k_m si te falta RAM MODEL_PATTERN = os.getenv("MODEL_PATTERN", "qwen2.5-7b-instruct-q4_k_m-*.gguf") LOCAL_DIR = Path("models"); LOCAL_DIR.mkdir(parents=True, exist_ok=True) print(f"[Boot] Descargando {MODEL_REPO} patrón {MODEL_PATTERN} ...") snapshot_dir = snapshot_download(repo_id=MODEL_REPO, local_dir=str(LOCAL_DIR), allow_patterns=[MODEL_PATTERN]) candidates = sorted(glob.glob(str(Path(snapshot_dir) / MODEL_PATTERN))) if not candidates: raise FileNotFoundError(f"No hay shards para {MODEL_PATTERN} en {snapshot_dir}") MODEL_PATH = candidates[0] print(f"[Boot] Usando shard: {MODEL_PATH}") N_THREADS = max(1, (os.cpu_count() or 2) - 1) llm = Llama( model_path=MODEL_PATH, n_ctx=4096, n_threads=N_THREADS, n_batch=256, n_gpu_layers=0, verbose=False, ) SYSTEM_DEFAULT = textwrap.dedent("""\ Eres Astrohunters-Guide, un asistente en español. - Respondes con precisión y sin inventar datos. - Sabes explicar resultados de exoplanetas (período, duración, profundidad, SNR, radio). - Si te paso una URL, lees su contenido y lo usas como contexto. """) def fetch_url_text(url: str, max_chars: int = 6000) -> str: try: r = requests.get(url, timeout=15) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") for t in soup(["script", "style", "noscript"]): t.decompose() txt = " ".join(soup.get_text(separator=" ").split()) return txt[:max_chars] except Exception as e: return f"[No se pudo cargar {url}: {e}]" def run_llm(messages, temperature=0.6, top_p=0.95, max_tokens=768): out = llm.create_chat_completion( messages=messages, temperature=temperature, top_p=top_p, max_tokens=max_tokens, stream=False, ) return out["choices"][0]["message"]["content"].strip() # ====== Funciones API ====== def api_run_predict(prompt: str, system: str = "") -> str: messages = [ {"role": "system", "content": system or SYSTEM_DEFAULT}, {"role": "user", "content": prompt}, ] return run_llm(messages, max_tokens=512) def api_run_predict_with_url(prompt: str, url: str = "", system: str = "") -> str: web_ctx = fetch_url_text(url) if url else "" user_msg = prompt if not web_ctx else f"{prompt}\n\n[CONTEXTO_WEB]\n{web_ctx}" messages = [ {"role": "system", "content": system or SYSTEM_DEFAULT}, {"role": "user", "content": user_msg}, ] return run_llm(messages, max_tokens=700) # ===== UI de chat ===== with gr.Blocks(title="Astrohunters LLM (Qwen2.5 7B)") as chat_ui: gr.Markdown("## 🛰️ Astrohunters LLM (Qwen2.5 7B Instruct, GGUF — CPU Basic)") with gr.Row(): with gr.Column(scale=3): chat = gr.Chatbot(height=420, type="tuples") with gr.Row(): txt = gr.Textbox(placeholder="Escribe tu pregunta...", scale=4) btn = gr.Button("Enviar", scale=1, variant="primary") with gr.Column(scale=2): system_tb = gr.Textbox(label="System prompt", value=SYSTEM_DEFAULT, lines=10) url_tb = gr.Textbox(label="URL (opcional): Cargar contenido web", placeholder="https://...") def chat_infer(history, system_prompt, user, url_to_load): web_ctx = fetch_url_text(url_to_load.strip()) if url_to_load and url_to_load.strip() else "" messages = [{"role": "system", "content": system_prompt or SYSTEM_DEFAULT}] for u, a in history: if u: messages.append({"role": "user", "content": u}) if a: messages.append({"role": "assistant", "content": a}) user_msg = user or "" if web_ctx: user_msg = f"{user_msg}\n\n[CONTEXTO_WEB]\n{web_ctx}" messages.append({"role": "user", "content": user_msg}) reply = run_llm(messages, max_tokens=700) history.append((user, reply)) return history, "" btn.click(chat_infer, inputs=[chat, system_tb, txt, url_tb], outputs=[chat, txt]) txt.submit(chat_infer, inputs=[chat, system_tb, txt, url_tb], outputs=[chat, txt]) # ====== APIs nombradas (fuera del Blocks) ====== api1 = gr.Interface( fn=api_run_predict, inputs=[gr.Textbox(label="prompt"), gr.Textbox(label="system")], outputs=gr.Textbox(label="reply"), api_name="run_predict", ) api2 = gr.Interface( fn=api_run_predict_with_url, inputs=[gr.Textbox(label="prompt"), gr.Textbox(label="url"), gr.Textbox(label="system")], outputs=gr.Textbox(label="reply"), api_name="run_predict_with_url", ) # Unimos todo en un solo demo para que Gradio registre las rutas demo = gr.TabbedInterface( [chat_ui, api1, api2], tab_names=["Chat", "API: run_predict", "API: run_predict_with_url"], ) if __name__ == "__main__": demo.queue(max_size=16).launch(server_name="0.0.0.0", server_port=7860)