Spaces:
Running
Running
File size: 3,252 Bytes
6607dfe 749c34d 971457a 7d5f64a d58f0d3 922bbf0 7d5f64a d58f0d3 7d5f64a 5276988 7d5f64a 5276988 0ef5898 7d5f64a 0ef5898 d58f0d3 5276988 0f1cb62 7d5f64a 0ef5898 e01c690 5276988 0ef5898 0f1cb62 5276988 e01c690 0ef5898 e01c690 5276988 7d5f64a 0ef5898 7d5f64a 5276988 0f1cb62 0ef5898 5276988 d58f0d3 7d5f64a 0ef5898 d58f0d3 7d5f64a 59f3a55 0ef5898 59f3a55 0ef5898 98f1339 5276988 0ef5898 5276988 0ef5898 5276988 0ef5898 5276988 7d5f64a 5276988 7d5f64a 5276988 0ef5898 5276988 7d5f64a 5276988 0ef5898 5276988 0ef5898 5276988 59f3a55 0ef5898 1c53807 922bbf0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# modules/local_llm.py
import os
from loguru import logger
from llama_cpp import Llama
import threading
MODEL_PATH = "/home/user/models/openhermes-2.5-mistral-7b.Q4_K_M.gguf"
FINETUNED_PATH = "/home/user/data/finetuned_hermes"
_llm_global = None
_lock = threading.Lock()
def _get_llm():
global _llm_global
if _llm_global is not None:
return _llm_global
with _lock:
if _llm_global is not None:
return _llm_global
logger.info("CARREGANDO HERMES 7B TURBO → 8-12 SEGUNDOS MÁXIMO!")
if not os.path.exists(MODEL_PATH):
logger.error("GGUF NÃO ENCONTRADO!")
return None
try:
llm = Llama(
model_path=MODEL_PATH,
n_ctx=2048, # ← MENOS CONTEXTO = MAIS RÁPIDO
n_threads=2, # ← SÓ 2 vCPU NO HF FREE
n_batch=256, # ← MENOR BATCH = MENOS MEMÓRIA
n_gpu_layers=0,
verbose=False,
logits_all=True,
use_mlock=True, # ← EVITA SWAP (ACELERA MUITO)
seed=-1,
)
lora_path = f"{FINETUNED_PATH}/adapter_model.bin"
if os.path.exists(lora_path):
logger.info("LORA ANGOLANO CARREGADO → SOTAQUE LUANDA TURBO!")
llm.load_lora(lora_path)
_llm_global = llm
logger.info("HERMES 7B TURBO ONLINE → 8-12s POR RESPOSTA!")
return llm
except Exception as e:
logger.error(f"ERRO HERMES: {e}")
import traceback
logger.error(traceback.format_exc())
return None
class LocalLLM:
def __init__(self):
self.llm = _get_llm()
def is_available(self) -> bool:
return self.llm is not None
def generate(self, prompt: str, max_tokens: int = 35) -> str: # ← 35 TOKENS = 8 SEGUNDOS!
if not self.is_available():
return "Akira off, kota."
try:
full_prompt = (
"<|im_start|>system\n"
"Akira é kota fixe de Luanda. Responde bué rápido, curto e com gírias angolanas.\n"
"<|im_end|>\n"
"<|im_start|>user\n"
f"{prompt}\n"
"<|im_end|>\n"
"<|im_start|>assistant\n"
)
logger.info(f"[HERMES TURBO] Gerando {max_tokens} tokens → 8-12s MÁXIMO!")
output = self.llm(
full_prompt,
max_tokens=max_tokens,
temperature=0.9,
top_p=0.95,
repeat_penalty=1.12,
stop=["<|im_end|>", "User:", "Assistant:"],
echo=False
)
text = output["choices"][0]["text"].strip()
# BOTÃO CONTINUA
if len(text.split()) > 12:
text += "\n\n*continua* pra mais, kota! 😎"
logger.info(f"[HERMES 8s] {text[:100]}...")
return text
except Exception as e:
logger.error(f"HERMES BUG: {e}")
return "Buguei, tenta de novo."
HermesLLM = LocalLLM() |