akira / modules /local_llm.py
akra35567's picture
Update modules/local_llm.py
0ef5898
raw
history blame
3.25 kB
# modules/local_llm.py
import os
from loguru import logger
from llama_cpp import Llama
import threading
MODEL_PATH = "/home/user/models/openhermes-2.5-mistral-7b.Q4_K_M.gguf"
FINETUNED_PATH = "/home/user/data/finetuned_hermes"
_llm_global = None
_lock = threading.Lock()
def _get_llm():
global _llm_global
if _llm_global is not None:
return _llm_global
with _lock:
if _llm_global is not None:
return _llm_global
logger.info("CARREGANDO HERMES 7B TURBO → 8-12 SEGUNDOS MÁXIMO!")
if not os.path.exists(MODEL_PATH):
logger.error("GGUF NÃO ENCONTRADO!")
return None
try:
llm = Llama(
model_path=MODEL_PATH,
n_ctx=2048, # ← MENOS CONTEXTO = MAIS RÁPIDO
n_threads=2, # ← SÓ 2 vCPU NO HF FREE
n_batch=256, # ← MENOR BATCH = MENOS MEMÓRIA
n_gpu_layers=0,
verbose=False,
logits_all=True,
use_mlock=True, # ← EVITA SWAP (ACELERA MUITO)
seed=-1,
)
lora_path = f"{FINETUNED_PATH}/adapter_model.bin"
if os.path.exists(lora_path):
logger.info("LORA ANGOLANO CARREGADO → SOTAQUE LUANDA TURBO!")
llm.load_lora(lora_path)
_llm_global = llm
logger.info("HERMES 7B TURBO ONLINE → 8-12s POR RESPOSTA!")
return llm
except Exception as e:
logger.error(f"ERRO HERMES: {e}")
import traceback
logger.error(traceback.format_exc())
return None
class LocalLLM:
def __init__(self):
self.llm = _get_llm()
def is_available(self) -> bool:
return self.llm is not None
def generate(self, prompt: str, max_tokens: int = 35) -> str: # ← 35 TOKENS = 8 SEGUNDOS!
if not self.is_available():
return "Akira off, kota."
try:
full_prompt = (
"<|im_start|>system\n"
"Akira é kota fixe de Luanda. Responde bué rápido, curto e com gírias angolanas.\n"
"<|im_end|>\n"
"<|im_start|>user\n"
f"{prompt}\n"
"<|im_end|>\n"
"<|im_start|>assistant\n"
)
logger.info(f"[HERMES TURBO] Gerando {max_tokens} tokens → 8-12s MÁXIMO!")
output = self.llm(
full_prompt,
max_tokens=max_tokens,
temperature=0.9,
top_p=0.95,
repeat_penalty=1.12,
stop=["<|im_end|>", "User:", "Assistant:"],
echo=False
)
text = output["choices"][0]["text"].strip()
# BOTÃO CONTINUA
if len(text.split()) > 12:
text += "\n\n*continua* pra mais, kota! 😎"
logger.info(f"[HERMES 8s] {text[:100]}...")
return text
except Exception as e:
logger.error(f"HERMES BUG: {e}")
return "Buguei, tenta de novo."
HermesLLM = LocalLLM()