File size: 3,252 Bytes
6607dfe
749c34d
971457a
7d5f64a
 
d58f0d3
922bbf0
 
7d5f64a
 
d58f0d3
7d5f64a
 
 
 
5276988
7d5f64a
 
 
5276988
0ef5898
7d5f64a
0ef5898
d58f0d3
5276988
0f1cb62
7d5f64a
 
0ef5898
 
 
e01c690
5276988
0ef5898
 
 
0f1cb62
5276988
e01c690
 
0ef5898
e01c690
5276988
7d5f64a
0ef5898
7d5f64a
5276988
0f1cb62
0ef5898
5276988
 
d58f0d3
 
 
 
7d5f64a
0ef5898
d58f0d3
7d5f64a
59f3a55
0ef5898
59f3a55
0ef5898
 
98f1339
5276988
 
0ef5898
 
5276988
0ef5898
 
5276988
 
 
0ef5898
5276988
7d5f64a
5276988
7d5f64a
5276988
 
 
0ef5898
5276988
7d5f64a
5276988
 
 
0ef5898
 
 
5276988
0ef5898
5276988
 
59f3a55
0ef5898
 
1c53807
922bbf0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# modules/local_llm.py
import os
from loguru import logger
from llama_cpp import Llama
import threading

MODEL_PATH = "/home/user/models/openhermes-2.5-mistral-7b.Q4_K_M.gguf"
FINETUNED_PATH = "/home/user/data/finetuned_hermes"
_llm_global = None
_lock = threading.Lock()

def _get_llm():
    global _llm_global
    if _llm_global is not None:
        return _llm_global
    
    with _lock:
        if _llm_global is not None:
            return _llm_global
        
        logger.info("CARREGANDO HERMES 7B TURBO → 8-12 SEGUNDOS MÁXIMO!")
        if not os.path.exists(MODEL_PATH):
            logger.error("GGUF NÃO ENCONTRADO!")
            return None
        
        try:
            llm = Llama(
                model_path=MODEL_PATH,
                n_ctx=2048,           # ← MENOS CONTEXTO = MAIS RÁPIDO
                n_threads=2,          # ← SÓ 2 vCPU NO HF FREE
                n_batch=256,          # ← MENOR BATCH = MENOS MEMÓRIA
                n_gpu_layers=0,
                verbose=False,
                logits_all=True,
                use_mlock=True,       # ← EVITA SWAP (ACELERA MUITO)
                seed=-1,
            )
            
            lora_path = f"{FINETUNED_PATH}/adapter_model.bin"
            if os.path.exists(lora_path):
                logger.info("LORA ANGOLANO CARREGADO → SOTAQUE LUANDA TURBO!")
                llm.load_lora(lora_path)
            
            _llm_global = llm
            logger.info("HERMES 7B TURBO ONLINE → 8-12s POR RESPOSTA!")
            return llm
            
        except Exception as e:
            logger.error(f"ERRO HERMES: {e}")
            import traceback
            logger.error(traceback.format_exc())
            return None

class LocalLLM:
    def __init__(self):
        self.llm = _get_llm()
    
    def is_available(self) -> bool:
        return self.llm is not None

    def generate(self, prompt: str, max_tokens: int = 35) -> str:  # ← 35 TOKENS = 8 SEGUNDOS!
        if not self.is_available():
            return "Akira off, kota."
        
        try:
            full_prompt = (
                "<|im_start|>system\n"
                "Akira é kota fixe de Luanda. Responde bué rápido, curto e com gírias angolanas.\n"
                "<|im_end|>\n"
                "<|im_start|>user\n"
                f"{prompt}\n"
                "<|im_end|>\n"
                "<|im_start|>assistant\n"
            )

            logger.info(f"[HERMES TURBO] Gerando {max_tokens} tokens → 8-12s MÁXIMO!")

            output = self.llm(
                full_prompt,
                max_tokens=max_tokens,
                temperature=0.9,
                top_p=0.95,
                repeat_penalty=1.12,
                stop=["<|im_end|>", "User:", "Assistant:"],
                echo=False
            )

            text = output["choices"][0]["text"].strip()
            
            # BOTÃO CONTINUA
            if len(text.split()) > 12:
                text += "\n\n*continua* pra mais, kota! 😎"

            logger.info(f"[HERMES 8s] {text[:100]}...")
            return text

        except Exception as e:
            logger.error(f"HERMES BUG: {e}")
            return "Buguei, tenta de novo."

HermesLLM = LocalLLM()