akra35567 commited on
Commit
7d5f64a
·
1 Parent(s): 07fb462

Update modules/local_llm.py

Browse files
Files changed (1) hide show
  1. modules/local_llm.py +44 -79
modules/local_llm.py CHANGED
@@ -1,107 +1,72 @@
1
  # modules/local_llm.py
2
  import os
3
- import threading
4
  from loguru import logger
5
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
6
- from peft import PeftModel
7
-
8
- MODEL_DIR = "/app/models/hermes-7b"
9
- FINETUNED_DIR = "/app/data/finetuned_hermes"
10
-
11
- _HERMES_GLOBAL = None
12
- _HERMES_LOCK = threading.Lock()
13
-
14
 
15
- def _get_hermes_singleton():
16
- global _HERMES_GLOBAL
17
- if _HERMES_GLOBAL is not None:
18
- logger.debug("Reusando Hermes 7B FP16 global")
19
- return _HERMES_GLOBAL
20
 
21
- with _HERMES_LOCK:
22
- if _HERMES_GLOBAL is not None:
23
- return _HERMES_GLOBAL
24
 
25
- logger.info("Carregando Hermes 7B FP16 + offload (8 GB RAM)...")
 
 
 
26
 
27
- required = ["config.json", "model.safetensors.index.json"]
28
- missing = [f for f in required if not os.path.exists(f"{MODEL_DIR}/{f}")]
29
- if missing:
30
- logger.error(f"FALTANDO: {missing}")
31
- return None
32
 
33
- shards = [f for f in os.listdir(MODEL_DIR) if f.endswith(".safetensors")]
34
- if len(shards) != 4:
35
- logger.error(f"SHARDS: {len(shards)}/4")
36
  return None
37
 
38
  try:
39
- tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
40
- if tokenizer.pad_token is None:
41
- tokenizer.pad_token = tokenizer.eos_token
42
-
43
- model = AutoModelForCausalLM.from_pretrained(
44
- MODEL_DIR,
45
- torch_dtype="auto", # fp16 se tiver GPU, senão bfloat16
46
- device_map="cpu",
47
- low_cpu_mem_usage=True,
48
- offload_folder="/tmp/offload",
49
- offload_state_dict=True
50
  )
51
 
52
- if os.path.exists(f"{FINETUNED_DIR}/adapter_config.json"):
53
- logger.info("Carregando LoRA angolano...")
54
- model = PeftModel.from_pretrained(model, FINETUNED_DIR)
55
- logger.info("LoRA ATIVO!")
56
-
57
- _HERMES_GLOBAL = (model, tokenizer)
58
- logger.info("Hermes 7B FP16 GLOBAL carregado com sucesso!")
59
- return _HERMES_GLOBAL
60
 
 
 
 
61
  except Exception as e:
62
- logger.error(f"ERRO CRÍTICO: {e}")
63
- import traceback
64
- logger.error(traceback.format_exc())
65
  return None
66
 
67
-
68
  class LocalLLM:
69
  def __init__(self):
70
- self.generator = None
71
- self._load_pipeline()
72
-
73
- def _load_pipeline(self):
74
- result = _get_hermes_singleton()
75
- if not result:
76
- logger.error("Hermes off → usando API")
77
- self.generator = None
78
- return
79
-
80
- model, tokenizer = result
81
- self.generator = pipeline(
82
- "text-generation",
83
- model=model,
84
- tokenizer=tokenizer,
85
- max_new_tokens=256,
86
- temperature=0.8,
87
- do_sample=True,
88
- repetition_penalty=1.1,
89
- return_full_text=False
90
- )
91
- logger.info("Pipeline LOCAL FP16 + LoRA → ONLINE!")
92
 
93
  def is_available(self) -> bool:
94
- return self.generator is not None
95
 
96
- def generate(self, prompt: str, max_tokens: int = 256, temperature: float = 0.8) -> str:
97
  if not self.is_available():
98
- return "Modelo local off, kota."
99
-
100
  try:
101
- out = self.generator(prompt, max_new_tokens=max_tokens, temperature=temperature)[0]["generated_text"]
102
- return out.strip()
 
 
 
 
 
 
 
103
  except Exception as e:
104
  logger.error(f"Geração falhou: {e}")
105
- return "Buguei, puto."
106
 
107
  HermesLLM = LocalLLM
 
1
  # modules/local_llm.py
2
  import os
 
3
  from loguru import logger
4
+ from llama_cpp import Llama
5
+ import threading
 
 
 
 
 
 
 
6
 
7
+ MODEL_PATH = "/app/models/openhermes-2.5-mistral-7b.Q4_K_M.gguf"
8
+ FINETUNED_PATH = "/app/data/finetuned_hermes"
 
 
 
9
 
10
+ _llm_global = None
11
+ _lock = threading.Lock()
 
12
 
13
+ def _get_llm():
14
+ global _llm_global
15
+ if _llm_global is not None:
16
+ return _llm_global
17
 
18
+ with _lock:
19
+ if _llm_global is not None:
20
+ return _llm_global
 
 
21
 
22
+ logger.info("Carregando OpenHermes-2.5-Mistral-7B Q4_K_M (4.8 GB RAM)...")
23
+ if not os.path.exists(MODEL_PATH):
24
+ logger.error("GGUF não encontrado! Baixa com snapshot_download.")
25
  return None
26
 
27
  try:
28
+ llm = Llama(
29
+ model_path=MODEL_PATH,
30
+ n_ctx=4096,
31
+ n_threads=4,
32
+ n_gpu_layers=0, # CPU only
33
+ n_batch=512,
34
+ verbose=False
 
 
 
 
35
  )
36
 
37
+ if os.path.exists(f"{FINETUNED_PATH}/adapter_config.json"):
38
+ logger.info("LoRA angolano DETECTADO → aplicando...")
39
+ llm.load_lora(FINETUNED_PATH)
 
 
 
 
 
40
 
41
+ _llm_global = llm
42
+ logger.info("OpenHermes 2.5 Q4_K_M + LoRA → ONLINE EM 4.8 GB!")
43
+ return llm
44
  except Exception as e:
45
+ logger.error(f"ERRO GGUF: {e}")
 
 
46
  return None
47
 
 
48
  class LocalLLM:
49
  def __init__(self):
50
+ self.llm = _get_llm()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  def is_available(self) -> bool:
53
+ return self.llm is not None
54
 
55
+ def generate(self, prompt: str, max_tokens: int = 256) -> str:
56
  if not self.is_available():
57
+ return "Modelo local off, usando API."
 
58
  try:
59
+ output = self.llm(
60
+ prompt,
61
+ max_tokens=max_tokens,
62
+ temperature=0.8,
63
+ top_p=0.9,
64
+ repeat_penalty=1.1,
65
+ stop=["</s>", "User:", "Assistant:"]
66
+ )
67
+ return output["choices"][0]["text"].strip()
68
  except Exception as e:
69
  logger.error(f"Geração falhou: {e}")
70
+ return "Buguei, kota."
71
 
72
  HermesLLM = LocalLLM