akra35567 commited on
Commit
d58f0d3
·
1 Parent(s): 9d98256

Update modules/local_llm.py

Browse files
Files changed (1) hide show
  1. modules/local_llm.py +92 -22
modules/local_llm.py CHANGED
@@ -1,24 +1,54 @@
1
  # modules/local_llm.py
2
  import os
 
3
  from loguru import logger
4
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
 
5
 
6
  MODEL_DIR = "/app/models/hermes-7b"
7
- FINETUNED = "/app/data/finetuned_hermes"
8
 
9
- class LocalLLM:
10
- def __init__(self):
11
- self.generator = None
12
- self._load_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- def _load_model(self):
15
  try:
16
- logger.info("Carregando Hermes 7B 4-bit (6 GB RAM)...")
17
  quantization = BitsAndBytesConfig(
18
  load_in_4bit=True,
19
- bnb_4bit_compute_dtype="float16"
 
20
  )
 
21
  tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
 
 
 
22
  model = AutoModelForCausalLM.from_pretrained(
23
  MODEL_DIR,
24
  device_map="cpu",
@@ -26,29 +56,69 @@ class LocalLLM:
26
  low_cpu_mem_usage=True,
27
  offload_folder="/tmp/offload"
28
  )
29
- self.generator = pipeline(
30
- "text-generation",
31
- model=model,
32
- tokenizer=tokenizer,
33
- max_new_tokens=256,
34
- temperature=0.8,
35
- do_sample=True
36
- )
37
- logger.info("HERMES 7B 4-BIT CARREGADO!")
 
 
38
  except Exception as e:
39
  logger.error(f"ERRO: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  self.generator = None
 
41
 
42
- def is_available(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  return self.generator is not None
44
 
45
- def generate(self, prompt, max_tokens=256, temperature=0.8):
46
  if not self.is_available():
47
- return "Off, kota."
 
48
  try:
49
- output = self.generator(prompt, max_new_tokens=max_tokens)[0]["generated_text"]
50
- return output.strip()
 
 
 
 
 
 
 
51
  except Exception as e:
 
52
  return "Buguei, puto."
53
 
 
54
  HermesLLM = LocalLLM
 
1
  # modules/local_llm.py
2
  import os
3
+ import threading
4
  from loguru import logger
5
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
6
+ from peft import PeftModel
7
 
8
  MODEL_DIR = "/app/models/hermes-7b"
9
+ FINETUNED_DIR = "/app/data/finetuned_hermes"
10
 
11
+ # SINGLETON + LOCK
12
+ _HERMES_GLOBAL = None
13
+ _HERMES_LOCK = threading.Lock()
14
+
15
+
16
+ def _get_hermes_singleton():
17
+ """Retorna modelo + tokenizer (4-bit) → singleton global"""
18
+ global _HERMES_GLOBAL
19
+ if _HERMES_GLOBAL is not None:
20
+ logger.debug("Reusando Hermes 7B 4-bit global")
21
+ return _HERMES_GLOBAL
22
+
23
+ with _HERMES_LOCK:
24
+ if _HERMES_GLOBAL is not None:
25
+ return _HERMES_GLOBAL
26
+
27
+ logger.info("Carregando Hermes 7B 4-bit (6 GB RAM)...")
28
+
29
+ # Verifica arquivos
30
+ required = ["config.json", "model.safetensors.index.json"]
31
+ missing = [f for f in required if not os.path.exists(f"{MODEL_DIR}/{f}")]
32
+ if missing:
33
+ logger.error(f"ARQUIVOS FALTANDO: {missing}")
34
+ return None
35
+
36
+ shards = [f for f in os.listdir(MODEL_DIR) if f.endswith(".safetensors")]
37
+ if len(shards) != 4:
38
+ logger.error(f"APENAS {len(shards)} SHARDS! ESPERADO: 4")
39
+ return None
40
 
 
41
  try:
 
42
  quantization = BitsAndBytesConfig(
43
  load_in_4bit=True,
44
+ bnb_4bit_compute_dtype="float16",
45
+ bnb_4bit_quant_type="nf4"
46
  )
47
+
48
  tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
49
+ if tokenizer.pad_token is None:
50
+ tokenizer.pad_token = tokenizer.eos_token
51
+
52
  model = AutoModelForCausalLM.from_pretrained(
53
  MODEL_DIR,
54
  device_map="cpu",
 
56
  low_cpu_mem_usage=True,
57
  offload_folder="/tmp/offload"
58
  )
59
+
60
+ # Carrega LoRA se existir
61
+ if os.path.exists(f"{FINETUNED_DIR}/adapter_config.json"):
62
+ logger.info("Aplicando LoRA finetuned...")
63
+ model = PeftModel.from_pretrained(model, FINETUNED_DIR)
64
+ logger.info("LoRA ANGOLANO ATIVO!")
65
+
66
+ _HERMES_GLOBAL = (model, tokenizer)
67
+ logger.info("Hermes 7B 4-bit GLOBAL carregado!")
68
+ return _HERMES_GLOBAL
69
+
70
  except Exception as e:
71
  logger.error(f"ERRO: {e}")
72
+ import traceback
73
+ logger.error(traceback.format_exc())
74
+ return None
75
+
76
+
77
+ class LocalLLM:
78
+ def __init__(self):
79
+ self.generator = None
80
+ self._load_pipeline()
81
+
82
+ def _load_pipeline(self):
83
+ result = _get_hermes_singleton()
84
+ if not result:
85
+ logger.error("Pipeline off")
86
  self.generator = None
87
+ return
88
 
89
+ model, tokenizer = result
90
+ self.generator = pipeline(
91
+ "text-generation",
92
+ model=model,
93
+ tokenizer=tokenizer,
94
+ max_new_tokens=256,
95
+ temperature=0.8,
96
+ do_sample=True,
97
+ repetition_penalty=1.1,
98
+ return_full_text=False
99
+ )
100
+ logger.info("Pipeline LOCAL com 4-bit + LoRA → ONLINE!")
101
+
102
+ def is_available(self) -> bool:
103
  return self.generator is not None
104
 
105
+ def generate(self, prompt: str, max_tokens: int = 256, temperature: float = 0.8) -> str:
106
  if not self.is_available():
107
+ return "Modelo off, kota."
108
+
109
  try:
110
+ output = self.generator(
111
+ prompt,
112
+ max_new_tokens=max_tokens,
113
+ temperature=temperature,
114
+ do_sample=temperature > 0.0,
115
+ repetition_penalty=1.1,
116
+ return_full_text=False
117
+ )
118
+ return output[0]["generated_text"].strip()
119
  except Exception as e:
120
+ logger.error(f"Geração falhou: {e}")
121
  return "Buguei, puto."
122
 
123
+ # EXPORTA
124
  HermesLLM = LocalLLM