CyberCoder225 commited on
Commit
33ce563
ยท
verified ยท
1 Parent(s): d5fba0e

Update brain.py

Browse files
Files changed (1) hide show
  1. brain.py +31 -25
brain.py CHANGED
@@ -1,57 +1,63 @@
1
  import os
2
  import gc
 
3
  from llama_cpp import Llama
4
 
5
  class MairaBrain:
6
  def __init__(self, repo_id, filename):
7
  self.repo_id = repo_id
8
  self.filename = filename
9
- self.llm = None # Model starts "off" to save RAM
10
 
11
  def load(self):
12
- """Wakes the core only when needed"""
13
  if self.llm is None:
14
- print(f"๐Ÿง  WAKING CORE: {self.filename}")
15
- # Ensure the path points to the root where Docker downloaded the files
16
  model_path = os.path.join("/app", self.filename)
17
 
18
- # Optimized for 16GB RAM limit
19
  self.llm = Llama(
20
  model_path=model_path,
21
- n_ctx=2048, # Standard context window
22
- n_threads=4, # Good for Hugging Face CPUs
23
- use_mmap=False, # Set to False to allow full RAM release on unload
24
- n_gpu_layers=0 # CPU only
 
 
 
 
 
 
 
 
 
 
25
  )
26
 
27
  def unload(self):
28
- """THE FIX: Puts the core to sleep and clears RAM"""
29
  if self.llm is not None:
30
- print(f"๐Ÿ’ค SLEEPING CORE: {self.filename}")
31
- # 1. Close the internal C++ handles
32
  try:
33
- self.llm.close()
34
- except Exception as e:
35
- print(f"Error closing llm: {e}")
36
-
37
- # 2. Delete the object reference
38
  del self.llm
39
  self.llm = None
40
-
41
- # 3. Force Python's Garbage Collector to wipe the memory
42
  gc.collect()
43
 
44
  def get_response(self, user_id, user_input):
45
- # Always ensure the model is loaded before inference
46
  self.load()
47
 
48
- # Identity prompt to keep Maira consistent
49
- prompt = f"System: You are Maira, a loyal AI created by CyberCoder225.\\nUser: {user_input}\\nAssistant:"
50
 
 
51
  output = self.llm(
52
- prompt,
53
- max_tokens=256,
54
- stop=["User:", "\\n"]
 
 
55
  )
56
 
57
  return output["choices"][0]["text"].strip()
 
1
  import os
2
  import gc
3
+ import llama_cpp
4
  from llama_cpp import Llama
5
 
6
  class MairaBrain:
7
  def __init__(self, repo_id, filename):
8
  self.repo_id = repo_id
9
  self.filename = filename
10
+ self.llm = None
11
 
12
  def load(self):
13
+ """Wakes the core with Turbo settings"""
14
  if self.llm is None:
15
+ print(f"๐Ÿš€ TURBO LOADING: {self.filename}")
 
16
  model_path = os.path.join("/app", self.filename)
17
 
 
18
  self.llm = Llama(
19
  model_path=model_path,
20
+ # ๐ŸŽ๏ธ SPEED TRICK 1: Smaller context (512) makes response start INSTANTLY
21
+ n_ctx=512,
22
+ # ๐ŸŽ๏ธ SPEED TRICK 2: Match HF's physical CPU cores (usually 4)
23
+ n_threads=4,
24
+ # ๐ŸŽ๏ธ SPEED TRICK 3: Batch processing size
25
+ n_batch=512,
26
+ # ๐ŸŽ๏ธ SPEED TRICK 4: Quantize the KV Cache (Moves 50% less data through RAM)
27
+ type_k=llama_cpp.GGML_TYPE_Q8_0,
28
+ type_v=llama_cpp.GGML_TYPE_Q8_0,
29
+ # ๐ŸŽ๏ธ SPEED TRICK 5: Flash Attention (if supported by the specific model)
30
+ flash_attn=True,
31
+ use_mmap=True,
32
+ use_mlock=False,
33
+ verbose=False
34
  )
35
 
36
  def unload(self):
37
+ """Clears the tracks for the next runner"""
38
  if self.llm is not None:
39
+ print(f"๐Ÿงน CLEARING CACHE: {self.filename}")
 
40
  try:
41
+ self.llm.close()
42
+ except:
43
+ pass
 
 
44
  del self.llm
45
  self.llm = None
 
 
46
  gc.collect()
47
 
48
  def get_response(self, user_id, user_input):
 
49
  self.load()
50
 
51
+ # Keep the prompt short. Long prompts slow down the "Time to First Token"
52
+ prompt = f"Maira: I am a high-speed AI core.\\nUser: {user_input}\\nMaira:"
53
 
54
+ # generate tokens
55
  output = self.llm(
56
+ prompt,
57
+ max_tokens=128, # Short responses feel faster
58
+ stop=["User:", "\\n"],
59
+ temperature=0.7,
60
+ repeat_penalty=1.1
61
  )
62
 
63
  return output["choices"][0]["text"].strip()