maira-chaty

Sleeping

App Files Files Community

CyberCoder225 commited on 13 days ago

Commit

33ce563

verified ·

1 Parent(s): d5fba0e

Update brain.py

Browse files

Files changed (1) hide show

brain.py +31 -25

brain.py CHANGED Viewed

@@ -1,57 +1,63 @@
 import os
 import gc
 from llama_cpp import Llama
 class MairaBrain:
     def __init__(self, repo_id, filename):
         self.repo_id = repo_id
         self.filename = filename
-        self.llm = None  # Model starts "off" to save RAM
     def load(self):
-        """Wakes the core only when needed"""
         if self.llm is None:
-            print(f"🧠 WAKING CORE: {self.filename}")
-            # Ensure the path points to the root where Docker downloaded the files
             model_path = os.path.join("/app", self.filename)
-            # Optimized for 16GB RAM limit
             self.llm = Llama(
                 model_path=model_path,
-                n_ctx=2048,      # Standard context window
-                n_threads=4,     # Good for Hugging Face CPUs
-                use_mmap=False,  # Set to False to allow full RAM release on unload
-                n_gpu_layers=0   # CPU only
             )
     def unload(self):
-        """THE FIX: Puts the core to sleep and clears RAM"""
         if self.llm is not None:
-            print(f"💤 SLEEPING CORE: {self.filename}")
-            # 1. Close the internal C++ handles
             try:
-                self.llm.close()
-            except Exception as e:
-                print(f"Error closing llm: {e}")
-            # 2. Delete the object reference
             del self.llm
             self.llm = None
-            # 3. Force Python's Garbage Collector to wipe the memory
             gc.collect()
     def get_response(self, user_id, user_input):
-        # Always ensure the model is loaded before inference
         self.load()
-        # Identity prompt to keep Maira consistent
-        prompt = f"System: You are Maira, a loyal AI created by CyberCoder225.\\nUser: {user_input}\\nAssistant:"
         output = self.llm(
-            prompt,
-            max_tokens=256,
-            stop=["User:", "\\n"]
         )
         return output["choices"][0]["text"].strip()

 import os
 import gc
+import llama_cpp
 from llama_cpp import Llama
 class MairaBrain:
     def __init__(self, repo_id, filename):
         self.repo_id = repo_id
         self.filename = filename
+        self.llm = None
     def load(self):
+        """Wakes the core with Turbo settings"""
         if self.llm is None:
+            print(f"🚀 TURBO LOADING: {self.filename}")
             model_path = os.path.join("/app", self.filename)
             self.llm = Llama(
                 model_path=model_path,
+                # 🏎️ SPEED TRICK 1: Smaller context (512) makes response start INSTANTLY
+                n_ctx=512,
+                # 🏎️ SPEED TRICK 2: Match HF's physical CPU cores (usually 4)
+                n_threads=4,
+                # 🏎️ SPEED TRICK 3: Batch processing size
+                n_batch=512,
+                # 🏎️ SPEED TRICK 4: Quantize the KV Cache (Moves 50% less data through RAM)
+                type_k=llama_cpp.GGML_TYPE_Q8_0,
+                type_v=llama_cpp.GGML_TYPE_Q8_0,
+                # 🏎️ SPEED TRICK 5: Flash Attention (if supported by the specific model)
+                flash_attn=True,
+                use_mmap=True,
+                use_mlock=False,
+                verbose=False
             )
     def unload(self):
+        """Clears the tracks for the next runner"""
         if self.llm is not None:
+            print(f"🧹 CLEARING CACHE: {self.filename}")
             try:
+                self.llm.close()
+            except:
+                pass
             del self.llm
             self.llm = None
             gc.collect()
     def get_response(self, user_id, user_input):
         self.load()
+        # Keep the prompt short. Long prompts slow down the "Time to First Token"
+        prompt = f"Maira: I am a high-speed AI core.\\nUser: {user_input}\\nMaira:"
+        # generate tokens
         output = self.llm(
+            prompt,
+            max_tokens=128, # Short responses feel faster
+            stop=["User:", "\\n"],
+            temperature=0.7,
+            repeat_penalty=1.1
         )
         return output["choices"][0]["text"].strip()