dungeon29 commited on
Commit
3c22ddb
Β·
verified Β·
1 Parent(s): 4479af2

Update llm_client.py

Browse files
Files changed (1) hide show
  1. llm_client.py +104 -75
llm_client.py CHANGED
@@ -3,72 +3,69 @@ import requests
3
  import subprocess
4
  import tarfile
5
  import stat
 
 
6
  from huggingface_hub import hf_hub_download
7
  from langchain_core.language_models import LLM
8
  from langchain.chains import RetrievalQA
9
  from langchain_core.prompts import PromptTemplate
10
  from typing import Any, List, Optional, Mapping
11
 
12
- # --- Helper to Setup llama-cli ---
13
- def setup_llama_cli():
14
  """
15
- Download and extract llama-cli binary and libs from official releases
16
  """
17
  # Latest release URL for Linux x64 (b4991 equivalent or newer)
18
- # Using the one found: b7312
19
  CLI_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b7312/llama-b7312-bin-ubuntu-x64.tar.gz"
20
  LOCAL_TAR = "llama-cli.tar.gz"
21
- BIN_DIR = "./llama_bin" # Extract to a subdirectory
22
- CLI_BIN = os.path.join(BIN_DIR, "bin/llama-cli") # Standard structure usually has bin/
23
 
24
- if os.path.exists(CLI_BIN):
25
- return CLI_BIN, BIN_DIR
26
 
27
  try:
28
- print("⬇️ Downloading llama-cli binary...")
29
  response = requests.get(CLI_URL, stream=True)
30
  if response.status_code == 200:
31
  with open(LOCAL_TAR, 'wb') as f:
32
  for chunk in response.iter_content(chunk_size=8192):
33
  f.write(chunk)
34
 
35
- print("πŸ“¦ Extracting llama-cli...")
36
- # Create dir
37
  os.makedirs(BIN_DIR, exist_ok=True)
38
 
39
  with tarfile.open(LOCAL_TAR, "r:gz") as tar:
40
  tar.extractall(path=BIN_DIR)
41
 
42
- # Locate the binary (it might be in bin/ or root of tar)
43
- # We search for it
44
  found_bin = None
45
  for root, dirs, files in os.walk(BIN_DIR):
46
- if "llama-cli" in files:
47
- found_bin = os.path.join(root, "llama-cli")
48
  break
49
 
50
  if not found_bin:
51
- print("❌ Could not find llama-cli in extracted files.")
52
  return None, None
53
 
54
  # Make executable
55
  st = os.stat(found_bin)
56
  os.chmod(found_bin, st.st_mode | stat.S_IEXEC)
57
- print(f"βœ… llama-cli binary ready at {found_bin}!")
58
  return found_bin, BIN_DIR
59
  else:
60
- print(f"❌ Failed to download binary: {response.status_code}")
61
  return None, None
62
  except Exception as e:
63
- print(f"❌ Error setting up llama-cli: {e}")
64
  return None, None
65
 
66
  # --- Custom LangChain LLM Wrapper for Hybrid Approach ---
67
  class HybridLLM(LLM):
68
  api_url: str = ""
69
- model_path: str = ""
70
- cli_path: str = ""
71
- lib_path: str = "" # Path to folder containing .so files
72
 
73
  @property
74
  def _llm_type(self) -> str:
@@ -91,70 +88,48 @@ class HybridLLM(LLM):
91
  except Exception as e:
92
  print(f"⚠️ API Connection Failed: {e}")
93
 
94
- # 2. Fallback to Local llama-cli
95
- if self.model_path and self.cli_path and os.path.exists(self.cli_path):
96
- print("πŸ’» Using Local llama-cli Fallback...")
97
- try:
98
- # Construct command
99
- cmd = [
100
- self.cli_path,
101
- "-m", self.model_path,
102
- "-p", prompt,
103
- "-n", "512",
104
- "--temp", "0.7",
105
- "--no-display-prompt", # Don't echo prompt
106
- "-c", "2048" # Context size
107
- ]
108
-
109
- # Setup Environment with LD_LIBRARY_PATH
110
- env = os.environ.copy()
111
- # Add the directory containing the binary (and likely libs) to LD_LIBRARY_PATH
112
- # Also check 'lib' subdir if it exists
113
- lib_paths = [os.path.dirname(self.cli_path)]
114
- lib_subdir = os.path.join(self.lib_path, "lib")
115
- if os.path.exists(lib_subdir):
116
- lib_paths.append(lib_subdir)
117
-
118
- env["LD_LIBRARY_PATH"] = ":".join(lib_paths) + ":" + env.get("LD_LIBRARY_PATH", "")
119
-
120
- # Run binary
121
- result = subprocess.run(
122
- cmd,
123
- capture_output=True,
124
- text=True,
125
- encoding='utf-8',
126
- errors='replace',
127
- env=env
128
- )
129
-
130
- if result.returncode == 0:
131
- return result.stdout.strip()
132
- else:
133
- return f"❌ llama-cli Error: {result.stderr}"
134
- except Exception as e:
135
- return f"❌ Local Inference Failed: {e}"
136
 
137
- return "❌ Error: No working LLM available (API failed and no local model)."
138
 
139
  @property
140
  def _identifying_params(self) -> Mapping[str, Any]:
141
- return {"api_url": self.api_url, "model_path": self.model_path}
142
 
143
  class LLMClient:
144
  def __init__(self, vector_store=None):
145
  """
146
- Initialize Hybrid LLM Client with Binary Wrapper
147
  """
148
  self.vector_store = vector_store
149
  self.api_url = os.environ.get("COLAB_API_URL", "")
150
- self.model_path = None
151
- self.cli_path = None
152
- self.lib_path = None
153
 
154
  # Setup Local Fallback
155
  try:
156
  # 1. Setup Binary
157
- self.cli_path, self.lib_path = setup_llama_cli()
158
 
159
  # 2. Download Model (Qwen3-0.6B)
160
  print("πŸ“‚ Loading Local Qwen3-0.6B (GGUF)...")
@@ -167,17 +142,71 @@ class LLMClient:
167
  )
168
  print(f"βœ… Model downloaded to: {self.model_path}")
169
 
 
 
 
170
  except Exception as e:
171
  print(f"⚠️ Could not setup local fallback: {e}")
172
 
173
  # Create Hybrid LangChain Wrapper
174
  self.llm = HybridLLM(
175
- api_url=self.api_url,
176
- model_path=self.model_path,
177
- cli_path=self.cli_path,
178
- lib_path=self.lib_path
179
  )
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  def analyze(self, text, context_chunks=None):
182
  """
183
  Analyze text using LangChain RetrievalQA
 
3
  import subprocess
4
  import tarfile
5
  import stat
6
+ import time
7
+ import atexit
8
  from huggingface_hub import hf_hub_download
9
  from langchain_core.language_models import LLM
10
  from langchain.chains import RetrievalQA
11
  from langchain_core.prompts import PromptTemplate
12
  from typing import Any, List, Optional, Mapping
13
 
14
+ # --- Helper to Setup llama-server ---
15
+ def setup_llama_binaries():
16
  """
17
+ Download and extract llama-server binary and libs from official releases
18
  """
19
  # Latest release URL for Linux x64 (b4991 equivalent or newer)
 
20
  CLI_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b7312/llama-b7312-bin-ubuntu-x64.tar.gz"
21
  LOCAL_TAR = "llama-cli.tar.gz"
22
+ BIN_DIR = "./llama_bin"
23
+ SERVER_BIN = os.path.join(BIN_DIR, "bin/llama-server") # Look for server binary
24
 
25
+ if os.path.exists(SERVER_BIN):
26
+ return SERVER_BIN, BIN_DIR
27
 
28
  try:
29
+ print("⬇️ Downloading llama.cpp binaries...")
30
  response = requests.get(CLI_URL, stream=True)
31
  if response.status_code == 200:
32
  with open(LOCAL_TAR, 'wb') as f:
33
  for chunk in response.iter_content(chunk_size=8192):
34
  f.write(chunk)
35
 
36
+ print("πŸ“¦ Extracting binaries...")
 
37
  os.makedirs(BIN_DIR, exist_ok=True)
38
 
39
  with tarfile.open(LOCAL_TAR, "r:gz") as tar:
40
  tar.extractall(path=BIN_DIR)
41
 
42
+ # Locate llama-server
 
43
  found_bin = None
44
  for root, dirs, files in os.walk(BIN_DIR):
45
+ if "llama-server" in files:
46
+ found_bin = os.path.join(root, "llama-server")
47
  break
48
 
49
  if not found_bin:
50
+ print("❌ Could not find llama-server in extracted files.")
51
  return None, None
52
 
53
  # Make executable
54
  st = os.stat(found_bin)
55
  os.chmod(found_bin, st.st_mode | stat.S_IEXEC)
56
+ print(f"βœ… llama-server binary ready at {found_bin}!")
57
  return found_bin, BIN_DIR
58
  else:
59
+ print(f"❌ Failed to download binaries: {response.status_code}")
60
  return None, None
61
  except Exception as e:
62
+ print(f"❌ Error setting up llama-server: {e}")
63
  return None, None
64
 
65
  # --- Custom LangChain LLM Wrapper for Hybrid Approach ---
66
  class HybridLLM(LLM):
67
  api_url: str = ""
68
+ local_server_url: str = "http://localhost:8080"
 
 
69
 
70
  @property
71
  def _llm_type(self) -> str:
 
88
  except Exception as e:
89
  print(f"⚠️ API Connection Failed: {e}")
90
 
91
+ # 2. Fallback to Local Server
92
+ print("πŸ’» Using Local llama-server Fallback...")
93
+ try:
94
+ # OpenAI-compatible completion endpoint
95
+ payload = {
96
+ "prompt": prompt,
97
+ "n_predict": 512,
98
+ "temperature": 0.7,
99
+ "stop": stop or []
100
+ }
101
+ response = requests.post(
102
+ f"{self.local_server_url}/completion",
103
+ json=payload,
104
+ timeout=60
105
+ )
106
+ if response.status_code == 200:
107
+ return response.json()["content"]
108
+ else:
109
+ return f"❌ Local Server Error: {response.text}"
110
+ except Exception as e:
111
+ return f"❌ Local Inference Failed: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ return "❌ Error: No working LLM available."
114
 
115
  @property
116
  def _identifying_params(self) -> Mapping[str, Any]:
117
+ return {"api_url": self.api_url, "local_server_url": self.local_server_url}
118
 
119
  class LLMClient:
120
  def __init__(self, vector_store=None):
121
  """
122
+ Initialize Hybrid LLM Client with Persistent Server
123
  """
124
  self.vector_store = vector_store
125
  self.api_url = os.environ.get("COLAB_API_URL", "")
126
+ self.server_process = None
127
+ self.server_port = 8080
 
128
 
129
  # Setup Local Fallback
130
  try:
131
  # 1. Setup Binary
132
+ self.server_bin, self.lib_path = setup_llama_binaries()
133
 
134
  # 2. Download Model (Qwen3-0.6B)
135
  print("πŸ“‚ Loading Local Qwen3-0.6B (GGUF)...")
 
142
  )
143
  print(f"βœ… Model downloaded to: {self.model_path}")
144
 
145
+ # 3. Start Server
146
+ self.start_local_server()
147
+
148
  except Exception as e:
149
  print(f"⚠️ Could not setup local fallback: {e}")
150
 
151
  # Create Hybrid LangChain Wrapper
152
  self.llm = HybridLLM(
153
+ api_url=self.api_url,
154
+ local_server_url=f"http://localhost:{self.server_port}"
 
 
155
  )
156
 
157
+ def start_local_server(self):
158
+ """Start llama-server in background"""
159
+ if not self.server_bin or not self.model_path:
160
+ return
161
+
162
+ print("πŸš€ Starting llama-server...")
163
+
164
+ # Setup Env
165
+ env = os.environ.copy()
166
+ lib_paths = [os.path.dirname(self.server_bin)]
167
+ lib_subdir = os.path.join(self.lib_path, "lib")
168
+ if os.path.exists(lib_subdir):
169
+ lib_paths.append(lib_subdir)
170
+ env["LD_LIBRARY_PATH"] = ":".join(lib_paths) + ":" + env.get("LD_LIBRARY_PATH", "")
171
+
172
+ cmd = [
173
+ self.server_bin,
174
+ "-m", self.model_path,
175
+ "--port", str(self.server_port),
176
+ "-c", "2048",
177
+ "--host", "0.0.0.0" # Bind to all interfaces for container
178
+ ]
179
+
180
+ # Launch process
181
+ self.server_process = subprocess.Popen(
182
+ cmd,
183
+ stdout=subprocess.DEVNULL, # Suppress noisy logs
184
+ stderr=subprocess.DEVNULL,
185
+ env=env
186
+ )
187
+
188
+ # Register cleanup
189
+ atexit.register(self.stop_server)
190
+
191
+ # Wait for server to be ready
192
+ print("⏳ Waiting for server to be ready...")
193
+ for _ in range(20): # Wait up to 20s
194
+ try:
195
+ requests.get(f"http://localhost:{self.server_port}/health", timeout=1)
196
+ print("βœ… llama-server is ready!")
197
+ return
198
+ except:
199
+ time.sleep(1)
200
+
201
+ print("⚠️ Server start timed out (but might still be loading).")
202
+
203
+ def stop_server(self):
204
+ """Kill the server process"""
205
+ if self.server_process:
206
+ print("πŸ›‘ Stopping llama-server...")
207
+ self.server_process.terminate()
208
+ self.server_process = None
209
+
210
  def analyze(self, text, context_chunks=None):
211
  """
212
  Analyze text using LangChain RetrievalQA