PhishingTest

Paused

App Files Files Community

dungeon29 commited on 15 days ago

Commit

e0883f0

verified ·

1 Parent(s): eeb2022

Update llm_client.py

Browse files

Files changed (1) hide show

llm_client.py +130 -208

llm_client.py CHANGED Viewed

@@ -1,225 +1,147 @@
 import os
-import requests
-import subprocess
-import tarfile
-import stat
-from huggingface_hub import hf_hub_download
-from langchain.llms.base import LLM
-from langchain.chains import RetrievalQA
-from langchain_core.prompts import PromptTemplate
-from typing import Any, List, Optional, Mapping
-# --- Helper to Setup llama-cli ---
-def setup_llama_cli():
-    """
-    Download and extract llama-cli binary and libs from official releases
-    """
-    # Latest release URL for Linux x64 (b4991 equivalent or newer)
-    # Using the one found: b7312
-    CLI_URL = "https://github.com/ggml-org/llama.cpp/releases/download/b7312/llama-b7312-bin-ubuntu-x64.tar.gz"
-    LOCAL_TAR = "llama-cli.tar.gz"
-    BIN_DIR = "./llama_bin" # Extract to a subdirectory
-    CLI_BIN = os.path.join(BIN_DIR, "bin/llama-cli") # Standard structure usually has bin/
-    if os.path.exists(CLI_BIN):
-        return CLI_BIN, BIN_DIR
-    try:
-        print("⬇️ Downloading llama-cli binary...")
-        response = requests.get(CLI_URL, stream=True)
-        if response.status_code == 200:
-            with open(LOCAL_TAR, 'wb') as f:
-                for chunk in response.iter_content(chunk_size=8192):
-                    f.write(chunk)
-            print("📦 Extracting llama-cli...")
-            # Create dir
-            os.makedirs(BIN_DIR, exist_ok=True)
-            with tarfile.open(LOCAL_TAR, "r:gz") as tar:
-                tar.extractall(path=BIN_DIR)
-            # Locate the binary (it might be in bin/ or root of tar)
-            # We search for it
-            found_bin = None
-            for root, dirs, files in os.walk(BIN_DIR):
-                if "llama-cli" in files:
-                    found_bin = os.path.join(root, "llama-cli")
-                    break
-            if not found_bin:
-                print("❌ Could not find llama-cli in extracted files.")
-                return None, None
-            # Make executable
-            st = os.stat(found_bin)
-            os.chmod(found_bin, st.st_mode | stat.S_IEXEC)
-            print(f"✅ llama-cli binary ready at {found_bin}!")
-            return found_bin, BIN_DIR
-        else:
-            print(f"❌ Failed to download binary: {response.status_code}")
-            return None, None
-    except Exception as e:
-        print(f"❌ Error setting up llama-cli: {e}")
-        return None, None
-# --- Custom LangChain LLM Wrapper for Hybrid Approach ---
-class HybridLLM(LLM):
-    api_url: str = ""
-    model_path: str = ""
-    cli_path: str = ""
-    lib_path: str = "" # Path to folder containing .so files
-    @property
-    def _llm_type(self) -> str:
-        return "hybrid_llm"
-    def _call(self, prompt: str, stop: Optional[List[str]] = None, **kwargs: Any) -> str:
-        # 1. Try Colab API first
-        if self.api_url:
-            try:
-                print(f"🌐 Calling Colab API: {self.api_url}")
-                response = requests.post(
-                    f"{self.api_url}/generate",
-                    json={"prompt": prompt, "max_tokens": 512},
-                    timeout=30
-                )
-                if response.status_code == 200:
-                    return response.json()["response"]
-                else:
-                    print(f"⚠️ API Error {response.status_code}: {response.text}")
-            except Exception as e:
-                print(f"⚠️ API Connection Failed: {e}")
-        # 2. Fallback to Local llama-cli
-        if self.model_path and self.cli_path and os.path.exists(self.cli_path):
-            print("💻 Using Local llama-cli Fallback...")
             try:
-                # Construct command
-                cmd = [
-                    self.cli_path,
-                    "-m", self.model_path,
-                    "-p", prompt,
-                    "-n", "512",
-                    "--temp", "0.7",
-                    "--no-display-prompt", # Don't echo prompt
-                    "-c", "2048" # Context size
-                ]
-                # Setup Environment with LD_LIBRARY_PATH
-                env = os.environ.copy()
-                # Add the directory containing the binary (and likely libs) to LD_LIBRARY_PATH
-                # Also check 'lib' subdir if it exists
-                lib_paths = [os.path.dirname(self.cli_path)]
-                lib_subdir = os.path.join(self.lib_path, "lib")
-                if os.path.exists(lib_subdir):
-                    lib_paths.append(lib_subdir)
-                env["LD_LIBRARY_PATH"] = ":".join(lib_paths) + ":" + env.get("LD_LIBRARY_PATH", "")
-                # Run binary
-                result = subprocess.run(
-                    cmd,
-                    capture_output=True,
-                    text=True,
-                    encoding='utf-8',
-                    errors='replace',
-                    env=env
-                )
-                if result.returncode == 0:
-                    return result.stdout.strip()
-                else:
-                    return f"❌ llama-cli Error: {result.stderr}"
             except Exception as e:
-                return f"❌ Local Inference Failed: {e}"
-        return "❌ Error: No working LLM available (API failed and no local model)."
-    @property
-    def _identifying_params(self) -> Mapping[str, Any]:
-        return {"api_url": self.api_url, "model_path": self.model_path}
-class LLMClient:
-    def __init__(self, vector_store=None):
-        """
-        Initialize Hybrid LLM Client with Binary Wrapper
-        """
-        self.vector_store = vector_store
-        self.api_url = os.environ.get("COLAB_API_URL", "")
-        self.model_path = None
-        self.cli_path = None
-        self.lib_path = None
-        # Setup Local Fallback
         try:
-            # 1. Setup Binary
-            self.cli_path, self.lib_path = setup_llama_cli()
-            # 2. Download Model (Qwen3-0.6B)
-            print("📂 Loading Local Qwen3-0.6B (GGUF)...")
-            model_repo = "Qwen/Qwen3-0.6B-GGUF"
-            filename = "Qwen3-0.6B-Q8_0.gguf"
-            self.model_path = hf_hub_download(
-                repo_id=model_repo,
-                filename=filename
-            )
-            print(f"✅ Model downloaded to: {self.model_path}")
         except Exception as e:
-            print(f"⚠️ Could not setup local fallback: {e}")
-        # Create Hybrid LangChain Wrapper
-        self.llm = HybridLLM(
-            api_url=self.api_url,
-            model_path=self.model_path,
-            cli_path=self.cli_path,
-            lib_path=self.lib_path
-        )
-    def analyze(self, text, context_chunks=None):
-        """
-        Analyze text using LangChain RetrievalQA
-        """
         if not self.vector_store:
-            return "❌ Vector Store not initialized."
-        # Custom Prompt Template
-        template = """<|im_start|>system
-You are a cybersecurity expert. Task: Determine whether the input is 'PHISHING' or 'BENIGN' (Safe).
-Respond in the following format:
-LABEL: [PHISHING or BENIGN]
-EXPLANATION: [A brief Vietnamese explanation]
-Context:
-{context}
-<|im_end|>
-<|im_start|>user
-Input:
-{question}
-Short Analysis:
-<|im_end|>
-<|im_start|>assistant
-"""
-        PROMPT = PromptTemplate(
-            template=template,
-            input_variables=["context", "question"]
-        )
-        # Create QA Chain
-        qa_chain = RetrievalQA.from_chain_type(
-            llm=self.llm,
-            chain_type="stuff",
-            retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
-            chain_type_kwargs={"prompt": PROMPT}
-        )
         try:
-            print("🤖 Generating response...")
-            response = qa_chain.invoke(text)
-            return response['result']
         except Exception as e:
-            return f"❌ Error: {str(e)}"

 import os
+import glob
+from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
+from langchain_qdrant import Qdrant
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from qdrant_client import QdrantClient
+class RAGEngine:
+    def __init__(self, knowledge_base_dir="./knowledge_base"):
+        self.knowledge_base_dir = knowledge_base_dir
+        # Initialize Embeddings
+        self.embedding_fn = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+        # Qdrant Cloud Configuration
+        # Prioritize Env Vars, fallback to Hardcoded (User provided)
+        self.qdrant_url = os.environ.get("QDRANT_URL") or "https://abd29675-7fb9-4d95-8941-e6130b09bf7f.us-east4-0.gcp.cloud.qdrant.io"
+        self.qdrant_api_key = os.environ.get("QDRANT_API_KEY") or "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.L0aAAAbxRypLfBeGCtFr2xX06iveGb76NrA3BPJQiNM"
+        self.collection_name = "phishing_knowledge"
+        if not self.qdrant_url or not self.qdrant_api_key:
+            print("⚠️ QDRANT_URL or QDRANT_API_KEY not set. RAG will not function correctly.")
+            self.vector_store = None
+            return
+        print(f"☁️ Connecting to Qdrant Cloud: {self.qdrant_url}...")
+        # Initialize Qdrant Client
+        self.client = QdrantClient(
+            url=self.qdrant_url,
+            api_key=self.qdrant_api_key
+        )
+        # Initialize Vector Store Wrapper
+        self.vector_store = Qdrant(
+            client=self.client,
+            collection_name=self.collection_name,
+            embeddings=self.embedding_fn
+        )
+        # Check if collection exists/is empty and build if needed
+        try:
+            count = self.client.count(collection_name=self.collection_name).count
+            if count == 0:
+                self._build_index()
+            else:
+                print(f"✅ Qdrant Collection '{self.collection_name}' ready with {count} vectors.")
+        except Exception as e:
+            print(f"⚠️ Collection check failed (might not exist): {e}")
+            self._build_index()
+    def _build_index(self):
+        """Load documents and build index"""
+        print("🔄 Building Knowledge Base Index on Qdrant Cloud...")
+        documents = self._load_documents()
+        if not documents:
+            print("⚠️ No documents found to index.")
+            return
+        # Split documents
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=500,
+            chunk_overlap=50,
+            separators=["\n\n", "\n", " ", ""]
+        )
+        chunks = text_splitter.split_documents(documents)
+        if chunks:
+            # Add to vector store (Qdrant handles persistence automatically)
             try:
+                self.vector_store.add_documents(chunks)
+                print(f"✅ Indexed {len(chunks)} chunks to Qdrant Cloud.")
             except Exception as e:
+                print(f"❌ Error indexing to Qdrant: {e}")
+        else:
+            print("⚠️ No chunks created.")
+    def _load_documents(self):
+        """Load documents from directory or fallback file"""
+        documents = []
+        # Check for directory or fallback file
+        target_path = self.knowledge_base_dir
+        if not os.path.exists(target_path):
+            if os.path.exists("knowledge_base.txt"):
+                target_path = "knowledge_base.txt"
+                print("⚠️ Using fallback 'knowledge_base.txt' in root.")
+            else:
+                print(f"❌ Knowledge base not found at {target_path}")
+                return []
         try:
+            if os.path.isfile(target_path):
+                # Load single file
+                if target_path.endswith(".pdf"):
+                    loader = PyPDFLoader(target_path)
+                else:
+                    loader = TextLoader(target_path, encoding="utf-8")
+                documents.extend(loader.load())
+            else:
+                # Load directory
+                loaders = [
+                    DirectoryLoader(target_path, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}),
+                    DirectoryLoader(target_path, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}),
+                    DirectoryLoader(target_path, glob="**/*.pdf", loader_cls=PyPDFLoader),
+                ]
+                for loader in loaders:
+                    try:
+                        docs = loader.load()
+                        documents.extend(docs)
+                    except Exception as e:
+                        print(f"⚠️ Error loading with {loader}: {e}")
         except Exception as e:
+            print(f"❌ Error loading documents: {e}")
+        return documents
+    def refresh_knowledge_base(self):
+        """Force rebuild of the index"""
+        print("♻️ Refreshing Knowledge Base...")
+        if self.client:
+            try:
+                self.client.delete_collection(self.collection_name)
+                self._build_index()
+                return "✅ Knowledge Base Refreshed on Cloud!"
+            except Exception as e:
+                return f"❌ Error refreshing: {e}"
+        return "❌ Qdrant Client not initialized."
+    def retrieve(self, query, n_results=3):
+        """Retrieve relevant context"""
         if not self.vector_store:
+            return []
+        # Search
         try:
+            results = self.vector_store.similarity_search(query, k=n_results)
+            if results:
+                return [doc.page_content for doc in results]
         except Exception as e:
+            print(f"⚠️ Retrieval Error: {e}")
+        return []