PhishingTest

Paused

App Files Files Community

dungeon29 commited on 19 days ago

Commit

f961bd3

verified ·

1 Parent(s): cfcd4e2

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +63 -33

rag_engine.py CHANGED Viewed

@@ -1,33 +1,58 @@
 import os
 import glob
-from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, JSONLoader
-from langchain_community.vectorstores import Chroma
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_core.documents import Document
 class RAGEngine:
-    def __init__(self, knowledge_base_dir="./knowledge_base", persist_directory="./chroma_db"):
         self.knowledge_base_dir = knowledge_base_dir
-        self.persist_directory = persist_directory
-        # Initialize Embeddings (using same model as before)
         self.embedding_fn = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
-        # Initialize Vector Store
-        self.vector_store = Chroma(
-            persist_directory=self.persist_directory,
-            embedding_function=self.embedding_fn,
-            collection_name="phishing_knowledge"
         )
-        # Build index if empty or on init
-        if not self.vector_store.get()['ids']:
             self._build_index()
     def _build_index(self):
         """Load documents and build index"""
-        print("🔄 Building Knowledge Base Index...")
         documents = self._load_documents()
         if not documents:
@@ -43,10 +68,12 @@ class RAGEngine:
         chunks = text_splitter.split_documents(documents)
         if chunks:
-            # Add to vector store
-            self.vector_store.add_documents(chunks)
-            self.vector_store.persist()
-            print(f"✅ Indexed {len(chunks)} chunks from {len(documents)} documents.")
         else:
             print("⚠️ No chunks created.")
@@ -95,23 +122,26 @@ class RAGEngine:
     def refresh_knowledge_base(self):
         """Force rebuild of the index"""
         print("♻️ Refreshing Knowledge Base...")
-        # Clear existing collection
-        self.vector_store.delete_collection()
-        self.vector_store = Chroma(
-            persist_directory=self.persist_directory,
-            embedding_function=self.embedding_fn,
-            collection_name="phishing_knowledge"
-        )
-        # Rebuild
-        self._build_index()
-        return "✅ Knowledge Base Refreshed!"
     def retrieve(self, query, n_results=3):
         """Retrieve relevant context"""
         # Search
-        results = self.vector_store.similarity_search(query, k=n_results)
-        # Format results
-        if results:
-            return [doc.page_content for doc in results]
         return []

 import os
 import glob
+from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
+from langchain_community.vectorstores import Qdrant
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
+from qdrant_client import QdrantClient
 class RAGEngine:
+    def __init__(self, knowledge_base_dir="./knowledge_base"):
         self.knowledge_base_dir = knowledge_base_dir
+        # Initialize Embeddings
         self.embedding_fn = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+        # Qdrant Cloud Configuration
+        # Prioritize Env Vars, fallback to Hardcoded (User provided)
+        self.qdrant_url = os.environ.get("QDRANT_URL") or "https://abd29675-7fb9-4d95-8941-e6130b09bf7f.us-east4-0.gcp.cloud.qdrant.io"
+        self.qdrant_api_key = os.environ.get("QDRANT_API_KEY") or "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.L0aAAAbxRypLfBeGCtFr2xX06iveGb76NrA3BPJQiNM"
+        self.collection_name = "phishing_knowledge"
+        if not self.qdrant_url or not self.qdrant_api_key:
+            print("⚠️ QDRANT_URL or QDRANT_API_KEY not set. RAG will not function correctly.")
+            self.vector_store = None
+            return
+        print(f"☁️ Connecting to Qdrant Cloud: {self.qdrant_url}...")
+        # Initialize Qdrant Client
+        self.client = QdrantClient(
+            url=self.qdrant_url,
+            api_key=self.qdrant_api_key
         )
+        # Initialize Vector Store Wrapper
+        self.vector_store = Qdrant(
+            client=self.client,
+            collection_name=self.collection_name,
+            embeddings=self.embedding_fn
+        )
+        # Check if collection exists/is empty and build if needed
+        try:
+            count = self.client.count(collection_name=self.collection_name).count
+            if count == 0:
+                self._build_index()
+            else:
+                print(f"✅ Qdrant Collection '{self.collection_name}' ready with {count} vectors.")
+        except Exception as e:
+            print(f"⚠️ Collection check failed (might not exist): {e}")
             self._build_index()
     def _build_index(self):
         """Load documents and build index"""
+        print("🔄 Building Knowledge Base Index on Qdrant Cloud...")
         documents = self._load_documents()
         if not documents:
         chunks = text_splitter.split_documents(documents)
         if chunks:
+            # Add to vector store (Qdrant handles persistence automatically)
+            try:
+                self.vector_store.add_documents(chunks)
+                print(f"✅ Indexed {len(chunks)} chunks to Qdrant Cloud.")
+            except Exception as e:
+                print(f"❌ Error indexing to Qdrant: {e}")
         else:
             print("⚠️ No chunks created.")
     def refresh_knowledge_base(self):
         """Force rebuild of the index"""
         print("♻️ Refreshing Knowledge Base...")
+        if self.client:
+            try:
+                self.client.delete_collection(self.collection_name)
+                self._build_index()
+                return "✅ Knowledge Base Refreshed on Cloud!"
+            except Exception as e:
+                return f"❌ Error refreshing: {e}"
+        return "❌ Qdrant Client not initialized."
     def retrieve(self, query, n_results=3):
         """Retrieve relevant context"""
+        if not self.vector_store:
+            return []
         # Search
+        try:
+            results = self.vector_store.similarity_search(query, k=n_results)
+            if results:
+                return [doc.page_content for doc in results]
+        except Exception as e:
+            print(f"⚠️ Retrieval Error: {e}")
         return []