dungeon29 commited on
Commit
6c05eaf
·
verified ·
1 Parent(s): 43b20ee

Update rag_engine.py

Browse files
Files changed (1) hide show
  1. rag_engine.py +33 -21
rag_engine.py CHANGED
@@ -1,33 +1,44 @@
1
  import os
2
  import glob
3
  from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, JSONLoader
4
- from langchain_community.vectorstores import Chroma
 
 
5
  from langchain_huggingface import HuggingFaceEmbeddings
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
7
  from langchain_core.documents import Document
8
 
9
  class RAGEngine:
10
- def __init__(self, knowledge_base_dir="./knowledge_base", persist_directory="./chroma_db"):
11
  self.knowledge_base_dir = knowledge_base_dir
12
  self.persist_directory = persist_directory
 
13
 
14
  # Initialize Embeddings (using same model as before)
15
  self.embedding_fn = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
16
 
17
- # Initialize Vector Store
18
- self.vector_store = Chroma(
19
- persist_directory=self.persist_directory,
20
- embedding_function=self.embedding_fn,
21
- collection_name="phishing_knowledge"
 
 
 
22
  )
23
 
24
- # Build index if empty or on init
25
- if not self.vector_store.get()['ids']:
 
 
 
 
 
26
  self._build_index()
27
 
28
  def _build_index(self):
29
  """Load documents and build index"""
30
- print("🔄 Building Knowledge Base Index...")
31
 
32
  documents = self._load_documents()
33
  if not documents:
@@ -43,9 +54,17 @@ class RAGEngine:
43
  chunks = text_splitter.split_documents(documents)
44
 
45
  if chunks:
46
- # Add to vector store
47
- self.vector_store.add_documents(chunks)
48
- self.vector_store.persist()
 
 
 
 
 
 
 
 
49
  print(f"✅ Indexed {len(chunks)} chunks from {len(documents)} documents.")
50
  else:
51
  print("⚠️ No chunks created.")
@@ -95,14 +114,7 @@ class RAGEngine:
95
  def refresh_knowledge_base(self):
96
  """Force rebuild of the index"""
97
  print("♻️ Refreshing Knowledge Base...")
98
- # Clear existing collection
99
- self.vector_store.delete_collection()
100
- self.vector_store = Chroma(
101
- persist_directory=self.persist_directory,
102
- embedding_function=self.embedding_fn,
103
- collection_name="phishing_knowledge"
104
- )
105
- # Rebuild
106
  self._build_index()
107
  return "✅ Knowledge Base Refreshed!"
108
 
 
1
  import os
2
  import glob
3
  from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, JSONLoader
4
+ from langchain_community.vectorstores import Qdrant
5
+ from qdrant_client import QdrantClient
6
+ from qdrant_client.http import models
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_text_splitters import RecursiveCharacterTextSplitter
9
  from langchain_core.documents import Document
10
 
11
  class RAGEngine:
12
+ def __init__(self, knowledge_base_dir="./knowledge_base", persist_directory="./qdrant_db"):
13
  self.knowledge_base_dir = knowledge_base_dir
14
  self.persist_directory = persist_directory
15
+ self.collection_name = "phishing_knowledge"
16
 
17
  # Initialize Embeddings (using same model as before)
18
  self.embedding_fn = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
19
 
20
+ # Initialize Qdrant Client (Local mode)
21
+ self.client = QdrantClient(path=self.persist_directory)
22
+
23
+ # Initialize Vector Store wrapper
24
+ self.vector_store = Qdrant(
25
+ client=self.client,
26
+ collection_name=self.collection_name,
27
+ embeddings=self.embedding_fn
28
  )
29
 
30
+ # Check if collection exists and has data
31
+ try:
32
+ count = self.client.count(collection_name=self.collection_name).count
33
+ if count == 0:
34
+ self._build_index()
35
+ except:
36
+ # Collection might not exist yet
37
  self._build_index()
38
 
39
  def _build_index(self):
40
  """Load documents and build index"""
41
+ print("🔄 Building Knowledge Base Index (Qdrant)...")
42
 
43
  documents = self._load_documents()
44
  if not documents:
 
54
  chunks = text_splitter.split_documents(documents)
55
 
56
  if chunks:
57
+ # Re-create collection to ensure clean slate or add to it
58
+ # For simplicity in local build, we use Qdrant.from_documents which creates/replaces
59
+ self.vector_store = Qdrant.from_documents(
60
+ chunks,
61
+ self.embedding_fn,
62
+ path=self.persist_directory,
63
+ collection_name=self.collection_name,
64
+ force_recreate=True
65
+ )
66
+ # Update the client reference after recreation
67
+ self.client = self.vector_store.client
68
  print(f"✅ Indexed {len(chunks)} chunks from {len(documents)} documents.")
69
  else:
70
  print("⚠️ No chunks created.")
 
114
  def refresh_knowledge_base(self):
115
  """Force rebuild of the index"""
116
  print("♻️ Refreshing Knowledge Base...")
117
+ # In Qdrant local, we can just rebuild with force_recreate=True which is handled in _build_index
 
 
 
 
 
 
 
118
  self._build_index()
119
  return "✅ Knowledge Base Refreshed!"
120