Spaces:
Paused
Paused
File size: 5,650 Bytes
fc74095 6c05eaf fc74095 6c05eaf fc74095 6c05eaf fc74095 6c05eaf fc74095 6c05eaf fc74095 6c05eaf fc74095 6c05eaf fc74095 6c05eaf fc74095 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import os
import glob
from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, JSONLoader
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.http import models
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
class RAGEngine:
def __init__(self, knowledge_base_dir="./knowledge_base", persist_directory="./qdrant_db"):
self.knowledge_base_dir = knowledge_base_dir
self.persist_directory = persist_directory
self.collection_name = "phishing_knowledge"
# Initialize Embeddings (using same model as before)
self.embedding_fn = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# Initialize Qdrant Client (Local mode)
self.client = QdrantClient(path=self.persist_directory)
# Initialize Vector Store wrapper
self.vector_store = Qdrant(
client=self.client,
collection_name=self.collection_name,
embeddings=self.embedding_fn
)
# Check if collection exists and has data
try:
count = self.client.count(collection_name=self.collection_name).count
if count == 0:
self._build_index()
except:
# Collection might not exist yet
self._build_index()
def _build_index(self):
"""Load documents and build index"""
print("🔄 Building Knowledge Base Index (Qdrant)...")
documents = self._load_documents()
if not documents:
print("⚠️ No documents found to index.")
return
# Split documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n", "\n", " ", ""]
)
chunks = text_splitter.split_documents(documents)
if chunks:
# Re-create collection to ensure clean slate or add to it
# For simplicity in local build, we use Qdrant.from_documents which creates/replaces
self.vector_store = Qdrant.from_documents(
chunks,
self.embedding_fn,
path=self.persist_directory,
collection_name=self.collection_name,
force_recreate=True
)
# Update the client reference after recreation
self.client = self.vector_store.client
print(f"✅ Indexed {len(chunks)} chunks from {len(documents)} documents.")
else:
print("⚠️ No chunks created.")
def _load_documents(self):
"""Load documents from directory or fallback file"""
documents = []
# Check for directory or fallback file
target_path = self.knowledge_base_dir
if not os.path.exists(target_path):
if os.path.exists("knowledge_base.txt"):
target_path = "knowledge_base.txt"
print("⚠️ Using fallback 'knowledge_base.txt' in root.")
else:
print(f"❌ Knowledge base not found at {target_path}")
return []
try:
if os.path.isfile(target_path):
# Load single file
if target_path.endswith(".pdf"):
loader = PyPDFLoader(target_path)
else:
loader = TextLoader(target_path, encoding="utf-8")
documents.extend(loader.load())
else:
# Load directory
loaders = [
DirectoryLoader(target_path, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}),
DirectoryLoader(target_path, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}),
DirectoryLoader(target_path, glob="**/*.pdf", loader_cls=PyPDFLoader),
]
for loader in loaders:
try:
docs = loader.load()
documents.extend(docs)
except Exception as e:
print(f"⚠️ Error loading with {loader}: {e}")
except Exception as e:
print(f"❌ Error loading documents: {e}")
return documents
def refresh_knowledge_base(self):
"""Force rebuild of the index"""
print("♻️ Refreshing Knowledge Base...")
# In Qdrant local, we can just rebuild with force_recreate=True which is handled in _build_index
self._build_index()
return "✅ Knowledge Base Refreshed!"
def retrieve(self, query, n_results=3, use_mmr=True):
"""
Retrieve relevant context
Args:
query: Câu truy vấn
n_results: Số lượng kết quả trả về
use_mmr: Sử dụng MMR (True) hay Similarity Search thường (False)
"""
if use_mmr:
results = self.vector_store.max_marginal_relevance_search(
query,
k=n_results,
fetch_k=n_results*3,
lambda_mult=0.6
)
else:
# Standard Similarity Search
results = self.vector_store.similarity_search(query, k=n_results)
# Format results
if results:
return [doc.page_content for doc in results]
return [] |