PhishingTest

Paused

File size: 5,650 Bytes

import os
import glob
from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, JSONLoader
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.http import models
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

class RAGEngine:
    def __init__(self, knowledge_base_dir="./knowledge_base", persist_directory="./qdrant_db"):
        self.knowledge_base_dir = knowledge_base_dir
        self.persist_directory = persist_directory
        self.collection_name = "phishing_knowledge"
        
        # Initialize Embeddings (using same model as before)
        self.embedding_fn = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        
        # Initialize Qdrant Client (Local mode)
        self.client = QdrantClient(path=self.persist_directory)
        
        # Initialize Vector Store wrapper
        self.vector_store = Qdrant(
            client=self.client,
            collection_name=self.collection_name,
            embeddings=self.embedding_fn
        )
        
        # Check if collection exists and has data
        try:
            count = self.client.count(collection_name=self.collection_name).count
            if count == 0:
                self._build_index()
        except:
            # Collection might not exist yet
            self._build_index()

    def _build_index(self):
        """Load documents and build index"""
        print("🔄 Building Knowledge Base Index (Qdrant)...")
        
        documents = self._load_documents()
        if not documents:
            print("⚠️ No documents found to index.")
            return

        # Split documents
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            separators=["\n\n", "\n", " ", ""]
        )
        chunks = text_splitter.split_documents(documents)
        
        if chunks:
            # Re-create collection to ensure clean slate or add to it
            # For simplicity in local build, we use Qdrant.from_documents which creates/replaces
            self.vector_store = Qdrant.from_documents(
                chunks,
                self.embedding_fn,
                path=self.persist_directory,
                collection_name=self.collection_name,
                force_recreate=True 
            )
            # Update the client reference after recreation
            self.client = self.vector_store.client
            print(f"✅ Indexed {len(chunks)} chunks from {len(documents)} documents.")
        else:
            print("⚠️ No chunks created.")

    def _load_documents(self):
        """Load documents from directory or fallback file"""
        documents = []
        
        # Check for directory or fallback file
        target_path = self.knowledge_base_dir
        if not os.path.exists(target_path):
            if os.path.exists("knowledge_base.txt"):
                target_path = "knowledge_base.txt"
                print("⚠️ Using fallback 'knowledge_base.txt' in root.")
            else:
                print(f"❌ Knowledge base not found at {target_path}")
                return []

        try:
            if os.path.isfile(target_path):
                # Load single file
                if target_path.endswith(".pdf"):
                    loader = PyPDFLoader(target_path)
                else:
                    loader = TextLoader(target_path, encoding="utf-8")
                documents.extend(loader.load())
            else:
                # Load directory
                loaders = [
                    DirectoryLoader(target_path, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}),
                    DirectoryLoader(target_path, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}),
                    DirectoryLoader(target_path, glob="**/*.pdf", loader_cls=PyPDFLoader),
                ]
                
                for loader in loaders:
                    try:
                        docs = loader.load()
                        documents.extend(docs)
                    except Exception as e:
                        print(f"⚠️ Error loading with {loader}: {e}")
                        
        except Exception as e:
            print(f"❌ Error loading documents: {e}")
            
        return documents

    def refresh_knowledge_base(self):
        """Force rebuild of the index"""
        print("♻️ Refreshing Knowledge Base...")
        # In Qdrant local, we can just rebuild with force_recreate=True which is handled in _build_index
        self._build_index()
        return "✅ Knowledge Base Refreshed!"

    def retrieve(self, query, n_results=3, use_mmr=True):
        """
        Retrieve relevant context
        Args:
            query: Câu truy vấn
            n_results: Số lượng kết quả trả về
            use_mmr: Sử dụng MMR (True) hay Similarity Search thường (False)
        """
        if use_mmr:
            results = self.vector_store.max_marginal_relevance_search(
                query, 
                k=n_results, 
                fetch_k=n_results*3, 
                lambda_mult=0.6
            )
        else:
            # Standard Similarity Search
            results = self.vector_store.similarity_search(query, k=n_results)
        
        # Format results
        if results:
            return [doc.page_content for doc in results]
        return []