File size: 9,844 Bytes
fc74095
 
f961bd3
e96edf2
fc74095
 
fdd4830
e96edf2
fdd4830
fc74095
 
f961bd3
fc74095
 
f961bd3
fc74095
 
f961bd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc74095
 
f961bd3
 
 
 
 
 
 
 
 
e96edf2
 
 
 
 
 
 
f961bd3
 
35dcb10
 
 
 
 
 
 
 
 
 
 
 
 
 
e0a14a5
35dcb10
 
5202b5f
 
35dcb10
f961bd3
e96edf2
 
fc74095
 
 
 
f961bd3
fc74095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f961bd3
 
 
 
 
 
fc74095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad173f1
 
 
 
 
fdd4830
ad173f1
 
fdd4830
ad173f1
 
 
 
 
 
 
fdd4830
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad173f1
 
fdd4830
ad173f1
 
fdd4830
 
 
ad173f1
 
 
 
 
 
 
 
 
 
 
 
 
fdd4830
 
 
 
 
 
fc74095
 
 
f961bd3
 
 
 
35dcb10
f961bd3
 
 
 
fc74095
9e66bad
 
f961bd3
 
 
9e66bad
f961bd3
 
 
 
 
 
 
9e66bad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import os
import glob
from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
from langchain_qdrant import Qdrant
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from qdrant_client import QdrantClient, models
from datasets import load_dataset

class RAGEngine:
    def __init__(self, knowledge_base_dir="./knowledge_base"):
        self.knowledge_base_dir = knowledge_base_dir
        
        # Initialize Embeddings
        self.embedding_fn = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        
        # Qdrant Cloud Configuration
        # Prioritize Env Vars, fallback to Hardcoded (User provided)
        self.qdrant_url = os.environ.get("QDRANT_URL") or "https://abd29675-7fb9-4d95-8941-e6130b09bf7f.us-east4-0.gcp.cloud.qdrant.io"
        self.qdrant_api_key = os.environ.get("QDRANT_API_KEY") or "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.L0aAAAbxRypLfBeGCtFr2xX06iveGb76NrA3BPJQiNM"
        self.collection_name = "phishing_knowledge"
        
        if not self.qdrant_url or not self.qdrant_api_key:
            print("โš ๏ธ QDRANT_URL or QDRANT_API_KEY not set. RAG will not function correctly.")
            self.vector_store = None
            return

        print(f"โ˜๏ธ Connecting to Qdrant Cloud: {self.qdrant_url}...")
        
        # Initialize Qdrant Client
        self.client = QdrantClient(
            url=self.qdrant_url, 
            api_key=self.qdrant_api_key
        )
        
        # Initialize Vector Store Wrapper
        self.vector_store = Qdrant(
            client=self.client,
            collection_name=self.collection_name,
            embeddings=self.embedding_fn
        )
        
        # Check if collection exists/is empty and build if needed
        try:
            if not self.client.collection_exists(self.collection_name):
                print(f"โš ๏ธ Collection '{self.collection_name}' not found. Creating...")
                self.client.create_collection(
                    collection_name=self.collection_name,
                    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)
                )
                print(f"โœ… Collection '{self.collection_name}' created!")
                self._build_index()
            else:
                # Check if dataset is already indexed
                dataset_filter = models.Filter(
                    must=[
                        models.FieldCondition(
                            key="metadata.source",
                            match=models.MatchValue(value="hf_dataset")
                        )
                    ]
                )
                dataset_count = self.client.count(
                    collection_name=self.collection_name,
                    count_filter=dataset_filter
                ).count
                
                print(f"โœ… Qdrant Collection '{self.collection_name}' ready with {dataset_count} vectors.")
                
                if dataset_count == 0:
                    print("โš ๏ธ Phishing dataset not found. Please run 'index_dataset_colab.ipynb' to populate.")
                    # self.load_from_huggingface() # Disabled to prevent timeout
                    
        except Exception as e:
            print(f"โš ๏ธ Collection check/creation failed: {e}")
            # Try to build anyway, maybe wrapper handles it
            self._build_index()

    def _build_index(self):
        """Load documents and build index"""
        print("๐Ÿ”„ Building Knowledge Base Index on Qdrant Cloud...")
        
        documents = self._load_documents()
        if not documents:
            print("โš ๏ธ No documents found to index.")
            return

        # Split documents
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            separators=["\n\n", "\n", " ", ""]
        )
        chunks = text_splitter.split_documents(documents)
        
        if chunks:
            # Add to vector store (Qdrant handles persistence automatically)
            try:
                self.vector_store.add_documents(chunks)
                print(f"โœ… Indexed {len(chunks)} chunks to Qdrant Cloud.")
            except Exception as e:
                print(f"โŒ Error indexing to Qdrant: {e}")
        else:
            print("โš ๏ธ No chunks created.")

    def _load_documents(self):
        """Load documents from directory or fallback file"""
        documents = []
        
        # Check for directory or fallback file
        target_path = self.knowledge_base_dir
        if not os.path.exists(target_path):
            if os.path.exists("knowledge_base.txt"):
                target_path = "knowledge_base.txt"
                print("โš ๏ธ Using fallback 'knowledge_base.txt' in root.")
            else:
                print(f"โŒ Knowledge base not found at {target_path}")
                return []

        try:
            if os.path.isfile(target_path):
                # Load single file
                if target_path.endswith(".pdf"):
                    loader = PyPDFLoader(target_path)
                else:
                    loader = TextLoader(target_path, encoding="utf-8")
                documents.extend(loader.load())
            else:
                # Load directory
                loaders = [
                    DirectoryLoader(target_path, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}),
                    DirectoryLoader(target_path, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}),
                    DirectoryLoader(target_path, glob="**/*.pdf", loader_cls=PyPDFLoader),
                ]
                
                for loader in loaders:
                    try:
                        docs = loader.load()
                        documents.extend(docs)
                    except Exception as e:
                        print(f"โš ๏ธ Error loading with {loader}: {e}")
                        
        except Exception as e:
            print(f"โŒ Error loading documents: {e}")
            
        return documents

    def load_from_huggingface(self):
        """Load and index dataset manually from Hugging Face JSON"""
        dataset_url = "https://huggingface.co/datasets/ealvaradob/phishing-dataset/resolve/main/combined_reduced.json"
        print(f"๐Ÿ“ฅ Downloading dataset from {dataset_url}...")
        
        try:
            import requests
            import json
            
            response = requests.get(dataset_url)
            if response.status_code != 200:
                print(f"โŒ Failed to download dataset: {response.status_code}")
                return

            data = response.json()
            print(f"โœ… Dataset downloaded. Processing {len(data)} rows...")
            
            documents = []
            for row in data:
                # Structure: text, label
                content = row.get('text', '')
                label = row.get('label', -1)
                
                if content:
                    doc = Document(
                        page_content=content,
                        metadata={"source": "hf_dataset", "label": label}
                    )
                    documents.append(doc)
            
            if documents:
                # Batch add to vector store
                print(f"๐Ÿ”„ Indexing {len(documents)} documents to Qdrant...")
                
                # Use a larger chunk size for efficiency since these are likely short texts
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=1000,
                    chunk_overlap=100
                )
                chunks = text_splitter.split_documents(documents)
                
                # Add in batches to avoid hitting API limits or timeouts
                batch_size = 100
                total_chunks = len(chunks)
                
                for i in range(0, total_chunks, batch_size):
                    batch = chunks[i:i+batch_size]
                    try:
                        self.vector_store.add_documents(batch)
                        print(f"   - Indexed batch {i//batch_size + 1}/{(total_chunks + batch_size - 1)//batch_size}")
                    except Exception as e:
                        print(f"   โš ๏ธ Error indexing batch {i}: {e}")
                        
                print(f"โœ… Successfully indexed {total_chunks} chunks from dataset!")
            else:
                print("โš ๏ธ No valid documents found in dataset.")
                
        except Exception as e:
            print(f"โŒ Error loading HF dataset: {e}")

    def refresh_knowledge_base(self):
        """Force rebuild of the index"""
        print("โ™ป๏ธ Refreshing Knowledge Base...")
        if self.client:
            try:
                self.client.delete_collection(self.collection_name)
                self._build_index()
                self.load_from_huggingface()
                return "โœ… Knowledge Base Refreshed on Cloud!"
            except Exception as e:
                return f"โŒ Error refreshing: {e}"
        return "โŒ Qdrant Client not initialized."

    def retrieve(self, query, n_results=3):
        """Retrieve relevant context"""
        if not self.vector_store:
            return []
            
        # Search
        try:
            results = self.vector_store.similarity_search(query, k=n_results)
            if results:
                return [doc.page_content for doc in results]
        except Exception as e:
            print(f"โš ๏ธ Retrieval Error: {e}")
            
        return []