dungeon29 commited on
Commit
ad173f1
·
verified ·
1 Parent(s): e26305e

Update rag_engine.py

Browse files
Files changed (1) hide show
  1. rag_engine.py +31 -14
rag_engine.py CHANGED
@@ -147,16 +147,22 @@ class RAGEngine:
147
 
148
  return documents
149
 
150
- def load_from_huggingface(self, dataset_name="ealvaradob/phishing-dataset"):
151
- """Load and index dataset from Hugging Face"""
152
- print(f"📥 Downloading dataset '{dataset_name}'...")
 
 
153
  try:
154
- dataset = load_dataset(dataset_name, trust_remote_code=True)
155
- # Assuming 'train' split exists, or take the first available
156
- split = list(dataset.keys())[0]
157
- data = dataset[split]
158
 
159
- print(f"✅ Dataset loaded. Processing {len(data)} rows...")
 
 
 
 
 
 
160
 
161
  documents = []
162
  for row in data:
@@ -174,16 +180,27 @@ class RAGEngine:
174
  if documents:
175
  # Batch add to vector store
176
  print(f"🔄 Indexing {len(documents)} documents to Qdrant...")
177
- # Split if needed, but these are likely short
178
- # Let's just add them directly for now, or split if very long
179
  text_splitter = RecursiveCharacterTextSplitter(
180
- chunk_size=500,
181
- chunk_overlap=50
182
  )
183
  chunks = text_splitter.split_documents(documents)
184
 
185
- self.vector_store.add_documents(chunks)
186
- print(f"✅ Successfully indexed {len(chunks)} chunks from dataset!")
 
 
 
 
 
 
 
 
 
 
 
187
  else:
188
  print("⚠️ No valid documents found in dataset.")
189
 
 
147
 
148
  return documents
149
 
150
+ def load_from_huggingface(self):
151
+ """Load and index dataset manually from Hugging Face JSON"""
152
+ dataset_url = "https://huggingface.co/datasets/ealvaradob/phishing-dataset/resolve/main/combined_reduced.json"
153
+ print(f"📥 Downloading dataset from {dataset_url}...")
154
+
155
  try:
156
+ import requests
157
+ import json
 
 
158
 
159
+ response = requests.get(dataset_url)
160
+ if response.status_code != 200:
161
+ print(f"❌ Failed to download dataset: {response.status_code}")
162
+ return
163
+
164
+ data = response.json()
165
+ print(f"✅ Dataset downloaded. Processing {len(data)} rows...")
166
 
167
  documents = []
168
  for row in data:
 
180
  if documents:
181
  # Batch add to vector store
182
  print(f"🔄 Indexing {len(documents)} documents to Qdrant...")
183
+
184
+ # Use a larger chunk size for efficiency since these are likely short texts
185
  text_splitter = RecursiveCharacterTextSplitter(
186
+ chunk_size=1000,
187
+ chunk_overlap=100
188
  )
189
  chunks = text_splitter.split_documents(documents)
190
 
191
+ # Add in batches to avoid hitting API limits or timeouts
192
+ batch_size = 100
193
+ total_chunks = len(chunks)
194
+
195
+ for i in range(0, total_chunks, batch_size):
196
+ batch = chunks[i:i+batch_size]
197
+ try:
198
+ self.vector_store.add_documents(batch)
199
+ print(f" - Indexed batch {i//batch_size + 1}/{(total_chunks + batch_size - 1)//batch_size}")
200
+ except Exception as e:
201
+ print(f" ⚠️ Error indexing batch {i}: {e}")
202
+
203
+ print(f"✅ Successfully indexed {total_chunks} chunks from dataset!")
204
  else:
205
  print("⚠️ No valid documents found in dataset.")
206