Spaces:
Paused
Paused
Update rag_engine.py
Browse files- rag_engine.py +31 -14
rag_engine.py
CHANGED
|
@@ -147,16 +147,22 @@ class RAGEngine:
|
|
| 147 |
|
| 148 |
return documents
|
| 149 |
|
| 150 |
-
def load_from_huggingface(self
|
| 151 |
-
"""Load and index dataset from Hugging Face"""
|
| 152 |
-
|
|
|
|
|
|
|
| 153 |
try:
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
split = list(dataset.keys())[0]
|
| 157 |
-
data = dataset[split]
|
| 158 |
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
documents = []
|
| 162 |
for row in data:
|
|
@@ -174,16 +180,27 @@ class RAGEngine:
|
|
| 174 |
if documents:
|
| 175 |
# Batch add to vector store
|
| 176 |
print(f"🔄 Indexing {len(documents)} documents to Qdrant...")
|
| 177 |
-
|
| 178 |
-
#
|
| 179 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 180 |
-
chunk_size=
|
| 181 |
-
chunk_overlap=
|
| 182 |
)
|
| 183 |
chunks = text_splitter.split_documents(documents)
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
else:
|
| 188 |
print("⚠️ No valid documents found in dataset.")
|
| 189 |
|
|
|
|
| 147 |
|
| 148 |
return documents
|
| 149 |
|
| 150 |
+
def load_from_huggingface(self):
|
| 151 |
+
"""Load and index dataset manually from Hugging Face JSON"""
|
| 152 |
+
dataset_url = "https://huggingface.co/datasets/ealvaradob/phishing-dataset/resolve/main/combined_reduced.json"
|
| 153 |
+
print(f"📥 Downloading dataset from {dataset_url}...")
|
| 154 |
+
|
| 155 |
try:
|
| 156 |
+
import requests
|
| 157 |
+
import json
|
|
|
|
|
|
|
| 158 |
|
| 159 |
+
response = requests.get(dataset_url)
|
| 160 |
+
if response.status_code != 200:
|
| 161 |
+
print(f"❌ Failed to download dataset: {response.status_code}")
|
| 162 |
+
return
|
| 163 |
+
|
| 164 |
+
data = response.json()
|
| 165 |
+
print(f"✅ Dataset downloaded. Processing {len(data)} rows...")
|
| 166 |
|
| 167 |
documents = []
|
| 168 |
for row in data:
|
|
|
|
| 180 |
if documents:
|
| 181 |
# Batch add to vector store
|
| 182 |
print(f"🔄 Indexing {len(documents)} documents to Qdrant...")
|
| 183 |
+
|
| 184 |
+
# Use a larger chunk size for efficiency since these are likely short texts
|
| 185 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 186 |
+
chunk_size=1000,
|
| 187 |
+
chunk_overlap=100
|
| 188 |
)
|
| 189 |
chunks = text_splitter.split_documents(documents)
|
| 190 |
|
| 191 |
+
# Add in batches to avoid hitting API limits or timeouts
|
| 192 |
+
batch_size = 100
|
| 193 |
+
total_chunks = len(chunks)
|
| 194 |
+
|
| 195 |
+
for i in range(0, total_chunks, batch_size):
|
| 196 |
+
batch = chunks[i:i+batch_size]
|
| 197 |
+
try:
|
| 198 |
+
self.vector_store.add_documents(batch)
|
| 199 |
+
print(f" - Indexed batch {i//batch_size + 1}/{(total_chunks + batch_size - 1)//batch_size}")
|
| 200 |
+
except Exception as e:
|
| 201 |
+
print(f" ⚠️ Error indexing batch {i}: {e}")
|
| 202 |
+
|
| 203 |
+
print(f"✅ Successfully indexed {total_chunks} chunks from dataset!")
|
| 204 |
else:
|
| 205 |
print("⚠️ No valid documents found in dataset.")
|
| 206 |
|