PhishingTest

Paused

App Files Files Community

dungeon29 commited on 10 days ago

Commit

ad173f1

verified ·

1 Parent(s): e26305e

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +31 -14

rag_engine.py CHANGED Viewed

@@ -147,16 +147,22 @@ class RAGEngine:
         return documents
-    def load_from_huggingface(self, dataset_name="ealvaradob/phishing-dataset"):
-        """Load and index dataset from Hugging Face"""
-        print(f"📥 Downloading dataset '{dataset_name}'...")
         try:
-            dataset = load_dataset(dataset_name, trust_remote_code=True)
-            # Assuming 'train' split exists, or take the first available
-            split = list(dataset.keys())[0]
-            data = dataset[split]
-            print(f"✅ Dataset loaded. Processing {len(data)} rows...")
             documents = []
             for row in data:
@@ -174,16 +180,27 @@ class RAGEngine:
             if documents:
                 # Batch add to vector store
                 print(f"🔄 Indexing {len(documents)} documents to Qdrant...")
-                # Split if needed, but these are likely short
-                # Let's just add them directly for now, or split if very long
                 text_splitter = RecursiveCharacterTextSplitter(
-                    chunk_size=500,
-                    chunk_overlap=50
                 )
                 chunks = text_splitter.split_documents(documents)
-                self.vector_store.add_documents(chunks)
-                print(f"✅ Successfully indexed {len(chunks)} chunks from dataset!")
             else:
                 print("⚠️ No valid documents found in dataset.")

         return documents
+    def load_from_huggingface(self):
+        """Load and index dataset manually from Hugging Face JSON"""
+        dataset_url = "https://huggingface.co/datasets/ealvaradob/phishing-dataset/resolve/main/combined_reduced.json"
+        print(f"📥 Downloading dataset from {dataset_url}...")
         try:
+            import requests
+            import json
+            response = requests.get(dataset_url)
+            if response.status_code != 200:
+                print(f"❌ Failed to download dataset: {response.status_code}")
+                return
+            data = response.json()
+            print(f"✅ Dataset downloaded. Processing {len(data)} rows...")
             documents = []
             for row in data:
             if documents:
                 # Batch add to vector store
                 print(f"🔄 Indexing {len(documents)} documents to Qdrant...")
+                # Use a larger chunk size for efficiency since these are likely short texts
                 text_splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=1000,
+                    chunk_overlap=100
                 )
                 chunks = text_splitter.split_documents(documents)
+                # Add in batches to avoid hitting API limits or timeouts
+                batch_size = 100
+                total_chunks = len(chunks)
+                for i in range(0, total_chunks, batch_size):
+                    batch = chunks[i:i+batch_size]
+                    try:
+                        self.vector_store.add_documents(batch)
+                        print(f"   - Indexed batch {i//batch_size + 1}/{(total_chunks + batch_size - 1)//batch_size}")
+                    except Exception as e:
+                        print(f"   ⚠️ Error indexing batch {i}: {e}")
+                print(f"✅ Successfully indexed {total_chunks} chunks from dataset!")
             else:
                 print("⚠️ No valid documents found in dataset.")