Spaces:

Bellok
/

warbler-cda

Running on Zero

Bellok commited on 24 days ago

Commit

2ce0e5e

1 Parent(s): d55147d

feat: expand dataset ingestion to include multiple packs and update dependencies

- Replaced single "prompt-report" dataset download with ingestion of 14 datasets (including arxiv chunks, novels, manuals, etc.) for full deployment readiness
- Added tracking of successful downloads, total documents, and per-pack counts with print feedback
- Upgraded gradio dependency from >=5.0.0 to >=5.5.0 in requirements.txt for improved compatibility

Files changed (2) hide show

app.py +24 -3
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -50,9 +50,30 @@ if len(documents) == 0:
     try:
         from warbler_cda.utils.hf_warbler_ingest import HFWarblerIngestor
         ingestor = HFWarblerIngestor(packs_dir=pack_loader.packs_dir, verbose=True)
-        # Download a small demo dataset for deployment
-        print("📦 Downloading warbler-pack-hf-prompt-report...")
-        success = ingestor.ingest_dataset("prompt-report")
         if success:
             # Reload after download
             documents = pack_loader.discover_documents()

     try:
         from warbler_cda.utils.hf_warbler_ingest import HFWarblerIngestor
         ingestor = HFWarblerIngestor(packs_dir=pack_loader.packs_dir, verbose=True)
+        # Download all required datasets for deployment
+        datasets_to_download = [
+            "arxiv-1", "arxiv-2", "arxiv-3", "arxiv-4", "arxiv-5",  # First 5 arxiv chunks
+            "novels", "manuals", "enterprise", "edustories", "npc-dialogue", "wisdom-scrolls",
+            "faction-politics", "portuguese-edu", "prompt-report"
+        ]
+        total_docs = 0
+        successful_downloads = 0
+        for dataset in datasets_to_download:
+            print(f"📦 Downloading {dataset}...")
+            success = ingestor.ingest_dataset(dataset)
+            if success:
+                successful_downloads += 1
+                # Count documents in this pack
+                pack_docs = pack_loader.discover_documents()
+                new_count = len(pack_docs) - total_docs
+                total_docs = len(pack_docs)
+                print(f"✅ {dataset}: {new_count} documents")
+            else:
+                print(f"❌ Failed to download {dataset}")
+        print(f"📊 Total: {total_docs} documents from {successful_downloads}/{len(datasets_to_download)} packs")
         if success:
             # Reload after download
             documents = pack_loader.discover_documents()

requirements.txt CHANGED Viewed

@@ -14,7 +14,7 @@ uvicorn>=0.32.0
 python-multipart>=0.0.6
 # Gradio for HuggingFace Space
-gradio>=5.0.0
 # CLI
 click>=8.1.0

 python-multipart>=0.0.6
 # Gradio for HuggingFace Space
+gradio>=5.5.0
 # CLI
 click>=8.1.0