Spaces:
Running
on
Zero
Running
on
Zero
Bellok
commited on
Commit
Β·
2ce0e5e
1
Parent(s):
d55147d
feat: expand dataset ingestion to include multiple packs and update dependencies
Browse files- Replaced single "prompt-report" dataset download with ingestion of 14 datasets (including arxiv chunks, novels, manuals, etc.) for full deployment readiness
- Added tracking of successful downloads, total documents, and per-pack counts with print feedback
- Upgraded gradio dependency from >=5.0.0 to >=5.5.0 in requirements.txt for improved compatibility
- app.py +24 -3
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -50,9 +50,30 @@ if len(documents) == 0:
|
|
| 50 |
try:
|
| 51 |
from warbler_cda.utils.hf_warbler_ingest import HFWarblerIngestor
|
| 52 |
ingestor = HFWarblerIngestor(packs_dir=pack_loader.packs_dir, verbose=True)
|
| 53 |
-
# Download
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
if success:
|
| 57 |
# Reload after download
|
| 58 |
documents = pack_loader.discover_documents()
|
|
|
|
| 50 |
try:
|
| 51 |
from warbler_cda.utils.hf_warbler_ingest import HFWarblerIngestor
|
| 52 |
ingestor = HFWarblerIngestor(packs_dir=pack_loader.packs_dir, verbose=True)
|
| 53 |
+
# Download all required datasets for deployment
|
| 54 |
+
datasets_to_download = [
|
| 55 |
+
"arxiv-1", "arxiv-2", "arxiv-3", "arxiv-4", "arxiv-5", # First 5 arxiv chunks
|
| 56 |
+
"novels", "manuals", "enterprise", "edustories", "npc-dialogue", "wisdom-scrolls",
|
| 57 |
+
"faction-politics", "portuguese-edu", "prompt-report"
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
total_docs = 0
|
| 61 |
+
successful_downloads = 0
|
| 62 |
+
|
| 63 |
+
for dataset in datasets_to_download:
|
| 64 |
+
print(f"π¦ Downloading {dataset}...")
|
| 65 |
+
success = ingestor.ingest_dataset(dataset)
|
| 66 |
+
if success:
|
| 67 |
+
successful_downloads += 1
|
| 68 |
+
# Count documents in this pack
|
| 69 |
+
pack_docs = pack_loader.discover_documents()
|
| 70 |
+
new_count = len(pack_docs) - total_docs
|
| 71 |
+
total_docs = len(pack_docs)
|
| 72 |
+
print(f"β
{dataset}: {new_count} documents")
|
| 73 |
+
else:
|
| 74 |
+
print(f"β Failed to download {dataset}")
|
| 75 |
+
|
| 76 |
+
print(f"π Total: {total_docs} documents from {successful_downloads}/{len(datasets_to_download)} packs")
|
| 77 |
if success:
|
| 78 |
# Reload after download
|
| 79 |
documents = pack_loader.discover_documents()
|
requirements.txt
CHANGED
|
@@ -14,7 +14,7 @@ uvicorn>=0.32.0
|
|
| 14 |
python-multipart>=0.0.6
|
| 15 |
|
| 16 |
# Gradio for HuggingFace Space
|
| 17 |
-
gradio>=5.
|
| 18 |
|
| 19 |
# CLI
|
| 20 |
click>=8.1.0
|
|
|
|
| 14 |
python-multipart>=0.0.6
|
| 15 |
|
| 16 |
# Gradio for HuggingFace Space
|
| 17 |
+
gradio>=5.5.0
|
| 18 |
|
| 19 |
# CLI
|
| 20 |
click>=8.1.0
|