Bellok commited on
Commit
2ce0e5e
Β·
1 Parent(s): d55147d

feat: expand dataset ingestion to include multiple packs and update dependencies

Browse files

- Replaced single "prompt-report" dataset download with ingestion of 14 datasets (including arxiv chunks, novels, manuals, etc.) for full deployment readiness
- Added tracking of successful downloads, total documents, and per-pack counts with print feedback
- Upgraded gradio dependency from >=5.0.0 to >=5.5.0 in requirements.txt for improved compatibility

Files changed (2) hide show
  1. app.py +24 -3
  2. requirements.txt +1 -1
app.py CHANGED
@@ -50,9 +50,30 @@ if len(documents) == 0:
50
  try:
51
  from warbler_cda.utils.hf_warbler_ingest import HFWarblerIngestor
52
  ingestor = HFWarblerIngestor(packs_dir=pack_loader.packs_dir, verbose=True)
53
- # Download a small demo dataset for deployment
54
- print("πŸ“¦ Downloading warbler-pack-hf-prompt-report...")
55
- success = ingestor.ingest_dataset("prompt-report")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  if success:
57
  # Reload after download
58
  documents = pack_loader.discover_documents()
 
50
  try:
51
  from warbler_cda.utils.hf_warbler_ingest import HFWarblerIngestor
52
  ingestor = HFWarblerIngestor(packs_dir=pack_loader.packs_dir, verbose=True)
53
+ # Download all required datasets for deployment
54
+ datasets_to_download = [
55
+ "arxiv-1", "arxiv-2", "arxiv-3", "arxiv-4", "arxiv-5", # First 5 arxiv chunks
56
+ "novels", "manuals", "enterprise", "edustories", "npc-dialogue", "wisdom-scrolls",
57
+ "faction-politics", "portuguese-edu", "prompt-report"
58
+ ]
59
+
60
+ total_docs = 0
61
+ successful_downloads = 0
62
+
63
+ for dataset in datasets_to_download:
64
+ print(f"πŸ“¦ Downloading {dataset}...")
65
+ success = ingestor.ingest_dataset(dataset)
66
+ if success:
67
+ successful_downloads += 1
68
+ # Count documents in this pack
69
+ pack_docs = pack_loader.discover_documents()
70
+ new_count = len(pack_docs) - total_docs
71
+ total_docs = len(pack_docs)
72
+ print(f"βœ… {dataset}: {new_count} documents")
73
+ else:
74
+ print(f"❌ Failed to download {dataset}")
75
+
76
+ print(f"πŸ“Š Total: {total_docs} documents from {successful_downloads}/{len(datasets_to_download)} packs")
77
  if success:
78
  # Reload after download
79
  documents = pack_loader.discover_documents()
requirements.txt CHANGED
@@ -14,7 +14,7 @@ uvicorn>=0.32.0
14
  python-multipart>=0.0.6
15
 
16
  # Gradio for HuggingFace Space
17
- gradio>=5.0.0
18
 
19
  # CLI
20
  click>=8.1.0
 
14
  python-multipart>=0.0.6
15
 
16
  # Gradio for HuggingFace Space
17
+ gradio>=5.5.0
18
 
19
  # CLI
20
  click>=8.1.0