warbler-cda / warbler_cda /pack_sync.py
Bellok's picture
Upload folder using huggingface_hub
0ccf2f0 verified
raw
history blame
4.24 kB
"""
Pack synchronization and update checking.
Verifies that local packs are up-to-date with upstream HuggingFace datasets.
Optionally re-ingests if packs are missing or outdated.
"""
import json
import logging
from pathlib import Path
from typing import Dict, Any, Optional
from datetime import datetime
logger = logging.getLogger(__name__)
class PackSync:
"""Manages pack synchronization with upstream sources."""
PACK_MANIFEST = {
"warbler-pack-hf-arxiv": {
"source": "nick007x/arxiv-papers",
"type": "huggingface",
"description": "Scholarly papers",
},
"warbler-pack-hf-prompt-report": {
"source": "PromptSystematicReview/ThePromptReport",
"type": "huggingface",
"description": "Prompt engineering documentation",
},
"warbler-pack-hf-novels": {
"source": "GOAT-AI/generated-novels",
"type": "huggingface",
"description": "Generated novels",
},
"warbler-pack-hf-manuals": {
"source": "nlasso/anac-manuals-23",
"type": "huggingface",
"description": "Technical manuals",
},
"warbler-pack-hf-enterprise": {
"source": "AST-FRI/EnterpriseBench",
"type": "huggingface",
"description": "Enterprise benchmarks",
},
"warbler-pack-hf-portuguese-edu": {
"source": "Solshine/Portuguese_Language_Education_Texts",
"type": "huggingface",
"description": "Portuguese education texts",
},
}
def __init__(self, packs_dir: Path = None):
"""Initialize the pack synchronizer."""
if packs_dir is None:
packs_dir = Path(__file__).parent.parent / "packs"
self.packs_dir = Path(packs_dir)
self.metadata_file = self.packs_dir / ".pack_metadata.json"
def verify_packs(self) -> Dict[str, Any]:
"""Verify all packs exist and are accessible."""
status = {"verified": [], "missing": [], "timestamp": datetime.now().isoformat()}
for pack_name in self.PACK_MANIFEST:
pack_dir = self.packs_dir / pack_name
pack_file = pack_dir / f"{pack_name}.jsonl"
if pack_dir.exists() and pack_file.exists():
try:
with open(pack_file, "r", encoding="utf-8") as f:
line_count = sum(1 for _ in f)
status["verified"].append(
{"pack": pack_name, "documents": line_count, "path": str(pack_dir)}
)
logger.info("✓ %s: %d documents", pack_name, line_count)
except OSError as e:
logger.warning("⚠️ %s exists but unable to read: %s", pack_name, e)
status["missing"].append(pack_name)
else:
status["missing"].append(pack_name)
logger.warning("⚠️ %s not found", pack_name)
return status
def save_metadata(self, status: Dict[str, Any]) -> None:
"""Save pack verification metadata."""
try:
with open(self.metadata_file, "w", encoding="utf-8") as f:
json.dump(status, f, indent=2)
logger.debug("Saved pack metadata to %s", self.metadata_file)
except OSError as e:
logger.warning("Could not save pack metadata: %s", e)
def get_sync_status(self) -> str:
"""Return human-readable pack sync status."""
status = self.verify_packs()
verified_count = len(status["verified"])
missing_count = len(status["missing"])
if missing_count == 0:
return f"✓ All {verified_count} packs verified and ready"
else:
return (
f"⚠️ {verified_count} packs verified, {missing_count} "
f"missing (run ingest to rebuild)"
)
def suggest_reingest(self) -> Optional[str]:
"""Return reingest command if packs are missing."""
status = self.verify_packs()
if status["missing"]:
return "python -m warbler_cda.utils.hf_warbler_ingest ingest --datasets all"
return None