Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Pack synchronization and update checking. | |
| Verifies that local packs are up-to-date with upstream HuggingFace datasets. | |
| Optionally re-ingests if packs are missing or outdated. | |
| """ | |
| import json | |
| import logging | |
| from pathlib import Path | |
| from typing import Dict, Any, Optional | |
| from datetime import datetime | |
| logger = logging.getLogger(__name__) | |
| class PackSync: | |
| """Manages pack synchronization with upstream sources.""" | |
| PACK_MANIFEST = { | |
| "warbler-pack-hf-arxiv": { | |
| "source": "nick007x/arxiv-papers", | |
| "type": "huggingface", | |
| "description": "Scholarly papers", | |
| }, | |
| "warbler-pack-hf-prompt-report": { | |
| "source": "PromptSystematicReview/ThePromptReport", | |
| "type": "huggingface", | |
| "description": "Prompt engineering documentation", | |
| }, | |
| "warbler-pack-hf-novels": { | |
| "source": "GOAT-AI/generated-novels", | |
| "type": "huggingface", | |
| "description": "Generated novels", | |
| }, | |
| "warbler-pack-hf-manuals": { | |
| "source": "nlasso/anac-manuals-23", | |
| "type": "huggingface", | |
| "description": "Technical manuals", | |
| }, | |
| "warbler-pack-hf-enterprise": { | |
| "source": "AST-FRI/EnterpriseBench", | |
| "type": "huggingface", | |
| "description": "Enterprise benchmarks", | |
| }, | |
| "warbler-pack-hf-portuguese-edu": { | |
| "source": "Solshine/Portuguese_Language_Education_Texts", | |
| "type": "huggingface", | |
| "description": "Portuguese education texts", | |
| }, | |
| } | |
| def __init__(self, packs_dir: Path = None): | |
| """Initialize the pack synchronizer.""" | |
| if packs_dir is None: | |
| packs_dir = Path(__file__).parent.parent / "packs" | |
| self.packs_dir = Path(packs_dir) | |
| self.metadata_file = self.packs_dir / ".pack_metadata.json" | |
| def verify_packs(self) -> Dict[str, Any]: | |
| """Verify all packs exist and are accessible.""" | |
| status = {"verified": [], "missing": [], "timestamp": datetime.now().isoformat()} | |
| for pack_name in self.PACK_MANIFEST: | |
| pack_dir = self.packs_dir / pack_name | |
| pack_file = pack_dir / f"{pack_name}.jsonl" | |
| if pack_dir.exists() and pack_file.exists(): | |
| try: | |
| with open(pack_file, "r", encoding="utf-8") as f: | |
| line_count = sum(1 for _ in f) | |
| status["verified"].append( | |
| {"pack": pack_name, "documents": line_count, "path": str(pack_dir)} | |
| ) | |
| logger.info("✓ %s: %d documents", pack_name, line_count) | |
| except OSError as e: | |
| logger.warning("⚠️ %s exists but unable to read: %s", pack_name, e) | |
| status["missing"].append(pack_name) | |
| else: | |
| status["missing"].append(pack_name) | |
| logger.warning("⚠️ %s not found", pack_name) | |
| return status | |
| def save_metadata(self, status: Dict[str, Any]) -> None: | |
| """Save pack verification metadata.""" | |
| try: | |
| with open(self.metadata_file, "w", encoding="utf-8") as f: | |
| json.dump(status, f, indent=2) | |
| logger.debug("Saved pack metadata to %s", self.metadata_file) | |
| except OSError as e: | |
| logger.warning("Could not save pack metadata: %s", e) | |
| def get_sync_status(self) -> str: | |
| """Return human-readable pack sync status.""" | |
| status = self.verify_packs() | |
| verified_count = len(status["verified"]) | |
| missing_count = len(status["missing"]) | |
| if missing_count == 0: | |
| return f"✓ All {verified_count} packs verified and ready" | |
| else: | |
| return ( | |
| f"⚠️ {verified_count} packs verified, {missing_count} " | |
| f"missing (run ingest to rebuild)" | |
| ) | |
| def suggest_reingest(self) -> Optional[str]: | |
| """Return reingest command if packs are missing.""" | |
| status = self.verify_packs() | |
| if status["missing"]: | |
| return "python -m warbler_cda.utils.hf_warbler_ingest ingest --datasets all" | |
| return None | |