""" HuggingFace Dataset Download Utility for LoRA Training Studio. Provides a helper to download audio datasets from HuggingFace Hub. The actual training pipeline lives in acestep/training/. """ import logging import os import shutil from pathlib import Path from typing import Tuple logger = logging.getLogger(__name__) AUDIO_SUFFIXES = {".wav", ".mp3", ".flac", ".ogg", ".opus"} def download_hf_dataset( dataset_id: str, max_files: int = 50, offset: int = 0, ) -> Tuple[str, str]: """ Download a subset of audio files from a HuggingFace dataset repo. Also pulls dataset.json from the repo if it exists (restoring labels and preprocessed flags from a previous session). Uses HF_TOKEN env var for authentication. Returns: Tuple of (output_dir, status_message) """ try: from huggingface_hub import HfApi, hf_hub_download api = HfApi() token = os.environ.get("HF_TOKEN") logger.info(f"Listing files in '{dataset_id}'...") all_files = [ f.rfilename for f in api.list_repo_tree( dataset_id, repo_type="dataset", token=token, recursive=True ) if hasattr(f, "rfilename") and Path(f.rfilename).suffix.lower() in AUDIO_SUFFIXES ] total_available = len(all_files) selected = all_files[offset:offset + max_files] if not selected: return "", f"No audio files found in {dataset_id}" logger.info( f"Downloading {len(selected)}/{total_available} audio files..." ) output_dir = Path("lora_training") / "hf" / dataset_id.replace("/", "_") output_dir.mkdir(parents=True, exist_ok=True) for i, filename in enumerate(selected): logger.info(f" [{i + 1}/{len(selected)}] {filename}") cached_path = hf_hub_download( repo_id=dataset_id, filename=filename, repo_type="dataset", token=token, ) # Symlink from cache into our working dir so scan_directory finds them dest = output_dir / Path(filename).name if not dest.exists(): dest.symlink_to(cached_path) # Pull dataset.json from repo if it exists (restores previous session state) try: cached_json = hf_hub_download( repo_id=dataset_id, filename="dataset.json", repo_type="dataset", token=token, ) dest_json = output_dir / "dataset.json" shutil.copy2(cached_json, str(dest_json)) logger.info("Pulled dataset.json from HF repo") except Exception: logger.info("No dataset.json in HF repo (first session)") status = ( f"Downloaded {len(selected)} of {total_available} " f"audio files from {dataset_id} (offset {offset})" ) logger.info(status) return str(output_dir), status except ImportError: msg = "huggingface_hub is not installed. Run: pip install huggingface_hub" logger.error(msg) return "", msg except Exception as e: msg = f"Failed to download dataset: {e}" logger.error(msg) return "", msg def upload_dataset_json_to_hf(dataset_id: str, json_path: str) -> str: """Push dataset.json to the HF dataset repo for persistence across sessions.""" try: from huggingface_hub import HfApi token = os.environ.get("HF_TOKEN") if not token: return "HF_TOKEN not set — skipped HF sync" api = HfApi() api.upload_file( path_or_fileobj=json_path, path_in_repo="dataset.json", repo_id=dataset_id, repo_type="dataset", token=token, ) return f"Synced dataset.json to {dataset_id}" except Exception as e: msg = f"HF sync failed: {e}" logger.error(msg) return msg