""" Dataset Loader for Hugging Face Datasets Downloads dataset files from HF Datasets repository if not present locally """ import os from pathlib import Path from huggingface_hub import hf_hub_download import streamlit as st # Dataset configuration DATASET_REPO = "hirthickraj2015/graphwiz-ireland-dataset" DATASET_FILES = [ "chunks.json", "graphrag_index.json", "graphrag_graphs.pkl", "hybrid_hnsw_index.bin", "hybrid_indexes.pkl", "ireland_articles.json", "page_titles.json", "chunk_stats.json", "graphrag_stats.json", "extraction_stats.json", "extraction_progress.json" ] def ensure_dataset_files(dataset_dir: str = "dataset/wikipedia_ireland", progress_container=None) -> tuple: """ Ensure all dataset files are available locally. Downloads from HF Datasets if missing. Args: dataset_dir: Local directory for dataset files progress_container: Streamlit container for progress updates (optional) Returns: Tuple of (success: bool, files_downloaded: bool) """ dataset_path = Path(dataset_dir) dataset_path.mkdir(parents=True, exist_ok=True) missing_files = [] for filename in DATASET_FILES: file_path = dataset_path / filename if not file_path.exists(): missing_files.append(filename) if not missing_files: print(f"[INFO] All dataset files present locally") return True, False # Success, no files downloaded print(f"[INFO] Missing {len(missing_files)} files, downloading from HF Datasets...") # Download missing files import shutil import time # Create UI elements if container provided status_placeholder = None progress_bar_placeholder = None progress_text_placeholder = None if progress_container: status_placeholder = progress_container.empty() progress_text_placeholder = progress_container.empty() progress_bar_placeholder = progress_container.empty() status_placeholder.info(f"📥 Downloading {len(missing_files)} missing dataset files from Hugging Face...") try: for idx, filename in enumerate(missing_files, 1): print(f"[INFO] Downloading {filename} ({idx}/{len(missing_files)})...") # Update progress if container provided if progress_text_placeholder and progress_bar_placeholder: progress_text_placeholder.text(f"⬇️ {filename} ({idx}/{len(missing_files)})") progress_bar_placeholder.progress((idx - 1) / len(missing_files)) downloaded_path = hf_hub_download( repo_id=DATASET_REPO, filename=filename, repo_type="dataset" ) # Move to target directory target_path = dataset_path / filename shutil.copy2(downloaded_path, target_path) print(f"[SUCCESS] Downloaded {filename}") # Update to show completion if progress_bar_placeholder: progress_bar_placeholder.progress(1.0) # Small delay to show completion, then clear time.sleep(0.5) # Clear ALL UI elements after successful download if status_placeholder: status_placeholder.empty() if progress_text_placeholder: progress_text_placeholder.empty() if progress_bar_placeholder: progress_bar_placeholder.empty() print("[SUCCESS] All dataset files downloaded successfully!") return True, True # Success, files were downloaded except Exception as e: print(f"[ERROR] Failed to download dataset files: {e}") if progress_container: # Clear progress indicators if status_placeholder: status_placeholder.empty() if progress_text_placeholder: progress_text_placeholder.empty() if progress_bar_placeholder: progress_bar_placeholder.empty() # Show error progress_container.error(f"❌ Failed to download dataset files: {e}") return False, False # Failure, no files downloaded def get_dataset_path(filename: str, dataset_dir: str = "dataset/wikipedia_ireland") -> str: """ Get full path to a dataset file, downloading if necessary. Args: filename: Name of the dataset file dataset_dir: Local directory for dataset files Returns: Full path to the dataset file """ # Ensure dataset files are available ensure_dataset_files(dataset_dir) return str(Path(dataset_dir) / filename)