Spaces:

hirthickraj2015
/

graphwiz-ireland

Sleeping

File size: 3,355 Bytes

"""
Dataset Loader for Hugging Face Datasets
Downloads dataset files from HF Datasets repository if not present locally
"""

import os
from pathlib import Path
from huggingface_hub import hf_hub_download
import streamlit as st

# Dataset configuration
DATASET_REPO = "hirthickraj2015/graphwiz-ireland-dataset"
DATASET_FILES = [
    "chunks.json",
    "graphrag_index.json",
    "graphrag_graphs.pkl",
    "hybrid_hnsw_index.bin",
    "hybrid_indexes.pkl",
    "ireland_articles.json",
    "page_titles.json",
    "chunk_stats.json",
    "graphrag_stats.json",
    "extraction_stats.json",
    "extraction_progress.json"
]

def ensure_dataset_files(dataset_dir: str = "dataset/wikipedia_ireland") -> bool:
    """
    Ensure all dataset files are available locally.
    Downloads from HF Datasets if missing.

    Args:
        dataset_dir: Local directory for dataset files

    Returns:
        True if all files are available, False otherwise
    """
    dataset_path = Path(dataset_dir)
    dataset_path.mkdir(parents=True, exist_ok=True)

    missing_files = []
    for filename in DATASET_FILES:
        file_path = dataset_path / filename
        if not file_path.exists():
            missing_files.append(filename)

    if not missing_files:
        print(f"[INFO] All dataset files present locally in {dataset_dir}")
        return True

    print(f"[INFO] Missing {len(missing_files)} files, downloading from HF Datasets...")

    # Download missing files
    import shutil
    try:
        for filename in missing_files:
            print(f"[INFO] Downloading {filename}...")
            if hasattr(st, 'status'):
                with st.status(f"Downloading {filename}...", expanded=True) as status:
                    downloaded_path = hf_hub_download(
                        repo_id=DATASET_REPO,
                        filename=filename,
                        repo_type="dataset"
                    )
                    # Move to target directory
                    target_path = dataset_path / filename
                    shutil.copy2(downloaded_path, target_path)
                    status.update(label=f"✓ Downloaded {filename}", state="complete")
            else:
                downloaded_path = hf_hub_download(
                    repo_id=DATASET_REPO,
                    filename=filename,
                    repo_type="dataset"
                )
                # Move to target directory
                target_path = dataset_path / filename
                shutil.copy2(downloaded_path, target_path)
            print(f"[SUCCESS] Downloaded {filename}")

        print("[SUCCESS] All dataset files downloaded successfully!")
        return True

    except Exception as e:
        print(f"[ERROR] Failed to download dataset files: {e}")
        if hasattr(st, 'error'):
            st.error(f"Failed to download dataset files: {e}")
        return False


def get_dataset_path(filename: str, dataset_dir: str = "dataset/wikipedia_ireland") -> str:
    """
    Get full path to a dataset file, downloading if necessary.

    Args:
        filename: Name of the dataset file
        dataset_dir: Local directory for dataset files

    Returns:
        Full path to the dataset file
    """
    # Ensure dataset files are available
    ensure_dataset_files(dataset_dir)

    return str(Path(dataset_dir) / filename)