File size: 3,355 Bytes
9679fcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1586c67
9679fcd
 
 
 
 
 
 
 
1586c67
9679fcd
1586c67
 
 
9679fcd
 
 
 
 
1586c67
9679fcd
1586c67
 
 
9679fcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""
Dataset Loader for Hugging Face Datasets
Downloads dataset files from HF Datasets repository if not present locally
"""

import os
from pathlib import Path
from huggingface_hub import hf_hub_download
import streamlit as st

# Dataset configuration
DATASET_REPO = "hirthickraj2015/graphwiz-ireland-dataset"
DATASET_FILES = [
    "chunks.json",
    "graphrag_index.json",
    "graphrag_graphs.pkl",
    "hybrid_hnsw_index.bin",
    "hybrid_indexes.pkl",
    "ireland_articles.json",
    "page_titles.json",
    "chunk_stats.json",
    "graphrag_stats.json",
    "extraction_stats.json",
    "extraction_progress.json"
]

def ensure_dataset_files(dataset_dir: str = "dataset/wikipedia_ireland") -> bool:
    """
    Ensure all dataset files are available locally.
    Downloads from HF Datasets if missing.

    Args:
        dataset_dir: Local directory for dataset files

    Returns:
        True if all files are available, False otherwise
    """
    dataset_path = Path(dataset_dir)
    dataset_path.mkdir(parents=True, exist_ok=True)

    missing_files = []
    for filename in DATASET_FILES:
        file_path = dataset_path / filename
        if not file_path.exists():
            missing_files.append(filename)

    if not missing_files:
        print(f"[INFO] All dataset files present locally in {dataset_dir}")
        return True

    print(f"[INFO] Missing {len(missing_files)} files, downloading from HF Datasets...")

    # Download missing files
    import shutil
    try:
        for filename in missing_files:
            print(f"[INFO] Downloading {filename}...")
            if hasattr(st, 'status'):
                with st.status(f"Downloading {filename}...", expanded=True) as status:
                    downloaded_path = hf_hub_download(
                        repo_id=DATASET_REPO,
                        filename=filename,
                        repo_type="dataset"
                    )
                    # Move to target directory
                    target_path = dataset_path / filename
                    shutil.copy2(downloaded_path, target_path)
                    status.update(label=f"✓ Downloaded {filename}", state="complete")
            else:
                downloaded_path = hf_hub_download(
                    repo_id=DATASET_REPO,
                    filename=filename,
                    repo_type="dataset"
                )
                # Move to target directory
                target_path = dataset_path / filename
                shutil.copy2(downloaded_path, target_path)
            print(f"[SUCCESS] Downloaded {filename}")

        print("[SUCCESS] All dataset files downloaded successfully!")
        return True

    except Exception as e:
        print(f"[ERROR] Failed to download dataset files: {e}")
        if hasattr(st, 'error'):
            st.error(f"Failed to download dataset files: {e}")
        return False


def get_dataset_path(filename: str, dataset_dir: str = "dataset/wikipedia_ireland") -> str:
    """
    Get full path to a dataset file, downloading if necessary.

    Args:
        filename: Name of the dataset file
        dataset_dir: Local directory for dataset files

    Returns:
        Full path to the dataset file
    """
    # Ensure dataset files are available
    ensure_dataset_files(dataset_dir)

    return str(Path(dataset_dir) / filename)