Spaces:
Sleeping
Sleeping
File size: 3,355 Bytes
9679fcd 1586c67 9679fcd 1586c67 9679fcd 1586c67 9679fcd 1586c67 9679fcd 1586c67 9679fcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
"""
Dataset Loader for Hugging Face Datasets
Downloads dataset files from HF Datasets repository if not present locally
"""
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
import streamlit as st
# Dataset configuration
DATASET_REPO = "hirthickraj2015/graphwiz-ireland-dataset"
DATASET_FILES = [
"chunks.json",
"graphrag_index.json",
"graphrag_graphs.pkl",
"hybrid_hnsw_index.bin",
"hybrid_indexes.pkl",
"ireland_articles.json",
"page_titles.json",
"chunk_stats.json",
"graphrag_stats.json",
"extraction_stats.json",
"extraction_progress.json"
]
def ensure_dataset_files(dataset_dir: str = "dataset/wikipedia_ireland") -> bool:
"""
Ensure all dataset files are available locally.
Downloads from HF Datasets if missing.
Args:
dataset_dir: Local directory for dataset files
Returns:
True if all files are available, False otherwise
"""
dataset_path = Path(dataset_dir)
dataset_path.mkdir(parents=True, exist_ok=True)
missing_files = []
for filename in DATASET_FILES:
file_path = dataset_path / filename
if not file_path.exists():
missing_files.append(filename)
if not missing_files:
print(f"[INFO] All dataset files present locally in {dataset_dir}")
return True
print(f"[INFO] Missing {len(missing_files)} files, downloading from HF Datasets...")
# Download missing files
import shutil
try:
for filename in missing_files:
print(f"[INFO] Downloading {filename}...")
if hasattr(st, 'status'):
with st.status(f"Downloading {filename}...", expanded=True) as status:
downloaded_path = hf_hub_download(
repo_id=DATASET_REPO,
filename=filename,
repo_type="dataset"
)
# Move to target directory
target_path = dataset_path / filename
shutil.copy2(downloaded_path, target_path)
status.update(label=f"✓ Downloaded {filename}", state="complete")
else:
downloaded_path = hf_hub_download(
repo_id=DATASET_REPO,
filename=filename,
repo_type="dataset"
)
# Move to target directory
target_path = dataset_path / filename
shutil.copy2(downloaded_path, target_path)
print(f"[SUCCESS] Downloaded {filename}")
print("[SUCCESS] All dataset files downloaded successfully!")
return True
except Exception as e:
print(f"[ERROR] Failed to download dataset files: {e}")
if hasattr(st, 'error'):
st.error(f"Failed to download dataset files: {e}")
return False
def get_dataset_path(filename: str, dataset_dir: str = "dataset/wikipedia_ireland") -> str:
"""
Get full path to a dataset file, downloading if necessary.
Args:
filename: Name of the dataset file
dataset_dir: Local directory for dataset files
Returns:
Full path to the dataset file
"""
# Ensure dataset files are available
ensure_dataset_files(dataset_dir)
return str(Path(dataset_dir) / filename)
|