Spaces:
Sleeping
Sleeping
Use older compatible versions: sentence-transformers 2.2.2, transformers 4.30.2, torch 2.0.1, huggingface-hub 0.16.4 - fix dataset loader for older API
1586c67
| """ | |
| Dataset Loader for Hugging Face Datasets | |
| Downloads dataset files from HF Datasets repository if not present locally | |
| """ | |
| import os | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download | |
| import streamlit as st | |
| # Dataset configuration | |
| DATASET_REPO = "hirthickraj2015/graphwiz-ireland-dataset" | |
| DATASET_FILES = [ | |
| "chunks.json", | |
| "graphrag_index.json", | |
| "graphrag_graphs.pkl", | |
| "hybrid_hnsw_index.bin", | |
| "hybrid_indexes.pkl", | |
| "ireland_articles.json", | |
| "page_titles.json", | |
| "chunk_stats.json", | |
| "graphrag_stats.json", | |
| "extraction_stats.json", | |
| "extraction_progress.json" | |
| ] | |
| def ensure_dataset_files(dataset_dir: str = "dataset/wikipedia_ireland") -> bool: | |
| """ | |
| Ensure all dataset files are available locally. | |
| Downloads from HF Datasets if missing. | |
| Args: | |
| dataset_dir: Local directory for dataset files | |
| Returns: | |
| True if all files are available, False otherwise | |
| """ | |
| dataset_path = Path(dataset_dir) | |
| dataset_path.mkdir(parents=True, exist_ok=True) | |
| missing_files = [] | |
| for filename in DATASET_FILES: | |
| file_path = dataset_path / filename | |
| if not file_path.exists(): | |
| missing_files.append(filename) | |
| if not missing_files: | |
| print(f"[INFO] All dataset files present locally in {dataset_dir}") | |
| return True | |
| print(f"[INFO] Missing {len(missing_files)} files, downloading from HF Datasets...") | |
| # Download missing files | |
| import shutil | |
| try: | |
| for filename in missing_files: | |
| print(f"[INFO] Downloading {filename}...") | |
| if hasattr(st, 'status'): | |
| with st.status(f"Downloading {filename}...", expanded=True) as status: | |
| downloaded_path = hf_hub_download( | |
| repo_id=DATASET_REPO, | |
| filename=filename, | |
| repo_type="dataset" | |
| ) | |
| # Move to target directory | |
| target_path = dataset_path / filename | |
| shutil.copy2(downloaded_path, target_path) | |
| status.update(label=f"β Downloaded {filename}", state="complete") | |
| else: | |
| downloaded_path = hf_hub_download( | |
| repo_id=DATASET_REPO, | |
| filename=filename, | |
| repo_type="dataset" | |
| ) | |
| # Move to target directory | |
| target_path = dataset_path / filename | |
| shutil.copy2(downloaded_path, target_path) | |
| print(f"[SUCCESS] Downloaded {filename}") | |
| print("[SUCCESS] All dataset files downloaded successfully!") | |
| return True | |
| except Exception as e: | |
| print(f"[ERROR] Failed to download dataset files: {e}") | |
| if hasattr(st, 'error'): | |
| st.error(f"Failed to download dataset files: {e}") | |
| return False | |
| def get_dataset_path(filename: str, dataset_dir: str = "dataset/wikipedia_ireland") -> str: | |
| """ | |
| Get full path to a dataset file, downloading if necessary. | |
| Args: | |
| filename: Name of the dataset file | |
| dataset_dir: Local directory for dataset files | |
| Returns: | |
| Full path to the dataset file | |
| """ | |
| # Ensure dataset files are available | |
| ensure_dataset_files(dataset_dir) | |
| return str(Path(dataset_dir) / filename) | |