Spaces:
Sleeping
Sleeping
| """Data loading module for HuggingFace datasets.""" | |
| from datasets import load_dataset | |
| from functools import cache | |
| from typing import Any | |
| import logging | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def load_sample_dataset() -> Any: | |
| """Load the essential-web dataset sample.""" | |
| try: | |
| logger.info("Loading dataset from HuggingFace...") | |
| dataset = load_dataset("sumuks/essential-web-v1.0-sample-1M-with-cleaned-text") | |
| logger.info(f"Dataset loaded successfully with {len(dataset['train'])} samples") | |
| return dataset | |
| except Exception as e: | |
| logger.error(f"Failed to load dataset: {e}") | |
| raise | |
| def get_dataset_size() -> int: | |
| """Get total number of samples in the dataset.""" | |
| dataset = load_sample_dataset() | |
| return len(dataset['train']) | |
| def get_sample(index: int) -> tuple[str, str]: | |
| """Get original and cleaned text for a specific sample.""" | |
| dataset = load_sample_dataset() | |
| sample = dataset['train'][index] | |
| return sample['text'], sample['cleaned_text'] |