Spaces:
Sleeping
Sleeping
File size: 1,092 Bytes
3a6b206 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
"""Data loading module for HuggingFace datasets."""
from datasets import load_dataset
from functools import cache
from typing import Any
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@cache
def load_sample_dataset() -> Any:
"""Load the essential-web dataset sample."""
try:
logger.info("Loading dataset from HuggingFace...")
dataset = load_dataset("sumuks/essential-web-v1.0-sample-1M-with-cleaned-text")
logger.info(f"Dataset loaded successfully with {len(dataset['train'])} samples")
return dataset
except Exception as e:
logger.error(f"Failed to load dataset: {e}")
raise
def get_dataset_size() -> int:
"""Get total number of samples in the dataset."""
dataset = load_sample_dataset()
return len(dataset['train'])
def get_sample(index: int) -> tuple[str, str]:
"""Get original and cleaned text for a specific sample."""
dataset = load_sample_dataset()
sample = dataset['train'][index]
return sample['text'], sample['cleaned_text'] |