File size: 1,092 Bytes
3a6b206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""Data loading module for HuggingFace datasets."""

from datasets import load_dataset
from functools import cache
from typing import Any
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


@cache
def load_sample_dataset() -> Any:
    """Load the essential-web dataset sample."""
    try:
        logger.info("Loading dataset from HuggingFace...")
        dataset = load_dataset("sumuks/essential-web-v1.0-sample-1M-with-cleaned-text")
        logger.info(f"Dataset loaded successfully with {len(dataset['train'])} samples")
        return dataset
    except Exception as e:
        logger.error(f"Failed to load dataset: {e}")
        raise


def get_dataset_size() -> int:
    """Get total number of samples in the dataset."""
    dataset = load_sample_dataset()
    return len(dataset['train'])


def get_sample(index: int) -> tuple[str, str]:
    """Get original and cleaned text for a specific sample."""
    dataset = load_sample_dataset()
    sample = dataset['train'][index]
    return sample['text'], sample['cleaned_text']