Spaces:
Runtime error
Runtime error
| """ | |
| Dataset Builder Module | |
| Handles train/test splitting and dataset creation. | |
| """ | |
| import random | |
| from typing import List, Dict, Any, Tuple, Optional | |
| class DatasetBuilder: | |
| """Build and split datasets for training.""" | |
| def __init__(self, seed: int = 42): | |
| """ | |
| Initialize dataset builder. | |
| Args: | |
| seed: Random seed for reproducibility | |
| """ | |
| self.seed = seed | |
| random.seed(seed) | |
| def train_test_split( | |
| self, | |
| data: List[Dict[str, Any]], | |
| train_ratio: float = 0.8, | |
| val_ratio: float = 0.1, | |
| test_ratio: float = 0.1, | |
| shuffle: bool = True | |
| ) -> Tuple[List[Dict], List[Dict], List[Dict]]: | |
| """ | |
| Split data into train/validation/test sets. | |
| Args: | |
| data: List of data examples | |
| train_ratio: Fraction for training | |
| val_ratio: Fraction for validation | |
| test_ratio: Fraction for testing | |
| shuffle: Whether to shuffle data | |
| Returns: | |
| Tuple of (train_data, val_data, test_data) | |
| """ | |
| # Validate ratios | |
| total = train_ratio + val_ratio + test_ratio | |
| if abs(total - 1.0) > 0.01: | |
| raise ValueError(f"Ratios must sum to 1.0, got {total}") | |
| # Shuffle if requested | |
| data_copy = data.copy() | |
| if shuffle: | |
| random.shuffle(data_copy) | |
| # Calculate split indices | |
| n = len(data_copy) | |
| train_end = int(n * train_ratio) | |
| val_end = train_end + int(n * val_ratio) | |
| # Split | |
| train_data = data_copy[:train_end] | |
| val_data = data_copy[train_end:val_end] | |
| test_data = data_copy[val_end:] | |
| return train_data, val_data, test_data | |
| def create_balanced_split( | |
| self, | |
| data: List[Dict[str, Any]], | |
| category_key: str, | |
| train_ratio: float = 0.8 | |
| ) -> Tuple[List[Dict], List[Dict]]: | |
| """ | |
| Create balanced train/test split by category. | |
| Args: | |
| data: List of data examples | |
| category_key: Key for category field | |
| train_ratio: Fraction for training | |
| Returns: | |
| Tuple of (train_data, test_data) | |
| """ | |
| # Group by category | |
| categories = {} | |
| for example in data: | |
| cat = example.get(category_key, "unknown") | |
| if cat not in categories: | |
| categories[cat] = [] | |
| categories[cat].append(example) | |
| # Split each category | |
| train_data = [] | |
| test_data = [] | |
| for cat, examples in categories.items(): | |
| random.shuffle(examples) | |
| split_idx = int(len(examples) * train_ratio) | |
| train_data.extend(examples[:split_idx]) | |
| test_data.extend(examples[split_idx:]) | |
| # Shuffle final datasets | |
| random.shuffle(train_data) | |
| random.shuffle(test_data) | |
| return train_data, test_data | |
| def save_split( | |
| self, | |
| train_data: List[Dict], | |
| val_data: List[Dict], | |
| test_data: List[Dict], | |
| output_dir: str | |
| ) -> None: | |
| """ | |
| Save split datasets to files. | |
| Args: | |
| train_data: Training data | |
| val_data: Validation data | |
| test_data: Test data | |
| output_dir: Output directory | |
| """ | |
| import json | |
| from pathlib import Path | |
| output_path = Path(output_dir) | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| # Save each split | |
| with open(output_path / "train.json", 'w') as f: | |
| json.dump(train_data, f, indent=2) | |
| with open(output_path / "val.json", 'w') as f: | |
| json.dump(val_data, f, indent=2) | |
| with open(output_path / "test.json", 'w') as f: | |
| json.dump(test_data, f, indent=2) | |
| print(f"✅ Datasets saved to {output_dir}") | |
| print(f" Train: {len(train_data)} examples") | |
| print(f" Val: {len(val_data)} examples") | |
| print(f" Test: {len(test_data)} examples") | |