LaunchLLM / data_aggregation /dataset_builder.py
Bmccloud22's picture
Deploy LaunchLLM - Production AI Training Platform
ec8f374 verified
"""
Dataset Builder Module
Handles train/test splitting and dataset creation.
"""
import random
from typing import List, Dict, Any, Tuple, Optional
class DatasetBuilder:
"""Build and split datasets for training."""
def __init__(self, seed: int = 42):
"""
Initialize dataset builder.
Args:
seed: Random seed for reproducibility
"""
self.seed = seed
random.seed(seed)
def train_test_split(
self,
data: List[Dict[str, Any]],
train_ratio: float = 0.8,
val_ratio: float = 0.1,
test_ratio: float = 0.1,
shuffle: bool = True
) -> Tuple[List[Dict], List[Dict], List[Dict]]:
"""
Split data into train/validation/test sets.
Args:
data: List of data examples
train_ratio: Fraction for training
val_ratio: Fraction for validation
test_ratio: Fraction for testing
shuffle: Whether to shuffle data
Returns:
Tuple of (train_data, val_data, test_data)
"""
# Validate ratios
total = train_ratio + val_ratio + test_ratio
if abs(total - 1.0) > 0.01:
raise ValueError(f"Ratios must sum to 1.0, got {total}")
# Shuffle if requested
data_copy = data.copy()
if shuffle:
random.shuffle(data_copy)
# Calculate split indices
n = len(data_copy)
train_end = int(n * train_ratio)
val_end = train_end + int(n * val_ratio)
# Split
train_data = data_copy[:train_end]
val_data = data_copy[train_end:val_end]
test_data = data_copy[val_end:]
return train_data, val_data, test_data
def create_balanced_split(
self,
data: List[Dict[str, Any]],
category_key: str,
train_ratio: float = 0.8
) -> Tuple[List[Dict], List[Dict]]:
"""
Create balanced train/test split by category.
Args:
data: List of data examples
category_key: Key for category field
train_ratio: Fraction for training
Returns:
Tuple of (train_data, test_data)
"""
# Group by category
categories = {}
for example in data:
cat = example.get(category_key, "unknown")
if cat not in categories:
categories[cat] = []
categories[cat].append(example)
# Split each category
train_data = []
test_data = []
for cat, examples in categories.items():
random.shuffle(examples)
split_idx = int(len(examples) * train_ratio)
train_data.extend(examples[:split_idx])
test_data.extend(examples[split_idx:])
# Shuffle final datasets
random.shuffle(train_data)
random.shuffle(test_data)
return train_data, test_data
def save_split(
self,
train_data: List[Dict],
val_data: List[Dict],
test_data: List[Dict],
output_dir: str
) -> None:
"""
Save split datasets to files.
Args:
train_data: Training data
val_data: Validation data
test_data: Test data
output_dir: Output directory
"""
import json
from pathlib import Path
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Save each split
with open(output_path / "train.json", 'w') as f:
json.dump(train_data, f, indent=2)
with open(output_path / "val.json", 'w') as f:
json.dump(val_data, f, indent=2)
with open(output_path / "test.json", 'w') as f:
json.dump(test_data, f, indent=2)
print(f"✅ Datasets saved to {output_dir}")
print(f" Train: {len(train_data)} examples")
print(f" Val: {len(val_data)} examples")
print(f" Test: {len(test_data)} examples")