"""
Dataset preparation and organization script
Helps structure your data for training
"""

import os
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split
import random

CATEGORIES = [
    'recyclable',
    'organic',
    'wet-waste',
    'dry-waste',
    'ewaste',
    'hazardous',
    'landfill'
]

def organize_dataset(raw_data_dir='ml/data/raw', 
                     processed_dir='ml/data/processed',
                     test_split=0.15,
                     val_split=0.15):
    """
    Organize raw images into train/val/test splits
    
    Expected raw structure:
    ml/data/raw/
        recyclable/
            img1.jpg
            img2.jpg
        organic/
            img1.jpg
        ...
    
    Output structure:
    ml/data/processed/
        train/
            recyclable/
            organic/
            ...
        val/
            ...
        test/
            ...
    """
    
    raw_path = Path(raw_data_dir)
    processed_path = Path(processed_dir)
    
    # Create output directories
    for split in ['train', 'val', 'test']:
        for category in CATEGORIES:
            (processed_path / split / category).mkdir(parents=True, exist_ok=True)
    
    print("Organizing dataset...")
    
    total_images = 0
    
    for category in CATEGORIES:
        category_path = raw_path / category
        
        if not category_path.exists():
            print(f"Warning: {category} directory not found, skipping...")
            continue
        
        # Get all images
        images = []
        for ext in ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']:
            images.extend(list(category_path.glob(ext)))
        
        if len(images) == 0:
            print(f"Warning: No images found for {category}")
            continue
        
        # Shuffle
        random.shuffle(images)
        
        # Split
        train_val, test = train_test_split(images, test_size=test_split, random_state=42)
        train, val = train_test_split(train_val, test_size=val_split/(1-test_split), random_state=42)
        
        # Copy files
        for img in train:
            shutil.copy(img, processed_path / 'train' / category / img.name)
        
        for img in val:
            shutil.copy(img, processed_path / 'val' / category / img.name)
        
        for img in test:
            shutil.copy(img, processed_path / 'test' / category / img.name)
        
        total_images += len(images)
        print(f"{category}: {len(train)} train, {len(val)} val, {len(test)} test")
    
    print(f"\nDataset organized successfully!")
    print(f"Total images: {total_images}")
    print(f"Train: {len(list((processed_path / 'train').rglob('*.jpg'))) + len(list((processed_path / 'train').rglob('*.png')))}")
    print(f"Val: {len(list((processed_path / 'val').rglob('*.jpg'))) + len(list((processed_path / 'val').rglob('*.png')))}")
    print(f"Test: {len(list((processed_path / 'test').rglob('*.jpg'))) + len(list((processed_path / 'test').rglob('*.png')))}")

def download_sample_datasets():
    """
    Instructions for downloading public waste classification datasets
    """
    
    datasets = """
    PUBLIC WASTE CLASSIFICATION DATASETS:
    
    1. Kaggle - Waste Classification Data
       URL: https://www.kaggle.com/datasets/techsash/waste-classification-data
       Categories: Organic, Recyclable
       Size: ~25k images
    
    2. TrashNet Dataset
       URL: https://github.com/garythung/trashnet
       Categories: Glass, Paper, Cardboard, Plastic, Metal, Trash
       Size: ~2.5k images
    
    3. Waste Pictures Dataset (Kaggle)
       URL: https://www.kaggle.com/datasets/wangziang/waste-pictures
       Categories: Multiple waste types
       Size: ~20k images
    
    4. TACO Dataset (Trash Annotations in Context)
       URL: http://tacodataset.org/
       Categories: 60 categories of litter
       Size: ~1.5k images with annotations
    
    SETUP INSTRUCTIONS:
    
    1. Download one or more datasets from above
    2. Extract to ml/data/raw/
    3. Organize by category (recyclable, organic, etc.)
    4. Run: python ml/dataset_prep.py
    
    For Indian waste types, you can:
    - Capture your own images using the webcam interface
    - Map categories from public datasets to Indian categories
    - Combine multiple datasets for better coverage
    """
    
    print(datasets)
    
    # Save to file
    with open('ml/DATASET_SOURCES.txt', 'w') as f:
        f.write(datasets)
    
    print("\nDataset sources saved to ml/DATASET_SOURCES.txt")

if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1 and sys.argv[1] == 'info':
        download_sample_datasets()
    else:
        organize_dataset()