""" Dataset preparation and organization script Helps structure your data for training """ import os import shutil from pathlib import Path from sklearn.model_selection import train_test_split import random CATEGORIES = [ 'recyclable', 'organic', 'wet-waste', 'dry-waste', 'ewaste', 'hazardous', 'landfill' ] def organize_dataset(raw_data_dir='ml/data/raw', processed_dir='ml/data/processed', test_split=0.15, val_split=0.15): """ Organize raw images into train/val/test splits Expected raw structure: ml/data/raw/ recyclable/ img1.jpg img2.jpg organic/ img1.jpg ... Output structure: ml/data/processed/ train/ recyclable/ organic/ ... val/ ... test/ ... """ raw_path = Path(raw_data_dir) processed_path = Path(processed_dir) # Create output directories for split in ['train', 'val', 'test']: for category in CATEGORIES: (processed_path / split / category).mkdir(parents=True, exist_ok=True) print("Organizing dataset...") total_images = 0 for category in CATEGORIES: category_path = raw_path / category if not category_path.exists(): print(f"Warning: {category} directory not found, skipping...") continue # Get all images images = [] for ext in ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']: images.extend(list(category_path.glob(ext))) if len(images) == 0: print(f"Warning: No images found for {category}") continue # Shuffle random.shuffle(images) # Split train_val, test = train_test_split(images, test_size=test_split, random_state=42) train, val = train_test_split(train_val, test_size=val_split/(1-test_split), random_state=42) # Copy files for img in train: shutil.copy(img, processed_path / 'train' / category / img.name) for img in val: shutil.copy(img, processed_path / 'val' / category / img.name) for img in test: shutil.copy(img, processed_path / 'test' / category / img.name) total_images += len(images) print(f"{category}: {len(train)} train, {len(val)} val, {len(test)} test") print(f"\nDataset organized successfully!") print(f"Total images: {total_images}") print(f"Train: {len(list((processed_path / 'train').rglob('*.jpg'))) + len(list((processed_path / 'train').rglob('*.png')))}") print(f"Val: {len(list((processed_path / 'val').rglob('*.jpg'))) + len(list((processed_path / 'val').rglob('*.png')))}") print(f"Test: {len(list((processed_path / 'test').rglob('*.jpg'))) + len(list((processed_path / 'test').rglob('*.png')))}") def download_sample_datasets(): """ Instructions for downloading public waste classification datasets """ datasets = """ PUBLIC WASTE CLASSIFICATION DATASETS: 1. Kaggle - Waste Classification Data URL: https://www.kaggle.com/datasets/techsash/waste-classification-data Categories: Organic, Recyclable Size: ~25k images 2. TrashNet Dataset URL: https://github.com/garythung/trashnet Categories: Glass, Paper, Cardboard, Plastic, Metal, Trash Size: ~2.5k images 3. Waste Pictures Dataset (Kaggle) URL: https://www.kaggle.com/datasets/wangziang/waste-pictures Categories: Multiple waste types Size: ~20k images 4. TACO Dataset (Trash Annotations in Context) URL: http://tacodataset.org/ Categories: 60 categories of litter Size: ~1.5k images with annotations SETUP INSTRUCTIONS: 1. Download one or more datasets from above 2. Extract to ml/data/raw/ 3. Organize by category (recyclable, organic, etc.) 4. Run: python ml/dataset_prep.py For Indian waste types, you can: - Capture your own images using the webcam interface - Map categories from public datasets to Indian categories - Combine multiple datasets for better coverage """ print(datasets) # Save to file with open('ml/DATASET_SOURCES.txt', 'w') as f: f.write(datasets) print("\nDataset sources saved to ml/DATASET_SOURCES.txt") if __name__ == "__main__": import sys if len(sys.argv) > 1 and sys.argv[1] == 'info': download_sample_datasets() else: organize_dataset()