Spaces:
Sleeping
Sleeping
| """ | |
| Dataset preparation and organization script | |
| Helps structure your data for training | |
| """ | |
| import os | |
| import shutil | |
| from pathlib import Path | |
| from sklearn.model_selection import train_test_split | |
| import random | |
| CATEGORIES = [ | |
| 'recyclable', | |
| 'organic', | |
| 'wet-waste', | |
| 'dry-waste', | |
| 'ewaste', | |
| 'hazardous', | |
| 'landfill' | |
| ] | |
| def organize_dataset(raw_data_dir='ml/data/raw', | |
| processed_dir='ml/data/processed', | |
| test_split=0.15, | |
| val_split=0.15): | |
| """ | |
| Organize raw images into train/val/test splits | |
| Expected raw structure: | |
| ml/data/raw/ | |
| recyclable/ | |
| img1.jpg | |
| img2.jpg | |
| organic/ | |
| img1.jpg | |
| ... | |
| Output structure: | |
| ml/data/processed/ | |
| train/ | |
| recyclable/ | |
| organic/ | |
| ... | |
| val/ | |
| ... | |
| test/ | |
| ... | |
| """ | |
| raw_path = Path(raw_data_dir) | |
| processed_path = Path(processed_dir) | |
| # Create output directories | |
| for split in ['train', 'val', 'test']: | |
| for category in CATEGORIES: | |
| (processed_path / split / category).mkdir(parents=True, exist_ok=True) | |
| print("Organizing dataset...") | |
| total_images = 0 | |
| for category in CATEGORIES: | |
| category_path = raw_path / category | |
| if not category_path.exists(): | |
| print(f"Warning: {category} directory not found, skipping...") | |
| continue | |
| # Get all images | |
| images = [] | |
| for ext in ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']: | |
| images.extend(list(category_path.glob(ext))) | |
| if len(images) == 0: | |
| print(f"Warning: No images found for {category}") | |
| continue | |
| # Shuffle | |
| random.shuffle(images) | |
| # Split | |
| train_val, test = train_test_split(images, test_size=test_split, random_state=42) | |
| train, val = train_test_split(train_val, test_size=val_split/(1-test_split), random_state=42) | |
| # Copy files | |
| for img in train: | |
| shutil.copy(img, processed_path / 'train' / category / img.name) | |
| for img in val: | |
| shutil.copy(img, processed_path / 'val' / category / img.name) | |
| for img in test: | |
| shutil.copy(img, processed_path / 'test' / category / img.name) | |
| total_images += len(images) | |
| print(f"{category}: {len(train)} train, {len(val)} val, {len(test)} test") | |
| print(f"\nDataset organized successfully!") | |
| print(f"Total images: {total_images}") | |
| print(f"Train: {len(list((processed_path / 'train').rglob('*.jpg'))) + len(list((processed_path / 'train').rglob('*.png')))}") | |
| print(f"Val: {len(list((processed_path / 'val').rglob('*.jpg'))) + len(list((processed_path / 'val').rglob('*.png')))}") | |
| print(f"Test: {len(list((processed_path / 'test').rglob('*.jpg'))) + len(list((processed_path / 'test').rglob('*.png')))}") | |
| def download_sample_datasets(): | |
| """ | |
| Instructions for downloading public waste classification datasets | |
| """ | |
| datasets = """ | |
| PUBLIC WASTE CLASSIFICATION DATASETS: | |
| 1. Kaggle - Waste Classification Data | |
| URL: https://www.kaggle.com/datasets/techsash/waste-classification-data | |
| Categories: Organic, Recyclable | |
| Size: ~25k images | |
| 2. TrashNet Dataset | |
| URL: https://github.com/garythung/trashnet | |
| Categories: Glass, Paper, Cardboard, Plastic, Metal, Trash | |
| Size: ~2.5k images | |
| 3. Waste Pictures Dataset (Kaggle) | |
| URL: https://www.kaggle.com/datasets/wangziang/waste-pictures | |
| Categories: Multiple waste types | |
| Size: ~20k images | |
| 4. TACO Dataset (Trash Annotations in Context) | |
| URL: http://tacodataset.org/ | |
| Categories: 60 categories of litter | |
| Size: ~1.5k images with annotations | |
| SETUP INSTRUCTIONS: | |
| 1. Download one or more datasets from above | |
| 2. Extract to ml/data/raw/ | |
| 3. Organize by category (recyclable, organic, etc.) | |
| 4. Run: python ml/dataset_prep.py | |
| For Indian waste types, you can: | |
| - Capture your own images using the webcam interface | |
| - Map categories from public datasets to Indian categories | |
| - Combine multiple datasets for better coverage | |
| """ | |
| print(datasets) | |
| # Save to file | |
| with open('ml/DATASET_SOURCES.txt', 'w') as f: | |
| f.write(datasets) | |
| print("\nDataset sources saved to ml/DATASET_SOURCES.txt") | |
| if __name__ == "__main__": | |
| import sys | |
| if len(sys.argv) > 1 and sys.argv[1] == 'info': | |
| download_sample_datasets() | |
| else: | |
| organize_dataset() | |