File size: 4,746 Bytes
bf17f74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""
Dataset preparation and organization script
Helps structure your data for training
"""

import os
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split
import random

CATEGORIES = [
    'recyclable',
    'organic',
    'wet-waste',
    'dry-waste',
    'ewaste',
    'hazardous',
    'landfill'
]

def organize_dataset(raw_data_dir='ml/data/raw', 
                     processed_dir='ml/data/processed',
                     test_split=0.15,
                     val_split=0.15):
    """
    Organize raw images into train/val/test splits
    
    Expected raw structure:
    ml/data/raw/
        recyclable/
            img1.jpg
            img2.jpg
        organic/
            img1.jpg
        ...
    
    Output structure:
    ml/data/processed/
        train/
            recyclable/
            organic/
            ...
        val/
            ...
        test/
            ...
    """
    
    raw_path = Path(raw_data_dir)
    processed_path = Path(processed_dir)
    
    # Create output directories
    for split in ['train', 'val', 'test']:
        for category in CATEGORIES:
            (processed_path / split / category).mkdir(parents=True, exist_ok=True)
    
    print("Organizing dataset...")
    
    total_images = 0
    
    for category in CATEGORIES:
        category_path = raw_path / category
        
        if not category_path.exists():
            print(f"Warning: {category} directory not found, skipping...")
            continue
        
        # Get all images
        images = []
        for ext in ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']:
            images.extend(list(category_path.glob(ext)))
        
        if len(images) == 0:
            print(f"Warning: No images found for {category}")
            continue
        
        # Shuffle
        random.shuffle(images)
        
        # Split
        train_val, test = train_test_split(images, test_size=test_split, random_state=42)
        train, val = train_test_split(train_val, test_size=val_split/(1-test_split), random_state=42)
        
        # Copy files
        for img in train:
            shutil.copy(img, processed_path / 'train' / category / img.name)
        
        for img in val:
            shutil.copy(img, processed_path / 'val' / category / img.name)
        
        for img in test:
            shutil.copy(img, processed_path / 'test' / category / img.name)
        
        total_images += len(images)
        print(f"{category}: {len(train)} train, {len(val)} val, {len(test)} test")
    
    print(f"\nDataset organized successfully!")
    print(f"Total images: {total_images}")
    print(f"Train: {len(list((processed_path / 'train').rglob('*.jpg'))) + len(list((processed_path / 'train').rglob('*.png')))}")
    print(f"Val: {len(list((processed_path / 'val').rglob('*.jpg'))) + len(list((processed_path / 'val').rglob('*.png')))}")
    print(f"Test: {len(list((processed_path / 'test').rglob('*.jpg'))) + len(list((processed_path / 'test').rglob('*.png')))}")

def download_sample_datasets():
    """
    Instructions for downloading public waste classification datasets
    """
    
    datasets = """
    PUBLIC WASTE CLASSIFICATION DATASETS:
    
    1. Kaggle - Waste Classification Data
       URL: https://www.kaggle.com/datasets/techsash/waste-classification-data
       Categories: Organic, Recyclable
       Size: ~25k images
    
    2. TrashNet Dataset
       URL: https://github.com/garythung/trashnet
       Categories: Glass, Paper, Cardboard, Plastic, Metal, Trash
       Size: ~2.5k images
    
    3. Waste Pictures Dataset (Kaggle)
       URL: https://www.kaggle.com/datasets/wangziang/waste-pictures
       Categories: Multiple waste types
       Size: ~20k images
    
    4. TACO Dataset (Trash Annotations in Context)
       URL: http://tacodataset.org/
       Categories: 60 categories of litter
       Size: ~1.5k images with annotations
    
    SETUP INSTRUCTIONS:
    
    1. Download one or more datasets from above
    2. Extract to ml/data/raw/
    3. Organize by category (recyclable, organic, etc.)
    4. Run: python ml/dataset_prep.py
    
    For Indian waste types, you can:
    - Capture your own images using the webcam interface
    - Map categories from public datasets to Indian categories
    - Combine multiple datasets for better coverage
    """
    
    print(datasets)
    
    # Save to file
    with open('ml/DATASET_SOURCES.txt', 'w') as f:
        f.write(datasets)
    
    print("\nDataset sources saved to ml/DATASET_SOURCES.txt")

if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1 and sys.argv[1] == 'info':
        download_sample_datasets()
    else:
        organize_dataset()