Spaces:

Rahiq
/

garbage-segregate

Sleeping

App Files Files Community

garbage-segregate / ml /dataset_prep.py

Rahiq

Deploy waste classification backend with ML model

bf17f74 26 days ago

raw

history blame contribute delete

4.75 kB

	"""
	Dataset preparation and organization script
	Helps structure your data for training
	"""

	import os
	import shutil
	from pathlib import Path
	from sklearn.model_selection import train_test_split
	import random

	CATEGORIES = [
	'recyclable',
	'organic',
	'wet-waste',
	'dry-waste',
	'ewaste',
	'hazardous',
	'landfill'
	]

	def organize_dataset(raw_data_dir='ml/data/raw',
	processed_dir='ml/data/processed',
	test_split=0.15,
	val_split=0.15):
	"""
	Organize raw images into train/val/test splits

	Expected raw structure:
	ml/data/raw/
	recyclable/
	img1.jpg
	img2.jpg
	organic/
	img1.jpg
	...

	Output structure:
	ml/data/processed/
	train/
	recyclable/
	organic/
	...
	val/
	...
	test/
	...
	"""

	raw_path = Path(raw_data_dir)
	processed_path = Path(processed_dir)

	# Create output directories
	for split in ['train', 'val', 'test']:
	for category in CATEGORIES:
	(processed_path / split / category).mkdir(parents=True, exist_ok=True)

	print("Organizing dataset...")

	total_images = 0

	for category in CATEGORIES:
	category_path = raw_path / category

	if not category_path.exists():
	print(f"Warning: {category} directory not found, skipping...")
	continue

	# Get all images
	images = []
	for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
	images.extend(list(category_path.glob(ext)))

	if len(images) == 0:
	print(f"Warning: No images found for {category}")
	continue

	# Shuffle
	random.shuffle(images)

	# Split
	train_val, test = train_test_split(images, test_size=test_split, random_state=42)
	train, val = train_test_split(train_val, test_size=val_split/(1-test_split), random_state=42)

	# Copy files
	for img in train:
	shutil.copy(img, processed_path / 'train' / category / img.name)

	for img in val:
	shutil.copy(img, processed_path / 'val' / category / img.name)

	for img in test:
	shutil.copy(img, processed_path / 'test' / category / img.name)

	total_images += len(images)
	print(f"{category}: {len(train)} train, {len(val)} val, {len(test)} test")

	print(f"\nDataset organized successfully!")
	print(f"Total images: {total_images}")
	print(f"Train: {len(list((processed_path / 'train').rglob('.jpg'))) + len(list((processed_path / 'train').rglob('.png')))}")
	print(f"Val: {len(list((processed_path / 'val').rglob('.jpg'))) + len(list((processed_path / 'val').rglob('.png')))}")
	print(f"Test: {len(list((processed_path / 'test').rglob('.jpg'))) + len(list((processed_path / 'test').rglob('.png')))}")

	def download_sample_datasets():
	"""
	Instructions for downloading public waste classification datasets
	"""

	datasets = """
	PUBLIC WASTE CLASSIFICATION DATASETS:

	1. Kaggle - Waste Classification Data
	URL: https://www.kaggle.com/datasets/techsash/waste-classification-data
	Categories: Organic, Recyclable
	Size: ~25k images

	2. TrashNet Dataset
	URL: https://github.com/garythung/trashnet
	Categories: Glass, Paper, Cardboard, Plastic, Metal, Trash
	Size: ~2.5k images

	3. Waste Pictures Dataset (Kaggle)
	URL: https://www.kaggle.com/datasets/wangziang/waste-pictures
	Categories: Multiple waste types
	Size: ~20k images

	4. TACO Dataset (Trash Annotations in Context)
	URL: http://tacodataset.org/
	Categories: 60 categories of litter
	Size: ~1.5k images with annotations

	SETUP INSTRUCTIONS:

	1. Download one or more datasets from above
	2. Extract to ml/data/raw/
	3. Organize by category (recyclable, organic, etc.)
	4. Run: python ml/dataset_prep.py

	For Indian waste types, you can:
	- Capture your own images using the webcam interface
	- Map categories from public datasets to Indian categories
	- Combine multiple datasets for better coverage
	"""

	print(datasets)

	# Save to file
	with open('ml/DATASET_SOURCES.txt', 'w') as f:
	f.write(datasets)

	print("\nDataset sources saved to ml/DATASET_SOURCES.txt")

	if __name__ == "__main__":
	import sys

	if len(sys.argv) > 1 and sys.argv[1] == 'info':
	download_sample_datasets()
	else:
	organize_dataset()