Spaces:

GotThatData
/

ai-card-hub

Runtime error

App Files Files Community

ai-card-hub / app.py

GotThatData

update

e2cfe8a verified 12 months ago

raw

history blame

12.4 kB

	from pydrive2.auth import GoogleAuth
	from pydrive2.drive import GoogleDrive
	import os
	import gradio as gr
	from datasets import load_dataset, Dataset, concatenate_datasets
	import pandas as pd
	from PIL import Image
	from tqdm import tqdm
	import logging
	import yaml
	from huggingface_hub import login # Add this import

	# Authenticate with Hugging Face
	HF_TOKEN = os.getenv('HF_TOKEN')
	if HF_TOKEN:
	login(token=HF_TOKEN)
	else:
	logger.warning("No Hugging Face token found. Please add HF_TOKEN to your Space secrets.")

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	# Load settings
	if not os.path.exists("settings.yaml"):
	raise FileNotFoundError("settings.yaml file is missing. Please add it with 'client_secrets_file'.")

	with open('settings.yaml', 'r') as file:
	settings = yaml.safe_load(file)

	# Utility Functions
	def safe_load_dataset(dataset_name):
	"""Load Hugging Face dataset safely."""
	try:
	dataset = load_dataset(dataset_name)
	return dataset, len(dataset['train']) if 'train' in dataset else 0
	except Exception as e:
	logger.info(f"No existing dataset found. Starting fresh. Error: {str(e)}")
	return None, 0

	def is_valid_image(file_path):
	"""Check if a file is a valid image."""
	try:
	with Image.open(file_path) as img:
	img.verify()
	return True
	except Exception as e:
	logger.error(f"Invalid image: {file_path}. Error: {str(e)}")
	return False

	def validate_input(folder_id, naming_convention):
	"""Validate user input."""
	if not folder_id or not folder_id.strip():
	return False, "Folder ID cannot be empty"
	if not naming_convention or not naming_convention.strip():
	return False, "Naming convention cannot be empty"
	if not naming_convention.replace('_', '').isalnum():
	return False, "Naming convention should only contain letters, numbers, and underscores"
	return True, ""

	def initialize_dataset():
	"""Initialize or verify the dataset structure."""
	try:
	# Check if the README.md exists, if not create it
	readme_content = """# Sports Cards Dataset

	This dataset contains sports card images with structured metadata. Each image is named using a consistent convention and includes relevant information about the card.

	## Dataset Structure

	```
	sports_card_{number}.jpg - Card images
	```

	## Features
	- file_path: Path to the image file
	- original_name: Original filename of the card
	- new_name: Standardized filename
	- image: Image data

	## Usage
	```python
	from datasets import load_dataset
	dataset = load_dataset("GotThatData/sports-cards")
	```

	## License
	This dataset is licensed under MIT.

	## Creator
	Created by GotThatData
	"""
	# Create dataset info content
	dataset_info = {
	"description": "A collection of sports card images with metadata",
	"citation": "",
	"homepage": "https://huggingface.co/datasets/GotThatData/sports-cards",
	"license": "mit",
	"features": {
	"file_path": {"dtype": "string", "_type": "Value"},
	"original_name": {"dtype": "string", "_type": "Value"},
	"new_name": {"dtype": "string", "_type": "Value"},
	"image": {"dtype": "string", "_type": "Value"}
	},
	"splits": ["train"]
	}

	# Write files
	with open("README.md", "w") as f:
	f.write(readme_content)
	with open("dataset-info.json", "w") as f:
	json.dump(dataset_info, f, indent=2)

	# Upload files to repository
	upload_file(
	path_or_fileobj="README.md",
	path_in_repo="README.md",
	repo_id="GotThatData/sports-cards",
	repo_type="dataset"
	)
	upload_file(
	path_or_fileobj="dataset-info.json",
	path_in_repo="dataset-info.json",
	repo_id="GotThatData/sports-cards",
	repo_type="dataset"
	)

	return True, "Dataset structure initialized successfully"
	except Exception as e:
	return False, f"Failed to initialize dataset: {str(e)}"

	# DatasetManager Class
	class DatasetManager:
	def __init__(self, local_images_dir="downloaded_cards"):
	self.local_images_dir = local_images_dir
	self.drive = None
	self.dataset_name = "GotThatData/sports-cards"
	os.makedirs(local_images_dir, exist_ok=True)

	# Initialize dataset structure
	success, message = initialize_dataset()
	if not success:
	logger.warning(f"Dataset initialization warning: {message}")

	def authenticate_drive(self):
	"""Authenticate with Google Drive."""
	try:
	gauth = GoogleAuth()
	gauth.settings['client_config_file'] = settings['client_secrets_file']

	# Try to load saved credentials
	gauth.LoadCredentialsFile("credentials.txt")
	if gauth.credentials is None:
	gauth.LocalWebserverAuth()
	elif gauth.access_token_expired:
	gauth.Refresh()
	else:
	gauth.Authorize()
	gauth.SaveCredentialsFile("credentials.txt")

	self.drive = GoogleDrive(gauth)
	return True, "Successfully authenticated with Google Drive"
	except Exception as e:
	logger.error(f"Authentication failed: {str(e)}")
	return False, f"Authentication failed: {str(e)}"

	def download_and_rename_files(self, drive_folder_id, naming_convention):
	"""Download files from Google Drive and rename them."""
	if not self.drive:
	return False, "Google Drive not authenticated", []

	try:
	query = f"'{drive_folder_id}' in parents and trashed=false"
	file_list = self.drive.ListFile({'q': query}).GetList()

	if not file_list:
	logger.warning(f"No files found in folder: {drive_folder_id}")
	return False, "No files found in the specified folder.", []

	existing_dataset, start_index = safe_load_dataset(self.dataset_name)
	renamed_files = []
	processed_count = 0
	error_count = 0

	for i, file in enumerate(tqdm(file_list, desc="Downloading files", unit="file")):
	if 'mimeType' in file and 'image' in file['mimeType'].lower():
	new_filename = f"{naming_convention}_{start_index + processed_count + 1}.jpg"
	file_path = os.path.join(self.local_images_dir, new_filename)

	try:
	file.GetContentFile(file_path)
	if is_valid_image(file_path):
	renamed_files.append({
	'file_path': file_path,
	'original_name': file['title'],
	'new_name': new_filename
	})
	processed_count += 1
	logger.info(f"Successfully processed: {file['title']} -> {new_filename}")
	else:
	error_count += 1
	if os.path.exists(file_path):
	os.remove(file_path)
	except Exception as e:
	error_count += 1
	logger.error(f"Error processing file {file['title']}: {str(e)}")
	if os.path.exists(file_path):
	os.remove(file_path)

	status_message = f"Processed {processed_count} images successfully"
	if error_count > 0:
	status_message += f" ({error_count} files failed)"

	return True, status_message, renamed_files
	except Exception as e:
	logger.error(f"Download error: {str(e)}")
	return False, f"Error during download: {str(e)}", []

	def update_huggingface_dataset(self, renamed_files):
	"""Update Hugging Face dataset with new images."""
	if not renamed_files:
	return False, "No files to update"

	try:
	df = pd.DataFrame(renamed_files)
	new_dataset = Dataset.from_pandas(df)

	existing_dataset, _ = safe_load_dataset(self.dataset_name)
	if existing_dataset and 'train' in existing_dataset:
	combined_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
	else:
	combined_dataset = new_dataset

	combined_dataset.push_to_hub(self.dataset_name, split="train")
	return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images."
	except Exception as e:
	logger.error(f"Dataset update error: {str(e)}")
	return False, f"Error updating Hugging Face dataset: {str(e)}"

	def process_pipeline(folder_id, naming_convention):
	"""Main pipeline for processing images and updating dataset."""
	# Validate input
	is_valid, error_message = validate_input(folder_id, naming_convention)
	if not is_valid:
	return error_message, []

	manager = DatasetManager()

	# Step 1: Authenticate Google Drive
	auth_success, auth_message = manager.authenticate_drive()
	if not auth_success:
	return auth_message, []

	# Step 2: Download and rename files
	success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention)
	if not success:
	return message, []

	# Step 3: Update Hugging Face dataset
	success, hf_message = manager.update_huggingface_dataset(renamed_files)
	return f"{message}\n{hf_message}", renamed_files

	def process_ui(folder_id, naming_convention):
	"""UI handler for the process pipeline"""
	status, renamed_files = process_pipeline(folder_id, naming_convention)
	table_data = [[file['original_name'], file['new_name'], file['file_path']]
	for file in renamed_files] if renamed_files else []
	return status, table_data

	# Custom CSS for web-safe fonts and improved styling
	custom_css = """
	div.gradio-container {
	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif !important;
	}
	div.gradio-container button {
	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif !important;
	}
	div.gradio-container input {
	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif !important;
	}
	.gr-form {
	background-color: #ffffff;
	border-radius: 8px;
	padding: 20px;
	box-shadow: 0 1px 3px rgba(0,0,0,0.1);
	}
	.gr-button {
	background-color: #2c5282;
	color: white;
	}
	.gr-button:hover {
	background-color: #2b6cb0;
	}
	.gr-input {
	border: 1px solid #e2e8f0;
	}
	.gr-input:focus {
	border-color: #4299e1;
	box-shadow: 0 0 0 1px #4299e1;
	}
	"""

	# Gradio interface
	demo = gr.Interface(
	fn=process_ui,
	inputs=[
	gr.Textbox(
	label="Google Drive Folder ID",
	placeholder="Enter the folder ID from the URL",
	info="Found in your Google Drive folder's URL"
	),
	gr.Textbox(
	label="Naming Convention",
	placeholder="e.g., sports_card",
	value="sports_card",
	info="Use only letters, numbers, and underscores"
	)
	],
	outputs=[
	gr.Textbox(
	label="Status",
	lines=3
	),
	gr.Dataframe(
	headers=["Original Name", "New Name", "File Path"],
	wrap=True
	)
	],
	title="Sports Cards Dataset Processor",
	description="""
	Instructions:
	1. Enter the Google Drive folder ID (found in the folder's URL)
	2. Specify a naming convention for the files (e.g., 'sports_card')
	3. Click submit to start processing

	Note: Only image files will be processed. Invalid images will be skipped.
	""",
	css=custom_css,
	theme="default"
	)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860
	)