Spaces:

szili2011
/

TrainAI

Sleeping

App Files Files Community

TrainAI / app.py

szili2011

Update app.py

0987ee1 verified 4 months ago

raw

history blame contribute delete

15.5 kB

	# --- Standard Library Imports ---
	import os
	import time
	import traceback
	import tempfile
	import json
	import math
	import collections
	import collections.abc # For Gradio compatibility with newer Python versions

	# --- UI Framework ---
	import gradio as gr

	# --- Data Handling & Numerical Ops ---
	import pandas as pd
	import numpy as np

	# --- Core Machine Learning (Scikit-learn) ---
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
	from sklearn.impute import SimpleImputer
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.linear_model import LogisticRegression, LinearRegression
	from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
	from sklearn.svm import SVC, SVR
	from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
	from sklearn.datasets import make_classification, make_regression
	import joblib

	# --- ONNX Support for Model Interoperability ---
	import skl2onnx
	from skl2onnx import convert_sklearn
	from skl2onnx.common.data_types import FloatTensorType, StringTensorType

	# --- Visualization ---
	import matplotlib
	matplotlib.use('Agg') # Use non-interactive backend for server environments
	import matplotlib.pyplot as plt

	# --- Graceful ONNX Runtime Handling ---
	# This addresses the system-level ImportError on platforms like Hugging Face Spaces.
	try:
	import onnxruntime as rt
	ONNX_RUNTIME_AVAILABLE = True
	except ImportError:
	ONNX_RUNTIME_AVAILABLE = False
	print("Warning: onnxruntime could not be imported. ONNX model validation will be skipped.")
	# --- End of Imports ---


	# --- Global Variables & Constants ---
	TEMP_DIR = "temp_outputs"
	os.makedirs(TEMP_DIR, exist_ok=True)
	MAX_GENERATED_ROWS = 50000
	MAX_GENERATED_COLS = 100

	# --- Helper Functions ---
	def get_temp_filepath(filename_base, extension):
	"""Generates a unique temporary filepath."""
	clean_extension = extension.lstrip('.')
	return os.path.join(TEMP_DIR, f"{filename_base}_{time.strftime('%Y%m%d-%H%M%S')}.{clean_extension}")

	# --- Dataset and Preprocessing Logic ---
	def generate_dataset_backend(task_type, n_samples, n_features, n_classes_or_informative, dataset_format):
	"""Generates synthetic data based on user specifications."""
	logs = "\n--- Generating Dataset ---\n"
	n_samples = max(10, min(int(n_samples), MAX_GENERATED_ROWS))
	n_features = max(1, min(int(n_features), MAX_GENERATED_COLS))
	n_classes_or_informative = int(n_classes_or_informative)
	df = None

	try:
	if task_type == "Tabular Classification":
	X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=max(1, n_features // 2),
	n_redundant=0, n_classes=max(2, n_classes_or_informative), random_state=42)
	df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)])
	df['target'] = y
	elif task_type == "Tabular Regression":
	X, y = make_regression(n_samples=n_samples, n_features=n_features,
	n_informative=max(1, min(n_features, n_classes_or_informative)), noise=10, random_state=42)
	df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)])
	df['target'] = y

	if df is None:
	raise NotImplementedError(f"Dataset generation for '{task_type}' is not implemented.")

	logs += f"Generated data with shape: {df.shape}\n"
	file_path = get_temp_filepath("generated_dataset", dataset_format)

	if dataset_format == ".csv": df.to_csv(file_path, index=False)
	elif dataset_format == ".json": df.to_json(file_path, orient='records', lines=True)
	elif dataset_format == ".parquet": df.to_parquet(file_path, index=False)

	logs += f"Dataset saved to temporary file: {os.path.basename(file_path)}\n"
	return df.head(), df, logs, file_path

	except Exception as e:
	error_msg = f"Error generating dataset: {traceback.format_exc()}"
	logs += error_msg + "\n"
	return None, None, logs, None

	# --- Core Training Functions ---
	def train_model_sklearn(data_input, target_column, task_type, model_name, model_output_format, logs=""):
	"""Handles the entire Scikit-learn training and evaluation pipeline."""
	logs += f"\n--- Training Scikit-learn Model: {model_name} ---\n"

	try:
	# Load data if it's a filepath, otherwise use the DataFrame directly
	df = data_input
	if isinstance(data_input, str):
	if data_input.endswith('.csv'): df = pd.read_csv(data_input)
	elif data_input.endswith('.json'): df = pd.read_json(data_input, lines=True)
	elif data_input.endswith('.parquet'): df = pd.read_parquet(data_input)
	else: raise ValueError("Unsupported file type for upload.")

	if target_column not in df.columns:
	raise ValueError(f"Target column '{target_column}' not found.")

	# Preprocessing
	X = df.drop(columns=[target_column])
	y = df[target_column]
	numeric_features = X.select_dtypes(include=np.number).columns
	categorical_features = X.select_dtypes(include='object').columns

	preprocessor = ColumnTransformer(transformers=[
	('num', Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]), numeric_features),
	('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
	])

	# Model Selection
	if task_type == "Tabular Classification":
	y = LabelEncoder().fit_transform(y)
	models = {
	"Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
	"Random Forest Classifier": RandomForestClassifier(random_state=42),
	"Support Vector Machine (SVM) Classifier": SVC(random_state=42, probability=True)
	}
	else: # Regression
	models = {
	"Linear Regression": LinearRegression(),
	"Random Forest Regressor": RandomForestRegressor(random_state=42),
	"Support Vector Machine (SVR) Regressor": SVR()
	}
	model = models[model_name]

	# Create full pipeline
	pipeline = Pipeline([('preprocessor', preprocessor), ('model', model)])

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	logs += f"Data split into training ({X_train.shape}) and testing ({X_test.shape}) sets.\n"

	# Training
	start_time = time.time()
	pipeline.fit(X_train, y_train)
	logs += f"Training completed in {time.time() - start_time:.2f}s.\n"

	# Evaluation
	y_pred = pipeline.predict(X_test)
	if task_type == "Tabular Classification":
	acc = accuracy_score(y_test, y_pred)
	report = classification_report(y_test, y_pred, zero_division=0)
	metrics = f"Accuracy: {acc:.4f}\n\nClassification Report:\n{report}"
	else:
	mse = mean_squared_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)
	metrics = f"Mean Squared Error: {mse:.4f}\nR² Score: {r2:.4f}"
	logs += "\n--- Evaluation Metrics ---\n" + metrics + "\n"

	# Model Saving
	model_filename_base = f"sklearn_{model_name.replace(' ', '_').lower()}"
	model_path = None
	if model_output_format == ".pkl (Scikit-learn)":
	model_path = get_temp_filepath(model_filename_base, "pkl")
	joblib.dump(pipeline, model_path)
	logs += f"Model pipeline saved to {os.path.basename(model_path)} as PKL.\n"
	elif model_output_format == ".onnx (ONNX)":
	model_path = get_temp_filepath(model_filename_base, "onnx")
	initial_types = []
	for col_name in X.columns:
	if pd.api.types.is_numeric_dtype(X[col_name].dtype):
	initial_types.append((col_name, FloatTensorType([None, 1])))
	else:
	initial_types.append((col_name, StringTensorType([None, 1])))

	options = {'zipmap': False} if task_type == "Tabular Classification" else {}
	onnx_model = convert_sklearn(pipeline, initial_types=initial_types, target_opset=12, options=options)
	with open(model_path, "wb") as f: f.write(onnx_model.SerializeToString())
	logs += f"Model pipeline saved to {os.path.basename(model_path)} as ONNX.\n"

	if ONNX_RUNTIME_AVAILABLE:
	sess = rt.InferenceSession(model_path)
	logs += "ONNX model successfully loaded and validated with onnxruntime.\n"
	else:
	logs += "ONNX model validation skipped because onnxruntime is not available in this environment.\n"

	return logs, metrics, model_path

	except Exception as e:
	error_msg = f"Scikit-learn training failed: {traceback.format_exc()}"
	logs += error_msg + "\n"
	return logs, error_msg, None

	# --- Main Training Dispatcher ---
	def train_model_wrapper(data_input, target_column, task_type, model_family, model_specific,
	model_output_format, logs):
	"""A wrapper to call the correct training function based on user choices."""
	if data_input is None:
	logs += "ERROR: No dataset has been generated or uploaded. Please go to Tab 2.\n"
	return logs, "Error: No dataset available.", None, None

	if model_family == "Scikit-learn (Classical ML)":
	logs, metrics, model_path = train_model_sklearn(data_input, target_column, task_type, model_specific, model_output_format, logs)
	return logs, metrics, model_path, None # No plot for sklearn

	# Placeholder for future PyTorch integration
	else:
	logs += f"The selected model family '{model_family}' is not supported yet.\n"
	return logs, "Error: Model family not supported.", None, None

	# --- Gradio UI Definition ---
	def update_model_options(task_choice, model_family_choice):
	"""Dynamically updates the available models based on task and family."""
	choices = []
	if model_family_choice == "Scikit-learn (Classical ML)":
	if task_choice == "Tabular Classification":
	choices = ["Logistic Regression", "Random Forest Classifier", "Support Vector Machine (SVM) Classifier"]
	elif task_choice == "Tabular Regression":
	choices = ["Linear Regression", "Random Forest Regressor", "Support Vector Machine (SVR) Regressor"]

	value = choices[0] if choices else None
	return gr.update(choices=choices, value=value, visible=bool(choices))

	def update_model_output_formats(model_family_choice):
	"""Updates the output format options based on the model family."""
	formats = []
	if model_family_choice == "Scikit-learn (Classical ML)":
	formats = [".pkl (Scikit-learn)", ".onnx (ONNX)"]

	value = formats[0] if formats else None
	return gr.update(choices=formats, value=value)

	# The Gradio App Layout
	with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange")) as demo:
	gr.Markdown("# 🧠 TrainAI ⚙️")
	gr.Markdown("A simple interface to create, train, and download machine learning models.")

	# State variables to hold data between interactions
	generated_data_state = gr.State(None)

	with gr.Tabs():
	with gr.TabItem("1. Define Task & Model"):
	with gr.Row():
	task_type_dd = gr.Dropdown(["Tabular Classification", "Tabular Regression"], label="Select Task Type", value="Tabular Classification")
	model_family_dd = gr.Dropdown(["Scikit-learn (Classical ML)"], label="Select Model Family", value="Scikit-learn (Classical ML)")

	model_specific_dd = gr.Dropdown(label="Select Specific Model", choices=["Logistic Regression", "Random Forest Classifier", "Support Vector Machine (SVM) Classifier"], value="Logistic Regression", interactive=True)

	with gr.TabItem("2. Configure Dataset"):
	with gr.Row():
	ds_gen_samples_num = gr.Number(label="# Samples", value=1000, minimum=10, step=100)
	ds_gen_features_num = gr.Number(label="# Features", value=10, minimum=1, step=1)
	ds_gen_classes_num = gr.Number(label="Classes (Classif) / Informative (Regr)", value=2, minimum=1, step=1)
	ds_gen_format_dd = gr.Dropdown([".csv", ".json", ".parquet"], label="Generated Dataset Format", value=".csv")
	generate_dataset_btn = gr.Button("Generate & Preview Dataset", variant="secondary")

	target_column_name_txt = gr.Textbox(label="Target Column Name", value="target", interactive=True)

	# --- FIX: Replaced 'height' with 'row_count' ---
	dataset_preview_df = gr.DataFrame(label="Dataset Preview (First 5 Rows)", interactive=False, row_count=5)
	# --- END FIX ---

	generated_dataset_download_file = gr.File(label="Download Generated Dataset", interactive=False)

	with gr.TabItem("3. Train Model & Get Results"):
	model_output_format_dd = gr.Dropdown(label="Select Model Output Format", choices=[".pkl (Scikit-learn)", ".onnx (ONNX)"], value=".pkl (Scikit-learn)")
	train_model_btn = gr.Button("🚀 Train Model", variant="primary")
	gr.Markdown("---")
	gr.Markdown("### Training Progress & Results")
	training_log_txt = gr.Textbox(label="Training Log & Status", lines=15, interactive=False, max_lines=50)
	evaluation_metrics_txt = gr.Textbox(label="Evaluation Metrics", lines=7, interactive=False)
	download_trained_model_file = gr.File(label="Download Trained Model", interactive=False)
	loss_plot_img = gr.Plot(label="Training Loss Curve (PyTorch only)", visible=False) # Hide as PyTorch is not used

	# --- Event Handlers ---

	# Update model choices when task or family changes
	task_type_dd.change(fn=update_model_options, inputs=[task_type_dd, model_family_dd], outputs=model_specific_dd)
	model_family_dd.change(fn=update_model_options, inputs=[task_type_dd, model_family_dd], outputs=model_specific_dd)

	# Update output formats when family changes
	model_family_dd.change(fn=update_model_output_formats, inputs=model_family_dd, outputs=model_output_format_dd)

	# Dataset generation button
	generate_dataset_btn.click(
	fn=generate_dataset_backend,
	inputs=[task_type_dd, ds_gen_samples_num, ds_gen_features_num, ds_gen_classes_num, ds_gen_format_dd],
	outputs=[dataset_preview_df, generated_data_state, training_log_txt, generated_dataset_download_file]
	)

	# Main training button
	train_model_btn.click(
	fn=train_model_wrapper,
	inputs=[generated_data_state, target_column_name_txt, task_type_dd, model_family_dd, model_specific_dd, model_output_format_dd, training_log_txt],
	outputs=[training_log_txt, evaluation_metrics_txt, download_trained_model_file, loss_plot_img]
	)

	# Launch the application
	demo.queue().launch(debug=True, show_error=True)