Spaces:

Aryanjaiswal78231
/

math-score-ml-project

Running

math-score-ml-project / src /components /data_transformation.py

Aryan

added all environment project

a244ac5 about 2 months ago

4.39 kB

	# Transform data for model training and evaluation.

	# Python Libraries
	import os
	import sys
	import pandas as pd
	import numpy as np
	from dataclasses import dataclass

	# local Imports
	from src.components.data_ingestion import DataIngestion
	from src.exception import CustomException
	from src.logger import logging
	from src.utils import save_object
	from src.components.model_trainer import ModelTrainer

	# Sklearn Imports
	from sklearn.compose import ColumnTransformer
	from sklearn.impute import SimpleImputer
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import OneHotEncoder, StandardScaler


	@dataclass
	class DataTransformationConfig:
	preprocessor_obj_file_path = os.path.join('artifacts', 'preprocessor.pkl')

	class DataTransformation:
	def __init__(self):
	self.data_transformation_config = DataTransformationConfig()

	def get_data_transformer_object(self):
	try:
	logging.info("Data Transformation initiated")
	numerical_columns = ['reading score', 'writing score']
	categorical_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

	num_pipeline = Pipeline(
	steps=[
	('imputer', SimpleImputer(strategy='median')),
	('scaler', StandardScaler())
	]
	)
	cat_pipeline = Pipeline(
	steps=[
	('imputer', SimpleImputer(strategy='most_frequent')),
	('one_hot_encoder', OneHotEncoder()),
	('scaler', StandardScaler(with_mean=False))
	]
	)
	logging.info("Numerical and Categorical pipelines created")
	preprocessor = ColumnTransformer(
	[
	('num_pipeline', num_pipeline, numerical_columns),
	('cat_pipeline', cat_pipeline, categorical_columns)
	]
	)
	return preprocessor
	except Exception as e:
	logging.error("Error in Data Transformation {0}".format(e))
	raise CustomException(e, sys)
	def initiate_data_transformation(self, train_path, test_path):
	try:
	# Reading train and test data
	train_df = pd.read_csv(train_path)
	test_df = pd.read_csv(test_path)
	logging.info("Read train and test data completed")

	logging.info("Obtaining preprocessor object")
	preprocessing_obj = self.get_data_transformer_object()

	target_column_name = "math score"

	input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
	target_feature_train_df = train_df[target_column_name]

	input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
	target_feature_test_df = test_df[target_column_name]

	logging.info("Applying preprocessing object on training and testing dataframes")
	input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
	input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)

	logging.info("Preprocessing completed successfully")
	train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
	test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

	logging.info("Saved preprocessing object")
	save_object(
	file_path=self.data_transformation_config.preprocessor_obj_file_path,
	obj=preprocessing_obj
	)
	return (
	train_arr,
	test_arr,
	self.data_transformation_config.preprocessor_obj_file_path,
	)
	except Exception as e:
	logging.error("Error in initiate_data_transformation {0}".format(e))
	raise CustomException(e, sys)




	if __name__ == "__main__":
	obj = DataIngestion()
	train_data, test_data = obj.initiate_data_ingestion()

	data_transformation = DataTransformation()
	train_arr , test_arr , preProcessorPath = data_transformation.initiate_data_transformation(train_data, test_data)

	model_trainer = ModelTrainer()
	print(model_trainer.initiate_model_trainer(train_arr , test_arr))