# Transform data for model training and evaluation. # Python Libraries import os import sys import pandas as pd import numpy as np from dataclasses import dataclass # local Imports from src.components.data_ingestion import DataIngestion from src.exception import CustomException from src.logger import logging from src.utils import save_object from src.components.model_trainer import ModelTrainer # Sklearn Imports from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler @dataclass class DataTransformationConfig: preprocessor_obj_file_path = os.path.join('artifacts', 'preprocessor.pkl') class DataTransformation: def __init__(self): self.data_transformation_config = DataTransformationConfig() def get_data_transformer_object(self): try: logging.info("Data Transformation initiated") numerical_columns = ['reading score', 'writing score'] categorical_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course'] num_pipeline = Pipeline( steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ] ) cat_pipeline = Pipeline( steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('one_hot_encoder', OneHotEncoder()), ('scaler', StandardScaler(with_mean=False)) ] ) logging.info("Numerical and Categorical pipelines created") preprocessor = ColumnTransformer( [ ('num_pipeline', num_pipeline, numerical_columns), ('cat_pipeline', cat_pipeline, categorical_columns) ] ) return preprocessor except Exception as e: logging.error("Error in Data Transformation {0}".format(e)) raise CustomException(e, sys) def initiate_data_transformation(self, train_path, test_path): try: # Reading train and test data train_df = pd.read_csv(train_path) test_df = pd.read_csv(test_path) logging.info("Read train and test data completed") logging.info("Obtaining preprocessor object") preprocessing_obj = self.get_data_transformer_object() target_column_name = "math score" input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1) target_feature_train_df = train_df[target_column_name] input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1) target_feature_test_df = test_df[target_column_name] logging.info("Applying preprocessing object on training and testing dataframes") input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df) input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df) logging.info("Preprocessing completed successfully") train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)] test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)] logging.info("Saved preprocessing object") save_object( file_path=self.data_transformation_config.preprocessor_obj_file_path, obj=preprocessing_obj ) return ( train_arr, test_arr, self.data_transformation_config.preprocessor_obj_file_path, ) except Exception as e: logging.error("Error in initiate_data_transformation {0}".format(e)) raise CustomException(e, sys) if __name__ == "__main__": obj = DataIngestion() train_data, test_data = obj.initiate_data_ingestion() data_transformation = DataTransformation() train_arr , test_arr , preProcessorPath = data_transformation.initiate_data_transformation(train_data, test_data) model_trainer = ModelTrainer() print(model_trainer.initiate_model_trainer(train_arr , test_arr))