math-score-ml-project / src /components /data_transformation.py
Aryan
added all environment project
a244ac5
# Transform data for model training and evaluation.
# Python Libraries
import os
import sys
import pandas as pd
import numpy as np
from dataclasses import dataclass
# local Imports
from src.components.data_ingestion import DataIngestion
from src.exception import CustomException
from src.logger import logging
from src.utils import save_object
from src.components.model_trainer import ModelTrainer
# Sklearn Imports
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
@dataclass
class DataTransformationConfig:
preprocessor_obj_file_path = os.path.join('artifacts', 'preprocessor.pkl')
class DataTransformation:
def __init__(self):
self.data_transformation_config = DataTransformationConfig()
def get_data_transformer_object(self):
try:
logging.info("Data Transformation initiated")
numerical_columns = ['reading score', 'writing score']
categorical_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
num_pipeline = Pipeline(
steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
]
)
cat_pipeline = Pipeline(
steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('one_hot_encoder', OneHotEncoder()),
('scaler', StandardScaler(with_mean=False))
]
)
logging.info("Numerical and Categorical pipelines created")
preprocessor = ColumnTransformer(
[
('num_pipeline', num_pipeline, numerical_columns),
('cat_pipeline', cat_pipeline, categorical_columns)
]
)
return preprocessor
except Exception as e:
logging.error("Error in Data Transformation {0}".format(e))
raise CustomException(e, sys)
def initiate_data_transformation(self, train_path, test_path):
try:
# Reading train and test data
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
logging.info("Read train and test data completed")
logging.info("Obtaining preprocessor object")
preprocessing_obj = self.get_data_transformer_object()
target_column_name = "math score"
input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
target_feature_train_df = train_df[target_column_name]
input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
target_feature_test_df = test_df[target_column_name]
logging.info("Applying preprocessing object on training and testing dataframes")
input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)
logging.info("Preprocessing completed successfully")
train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]
logging.info("Saved preprocessing object")
save_object(
file_path=self.data_transformation_config.preprocessor_obj_file_path,
obj=preprocessing_obj
)
return (
train_arr,
test_arr,
self.data_transformation_config.preprocessor_obj_file_path,
)
except Exception as e:
logging.error("Error in initiate_data_transformation {0}".format(e))
raise CustomException(e, sys)
if __name__ == "__main__":
obj = DataIngestion()
train_data, test_data = obj.initiate_data_ingestion()
data_transformation = DataTransformation()
train_arr , test_arr , preProcessorPath = data_transformation.initiate_data_transformation(train_data, test_data)
model_trainer = ModelTrainer()
print(model_trainer.initiate_model_trainer(train_arr , test_arr))