|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import sys |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from dataclasses import dataclass |
|
|
|
|
|
|
|
|
from src.components.data_ingestion import DataIngestion |
|
|
from src.exception import CustomException |
|
|
from src.logger import logging |
|
|
from src.utils import save_object |
|
|
from src.components.model_trainer import ModelTrainer |
|
|
|
|
|
|
|
|
from sklearn.compose import ColumnTransformer |
|
|
from sklearn.impute import SimpleImputer |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.preprocessing import OneHotEncoder, StandardScaler |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class DataTransformationConfig: |
|
|
preprocessor_obj_file_path = os.path.join('artifacts', 'preprocessor.pkl') |
|
|
|
|
|
class DataTransformation: |
|
|
def __init__(self): |
|
|
self.data_transformation_config = DataTransformationConfig() |
|
|
|
|
|
def get_data_transformer_object(self): |
|
|
try: |
|
|
logging.info("Data Transformation initiated") |
|
|
numerical_columns = ['reading score', 'writing score'] |
|
|
categorical_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course'] |
|
|
|
|
|
num_pipeline = Pipeline( |
|
|
steps=[ |
|
|
('imputer', SimpleImputer(strategy='median')), |
|
|
('scaler', StandardScaler()) |
|
|
] |
|
|
) |
|
|
cat_pipeline = Pipeline( |
|
|
steps=[ |
|
|
('imputer', SimpleImputer(strategy='most_frequent')), |
|
|
('one_hot_encoder', OneHotEncoder()), |
|
|
('scaler', StandardScaler(with_mean=False)) |
|
|
] |
|
|
) |
|
|
logging.info("Numerical and Categorical pipelines created") |
|
|
preprocessor = ColumnTransformer( |
|
|
[ |
|
|
('num_pipeline', num_pipeline, numerical_columns), |
|
|
('cat_pipeline', cat_pipeline, categorical_columns) |
|
|
] |
|
|
) |
|
|
return preprocessor |
|
|
except Exception as e: |
|
|
logging.error("Error in Data Transformation {0}".format(e)) |
|
|
raise CustomException(e, sys) |
|
|
def initiate_data_transformation(self, train_path, test_path): |
|
|
try: |
|
|
|
|
|
train_df = pd.read_csv(train_path) |
|
|
test_df = pd.read_csv(test_path) |
|
|
logging.info("Read train and test data completed") |
|
|
|
|
|
logging.info("Obtaining preprocessor object") |
|
|
preprocessing_obj = self.get_data_transformer_object() |
|
|
|
|
|
target_column_name = "math score" |
|
|
|
|
|
input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1) |
|
|
target_feature_train_df = train_df[target_column_name] |
|
|
|
|
|
input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1) |
|
|
target_feature_test_df = test_df[target_column_name] |
|
|
|
|
|
logging.info("Applying preprocessing object on training and testing dataframes") |
|
|
input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df) |
|
|
input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df) |
|
|
|
|
|
logging.info("Preprocessing completed successfully") |
|
|
train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)] |
|
|
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)] |
|
|
|
|
|
logging.info("Saved preprocessing object") |
|
|
save_object( |
|
|
file_path=self.data_transformation_config.preprocessor_obj_file_path, |
|
|
obj=preprocessing_obj |
|
|
) |
|
|
return ( |
|
|
train_arr, |
|
|
test_arr, |
|
|
self.data_transformation_config.preprocessor_obj_file_path, |
|
|
) |
|
|
except Exception as e: |
|
|
logging.error("Error in initiate_data_transformation {0}".format(e)) |
|
|
raise CustomException(e, sys) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
obj = DataIngestion() |
|
|
train_data, test_data = obj.initiate_data_ingestion() |
|
|
|
|
|
data_transformation = DataTransformation() |
|
|
train_arr , test_arr , preProcessorPath = data_transformation.initiate_data_transformation(train_data, test_data) |
|
|
|
|
|
model_trainer = ModelTrainer() |
|
|
print(model_trainer.initiate_model_trainer(train_arr , test_arr)) |