Spaces:
Build error
Build error
| import os | |
| import sys | |
| from dataclasses import dataclass | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler | |
| from src.logger import logging | |
| from src.exception import CustomException | |
| from src.utils import save_object | |
| class DataTransformationConfig: | |
| preprocessor_obj_file_path = os.path.join("artifacts", "preprocessor.pkl") | |
| class DataTransformation: | |
| def __init__(self) -> None: | |
| self.data_transformation_config = DataTransformationConfig() | |
| def get_data_transformer_object(self): | |
| """ | |
| This function is responsible for data transformation | |
| """ | |
| try: | |
| numerical_columns = ["reading_score", "writing_score"] | |
| catogrical_columns = [ | |
| "gender", | |
| "race_ethnicity", | |
| "parental_level_of_education", | |
| "lunch", | |
| "test_preparation_course", | |
| ] | |
| num_pipeline = Pipeline( | |
| steps=[ | |
| ("imputer", SimpleImputer(strategy="median")), | |
| ("scaler", StandardScaler()), | |
| ] | |
| ) | |
| logging.info("Numerical columns standard scaling completed") | |
| cat_pipeline = Pipeline( | |
| steps=[ | |
| ("imputer", SimpleImputer(strategy="most_frequent")), | |
| ("one_hot_encoder", OneHotEncoder()), | |
| # ("scaler", StandardScaler()), | |
| ] | |
| ) | |
| logging.info("Categorical columns encoding completed") | |
| logging.info(f"Numerical columns: {numerical_columns}") | |
| logging.info(f"Categorical columns: {catogrical_columns}") | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ("num_pipeline", num_pipeline, numerical_columns), | |
| ("cat_pipeline", cat_pipeline, catogrical_columns), | |
| ] | |
| ) | |
| return preprocessor | |
| except Exception as e: | |
| raise CustomException(e, sys) | |
| def initiate_data_transformation(self, train_path, test_path): | |
| try: | |
| train_df = pd.read_csv(train_path) | |
| test_df = pd.read_csv(test_path) | |
| logging.info("Read train and test data completed") | |
| logging.info("Obtaining preprocessing object") | |
| preprocessing_obj = self.get_data_transformer_object() | |
| target_column_name = "math_score" | |
| # numerical_columns = (["reading_score", "writing_score"],) | |
| input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1) | |
| target_feature_train_df = train_df[target_column_name] | |
| input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1) | |
| target_feature_test_df = test_df[target_column_name] | |
| logging.info( | |
| f"Applying preprocessing object on training and testing dataframe" | |
| ) | |
| input_feature_train_arr = preprocessing_obj.fit_transform( | |
| input_feature_train_df | |
| ) | |
| input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df) | |
| train_arr = np.c_[ | |
| input_feature_train_arr, np.array(target_feature_train_df) | |
| ] | |
| test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)] | |
| save_object( | |
| file_path=self.data_transformation_config.preprocessor_obj_file_path, | |
| obj=preprocessing_obj, | |
| ) | |
| logging.info(f"Saved preprocessing object") | |
| return ( | |
| train_arr, | |
| test_arr, | |
| self.data_transformation_config.preprocessor_obj_file_path, | |
| ) | |
| except Exception as e: | |
| raise CustomException(e, sys) | |