File size: 4,500 Bytes
a244ac5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# Libraries
import os
import sys
import pandas as pd
import numpy as np
from dataclasses import dataclass
# local Imports
from src.exception import CustomException
from src.logger import logging
from src.utils import save_object , evaluate_models
# Models
from sklearn.linear_model import LinearRegression , Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor , AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
# Imports for hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Metric Imports to test model performance
from sklearn.metrics import r2_score
@dataclass
class ModelTrainerConfig:
trained_model_file_path = os.path.join('artifacts', 'model.pkl')
class ModelTrainer:
def __init__(self):
self.model_trainer_config = ModelTrainerConfig()
def initiate_model_trainer(self, train_array, test_array):
try:
logging.info("Model Trainer initiated")
X_train, y_train = train_array[:,:-1], train_array[:,-1]
X_test, y_test = test_array[:,:-1], test_array[:,-1]
models = {
"Linear Regression": LinearRegression(),
"Ridge": Ridge(),
"Lasso": Lasso(),
"Decision Tree": DecisionTreeRegressor(),
"Random Forest": RandomForestRegressor(),
"Gradient Boosting": GradientBoostingRegressor(),
"AdaBoost": AdaBoostRegressor(),
"SVR": SVR(),
"KNeighbors": KNeighborsRegressor(),
"CatBoost": CatBoostRegressor(verbose=False),
"XGBRegressor": XGBRegressor()
}
params = {
"Decision Tree": {
'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
'splitter': ['best', 'random'],
'max_depth': [3, 5, 10, 15, 20, None]
},
"Random Forest": {
'n_estimators': [50, 100, 200],
'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
'max_depth': [3, 5, 10, 15, 20, None]
},
"Gradient Boosting": {
'learning_rate': [0.01, 0.1, 0.2, 0.3],
'n_estimators': [50, 100, 200],
'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]
},
"SVR": {
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'C': [0.1, 1, 10, 100],
'gamma': ['scale', 'auto']
},
"KNeighbors": {
'n_neighbors': [3, 5, 7, 9],
'weights': ['uniform', 'distance'],
'metric': ['euclidean', 'manhattan']
},
"CatBoost": {
'depth': [4, 6, 8],
'learning_rate': [0.01, 0.1, 0.2],
'iterations': [100, 200, 300]
},
"XGBRegressor": {
'learning_rate': [0.01, 0.1, 0.2],
'n_estimators': [100, 200, 300],
'max_depth': [3, 5, 7]
}
}
logging.info("Models defined for training")
model_report = evaluate_models(X_train, y_train, X_test, y_test, models , params)
logging.info("Model evaluation completed finding best model")
best_model_name = max(model_report, key=model_report.get)
best_model = models[best_model_name]
best_r2_score = model_report[best_model_name]
if(best_r2_score < 0.6):
logging.info("No model found with R2 score greater than 0.6")
raise CustomException("No best model found with R2 score greater than 0.6", sys)
logging.info(f"Best Model: {best_model_name} with R2 Score: {best_r2_score}")
save_object(
file_path=self.model_trainer_config.trained_model_file_path,
obj=best_model
)
return best_r2_score
except Exception as e:
logging.error("Error in Model Trainer {0}".format(e))
raise CustomException(e, sys) |