chkp-talexm commited on
Commit
b8bf9dd
·
1 Parent(s): da39861

init model

Browse files
Files changed (7) hide show
  1. config.py +14 -0
  2. data_loader.py +184 -0
  3. main.py +45 -0
  4. model_manager.py +23 -0
  5. model_predictor.py +17 -0
  6. model_trainer.py +35 -0
  7. requirements.txt +9 -0
config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # Directories
4
+ MODEL_DIR = "models"
5
+
6
+ # Model File Paths
7
+ CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.cbm")
8
+ XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.json")
9
+ RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
10
+
11
+ # Model Parameters
12
+ CATBOOST_PARAMS = {"iterations": 800, "depth": 6, "learning_rate": 0.05, "random_seed": 42, "task_type": "CPU", "verbose": 100}
13
+ XGB_PARAMS = {"n_estimators": 800, "learning_rate": 0.05, "max_depth": 6, "tree_method": "hist", "random_state": 42}
14
+ RF_PARAMS = {"n_estimators": 200, "max_depth": 15, "random_state": 42, "n_jobs": -1}
data_loader.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import time
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
8
+ from imblearn.over_sampling import SMOTE
9
+
10
+ # ===========================
11
+ # CONFIGURATION
12
+ # ===========================
13
+
14
+ TRAIN_PATH = "~/Downloads/train_dataset_full - train_dataset_full (1).csv"
15
+ TEST_PATH = "~/Downloads/X_test_1st (1).csv" # Replace with actual test dataset path
16
+
17
+ CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
18
+ TARGET_COLUMN = "is_click"
19
+
20
+ FEATURE_COLUMNS = [
21
+ "age_level", "gender", "product", "campaign_id", "webpage_id",
22
+ "product_category_1", "product_category_2", "user_group_id",
23
+ "user_depth", "city_development_index", "var_1"
24
+ ]
25
+
26
+ AGGREGATED_COLUMNS = [
27
+ "click_sum_age_sex_prod", "click_count_age_sex_prod",
28
+ "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
29
+ "click_sum_city_age_prod", "click_count_city_age_prod",
30
+ "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
31
+ ]
32
+
33
+ # ===========================
34
+ # LOAD DATASETS
35
+ # ===========================
36
+
37
+ def load_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
38
+ """Load train & test datasets, handling missing values."""
39
+ train_df = pd.read_csv(train_path)
40
+ test_df = pd.read_csv(test_path)
41
+
42
+ # Fill missing values
43
+ train_df.fillna(-1, inplace=True)
44
+ test_df.fillna(-1, inplace=True)
45
+
46
+ return train_df, test_df
47
+
48
+
49
+ # ===========================
50
+ # FEATURE ENGINEERING: AGGREGATIONS
51
+ # ===========================
52
+
53
+ def add_aggregated_features(df, test_df):
54
+ """Creates aggregated features based on age, gender, and product interactions."""
55
+
56
+ # Aggregate by age & gender vs product
57
+ age_sex_product_agg = df.groupby(["age_level", "gender", "product"]).agg({
58
+ "is_click": ["sum", "count"],
59
+ "campaign_id": "nunique",
60
+ "webpage_id": "nunique"
61
+ }).reset_index()
62
+
63
+ # Rename columns after aggregation
64
+ age_sex_product_agg.columns = ["age_level", "gender", "product",
65
+ "click_sum_age_sex_prod", "click_count_age_sex_prod",
66
+ "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
67
+
68
+ # Merge into train & test datasets
69
+ df = df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
70
+ test_df = test_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
71
+
72
+ # Aggregate by city, age, product
73
+ city_age_product_agg = df.groupby(["city_development_index", "age_level", "product"]).agg({
74
+ "is_click": ["sum", "count"],
75
+ "campaign_id": "nunique",
76
+ "webpage_id": "nunique"
77
+ }).reset_index()
78
+
79
+ # Rename columns
80
+ city_age_product_agg.columns = ["city_development_index", "age_level", "product",
81
+ "click_sum_city_age_prod", "click_count_city_age_prod",
82
+ "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
83
+
84
+ # Merge into train & test datasets
85
+ df = df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
86
+ test_df = test_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
87
+
88
+ # Fill missing values after merging
89
+ df.fillna(0, inplace=True)
90
+ test_df.fillna(0, inplace=True)
91
+
92
+ return df, test_df
93
+
94
+
95
+ # ===========================
96
+ # ENCODE & NORMALIZE FEATURES
97
+ # ===========================
98
+
99
+ def preprocess_data(df, test_df, categorical_columns):
100
+ """Encodes categorical features, normalizes numerical features, and prepares the dataset."""
101
+
102
+ label_encoders = {}
103
+ for col in categorical_columns:
104
+ le = LabelEncoder()
105
+ df[col] = le.fit_transform(df[col].astype(str))
106
+ test_df[col] = test_df[col].astype(str).map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)
107
+ label_encoders[col] = le # Store encoders for later use
108
+
109
+ numerical_columns = [col for col in FEATURE_COLUMNS + AGGREGATED_COLUMNS if col not in categorical_columns]
110
+
111
+ scaler = StandardScaler()
112
+ df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
113
+ test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])
114
+
115
+ return df, test_df, label_encoders, scaler
116
+
117
+
118
+ # ===========================
119
+ # SPLIT DATA & HANDLE IMBALANCE
120
+ # ===========================
121
+
122
+ def split_and_balance_data(df, target_column):
123
+ """Splits data into training and validation sets, applies SMOTE to balance classes."""
124
+
125
+ X = df[FEATURE_COLUMNS + AGGREGATED_COLUMNS]
126
+ y = df[target_column]
127
+
128
+ # Handle class imbalance using SMOTE
129
+ smote = SMOTE(sampling_strategy="auto", random_state=42)
130
+ X_resampled, y_resampled = smote.fit_resample(X, y)
131
+
132
+ # Split into training & validation sets
133
+ X_train, X_val, y_train, y_val = train_test_split(
134
+ X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
135
+ )
136
+
137
+ return X_train, X_val, y_train, y_val
138
+
139
+
140
+ # ===========================
141
+ # VISUALIZE FEATURES
142
+ # ===========================
143
+
144
+ def visualize_features():
145
+ """Generates visualizations for aggregated features."""
146
+
147
+ df, _ = load_data()
148
+ df, _ = add_aggregated_features(df, df)
149
+
150
+ sns.set_style("whitegrid")
151
+
152
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6))
153
+
154
+ sns.barplot(x="age_level", y="click_sum_age_sex_prod", hue="gender",
155
+ data=df, ax=axes[0], palette="coolwarm")
156
+ axes[0].set_title("Total Clicks by Age & Gender vs Product")
157
+
158
+ sns.barplot(x="city_development_index", y="click_sum_city_age_prod", hue="age_level",
159
+ data=df, ax=axes[1], palette="viridis")
160
+ axes[1].set_title("Total Clicks by City Development Index & Age")
161
+
162
+ plt.tight_layout()
163
+ plt.show()
164
+
165
+
166
+ # ===========================
167
+ # RUN FULL DATA PROCESSING PIPELINE
168
+ # ===========================
169
+
170
+ def load_and_process_data():
171
+ """Runs the full data processing pipeline and returns preprocessed training & test data."""
172
+
173
+ df, test_df = load_data()
174
+ df, test_df = add_aggregated_features(df, test_df)
175
+ df, test_df, label_encoders, scaler = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
176
+ X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)
177
+
178
+ return X_train, X_val, y_train, y_val, test_df
179
+
180
+
181
+ if __name__ == "__main__":
182
+ print("🔹 Loading and processing data...")
183
+ X_train, X_val, y_train, y_val, test_df = load_and_process_data()
184
+ print("✅ Data successfully loaded and processed!")
main.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from data_loader import load_and_process_data
4
+ from model_trainer import train_models
5
+ from model_manager import save_models, load_models
6
+ from model_predictor import predict
7
+ from config import MODEL_DIR
8
+ ## ===========================
9
+ # MAIN FUNCTION
10
+ # ===========================
11
+
12
+ def main(train=True, retrain=False):
13
+ """ Main entry point to train, retrain or predict """
14
+ # Create model directory if it doesn't exist
15
+ if not os.path.exists(MODEL_DIR):
16
+ os.makedirs(MODEL_DIR)
17
+ print("\n🚀 Loading data...")
18
+ X_train, X_val, y_train, y_val, test_df = load_and_process_data()
19
+
20
+ if train or retrain:
21
+ print("\n🚀 Training models...")
22
+ models = train_models(X_train, y_train)
23
+ save_models(models)
24
+
25
+ else:
26
+ print("\n🚀 Loading existing models...")
27
+ models = load_models()
28
+
29
+ print("\n🔍 Making predictions...")
30
+ predictions = predict(models, test_df)
31
+
32
+ # Save final predictions
33
+ predictions.to_csv("final_predictions.csv", index=False)
34
+ print("\n✅ Predictions saved successfully as 'final_predictions.csv'!")
35
+
36
+ # ===========================
37
+ # COMMAND-LINE EXECUTION
38
+ # ===========================
39
+ if __name__ == "__main__":
40
+ # parser = argparse.ArgumentParser(description="Train, retrain or make predictions")
41
+ # parser.add_argument("--train", action="store_true", help="Train new models")
42
+ # parser.add_argument("--retrain", action="store_true", help="Retrain models with updated data")
43
+ #
44
+ # args = parser.parse_args()
45
+ main(train=True, retrain=False)
model_manager.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ from catboost import CatBoostClassifier
3
+ from xgboost import XGBClassifier
4
+ from config import CATBOOST_MODEL_PATH, XGB_MODEL_PATH, RF_MODEL_PATH
5
+
6
+ def save_models(models):
7
+ """ Save trained models """
8
+ models["CatBoost"].save_model(CATBOOST_MODEL_PATH)
9
+ models["XGBoost"].save_model(XGB_MODEL_PATH)
10
+ joblib.dump(models["RandomForest"], RF_MODEL_PATH)
11
+ print("✅ Models saved successfully!")
12
+
13
+ def load_models():
14
+ """ Load trained models """
15
+ catboost = CatBoostClassifier()
16
+ catboost.load_model(CATBOOST_MODEL_PATH)
17
+
18
+ xgb = XGBClassifier()
19
+ xgb.load_model(XGB_MODEL_PATH)
20
+
21
+ rf = joblib.load(RF_MODEL_PATH)
22
+
23
+ return {"CatBoost": catboost, "XGBoost": xgb, "RandomForest": rf}
model_predictor.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ def predict(models, X_test):
5
+ """ Make predictions using trained models """
6
+ test_predictions = {name: np.array(model.predict(X_test)).squeeze() for name, model in models.items()}
7
+
8
+ test_predictions_df = pd.DataFrame(test_predictions)
9
+
10
+ # Ensure binary values (0 or 1)
11
+ for col in test_predictions_df.columns:
12
+ test_predictions_df[col] = (test_predictions_df[col] > 0.5).astype(int)
13
+
14
+ # Apply "at least one model predicts 1" rule
15
+ test_predictions_df["is_click_predicted"] = test_predictions_df.max(axis=1)
16
+
17
+ return test_predictions_df
model_trainer.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from catboost import CatBoostClassifier
3
+ from xgboost import XGBClassifier
4
+ from sklearn.ensemble import RandomForestClassifier
5
+ from config import CATBOOST_PARAMS, XGB_PARAMS, RF_PARAMS
6
+
7
+ def train_models(X_train, y_train, categorical_columns):
8
+ """ Train and return machine learning models """
9
+ models = {}
10
+
11
+ # Train CatBoost
12
+ start_time = time.time()
13
+ catboost = CatBoostClassifier(**CATBOOST_PARAMS)
14
+ catboost.fit(X_train, y_train, cat_features=[X_train.columns.get_loc(col) for col in categorical_columns])
15
+ models["CatBoost"] = catboost
16
+ print(f"✅ CatBoost trained in {time.time() - start_time:.2f} sec")
17
+
18
+ # Train XGBoost
19
+ if set(y_train.unique()) <= {0, 1}: # Ensure only valid labels exist
20
+ start_time = time.time()
21
+ xgb = XGBClassifier(**XGB_PARAMS)
22
+ xgb.fit(X_train, y_train)
23
+ models["XGBoost"] = xgb
24
+ print(f"✅ XGBoost trained in {time.time() - start_time:.2f} sec")
25
+ else:
26
+ print("⚠ XGBoost training skipped due to invalid labels!")
27
+
28
+ # Train RandomForest
29
+ start_time = time.time()
30
+ rf = RandomForestClassifier(**RF_PARAMS)
31
+ rf.fit(X_train, y_train)
32
+ models["RandomForest"] = rf
33
+ print(f"✅ RandomForest trained in {time.time() - start_time:.2f} sec")
34
+
35
+ return models
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ scikit-learn
4
+ imbalanced-learn
5
+ matplotlib
6
+ seaborn
7
+ catboost
8
+ xgboost
9
+ joblib