|
|
import os |
|
|
import sys |
|
|
from src.exception import CustomException |
|
|
from src.logger import logging |
|
|
import pandas as pd |
|
|
|
|
|
from sklearn.model_selection import train_test_split |
|
|
from dataclasses import dataclass |
|
|
|
|
|
@dataclass |
|
|
class DataIngestionConfig: |
|
|
train_data_path: str |
|
|
test_data_path: str |
|
|
raw_data_path: str |
|
|
def __init__(self): |
|
|
self.train_data_path = os.path.join('artifacts', 'train.csv') |
|
|
self.test_data_path = os.path.join('artifacts', 'test.csv') |
|
|
self.raw_data_path = os.path.join('artifacts', 'data.csv') |
|
|
|
|
|
class DataIngestion: |
|
|
def __init__(self): |
|
|
self.ingestion_config = DataIngestionConfig() |
|
|
|
|
|
def initiate_data_ingestion(self): |
|
|
logging.info("Data Ingestion started") |
|
|
try: |
|
|
df = pd.read_csv('/Users/aryan/Documents/aryan/ml-projects/project-1/notebook/data/StudentsPerformance.csv') |
|
|
logging.info("Dataset read as dataframe") |
|
|
|
|
|
os.makedirs(os.path.dirname(self.ingestion_config.raw_data_path), exist_ok=True) |
|
|
|
|
|
df.to_csv(self.ingestion_config.raw_data_path, index=False) |
|
|
logging.info("Raw data saved") |
|
|
|
|
|
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42) |
|
|
|
|
|
train_set.to_csv(self.ingestion_config.train_data_path, index=False) |
|
|
test_set.to_csv(self.ingestion_config.test_data_path, index=False) |
|
|
logging.info("Train and Test data saved") |
|
|
|
|
|
return ( |
|
|
self.ingestion_config.train_data_path, |
|
|
self.ingestion_config.test_data_path |
|
|
) |
|
|
except Exception as e: |
|
|
logging.error("Error in Data Ingestion {0}".format(e)) |
|
|
raise CustomException(e, sys) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
obj = DataIngestion() |
|
|
obj.initiate_data_ingestion() |
|
|
|