import os import sys from src.exception import CustomException from src.logger import logging import pandas as pd from sklearn.model_selection import train_test_split from dataclasses import dataclass @dataclass class DataIngestionConfig: train_data_path: str test_data_path: str raw_data_path: str def __init__(self): self.train_data_path = os.path.join('artifacts', 'train.csv') self.test_data_path = os.path.join('artifacts', 'test.csv') self.raw_data_path = os.path.join('artifacts', 'data.csv') class DataIngestion: def __init__(self): self.ingestion_config = DataIngestionConfig() def initiate_data_ingestion(self): logging.info("Data Ingestion started") try: df = pd.read_csv('/Users/aryan/Documents/aryan/ml-projects/project-1/notebook/data/StudentsPerformance.csv') logging.info("Dataset read as dataframe") os.makedirs(os.path.dirname(self.ingestion_config.raw_data_path), exist_ok=True) df.to_csv(self.ingestion_config.raw_data_path, index=False) logging.info("Raw data saved") train_set, test_set = train_test_split(df, test_size=0.2, random_state=42) train_set.to_csv(self.ingestion_config.train_data_path, index=False) test_set.to_csv(self.ingestion_config.test_data_path, index=False) logging.info("Train and Test data saved") return ( self.ingestion_config.train_data_path, self.ingestion_config.test_data_path ) except Exception as e: logging.error("Error in Data Ingestion {0}".format(e)) raise CustomException(e, sys) if __name__ == "__main__": obj = DataIngestion() obj.initiate_data_ingestion()