""" Kit Composition Data Cleaner This script converts the Kit_Composition_and_relation.csv file into a cleaned format with line types according to the following rules: 1. Master Kits: - If appears only once (standalone master): line_type = "long line" - If appears multiple times: line_type = "" (empty/theoretical) 2. Sub Kits: - All sub kits get line_type = "long line" 3. Prepacks: - All prepacks get line_type = "miniload" The output includes columns: kit_name, kit_description, kit_type, line_type """ import pandas as pd import os from typing import Tuple class KitCompositionCleaner: """ Cleans and processes kit composition data with line type assignments. This class maintains state across processing steps, allowing for: - Single data load - Step-by-step processing - Intermediate result storage """ def __init__(self, input_file: str, output_file: str = None): """ Initialize the cleaner with file paths. Args: input_file: Path to input CSV file (Kit_Composition_and_relation.csv) output_file: Path to output CSV file (optional, can be set later) """ self.input_file = input_file self.output_file = output_file # State variables for processing pipeline self.df = None self.master_df = None self.subkit_df = None self.prepack_df = None self.final_df = None def load_data(self) -> pd.DataFrame: """Load the Kit Composition and relation CSV file.""" if not os.path.exists(self.input_file): raise FileNotFoundError(f"File not found: {self.input_file}") self.df = pd.read_csv(self.input_file) print(f"Loaded {len(self.df)} rows from {self.input_file}") return self.df def process_master_kits(self) -> pd.DataFrame: """ Process Master Kits according to business rules: - Standalone masters (no subkits/prepacks, only components): line_type = "long line" - Non-standalone masters (have subkits/prepacks): line_type = "" (empty - no production needed) """ if self.df is None: raise ValueError("Data not loaded. Call load_data() first.") print("Processing Master Kits...") # Identify masters with hierarchy (subkits or prepacks) masters_with_subkits = set(self.df[self.df['Sub kit'].notna()]['Master Kit'].unique()) masters_with_prepacks = set(self.df[self.df['Prepack'].notna()]['Master Kit'].unique()) masters_with_hierarchy = masters_with_subkits.union(masters_with_prepacks) # All masters all_masters = set(self.df['Master Kit'].unique()) # Standalone masters are those WITHOUT subkits/prepacks (only have components) standalone_masters = all_masters - masters_with_hierarchy print(f"Total unique Master Kits: {len(all_masters)}") print(f"Masters with subkits/prepacks: {len(masters_with_hierarchy)}") print(f"Standalone masters (only components): {len(standalone_masters)}") # Create master kit records master_data = [] # Get unique master kits with descriptions unique_masters = self.df[['Master Kit', 'Master Kit Description']].drop_duplicates() for _, row in unique_masters.iterrows(): master_kit = row['Master Kit'] master_desc = row['Master Kit Description'] # Determine line_type based on standalone status if master_kit in standalone_masters: line_type = "long line" else: line_type = "" # Empty for non-standalone (theoretical) master_data.append({ 'kit_name': master_kit, 'kit_description': master_desc, 'kit_type': 'master', 'line_type': line_type }) self.master_df = pd.DataFrame(master_data) return self.master_df def process_sub_kits(self) -> pd.DataFrame: """ Process Sub Kits according to business rules: - All sub kits get line_type = "long line" - Remove duplicates """ if self.df is None: raise ValueError("Data not loaded. Call load_data() first.") print("Processing Sub Kits...") # Filter rows that have sub kits subkit_df = self.df[self.df['Sub kit'].notna()].copy() if len(subkit_df) == 0: print("No sub kits found") self.subkit_df = pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type']) return self.subkit_df # Get unique sub kits with descriptions unique_subkits = subkit_df[['Sub kit', 'Sub kit description']].drop_duplicates() subkit_data = [] for _, row in unique_subkits.iterrows(): subkit_data.append({ 'kit_name': row['Sub kit'], 'kit_description': row['Sub kit description'], 'kit_type': 'subkit', 'line_type': 'long line' }) self.subkit_df = pd.DataFrame(subkit_data) print(f"Created {len(self.subkit_df)} sub kit records") return self.subkit_df def process_prepacks(self) -> pd.DataFrame: """ Process Prepacks according to business rules: - All prepacks get line_type = "miniload" - Remove duplicates """ if self.df is None: raise ValueError("Data not loaded. Call load_data() first.") print("Processing Prepacks...") # Filter rows that have prepacks prepack_df = self.df[self.df['Prepack'].notna()].copy() if len(prepack_df) == 0: print("No prepacks found") self.prepack_df = pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type']) return self.prepack_df # Get unique prepacks with descriptions unique_prepacks = prepack_df[['Prepack', 'Prepack Description']].drop_duplicates() prepack_data = [] for _, row in unique_prepacks.iterrows(): prepack_data.append({ 'kit_name': row['Prepack'], 'kit_description': row['Prepack Description'], 'kit_type': 'prepack', 'line_type': 'miniload' }) self.prepack_df = pd.DataFrame(prepack_data) print(f"Created {len(self.prepack_df)} prepack records") return self.prepack_df def concatenate_and_save(self, output_path: str = None) -> pd.DataFrame: """ Concatenate all processed dataframes and save to output file. Args: output_path: Path to save the output file (uses self.output_file if not provided) """ if self.master_df is None or self.subkit_df is None or self.prepack_df is None: raise ValueError("Processing not complete. Run process_master_kits(), process_sub_kits(), and process_prepacks() first.") print("Concatenating results...") # Concatenate all dataframes self.final_df = pd.concat([self.master_df, self.subkit_df, self.prepack_df], ignore_index=True) # Ensure empty strings instead of NaN for line_type self.final_df['line_type'] = self.final_df['line_type'].fillna('') # Sort by kit_type for better organization self.final_df = self.final_df.sort_values(['kit_type', 'kit_name']).reset_index(drop=True) print(f"Final dataset contains {len(self.final_df)} records:") print(f" - Masters: {len(self.master_df)}") print(f" - Subkits: {len(self.subkit_df)}") print(f" - Prepacks: {len(self.prepack_df)}") # Determine output path save_path = output_path or self.output_file if save_path is None: raise ValueError("No output path provided. Specify output_path parameter or set self.output_file") # Save to file (keep empty strings as empty, not NaN) self.final_df.to_csv(save_path, index=False, na_rep='') print(f"Saved cleaned data to: {save_path}") return self.final_df def main(): """Main function to execute the kit composition cleaning process.""" # Define file paths base_dir = "/Users/halimjun/Coding_local/SD_roster_real" input_file = os.path.join(base_dir, "data/real_data_excel/converted_csv/Kit_Composition_and_relation.csv") output_file = os.path.join(base_dir, "data/real_data_excel/converted_csv/Kit_Composition_and_relation_cleaned_with_line_type.csv") try: # Initialize cleaner with class cleaner = KitCompositionCleaner(input_file, output_file) # Execute pipeline step by step cleaner.load_data() cleaner.process_master_kits() cleaner.process_sub_kits() cleaner.process_prepacks() final_df = cleaner.concatenate_and_save() # Display summary statistics print("Line type distribution:") print(final_df['line_type'].value_counts(dropna=False)) print("\nKit type distribution:") print(final_df['kit_type'].value_counts()) print("\nSample of final data:") print(final_df.head(10)) except Exception as e: print(f"❌ Error processing kit composition data: {e}") raise if __name__ == "__main__": main()