File size: 9,748 Bytes
17be6b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8504f5a
17be6b7
8504f5a
17be6b7
8504f5a
 
 
 
17be6b7
 
8504f5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17be6b7
8504f5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28016d1
8504f5a
 
17be6b7
8504f5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17be6b7
8504f5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17be6b7
8504f5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17be6b7
 
 
 
 
 
 
 
 
 
8504f5a
 
17be6b7
8504f5a
 
 
 
 
 
17be6b7
 
 
 
 
 
 
 
 
8504f5a
17be6b7
8504f5a
17be6b7
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
"""
Kit Composition Data Cleaner

This script converts the Kit_Composition_and_relation.csv file into a cleaned format
with line types according to the following rules:

1. Master Kits:
   - If appears only once (standalone master): line_type = "long line"
   - If appears multiple times: line_type = "" (empty/theoretical)

2. Sub Kits:
   - All sub kits get line_type = "long line"

3. Prepacks:
   - All prepacks get line_type = "miniload"

The output includes columns: kit_name, kit_description, kit_type, line_type
"""

import pandas as pd
import os
from typing import Tuple


class KitCompositionCleaner:
    """
    Cleans and processes kit composition data with line type assignments.
    
    This class maintains state across processing steps, allowing for:
    - Single data load
    - Step-by-step processing
    - Intermediate result storage
    """
    
    def __init__(self, input_file: str, output_file: str = None):
        """
        Initialize the cleaner with file paths.
        
        Args:
            input_file: Path to input CSV file (Kit_Composition_and_relation.csv)
            output_file: Path to output CSV file (optional, can be set later)
        """
        self.input_file = input_file
        self.output_file = output_file
        
        # State variables for processing pipeline
        self.df = None
        self.master_df = None
        self.subkit_df = None
        self.prepack_df = None
        self.final_df = None
        
    def load_data(self) -> pd.DataFrame:
        """Load the Kit Composition and relation CSV file."""
        if not os.path.exists(self.input_file):
            raise FileNotFoundError(f"File not found: {self.input_file}")
        
        self.df = pd.read_csv(self.input_file)
        print(f"Loaded {len(self.df)} rows from {self.input_file}")
        return self.df
    
    def process_master_kits(self) -> pd.DataFrame:
        """
        Process Master Kits according to business rules:
        - Standalone masters (no subkits/prepacks, only components): line_type = "long line"
        - Non-standalone masters (have subkits/prepacks): line_type = "" (empty - no production needed)
        """
        if self.df is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        print("Processing Master Kits...")
        
        # Identify masters with hierarchy (subkits or prepacks)
        masters_with_subkits = set(self.df[self.df['Sub kit'].notna()]['Master Kit'].unique())
        masters_with_prepacks = set(self.df[self.df['Prepack'].notna()]['Master Kit'].unique())
        masters_with_hierarchy = masters_with_subkits.union(masters_with_prepacks)
        
        # All masters
        all_masters = set(self.df['Master Kit'].unique())
        
        # Standalone masters are those WITHOUT subkits/prepacks (only have components)
        standalone_masters = all_masters - masters_with_hierarchy
        
        print(f"Total unique Master Kits: {len(all_masters)}")
        print(f"Masters with subkits/prepacks: {len(masters_with_hierarchy)}")
        print(f"Standalone masters (only components): {len(standalone_masters)}")
        
        # Create master kit records
        master_data = []
        
        # Get unique master kits with descriptions
        unique_masters = self.df[['Master Kit', 'Master Kit  Description']].drop_duplicates()
        
        for _, row in unique_masters.iterrows():
            master_kit = row['Master Kit']
            master_desc = row['Master Kit  Description']
            
            # Determine line_type based on standalone status
            if master_kit in standalone_masters:
                line_type = "long line"
            else:
                line_type = ""  # Empty for non-standalone (theoretical)
            
            master_data.append({
                'kit_name': master_kit,
                'kit_description': master_desc,
                'kit_type': 'master',
                'line_type': line_type
            })
        
        self.master_df = pd.DataFrame(master_data)

        
        return self.master_df
    
    def process_sub_kits(self) -> pd.DataFrame:
        """
        Process Sub Kits according to business rules:
        - All sub kits get line_type = "long line"
        - Remove duplicates
        """
        if self.df is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        print("Processing Sub Kits...")
        
        # Filter rows that have sub kits
        subkit_df = self.df[self.df['Sub kit'].notna()].copy()
        
        if len(subkit_df) == 0:
            print("No sub kits found")
            self.subkit_df = pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type'])
            return self.subkit_df
        
        # Get unique sub kits with descriptions
        unique_subkits = subkit_df[['Sub kit', 'Sub kit description']].drop_duplicates()
        
        subkit_data = []
        for _, row in unique_subkits.iterrows():
            subkit_data.append({
                'kit_name': row['Sub kit'],
                'kit_description': row['Sub kit description'],
                'kit_type': 'subkit',
                'line_type': 'long line'
            })
        
        self.subkit_df = pd.DataFrame(subkit_data)
        print(f"Created {len(self.subkit_df)} sub kit records")
        
        return self.subkit_df
    
    def process_prepacks(self) -> pd.DataFrame:
        """
        Process Prepacks according to business rules:
        - All prepacks get line_type = "miniload"
        - Remove duplicates
        """
        if self.df is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        print("Processing Prepacks...")
        
        # Filter rows that have prepacks
        prepack_df = self.df[self.df['Prepack'].notna()].copy()
        
        if len(prepack_df) == 0:
            print("No prepacks found")
            self.prepack_df = pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type'])
            return self.prepack_df
        
        # Get unique prepacks with descriptions
        unique_prepacks = prepack_df[['Prepack', 'Prepack Description']].drop_duplicates()
        
        prepack_data = []
        for _, row in unique_prepacks.iterrows():
            prepack_data.append({
                'kit_name': row['Prepack'],
                'kit_description': row['Prepack Description'],
                'kit_type': 'prepack',
                'line_type': 'miniload'
            })
        
        self.prepack_df = pd.DataFrame(prepack_data)
        print(f"Created {len(self.prepack_df)} prepack records")
        
        return self.prepack_df
    
    def concatenate_and_save(self, output_path: str = None) -> pd.DataFrame:
        """
        Concatenate all processed dataframes and save to output file.
        
        Args:
            output_path: Path to save the output file (uses self.output_file if not provided)
        """
        if self.master_df is None or self.subkit_df is None or self.prepack_df is None:
            raise ValueError("Processing not complete. Run process_master_kits(), process_sub_kits(), and process_prepacks() first.")
        
        print("Concatenating results...")
        
        # Concatenate all dataframes
        self.final_df = pd.concat([self.master_df, self.subkit_df, self.prepack_df], ignore_index=True)
        
        # Ensure empty strings instead of NaN for line_type
        self.final_df['line_type'] = self.final_df['line_type'].fillna('')
        
        # Sort by kit_type for better organization
        self.final_df = self.final_df.sort_values(['kit_type', 'kit_name']).reset_index(drop=True)
        
        print(f"Final dataset contains {len(self.final_df)} records:")
        print(f"  - Masters: {len(self.master_df)}")
        print(f"  - Subkits: {len(self.subkit_df)}")
        print(f"  - Prepacks: {len(self.prepack_df)}")
        
        # Determine output path
        save_path = output_path or self.output_file
        if save_path is None:
            raise ValueError("No output path provided. Specify output_path parameter or set self.output_file")
        
        # Save to file (keep empty strings as empty, not NaN)
        self.final_df.to_csv(save_path, index=False, na_rep='')
        print(f"Saved cleaned data to: {save_path}")
        
        return self.final_df


def main():
    """Main function to execute the kit composition cleaning process."""
    # Define file paths
    base_dir = "/Users/halimjun/Coding_local/SD_roster_real"
    input_file = os.path.join(base_dir, "data/real_data_excel/converted_csv/Kit_Composition_and_relation.csv")
    output_file = os.path.join(base_dir, "data/real_data_excel/converted_csv/Kit_Composition_and_relation_cleaned_with_line_type.csv")
    
    try:
        # Initialize cleaner with class
        cleaner = KitCompositionCleaner(input_file, output_file)
        
        # Execute pipeline step by step
        cleaner.load_data()
        cleaner.process_master_kits()
        cleaner.process_sub_kits()
        cleaner.process_prepacks()
        final_df = cleaner.concatenate_and_save()
        
        # Display summary statistics
        print("Line type distribution:")
        print(final_df['line_type'].value_counts(dropna=False))
        print("\nKit type distribution:")
        print(final_df['kit_type'].value_counts())
        
        print("\nSample of final data:")
        print(final_df.head(10))

    except Exception as e:
        print(f"❌ Error processing kit composition data: {e}")
        raise


if __name__ == "__main__":
    main()