File size: 1,937 Bytes
8504f5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd


def process_Kit_Composition_and_relation(output_csv_path: str = 'data/real_data_excel/converted_csv/Kit_Composition_and_relation_cleaned_with_line_type_and_id.csv') -> pd.DataFrame:
    """
    Process the Kit_Composition_and_relation.csv file to clean the data and add line type and id.
    
    Returns:
        saves to csv path
        cleaned_df: pd.DataFrame
    """
    df = pd.read_csv('data/real_data_excel/converted_csv/Kit_Composition_and_relation.csv')
    # df.dropna(inplace=True)
    master = df[["Master Kit", "Master Kit  Description"]]
    master["kit_type"] = "master"
    master.rename(columns={"Master Kit": "kit_name", "Master Kit  Description": "kit_description"}, inplace=True)

    subkit = df[["Sub kit", "Sub kit description"]]
    subkit["kit_type"] = "subkit"
    subkit.rename(columns={"Sub kit": "kit_name", "Sub kit Description": "kit_description"}, inplace=True)
    subkit.columns = ["kit_name", "kit_description", "kit_type"]

    prepack = df[["Prepack", "Prepack Description"]]
    prepack["kit_type"] = "prepack"
    prepack.rename(columns={"Prepack": "kit_name", "Prepack Description": "kit_description"}, inplace=True)


    cleaned_df = pd.concat([master, subkit, prepack])
    cleaned_df[['kit_name','kit_description','kit_type']].drop_duplicates()
    tmp = cleaned_df.groupby('kit_name').count()['kit_type'].reset_index()
    standalone_masterkit_list = tmp.loc[tmp['kit_type']==1,'kit_name']

    cleaned_df.loc[cleaned_df['kit_name'].isin(standalone_masterkit_list),'line_type'] = 'long line'
    cleaned_df.loc[cleaned_df['kit_type']=='prepack','line_type'] = 'mini load'
    cleaned_df.loc[cleaned_df['kit_type']=='subkit','line_type'] = 'long line'
    cleaned_df.loc[cleaned_df['line_type']=='mini load', 'line_id'] = 7
    cleaned_df.loc[cleaned_df['line_type']=='long line', 'line_id'] = 6
    cleaned_df.to_csv(output_csv_path, index=False)
    return cleaned_df