RobertoBarrosoLuque commited on
Commit
099c385
·
1 Parent(s): 8b0ccc1

Filter to only top 5 categories

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -2
  2. src/data_prep/data_prep.py +13 -0
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
  gradio==5.42.0
2
  openai
3
- python-dotenv==1.0.0
4
- datasets
5
  numpy
6
  pandas
7
  scikit-learn
 
1
  gradio==5.42.0
2
  openai
3
+ python-dotenv
4
+ datasets>=2.19.0
5
  numpy
6
  pandas
7
  scikit-learn
src/data_prep/data_prep.py CHANGED
@@ -44,6 +44,19 @@ def prepare_amazon_product_data(df: pd.DataFrame) -> pd.DataFrame:
44
  # Drop dupes
45
  df = df.drop_duplicates(subset=["FullText"])
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  return df.loc[
48
  :,
49
  [
 
44
  # Drop dupes
45
  df = df.drop_duplicates(subset=["FullText"])
46
 
47
+ # Downsample where MainCategory == Toys and Games to 650 since in raw data its over 70% of data
48
+ df_non_toys = df[df["MainCategory"] != "Toys & Games"]
49
+ df_toys = df[df["MainCategory"] == "Toys & Games"]
50
+ df_toys = df_toys.sample(n=650, random_state=42)
51
+ df = pd.concat([df_non_toys, df_toys])
52
+
53
+ # Filter to only top 5 MainCategories
54
+ df = df[df["MainCategory"].isin(df["MainCategory"].value_counts().index[:5])]
55
+
56
+ print(
57
+ f"Prepared dataset with {len(df)} products with \n Count of MainCategories: {df['MainCategory'].value_counts()}"
58
+ )
59
+
60
  return df.loc[
61
  :,
62
  [