Spaces:
Running
Running
RobertoBarrosoLuque
commited on
Commit
·
099c385
1
Parent(s):
8b0ccc1
Filter to only top 5 categories
Browse files- requirements.txt +2 -2
- src/data_prep/data_prep.py +13 -0
requirements.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
gradio==5.42.0
|
| 2 |
openai
|
| 3 |
-
python-dotenv
|
| 4 |
-
datasets
|
| 5 |
numpy
|
| 6 |
pandas
|
| 7 |
scikit-learn
|
|
|
|
| 1 |
gradio==5.42.0
|
| 2 |
openai
|
| 3 |
+
python-dotenv
|
| 4 |
+
datasets>=2.19.0
|
| 5 |
numpy
|
| 6 |
pandas
|
| 7 |
scikit-learn
|
src/data_prep/data_prep.py
CHANGED
|
@@ -44,6 +44,19 @@ def prepare_amazon_product_data(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 44 |
# Drop dupes
|
| 45 |
df = df.drop_duplicates(subset=["FullText"])
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
return df.loc[
|
| 48 |
:,
|
| 49 |
[
|
|
|
|
| 44 |
# Drop dupes
|
| 45 |
df = df.drop_duplicates(subset=["FullText"])
|
| 46 |
|
| 47 |
+
# Downsample where MainCategory == Toys and Games to 650 since in raw data its over 70% of data
|
| 48 |
+
df_non_toys = df[df["MainCategory"] != "Toys & Games"]
|
| 49 |
+
df_toys = df[df["MainCategory"] == "Toys & Games"]
|
| 50 |
+
df_toys = df_toys.sample(n=650, random_state=42)
|
| 51 |
+
df = pd.concat([df_non_toys, df_toys])
|
| 52 |
+
|
| 53 |
+
# Filter to only top 5 MainCategories
|
| 54 |
+
df = df[df["MainCategory"].isin(df["MainCategory"].value_counts().index[:5])]
|
| 55 |
+
|
| 56 |
+
print(
|
| 57 |
+
f"Prepared dataset with {len(df)} products with \n Count of MainCategories: {df['MainCategory'].value_counts()}"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
return df.loc[
|
| 61 |
:,
|
| 62 |
[
|