Spaces:
Sleeping
Sleeping
RobertoBarrosoLuque
commited on
Commit
·
2f3a721
1
Parent(s):
03263ac
Add data prep and vector DB
Browse files- requirements.txt +1 -0
- src/modules/__init__.py +0 -0
- src/modules/data_prep.py +147 -0
requirements.txt
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
gradio==5.42.0
|
| 2 |
openai
|
| 3 |
python-dotenv==1.0.0
|
|
|
|
| 4 |
numpy
|
| 5 |
pandas
|
| 6 |
scikit-learn
|
|
|
|
| 1 |
gradio==5.42.0
|
| 2 |
openai
|
| 3 |
python-dotenv==1.0.0
|
| 4 |
+
datasets
|
| 5 |
numpy
|
| 6 |
pandas
|
| 7 |
scikit-learn
|
src/modules/__init__.py
ADDED
|
File without changes
|
src/modules/data_prep.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import numpy as np
|
| 5 |
+
import faiss
|
| 6 |
+
from openai import OpenAI
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
import os
|
| 9 |
+
from src.config import EMBEDDING_MODEL
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
_FILE_PATH = Path(__file__).parents[2]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def load_amazon_raw_product_data() -> pd.DataFrame:
|
| 17 |
+
ds = load_dataset("ckandemir/amazon-products")
|
| 18 |
+
df = ds["train"].to_pandas()
|
| 19 |
+
return df
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def load_clean_amazon_product_data() -> pd.DataFrame:
|
| 23 |
+
return pd.read_parquet(_FILE_PATH / "data" / "amazon_products.parquet")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def prepare_amazon_product_data(df: pd.DataFrame) -> pd.DataFrame:
|
| 27 |
+
"""
|
| 28 |
+
Data preparation for Amazon products.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
df: DataFrame with 'Product Name', 'Category', 'Description' columns
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
DataFrame
|
| 35 |
+
"""
|
| 36 |
+
# Full text is combination of Category + Description
|
| 37 |
+
df.loc[:, "FullText"] = (
|
| 38 |
+
df["Product Name"] + " | " + df["Category"] + " | " + df["Description"]
|
| 39 |
+
)
|
| 40 |
+
df.loc[:, "FullText"] = df.FullText.str.lower().str.strip().str.replace("\n", " ")
|
| 41 |
+
|
| 42 |
+
df[["MainCategory", "SecondaryCategory", "TertiaryCategory"]] = df[
|
| 43 |
+
"Category"
|
| 44 |
+
].str.split(r" \| ", n=2, expand=True, regex=True)
|
| 45 |
+
df = df.dropna(subset=["MainCategory", "SecondaryCategory"])
|
| 46 |
+
|
| 47 |
+
# Drop dupes
|
| 48 |
+
df = df.drop_duplicates(subset=["FullText"])
|
| 49 |
+
|
| 50 |
+
return df.loc[
|
| 51 |
+
:,
|
| 52 |
+
[
|
| 53 |
+
"Product Name",
|
| 54 |
+
"Description",
|
| 55 |
+
"MainCategory",
|
| 56 |
+
"SecondaryCategory",
|
| 57 |
+
"TertiaryCategory",
|
| 58 |
+
"FullText",
|
| 59 |
+
],
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def save_as_parquet(df: pd.DataFrame):
|
| 64 |
+
"""
|
| 65 |
+
Save DataFrame to parquet file.
|
| 66 |
+
"""
|
| 67 |
+
df.to_parquet(_FILE_PATH / "data" / "amazon_products.parquet", index=False)
|
| 68 |
+
print(f"Saved to {_FILE_PATH / 'data' / 'amazon_products.parquet'}")
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def create_faiss_index(df: pd.DataFrame, batch_size: int = 100):
|
| 72 |
+
"""
|
| 73 |
+
Create FAISS index from product data using Fireworks AI embeddings.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
df: DataFrame with 'FullText' column to embed
|
| 77 |
+
batch_size: Number of texts to embed in each API call
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Tuple of (faiss_index, embeddings_array)
|
| 81 |
+
"""
|
| 82 |
+
assert (
|
| 83 |
+
os.getenv("FIREWORKS_API_KEY") is not None
|
| 84 |
+
), "FIREWORKS_API_KEY not found in environment variables"
|
| 85 |
+
client = OpenAI(
|
| 86 |
+
api_key=os.getenv("FIREWORKS_API_KEY"),
|
| 87 |
+
base_url="https://api.fireworks.ai/inference/v1",
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
print(f"Generating embeddings for {len(df)} products...")
|
| 91 |
+
|
| 92 |
+
all_embeddings = []
|
| 93 |
+
texts = df["FullText"].tolist()
|
| 94 |
+
|
| 95 |
+
for i in range(0, len(texts), batch_size):
|
| 96 |
+
batch = texts[i : i + batch_size]
|
| 97 |
+
print(
|
| 98 |
+
f"Processing batch {i // batch_size + 1}/{(len(texts) + batch_size - 1) // batch_size}"
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
response = client.embeddings.create(model=EMBEDDING_MODEL, input=batch)
|
| 102 |
+
|
| 103 |
+
batch_embeddings = [item.embedding for item in response.data]
|
| 104 |
+
all_embeddings.extend(batch_embeddings)
|
| 105 |
+
|
| 106 |
+
embeddings_array = np.array(all_embeddings, dtype=np.float32)
|
| 107 |
+
|
| 108 |
+
dimension = embeddings_array.shape[1]
|
| 109 |
+
index = faiss.IndexFlatL2(
|
| 110 |
+
dimension
|
| 111 |
+
) # L2 distance for cosine similarity after normalization
|
| 112 |
+
|
| 113 |
+
# Normalize embeddings for cosine similarity
|
| 114 |
+
faiss.normalize_L2(embeddings_array)
|
| 115 |
+
|
| 116 |
+
index.add(embeddings_array)
|
| 117 |
+
|
| 118 |
+
print(f"Created FAISS index with {index.ntotal} vectors of dimension {dimension}")
|
| 119 |
+
|
| 120 |
+
faiss.write_index(index, str(_FILE_PATH / "data" / "faiss_index.bin"))
|
| 121 |
+
np.save(_FILE_PATH / "data" / "embeddings.npy", embeddings_array)
|
| 122 |
+
|
| 123 |
+
print(f"Saved FAISS index to {_FILE_PATH / 'data' / 'faiss_index.bin'}")
|
| 124 |
+
print(f"Saved embeddings to {_FILE_PATH / 'data' / 'embeddings.npy'}")
|
| 125 |
+
|
| 126 |
+
return index, embeddings_array
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def load_faiss_index():
|
| 130 |
+
"""
|
| 131 |
+
Load pre-computed FAISS index and embeddings from disk.
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
Tuple of (faiss_index, embeddings_array)
|
| 135 |
+
"""
|
| 136 |
+
index = faiss.read_index(str(_FILE_PATH / "data" / "faiss_index.bin"))
|
| 137 |
+
embeddings = np.load(_FILE_PATH / "data" / "embeddings.npy")
|
| 138 |
+
print(f"Loaded FAISS index with {index.ntotal} vectors")
|
| 139 |
+
return index, embeddings
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
if __name__ == "__main__":
|
| 143 |
+
_df = load_amazon_raw_product_data()
|
| 144 |
+
_df = prepare_amazon_product_data(_df)
|
| 145 |
+
save_as_parquet(_df)
|
| 146 |
+
|
| 147 |
+
create_faiss_index(_df)
|