RobertoBarrosoLuque commited on
Commit
2f3a721
·
1 Parent(s): 03263ac

Add data prep and vector DB

Browse files
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  gradio==5.42.0
2
  openai
3
  python-dotenv==1.0.0
 
4
  numpy
5
  pandas
6
  scikit-learn
 
1
  gradio==5.42.0
2
  openai
3
  python-dotenv==1.0.0
4
+ datasets
5
  numpy
6
  pandas
7
  scikit-learn
src/modules/__init__.py ADDED
File without changes
src/modules/data_prep.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from datasets import load_dataset
3
+ from pathlib import Path
4
+ import numpy as np
5
+ import faiss
6
+ from openai import OpenAI
7
+ from dotenv import load_dotenv
8
+ import os
9
+ from src.config import EMBEDDING_MODEL
10
+
11
+ load_dotenv()
12
+
13
+ _FILE_PATH = Path(__file__).parents[2]
14
+
15
+
16
+ def load_amazon_raw_product_data() -> pd.DataFrame:
17
+ ds = load_dataset("ckandemir/amazon-products")
18
+ df = ds["train"].to_pandas()
19
+ return df
20
+
21
+
22
+ def load_clean_amazon_product_data() -> pd.DataFrame:
23
+ return pd.read_parquet(_FILE_PATH / "data" / "amazon_products.parquet")
24
+
25
+
26
+ def prepare_amazon_product_data(df: pd.DataFrame) -> pd.DataFrame:
27
+ """
28
+ Data preparation for Amazon products.
29
+
30
+ Args:
31
+ df: DataFrame with 'Product Name', 'Category', 'Description' columns
32
+
33
+ Returns:
34
+ DataFrame
35
+ """
36
+ # Full text is combination of Category + Description
37
+ df.loc[:, "FullText"] = (
38
+ df["Product Name"] + " | " + df["Category"] + " | " + df["Description"]
39
+ )
40
+ df.loc[:, "FullText"] = df.FullText.str.lower().str.strip().str.replace("\n", " ")
41
+
42
+ df[["MainCategory", "SecondaryCategory", "TertiaryCategory"]] = df[
43
+ "Category"
44
+ ].str.split(r" \| ", n=2, expand=True, regex=True)
45
+ df = df.dropna(subset=["MainCategory", "SecondaryCategory"])
46
+
47
+ # Drop dupes
48
+ df = df.drop_duplicates(subset=["FullText"])
49
+
50
+ return df.loc[
51
+ :,
52
+ [
53
+ "Product Name",
54
+ "Description",
55
+ "MainCategory",
56
+ "SecondaryCategory",
57
+ "TertiaryCategory",
58
+ "FullText",
59
+ ],
60
+ ]
61
+
62
+
63
+ def save_as_parquet(df: pd.DataFrame):
64
+ """
65
+ Save DataFrame to parquet file.
66
+ """
67
+ df.to_parquet(_FILE_PATH / "data" / "amazon_products.parquet", index=False)
68
+ print(f"Saved to {_FILE_PATH / 'data' / 'amazon_products.parquet'}")
69
+
70
+
71
+ def create_faiss_index(df: pd.DataFrame, batch_size: int = 100):
72
+ """
73
+ Create FAISS index from product data using Fireworks AI embeddings.
74
+
75
+ Args:
76
+ df: DataFrame with 'FullText' column to embed
77
+ batch_size: Number of texts to embed in each API call
78
+
79
+ Returns:
80
+ Tuple of (faiss_index, embeddings_array)
81
+ """
82
+ assert (
83
+ os.getenv("FIREWORKS_API_KEY") is not None
84
+ ), "FIREWORKS_API_KEY not found in environment variables"
85
+ client = OpenAI(
86
+ api_key=os.getenv("FIREWORKS_API_KEY"),
87
+ base_url="https://api.fireworks.ai/inference/v1",
88
+ )
89
+
90
+ print(f"Generating embeddings for {len(df)} products...")
91
+
92
+ all_embeddings = []
93
+ texts = df["FullText"].tolist()
94
+
95
+ for i in range(0, len(texts), batch_size):
96
+ batch = texts[i : i + batch_size]
97
+ print(
98
+ f"Processing batch {i // batch_size + 1}/{(len(texts) + batch_size - 1) // batch_size}"
99
+ )
100
+
101
+ response = client.embeddings.create(model=EMBEDDING_MODEL, input=batch)
102
+
103
+ batch_embeddings = [item.embedding for item in response.data]
104
+ all_embeddings.extend(batch_embeddings)
105
+
106
+ embeddings_array = np.array(all_embeddings, dtype=np.float32)
107
+
108
+ dimension = embeddings_array.shape[1]
109
+ index = faiss.IndexFlatL2(
110
+ dimension
111
+ ) # L2 distance for cosine similarity after normalization
112
+
113
+ # Normalize embeddings for cosine similarity
114
+ faiss.normalize_L2(embeddings_array)
115
+
116
+ index.add(embeddings_array)
117
+
118
+ print(f"Created FAISS index with {index.ntotal} vectors of dimension {dimension}")
119
+
120
+ faiss.write_index(index, str(_FILE_PATH / "data" / "faiss_index.bin"))
121
+ np.save(_FILE_PATH / "data" / "embeddings.npy", embeddings_array)
122
+
123
+ print(f"Saved FAISS index to {_FILE_PATH / 'data' / 'faiss_index.bin'}")
124
+ print(f"Saved embeddings to {_FILE_PATH / 'data' / 'embeddings.npy'}")
125
+
126
+ return index, embeddings_array
127
+
128
+
129
+ def load_faiss_index():
130
+ """
131
+ Load pre-computed FAISS index and embeddings from disk.
132
+
133
+ Returns:
134
+ Tuple of (faiss_index, embeddings_array)
135
+ """
136
+ index = faiss.read_index(str(_FILE_PATH / "data" / "faiss_index.bin"))
137
+ embeddings = np.load(_FILE_PATH / "data" / "embeddings.npy")
138
+ print(f"Loaded FAISS index with {index.ntotal} vectors")
139
+ return index, embeddings
140
+
141
+
142
+ if __name__ == "__main__":
143
+ _df = load_amazon_raw_product_data()
144
+ _df = prepare_amazon_product_data(_df)
145
+ save_as_parquet(_df)
146
+
147
+ create_faiss_index(_df)