RobertoBarrosoLuque commited on
Commit
385bc37
Β·
1 Parent(s): 076aa73

Add stage 2 embeddings

Browse files
src/app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import time
3
- from typing import List, Dict, Tuple
4
  from pathlib import Path
5
  import os
6
  from config import (
@@ -9,7 +9,15 @@ from config import (
9
  EXAMPLE_QUERIES_BY_CATEGORY,
10
  )
11
  from src.search.bm25_lexical_search import search_bm25
 
12
  from src.data_prep.data_prep import load_clean_amazon_product_data
 
 
 
 
 
 
 
13
 
14
  _FILE_PATH = Path(__file__).parents[1]
15
 
@@ -59,6 +67,26 @@ def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
59
  """
60
  html_parts = [f"## πŸ” {stage_name}\n\n"]
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  for idx, result in enumerate(results, 1):
63
  category = f"{result.get('main_category', 'N/A')} > {result.get('secondary_category', 'N/A')}"
64
  html_parts.append(
@@ -66,42 +94,30 @@ def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
66
  <div class="result-card">
67
  <strong>{idx}. {result['product_name']}</strong><br/>
68
  <span style="color: #64748B; font-size: 0.9em;">{result['description'][:150]}...</span><br/>
69
- <span style="color: #94A3B8; font-size: 0.85em;">Category: {category}</span><br/>
70
- <span style="color: #6720FF; font-weight: 600;">Score: {result['score']:.3f}</span>
71
  </div>
72
  """
73
  )
74
-
75
- html_parts.append("\n---\n\n### Performance Metrics\n\n")
76
- html_parts.append(
77
- f"""
78
- | Metric | Score |
79
- |--------|-------|
80
- | **Semantic Match** | {metrics['semantic_match']:.3f} |
81
- | **Diversity** | {metrics['diversity']:.3f} |
82
- | **Latency** | {metrics['latency_ms']}ms |
83
- """
84
- )
85
 
86
  return "".join(html_parts)
87
 
 
 
 
 
 
 
88
 
89
  def search_stage_1(query: str) -> Tuple[str, Dict]:
90
  """Stage 1: Baseline BM25 keyword search."""
91
- start_time = time.time()
92
-
93
- results = search_bm25(query, top_k=5)
94
- latency = int((time.time() - start_time) * 1000)
95
-
96
- unique_categories = len(set(r["main_category"] for r in results)) if results else 0
97
- diversity = min(1.0, unique_categories / 5.0)
98
 
99
  avg_score = sum(r["score"] for r in results) / len(results) if results else 0
100
  semantic_match = min(1.0, avg_score / 10.0)
101
 
102
  metrics = {
103
  "semantic_match": semantic_match,
104
- "diversity": diversity,
105
  "latency_ms": latency,
106
  }
107
  print(f"Searched BM25 for {query} in {latency}ms")
@@ -110,30 +126,20 @@ def search_stage_1(query: str) -> Tuple[str, Dict]:
110
 
111
 
112
  def search_stage_2(query: str) -> Tuple[str, Dict]:
113
- """Stage 2: BM25 + Vector Embeddings."""
114
- start_time = time.time()
115
-
116
- # Placeholder: Simulated embedding search with correct format
117
- results = [
118
- {
119
- "product_name": product["title"],
120
- "description": product["description"],
121
- "main_category": product["category"],
122
- "secondary_category": "Placeholder",
123
- "score": 0.72 + (idx * 0.04),
124
- }
125
- for idx, product in enumerate(SAMPLE_PRODUCTS[:4])
126
- ]
127
 
128
- latency = int((time.time() - start_time) * 1000)
 
129
 
130
  metrics = {
131
- "semantic_match": 0.72,
132
- "diversity": 0.70,
133
- "latency_ms": max(100, latency),
134
  }
 
135
 
136
- return format_results(results, "Stage 2: + Vector Embeddings", metrics), metrics
 
137
 
138
 
139
  def search_stage_3(query: str) -> Tuple[str, Dict]:
@@ -156,7 +162,6 @@ def search_stage_3(query: str) -> Tuple[str, Dict]:
156
 
157
  metrics = {
158
  "semantic_match": 0.81,
159
- "diversity": 0.75,
160
  "latency_ms": max(150, latency),
161
  }
162
 
@@ -183,7 +188,6 @@ def search_stage_4(query: str) -> Tuple[str, Dict]:
183
 
184
  metrics = {
185
  "semantic_match": 0.88,
186
- "diversity": 0.80,
187
  "latency_ms": max(200, latency),
188
  }
189
 
@@ -209,19 +213,19 @@ def search_all_stages(query: str) -> Tuple[str, str, str, str, str]:
209
  def generate_comparison_table(all_metrics: List[Dict]) -> str:
210
  """Generate comparison table for all stages."""
211
  stage_names = [
212
- "Stage 1: BM25",
213
- "Stage 2: + Embeddings",
214
- "Stage 3: + Query Expansion",
215
- "Stage 4: + Reranking",
216
  ]
217
 
218
  # Build markdown table
219
  html = "## Stage-by-Stage Comparison\n\n"
220
- html += "| Stage | Semantic Match | Diversity | Latency (ms) |\n"
221
- html += "|-------|----------------|-----------|---------------|\n"
222
 
223
  for name, metrics in zip(stage_names, all_metrics):
224
- html += f"| **{name}** | {metrics['semantic_match']:.3f} | {metrics['diversity']:.3f} | {metrics['latency_ms']} |\n"
225
 
226
  # Calculate improvements
227
  semantic_improvement = (
@@ -233,22 +237,13 @@ def generate_comparison_table(all_metrics: List[Dict]) -> str:
233
  if all_metrics[0]["semantic_match"] > 0
234
  else 0
235
  )
236
- diversity_improvement = (
237
- (
238
- (all_metrics[3]["diversity"] - all_metrics[0]["diversity"])
239
- / all_metrics[0]["diversity"]
240
- * 100
241
- )
242
- if all_metrics[0]["diversity"] > 0
243
- else 0
244
- )
245
 
246
  html += "\n---\n\n"
247
  html += "## Key Insights\n\n"
248
- html += f"- **Semantic Match** improves by **{semantic_improvement:.0f}%** from Stage 1 to Stage 4\n"
249
- html += f"- **Diversity** increases by **{diversity_improvement:.0f}%** showing more varied results\n"
250
  html += f"- **Latency** stays under **{max(m['latency_ms'] for m in all_metrics)}ms** maintaining fast performance\n"
251
- html += "- Each stage adds incremental value to search quality\n"
 
252
 
253
  return html
254
 
@@ -340,106 +335,6 @@ def generate_sample_data_table() -> str:
340
  return html
341
 
342
 
343
- # Code snippets for each stage
344
- CODE_STAGE_1 = """
345
- ```python
346
- import bm25s
347
- import pandas as pd
348
-
349
- # Step 1: Create BM25 index (one-time setup)
350
- df = pd.read_parquet("data/amazon_products.parquet")
351
- corpus = df["FullText"].tolist()
352
- corpus_tokens = bm25s.tokenize(corpus, stopwords="en")
353
-
354
- retriever = bm25s.BM25()
355
- retriever.index(corpus_tokens)
356
- retriever.save("data/bm25_index")
357
-
358
- # Step 2: Load index and search
359
- bm25_index = bm25s.BM25.load("data/bm25_index", load_corpus=False)
360
- query_tokens = bm25s.tokenize(query, stopwords="en")
361
- results, scores = bm25_index.retrieve(query_tokens, k=5)
362
-
363
- # Extract top results
364
- top_products = [df.iloc[idx] for idx in results[0]]
365
- ```
366
- """
367
-
368
- CODE_STAGE_2 = """
369
- ```python
370
- from openai import OpenAI
371
- import faiss
372
- import numpy as np
373
-
374
- client = OpenAI(
375
- base_url="https://api.fireworks.ai/inference/v1"
376
- )
377
-
378
- # Generate embeddings
379
- response = client.embeddings.create(
380
- model="accounts/fireworks/models/qwen3-embedding-8b",
381
- input=[query] + documents
382
- )
383
-
384
- # Extract embeddings
385
- query_emb = np.array(response.data[0].embedding)
386
- doc_embs = np.array([d.embedding for d in response.data[1:]])
387
-
388
- # FAISS search
389
- index = faiss.IndexFlatIP(doc_embs.shape[1])
390
- index.add(doc_embs)
391
- scores, indices = index.search(query_emb.reshape(1, -1), k=5)
392
- ```
393
- """
394
-
395
- CODE_STAGE_3 = """
396
- ```python
397
- # Query expansion with LLM
398
- response = client.chat.completions.create(
399
- model="accounts/fireworks/models/llama-v3p1-8b-instruct",
400
- messages=[{
401
- "role": "user",
402
- "content": f"Extract 2-3 key search concepts from: {query}"
403
- }]
404
- )
405
-
406
- expanded_query = response.choices[0].message.content
407
-
408
- # Search with expanded query
409
- response = client.embeddings.create(
410
- model="accounts/fireworks/models/qwen3-embedding-8b",
411
- input=[expanded_query] + documents
412
- )
413
-
414
- # Continue with embedding search...
415
- ```
416
- """
417
-
418
- CODE_STAGE_4 = """
419
- ```python
420
- # First get top 20 candidates from Stage 3
421
- top_20_results = get_stage_3_results(query, k=20)
422
-
423
- # Rerank with Fireworks reranker
424
- rerank_response = client.post(
425
- "https://api.fireworks.ai/inference/v1/rerank",
426
- json={
427
- "model": "fireworks/qwen3-reranker-8b",
428
- "query": query,
429
- "documents": [r["text"] for r in top_20_results],
430
- "top_n": 5
431
- }
432
- )
433
-
434
- # Get final ranked results
435
- final_results = [
436
- top_20_results[r["index"]]
437
- for r in rerank_response.json()["results"]
438
- ]
439
- ```
440
- """
441
-
442
-
443
  # Build Gradio Interface
444
  with gr.Blocks(
445
  css=CUSTOM_CSS, theme=GRADIO_THEME, title="Search Alchemy - Fireworks AI"
 
1
  import gradio as gr
2
  import time
3
+ from typing import List, Dict, Tuple, Callable
4
  from pathlib import Path
5
  import os
6
  from config import (
 
9
  EXAMPLE_QUERIES_BY_CATEGORY,
10
  )
11
  from src.search.bm25_lexical_search import search_bm25
12
+ from src.search.vector_search import search_vector
13
  from src.data_prep.data_prep import load_clean_amazon_product_data
14
+ from src.constants.code_snippets import (
15
+ CODE_STAGE_1,
16
+ CODE_STAGE_2,
17
+ CODE_STAGE_3,
18
+ CODE_STAGE_4,
19
+ )
20
+
21
 
22
  _FILE_PATH = Path(__file__).parents[1]
23
 
 
67
  """
68
  html_parts = [f"## πŸ” {stage_name}\n\n"]
69
 
70
+ # Performance metrics at the top with prominent styling
71
+ html_parts.append(
72
+ f"""
73
+ <div style="display: flex; gap: 20px; margin-bottom: 28px;">
74
+ <div class="metric-box" style="flex: 1;">
75
+ <div style="color: #6720FF; font-size: 0.9em; font-weight: 600; margin-bottom: 6px; letter-spacing: 0.5px;">SEMANTIC MATCH</div>
76
+ <div style="font-size: 2.2em; font-weight: 700; color: #1E293B;">{metrics['semantic_match']:.3f}</div>
77
+ <div style="color: #64748B; font-size: 0.8em; margin-top: 4px;">Higher is better</div>
78
+ </div>
79
+ <div class="metric-box" style="flex: 1;">
80
+ <div style="color: #6720FF; font-size: 0.9em; font-weight: 600; margin-bottom: 6px; letter-spacing: 0.5px;">LATENCY</div>
81
+ <div style="font-size: 2.2em; font-weight: 700; color: #1E293B;">{metrics['latency_ms']}<span style="font-size: 0.45em; color: #64748B; font-weight: 400;">ms</span></div>
82
+ <div style="color: #64748B; font-size: 0.8em; margin-top: 4px;">Response time</div>
83
+ </div>
84
+ </div>
85
+ """
86
+ )
87
+
88
+ # Results section
89
+ html_parts.append('<div style="margin-top: 20px;">\n\n')
90
  for idx, result in enumerate(results, 1):
91
  category = f"{result.get('main_category', 'N/A')} > {result.get('secondary_category', 'N/A')}"
92
  html_parts.append(
 
94
  <div class="result-card">
95
  <strong>{idx}. {result['product_name']}</strong><br/>
96
  <span style="color: #64748B; font-size: 0.9em;">{result['description'][:150]}...</span><br/>
97
+ <span style="color: #94A3B8; font-size: 0.85em;">Category: {category}</span>
 
98
  </div>
99
  """
100
  )
101
+ html_parts.append('</div>')
 
 
 
 
 
 
 
 
 
 
102
 
103
  return "".join(html_parts)
104
 
105
+ def run_search_function_and_time(query: str, func: Callable):
106
+ start = time.time()
107
+ results = func(query)
108
+ latency = int((time.time() - start) * 1000)
109
+ return results, latency
110
+
111
 
112
  def search_stage_1(query: str) -> Tuple[str, Dict]:
113
  """Stage 1: Baseline BM25 keyword search."""
114
+ results, latency = run_search_function_and_time(query, search_bm25)
 
 
 
 
 
 
115
 
116
  avg_score = sum(r["score"] for r in results) / len(results) if results else 0
117
  semantic_match = min(1.0, avg_score / 10.0)
118
 
119
  metrics = {
120
  "semantic_match": semantic_match,
 
121
  "latency_ms": latency,
122
  }
123
  print(f"Searched BM25 for {query} in {latency}ms")
 
126
 
127
 
128
  def search_stage_2(query: str) -> Tuple[str, Dict]:
129
+ """Stage 2: Vector Embeddings using FAISS."""
130
+ results, latency = run_search_function_and_time(query, search_vector)
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ avg_score = sum(r["score"] for r in results) / len(results) if results else 0
133
+ semantic_match = avg_score
134
 
135
  metrics = {
136
+ "semantic_match": semantic_match,
137
+ "latency_ms": latency,
 
138
  }
139
+ print(f"Searched vector embeddings for '{query}' in {latency}ms")
140
 
141
+ # Return top 5 for display
142
+ return format_results(results[:5], "Stage 2: Vector Embeddings", metrics), metrics
143
 
144
 
145
  def search_stage_3(query: str) -> Tuple[str, Dict]:
 
162
 
163
  metrics = {
164
  "semantic_match": 0.81,
 
165
  "latency_ms": max(150, latency),
166
  }
167
 
 
188
 
189
  metrics = {
190
  "semantic_match": 0.88,
 
191
  "latency_ms": max(200, latency),
192
  }
193
 
 
213
  def generate_comparison_table(all_metrics: List[Dict]) -> str:
214
  """Generate comparison table for all stages."""
215
  stage_names = [
216
+ "Baseline: BM25",
217
+ "Stage 1: + Embeddings",
218
+ "Stage 2: + Query Expansion",
219
+ "Stage 3: + Reranking",
220
  ]
221
 
222
  # Build markdown table
223
  html = "## Stage-by-Stage Comparison\n\n"
224
+ html += "| Stage | Semantic Match | Latency (ms) |\n"
225
+ html += "|-------|----------------|--------------|\n"
226
 
227
  for name, metrics in zip(stage_names, all_metrics):
228
+ html += f"| **{name}** | {metrics['semantic_match']:.3f} | {metrics['latency_ms']} |\n"
229
 
230
  # Calculate improvements
231
  semantic_improvement = (
 
237
  if all_metrics[0]["semantic_match"] > 0
238
  else 0
239
  )
 
 
 
 
 
 
 
 
 
240
 
241
  html += "\n---\n\n"
242
  html += "## Key Insights\n\n"
243
+ html += f"- **Semantic Match** improves by **{semantic_improvement:.0f}%** from baseline to final stage\n"
 
244
  html += f"- **Latency** stays under **{max(m['latency_ms'] for m in all_metrics)}ms** maintaining fast performance\n"
245
+ html += "- Each stage progressively enhances search relevance while keeping response times low\n"
246
+ html += "- Vector embeddings provide the biggest jump in semantic understanding\n"
247
 
248
  return html
249
 
 
335
  return html
336
 
337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  # Build Gradio Interface
339
  with gr.Blocks(
340
  css=CUSTOM_CSS, theme=GRADIO_THEME, title="Search Alchemy - Fireworks AI"
src/constants/__init__.py ADDED
File without changes
src/constants/code_snippets.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Code snippets for displaying implementation examples in the Gradio UI.
3
+ Each snippet shows the actual implementation approach for each search stage.
4
+ """
5
+
6
+ CODE_STAGE_1 = """
7
+ ```python
8
+ import bm25s
9
+ import pandas as pd
10
+
11
+ # Step 1: Create BM25 index (one-time setup)
12
+ df = pd.read_parquet("data/amazon_products.parquet")
13
+ corpus = df["FullText"].tolist()
14
+ corpus_tokens = bm25s.tokenize(corpus, stopwords="en")
15
+
16
+ retriever = bm25s.BM25()
17
+ retriever.index(corpus_tokens)
18
+ retriever.save("data/bm25_index")
19
+
20
+ # Step 2: Load index and search
21
+ bm25_index = bm25s.BM25.load("data/bm25_index", load_corpus=False)
22
+ query_tokens = bm25s.tokenize(query, stopwords="en")
23
+ results, scores = bm25_index.retrieve(query_tokens, k=5)
24
+
25
+ # Extract top results
26
+ top_products = [df.iloc[idx] for idx in results[0]]
27
+ ```
28
+ """
29
+
30
+ CODE_STAGE_2 = """
31
+ ```python
32
+ from openai import OpenAI
33
+ import faiss
34
+ import numpy as np
35
+
36
+ # Initialize Fireworks AI client
37
+ client = OpenAI(
38
+ api_key="your_fireworks_api_key",
39
+ base_url="https://api.fireworks.ai/inference/v1"
40
+ )
41
+
42
+ # Generate query embedding
43
+ response = client.embeddings.create(
44
+ model="accounts/fireworks/models/qwen3-embedding-8b",
45
+ input=query
46
+ )
47
+ query_embedding = np.array(response.data[0].embedding, dtype=np.float32)
48
+ query_vector = query_embedding.reshape(1, -1)
49
+
50
+ # Normalize for cosine similarity using L2 distance
51
+ faiss.normalize_L2(query_vector)
52
+
53
+ # Load pre-built FAISS index
54
+ index = faiss.read_index("data/faiss_index.bin")
55
+
56
+ # Search for top-k similar documents
57
+ distances, indices = index.search(query_vector, k=10)
58
+
59
+ # Convert L2 distances to cosine similarity scores
60
+ # After normalization: L2_distance = 2 * (1 - cosine_similarity)
61
+ # So: cosine_similarity = 1 - (L2_distance / 2)
62
+ similarity_scores = 1 - (distances[0] / 2)
63
+
64
+ # Get top results
65
+ top_results = [
66
+ {
67
+ "product": df.iloc[idx],
68
+ "score": float(score)
69
+ }
70
+ for idx, score in zip(indices[0], similarity_scores)
71
+ ]
72
+ ```
73
+ """
74
+
75
+ CODE_STAGE_3 = """
76
+ ```python
77
+ # Query expansion with LLM
78
+ response = client.chat.completions.create(
79
+ model="accounts/fireworks/models/llama-v3p1-8b-instruct",
80
+ messages=[{
81
+ "role": "user",
82
+ "content": f"Extract 2-3 key search concepts from: {query}"
83
+ }]
84
+ )
85
+
86
+ expanded_query = response.choices[0].message.content
87
+
88
+ # Search with expanded query
89
+ response = client.embeddings.create(
90
+ model="accounts/fireworks/models/qwen3-embedding-8b",
91
+ input=[expanded_query] + documents
92
+ )
93
+
94
+ # Continue with embedding search...
95
+ ```
96
+ """
97
+
98
+ CODE_STAGE_4 = """
99
+ ```python
100
+ # First get top 20 candidates from Stage 3
101
+ top_20_results = get_stage_3_results(query, k=20)
102
+
103
+ # Rerank with Fireworks reranker
104
+ rerank_response = client.post(
105
+ "https://api.fireworks.ai/inference/v1/rerank",
106
+ json={
107
+ "model": "fireworks/qwen3-reranker-8b",
108
+ "query": query,
109
+ "documents": [r["text"] for r in top_20_results],
110
+ "top_n": 5
111
+ }
112
+ )
113
+
114
+ # Get final ranked results
115
+ final_results = [
116
+ top_20_results[r["index"]]
117
+ for r in rerank_response.json()["results"]
118
+ ]
119
+ ```
120
+ """
src/{data_prep β†’ constants}/constants.py RENAMED
File without changes
src/search/bm25_lexical_search.py CHANGED
@@ -1,7 +1,7 @@
1
  import bm25s
2
  from typing import List, Dict
3
  from pathlib import Path
4
- from src.data_prep.constants import BM25_INDEX, PRODUCTS_DF
5
 
6
  _FILE_PATH = Path(__file__).parents[2]
7
 
 
1
  import bm25s
2
  from typing import List, Dict
3
  from pathlib import Path
4
+ from constants.constants import BM25_INDEX, PRODUCTS_DF
5
 
6
  _FILE_PATH = Path(__file__).parents[2]
7
 
src/search/vector_search.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import faiss
3
+ from typing import List, Dict
4
+ from pathlib import Path
5
+ from src.fireworks.inference import get_embedding
6
+ from constants.constants import FAISS_INDEX, PRODUCTS_DF
7
+
8
+ _FILE_PATH = Path(__file__).parents[2]
9
+
10
+
11
+ def search_vector(query: str, top_k: int = 10) -> List[Dict[str, any]]:
12
+ """
13
+ Search products using vector embeddings and FAISS for semantic search.
14
+
15
+ This is Stage 2: semantic search using vector embeddings to understand
16
+ query meaning and intent beyond exact keyword matching.
17
+
18
+ Args:
19
+ query: Search query string
20
+ top_k: Number of top results to return (default: 10)
21
+
22
+ Returns:
23
+ List of dictionaries containing product information and scores
24
+ """
25
+ query_embedding = get_embedding(query)
26
+ query_vector = np.array([query_embedding], dtype=np.float32)
27
+
28
+ # Normalize query vector for cosine similarity
29
+ faiss.normalize_L2(query_vector)
30
+
31
+ # Unpack FAISS index tuple (index, embeddings)
32
+ faiss_index = FAISS_INDEX[0]
33
+
34
+ # Search FAISS index
35
+ distances, indices = faiss_index.search(query_vector, top_k)
36
+
37
+ # Convert L2 distances to similarity scores (0-1 range)
38
+ # After normalization, L2 distance = 2 * (1 - cosine_similarity)
39
+ # So cosine_similarity = 1 - (L2_distance / 2)
40
+ similarity_scores = 1 - (distances[0] / 2)
41
+
42
+ return [
43
+ {
44
+ "product_name": PRODUCTS_DF.iloc[idx]["Product Name"],
45
+ "description": PRODUCTS_DF.iloc[idx]["Description"],
46
+ "main_category": PRODUCTS_DF.iloc[idx]["MainCategory"],
47
+ "secondary_category": PRODUCTS_DF.iloc[idx]["SecondaryCategory"],
48
+ "score": float(score),
49
+ }
50
+ for idx, score in zip(indices[0], similarity_scores)
51
+ ]