RobertoBarrosoLuque commited on
Commit
8601e54
Β·
1 Parent(s): fe002f3

Last cleanup items

Browse files
Files changed (2) hide show
  1. src/app.py +59 -70
  2. src/config.py +5 -5
src/app.py CHANGED
@@ -18,41 +18,6 @@ from src.data_prep.data_prep import load_clean_amazon_product_data
18
  _FILE_PATH = Path(__file__).parents[1]
19
 
20
 
21
- # Placeholder data for demo
22
- SAMPLE_PRODUCTS = [
23
- {
24
- "id": 1,
25
- "title": "Wireless Bluetooth Headphones",
26
- "description": "High-quality wireless headphones with 30-hour battery life and noise cancellation.",
27
- "category": "Electronics",
28
- },
29
- {
30
- "id": 2,
31
- "title": "Science Kit for Kids",
32
- "description": "Educational science experiments kit perfect for children ages 5-10.",
33
- "category": "Toys",
34
- },
35
- {
36
- "id": 3,
37
- "title": "Running Shoes - Men's",
38
- "description": "Lightweight running shoes with cushioned soles and breathable mesh.",
39
- "category": "Sports",
40
- },
41
- {
42
- "id": 4,
43
- "title": "Portable Bluetooth Speaker",
44
- "description": "Waterproof speaker with 12-hour battery life and deep bass.",
45
- "category": "Electronics",
46
- },
47
- {
48
- "id": 5,
49
- "title": "Ergonomic Office Chair",
50
- "description": "Adjustable office chair with lumbar support and breathable fabric.",
51
- "category": "Furniture",
52
- },
53
- ]
54
-
55
-
56
  def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
57
  """Format search results as HTML.
58
 
@@ -221,6 +186,13 @@ def search_all_stages(query: str) -> Tuple[str, str, str, str, str]:
221
  return results_1, results_2, results_3, results_4, comparison
222
 
223
 
 
 
 
 
 
 
 
224
  def generate_comparison_table(all_metrics: List[Dict]) -> str:
225
  """Generate comparison table for all stages."""
226
  stage_names = [
@@ -238,25 +210,9 @@ def generate_comparison_table(all_metrics: List[Dict]) -> str:
238
  for name, metrics in zip(stage_names, all_metrics):
239
  html += f"| **{name}** | {metrics['top1_score']:.3f} | {metrics['top5_avg']:.3f} | {metrics['latency_ms']} |\n"
240
 
241
- # Calculate improvements based on top-5 average
242
- top5_improvement = (
243
- (
244
- (all_metrics[3]["top5_avg"] - all_metrics[0]["top5_avg"])
245
- / all_metrics[0]["top5_avg"]
246
- * 100
247
- )
248
- if all_metrics[0]["top5_avg"] > 0
249
- else 0
250
- )
251
-
252
- top1_improvement = (
253
- (
254
- (all_metrics[3]["top1_score"] - all_metrics[0]["top1_score"])
255
- / all_metrics[0]["top1_score"]
256
- * 100
257
- )
258
- if all_metrics[0]["top1_score"] > 0
259
- else 0
260
  )
261
 
262
  html += "\n---\n\n"
@@ -393,10 +349,10 @@ with gr.Blocks(
393
  with gr.Column(scale=3):
394
  gr.Markdown(
395
  """
396
- <h1 class="header-title" style="font-size: 2.5em; text-align: left;">Search Alchemy</h1>
397
  <p style="color: #64748B; font-size: 1.1em; margin-top: 0; text-align: left;">Building Production Search Pipelines with Fireworks AI</p>
398
  <p style="color: #475569; font-size: 1.0em; line-height: 1.6; margin: 0;">
399
- Four progressive stages demonstrating production-grade semantic search:
400
  <strong>BM25</strong> β†’ <strong>Vector Embeddings</strong> β†’ <strong>Query Expansion</strong> β†’ <strong>Reranking</strong>
401
  </p>
402
  """
@@ -417,15 +373,24 @@ with gr.Blocks(
417
  show_share_button=False,
418
  )
419
 
 
420
  with gr.Row():
421
- with gr.Column(scale=4):
422
- query_input = gr.Textbox(
423
- label="Search Query",
424
- placeholder="Enter your search query...",
425
- value=EXAMPLE_QUERIES_BY_CATEGORY["Toys & Games"]["clear"],
426
- scale=3,
427
- elem_classes="search-box",
428
- )
 
 
 
 
 
 
 
 
429
 
430
  with gr.Row():
431
  gr.Markdown(
@@ -442,29 +407,53 @@ with gr.Blocks(
442
  with gr.Column(scale=1):
443
  ambiguity_dropdown = gr.Dropdown(
444
  choices=["Clear", "Somewhat Ambiguous", "Ambiguous"],
445
- value="Clear",
446
  label="Query Specificity",
447
  container=True,
448
  )
449
  with gr.Column(scale=1):
450
  search_btn = gr.Button("Search", variant="primary", scale=1, size="lg")
451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  with gr.Tabs() as tabs:
453
 
454
  with gr.Tab("Stage 1: BM25 Baseline"):
455
- stage1_output = gr.Markdown(label="Results")
 
 
456
 
457
  with gr.Tab("Stage 2: + Vector Embeddings"):
458
- stage2_output = gr.Markdown(label="Results")
 
 
459
 
460
  with gr.Tab("Stage 3: + Query Expansion"):
461
- stage3_output = gr.Markdown(label="Results")
 
 
462
 
463
  with gr.Tab("Stage 4: + LLM Reranking"):
464
- stage4_output = gr.Markdown(label="Results")
 
 
465
 
466
  with gr.Tab("Compare All Stages"):
467
- comparison_output = gr.Markdown(label="Comparison")
 
 
468
 
469
  with gr.Accordion("Dataset Information", open=False):
470
  gr.Markdown("Explore the dataset used for this search demo")
 
18
  _FILE_PATH = Path(__file__).parents[1]
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
22
  """Format search results as HTML.
23
 
 
186
  return results_1, results_2, results_3, results_4, comparison
187
 
188
 
189
+ def calculate_improvement(metric1, metric2, metric_name):
190
+ """Calculate improvement as a percentage."""
191
+ if metric2[metric_name] == 0:
192
+ return (metric1[metric_name] - metric2[metric_name]) * 100
193
+ return (metric1[metric_name] - metric2[metric_name]) / metric2[metric_name] * 100
194
+
195
+
196
  def generate_comparison_table(all_metrics: List[Dict]) -> str:
197
  """Generate comparison table for all stages."""
198
  stage_names = [
 
210
  for name, metrics in zip(stage_names, all_metrics):
211
  html += f"| **{name}** | {metrics['top1_score']:.3f} | {metrics['top5_avg']:.3f} | {metrics['latency_ms']} |\n"
212
 
213
+ top5_improvement = calculate_improvement(all_metrics[3], all_metrics[0], "top5_avg")
214
+ top1_improvement = calculate_improvement(
215
+ all_metrics[3], all_metrics[0], "top1_score"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  )
217
 
218
  html += "\n---\n\n"
 
349
  with gr.Column(scale=3):
350
  gr.Markdown(
351
  """
352
+ <h1 class="header-title" style="font-size: 2.5em; text-align: left;">πŸ§™ Search Alchemy πŸ§™</h1>
353
  <p style="color: #64748B; font-size: 1.1em; margin-top: 0; text-align: left;">Building Production Search Pipelines with Fireworks AI</p>
354
  <p style="color: #475569; font-size: 1.0em; line-height: 1.6; margin: 0;">
355
+ Four progressive stages demonstrating how to build production-grade semantic search:
356
  <strong>BM25</strong> β†’ <strong>Vector Embeddings</strong> β†’ <strong>Query Expansion</strong> β†’ <strong>Reranking</strong>
357
  </p>
358
  """
 
373
  show_share_button=False,
374
  )
375
 
376
+ # Introduction Section
377
  with gr.Row():
378
+ gr.Markdown(
379
+ """
380
+ **The Context:**
381
+ - **[Dataset](https://huggingface.co/datasets/ckandemir/amazon-products):** 10,000+ Amazon products across Toys, Home & Kitchen, Clothing, Sports, and Baby Products
382
+ - **The Problem:** Users search with vague terms like "keep kids busy" or "make bedroom nicer" instead of specific product names
383
+ - **The Solution:** Four progressive stages showing how semantic search handles ambiguity better than keyword matching
384
+
385
+ **How to Use:**
386
+ 1. The default query "keep kids busy" is intentionally ambiguous - try it first to see the dramatic improvement across stages
387
+ 2. Select different categories and specificity levels to explore more examples
388
+ 3. Click **Search** and compare **Top-1 Score** and **Top-5 Avg** across all stages in the "Compare All Stages" tab
389
+ 4. Notice how BM25 (keyword matching) struggles with ambiguous queries while vector embeddings + reranking excel
390
+
391
+ Note: scores are normalized to a 0-1, higher is better.
392
+ """
393
+ )
394
 
395
  with gr.Row():
396
  gr.Markdown(
 
407
  with gr.Column(scale=1):
408
  ambiguity_dropdown = gr.Dropdown(
409
  choices=["Clear", "Somewhat Ambiguous", "Ambiguous"],
410
+ value="Ambiguous",
411
  label="Query Specificity",
412
  container=True,
413
  )
414
  with gr.Column(scale=1):
415
  search_btn = gr.Button("Search", variant="primary", scale=1, size="lg")
416
 
417
+ with gr.Row():
418
+ gr.Markdown(
419
+ "**Or write your own query:** Write your own query to find product in the database"
420
+ )
421
+ with gr.Row():
422
+ with gr.Column(scale=4):
423
+ query_input = gr.Textbox(
424
+ label="Search Query",
425
+ placeholder="...",
426
+ value=EXAMPLE_QUERIES_BY_CATEGORY["Baby Products"]["ambiguous"],
427
+ scale=3,
428
+ elem_classes="search-box",
429
+ )
430
+
431
  with gr.Tabs() as tabs:
432
 
433
  with gr.Tab("Stage 1: BM25 Baseline"):
434
+ stage1_output = gr.Markdown(
435
+ value="Click **Search** to see results", label="Results"
436
+ )
437
 
438
  with gr.Tab("Stage 2: + Vector Embeddings"):
439
+ stage2_output = gr.Markdown(
440
+ value="Click **Search** to see results", label="Results"
441
+ )
442
 
443
  with gr.Tab("Stage 3: + Query Expansion"):
444
+ stage3_output = gr.Markdown(
445
+ value="Click **Search** to see results", label="Results"
446
+ )
447
 
448
  with gr.Tab("Stage 4: + LLM Reranking"):
449
+ stage4_output = gr.Markdown(
450
+ value="Click **Search** to see results", label="Results"
451
+ )
452
 
453
  with gr.Tab("Compare All Stages"):
454
+ comparison_output = gr.Markdown(
455
+ value="Click **Search** to see results", label="Comparison"
456
+ )
457
 
458
  with gr.Accordion("Dataset Information", open=False):
459
  gr.Markdown("Explore the dataset used for this search demo")
src/config.py CHANGED
@@ -221,6 +221,11 @@ button.primary:hover {
221
 
222
 
223
  EXAMPLE_QUERIES_BY_CATEGORY = {
 
 
 
 
 
224
  "Toys & Games": {
225
  "clear": "magnetic construction building blocks educational toy",
226
  "somewhat_ambiguous": "creative play for young children",
@@ -241,9 +246,4 @@ EXAMPLE_QUERIES_BY_CATEGORY = {
241
  "somewhat_ambiguous": "active outdoor toy",
242
  "ambiguous": "yard activity",
243
  },
244
- "Baby Products": {
245
- "clear": "nursery wall decor quotes motivational stickers",
246
- "somewhat_ambiguous": "baby room essentials",
247
- "ambiguous": "expecting soon",
248
- },
249
  }
 
221
 
222
 
223
  EXAMPLE_QUERIES_BY_CATEGORY = {
224
+ "Baby Products": {
225
+ "clear": "nursery wall decor quotes motivational stickers",
226
+ "somewhat_ambiguous": "baby room essentials",
227
+ "ambiguous": "expecting soon",
228
+ },
229
  "Toys & Games": {
230
  "clear": "magnetic construction building blocks educational toy",
231
  "somewhat_ambiguous": "creative play for young children",
 
246
  "somewhat_ambiguous": "active outdoor toy",
247
  "ambiguous": "yard activity",
248
  },
 
 
 
 
 
249
  }