Spaces:

fireworks-ai
/

search-alchemy

Running

App Files Files Community

RobertoBarrosoLuque commited on 22 days ago

Commit

8601e54

1 Parent(s): fe002f3

Last cleanup items

Browse files

Files changed (2) hide show

src/app.py +59 -70
src/config.py +5 -5

src/app.py CHANGED Viewed

@@ -18,41 +18,6 @@ from src.data_prep.data_prep import load_clean_amazon_product_data
 _FILE_PATH = Path(__file__).parents[1]
-# Placeholder data for demo
-SAMPLE_PRODUCTS = [
-    {
-        "id": 1,
-        "title": "Wireless Bluetooth Headphones",
-        "description": "High-quality wireless headphones with 30-hour battery life and noise cancellation.",
-        "category": "Electronics",
-    },
-    {
-        "id": 2,
-        "title": "Science Kit for Kids",
-        "description": "Educational science experiments kit perfect for children ages 5-10.",
-        "category": "Toys",
-    },
-    {
-        "id": 3,
-        "title": "Running Shoes - Men's",
-        "description": "Lightweight running shoes with cushioned soles and breathable mesh.",
-        "category": "Sports",
-    },
-    {
-        "id": 4,
-        "title": "Portable Bluetooth Speaker",
-        "description": "Waterproof speaker with 12-hour battery life and deep bass.",
-        "category": "Electronics",
-    },
-    {
-        "id": 5,
-        "title": "Ergonomic Office Chair",
-        "description": "Adjustable office chair with lumbar support and breathable fabric.",
-        "category": "Furniture",
-    },
-]
 def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
     """Format search results as HTML.
@@ -221,6 +186,13 @@ def search_all_stages(query: str) -> Tuple[str, str, str, str, str]:
     return results_1, results_2, results_3, results_4, comparison
 def generate_comparison_table(all_metrics: List[Dict]) -> str:
     """Generate comparison table for all stages."""
     stage_names = [
@@ -238,25 +210,9 @@ def generate_comparison_table(all_metrics: List[Dict]) -> str:
     for name, metrics in zip(stage_names, all_metrics):
         html += f"| **{name}** | {metrics['top1_score']:.3f} | {metrics['top5_avg']:.3f} | {metrics['latency_ms']} |\n"
-    # Calculate improvements based on top-5 average
-    top5_improvement = (
-        (
-            (all_metrics[3]["top5_avg"] - all_metrics[0]["top5_avg"])
-            / all_metrics[0]["top5_avg"]
-            * 100
-        )
-        if all_metrics[0]["top5_avg"] > 0
-        else 0
-    )
-    top1_improvement = (
-        (
-            (all_metrics[3]["top1_score"] - all_metrics[0]["top1_score"])
-            / all_metrics[0]["top1_score"]
-            * 100
-        )
-        if all_metrics[0]["top1_score"] > 0
-        else 0
     )
     html += "\n---\n\n"
@@ -393,10 +349,10 @@ with gr.Blocks(
         with gr.Column(scale=3):
             gr.Markdown(
                 """
-            <h1 class="header-title" style="font-size: 2.5em; text-align: left;">Search Alchemy</h1>
             <p style="color: #64748B; font-size: 1.1em; margin-top: 0; text-align: left;">Building Production Search Pipelines with Fireworks AI</p>
             <p style="color: #475569; font-size: 1.0em; line-height: 1.6; margin: 0;">
-            Four progressive stages demonstrating production-grade semantic search:
             <strong>BM25</strong> → <strong>Vector Embeddings</strong> → <strong>Query Expansion</strong> → <strong>Reranking</strong>
             </p>
             """
@@ -417,15 +373,24 @@ with gr.Blocks(
                     show_share_button=False,
                 )
     with gr.Row():
-        with gr.Column(scale=4):
-            query_input = gr.Textbox(
-                label="Search Query",
-                placeholder="Enter your search query...",
-                value=EXAMPLE_QUERIES_BY_CATEGORY["Toys & Games"]["clear"],
-                scale=3,
-                elem_classes="search-box",
-            )
     with gr.Row():
         gr.Markdown(
@@ -442,29 +407,53 @@ with gr.Blocks(
         with gr.Column(scale=1):
             ambiguity_dropdown = gr.Dropdown(
                 choices=["Clear", "Somewhat Ambiguous", "Ambiguous"],
-                value="Clear",
                 label="Query Specificity",
                 container=True,
             )
         with gr.Column(scale=1):
             search_btn = gr.Button("Search", variant="primary", scale=1, size="lg")
     with gr.Tabs() as tabs:
         with gr.Tab("Stage 1: BM25 Baseline"):
-            stage1_output = gr.Markdown(label="Results")
         with gr.Tab("Stage 2: + Vector Embeddings"):
-            stage2_output = gr.Markdown(label="Results")
         with gr.Tab("Stage 3: + Query Expansion"):
-            stage3_output = gr.Markdown(label="Results")
         with gr.Tab("Stage 4: + LLM Reranking"):
-            stage4_output = gr.Markdown(label="Results")
         with gr.Tab("Compare All Stages"):
-            comparison_output = gr.Markdown(label="Comparison")
     with gr.Accordion("Dataset Information", open=False):
         gr.Markdown("Explore the dataset used for this search demo")

 _FILE_PATH = Path(__file__).parents[1]
 def format_results(results: List[Dict], stage_name: str, metrics: Dict) -> str:
     """Format search results as HTML.
     return results_1, results_2, results_3, results_4, comparison
+def calculate_improvement(metric1, metric2, metric_name):
+    """Calculate improvement as a percentage."""
+    if metric2[metric_name] == 0:
+        return (metric1[metric_name] - metric2[metric_name]) * 100
+    return (metric1[metric_name] - metric2[metric_name]) / metric2[metric_name] * 100
 def generate_comparison_table(all_metrics: List[Dict]) -> str:
     """Generate comparison table for all stages."""
     stage_names = [
     for name, metrics in zip(stage_names, all_metrics):
         html += f"| **{name}** | {metrics['top1_score']:.3f} | {metrics['top5_avg']:.3f} | {metrics['latency_ms']} |\n"
+    top5_improvement = calculate_improvement(all_metrics[3], all_metrics[0], "top5_avg")
+    top1_improvement = calculate_improvement(
+        all_metrics[3], all_metrics[0], "top1_score"
     )
     html += "\n---\n\n"
         with gr.Column(scale=3):
             gr.Markdown(
                 """
+            <h1 class="header-title" style="font-size: 2.5em; text-align: left;">🧙 Search Alchemy 🧙</h1>
             <p style="color: #64748B; font-size: 1.1em; margin-top: 0; text-align: left;">Building Production Search Pipelines with Fireworks AI</p>
             <p style="color: #475569; font-size: 1.0em; line-height: 1.6; margin: 0;">
+            Four progressive stages demonstrating how to build production-grade semantic search:
             <strong>BM25</strong> → <strong>Vector Embeddings</strong> → <strong>Query Expansion</strong> → <strong>Reranking</strong>
             </p>
             """
                     show_share_button=False,
                 )
+    # Introduction Section
     with gr.Row():
+        gr.Markdown(
+            """
+**The Context:**
+- **[Dataset](https://huggingface.co/datasets/ckandemir/amazon-products):** 10,000+ Amazon products across Toys, Home & Kitchen, Clothing, Sports, and Baby Products
+- **The Problem:** Users search with vague terms like "keep kids busy" or "make bedroom nicer" instead of specific product names
+- **The Solution:** Four progressive stages showing how semantic search handles ambiguity better than keyword matching
+**How to Use:**
+1. The default query "keep kids busy" is intentionally ambiguous - try it first to see the dramatic improvement across stages
+2. Select different categories and specificity levels to explore more examples
+3. Click **Search** and compare **Top-1 Score** and **Top-5 Avg** across all stages in the "Compare All Stages" tab
+4. Notice how BM25 (keyword matching) struggles with ambiguous queries while vector embeddings + reranking excel
+Note: scores are normalized to a 0-1, higher is better.
+            """
+        )
     with gr.Row():
         gr.Markdown(
         with gr.Column(scale=1):
             ambiguity_dropdown = gr.Dropdown(
                 choices=["Clear", "Somewhat Ambiguous", "Ambiguous"],
+                value="Ambiguous",
                 label="Query Specificity",
                 container=True,
             )
         with gr.Column(scale=1):
             search_btn = gr.Button("Search", variant="primary", scale=1, size="lg")
+    with gr.Row():
+        gr.Markdown(
+            "**Or write your own query:** Write your own query to find product in the database"
+        )
+    with gr.Row():
+        with gr.Column(scale=4):
+            query_input = gr.Textbox(
+                label="Search Query",
+                placeholder="...",
+                value=EXAMPLE_QUERIES_BY_CATEGORY["Baby Products"]["ambiguous"],
+                scale=3,
+                elem_classes="search-box",
+            )
     with gr.Tabs() as tabs:
         with gr.Tab("Stage 1: BM25 Baseline"):
+            stage1_output = gr.Markdown(
+                value="Click **Search** to see results", label="Results"
+            )
         with gr.Tab("Stage 2: + Vector Embeddings"):
+            stage2_output = gr.Markdown(
+                value="Click **Search** to see results", label="Results"
+            )
         with gr.Tab("Stage 3: + Query Expansion"):
+            stage3_output = gr.Markdown(
+                value="Click **Search** to see results", label="Results"
+            )
         with gr.Tab("Stage 4: + LLM Reranking"):
+            stage4_output = gr.Markdown(
+                value="Click **Search** to see results", label="Results"
+            )
         with gr.Tab("Compare All Stages"):
+            comparison_output = gr.Markdown(
+                value="Click **Search** to see results", label="Comparison"
+            )
     with gr.Accordion("Dataset Information", open=False):
         gr.Markdown("Explore the dataset used for this search demo")

src/config.py CHANGED Viewed

@@ -221,6 +221,11 @@ button.primary:hover {
 EXAMPLE_QUERIES_BY_CATEGORY = {
     "Toys & Games": {
         "clear": "magnetic construction building blocks educational toy",
         "somewhat_ambiguous": "creative play for young children",
@@ -241,9 +246,4 @@ EXAMPLE_QUERIES_BY_CATEGORY = {
         "somewhat_ambiguous": "active outdoor toy",
         "ambiguous": "yard activity",
     },
-    "Baby Products": {
-        "clear": "nursery wall decor quotes motivational stickers",
-        "somewhat_ambiguous": "baby room essentials",
-        "ambiguous": "expecting soon",
-    },
 }

 EXAMPLE_QUERIES_BY_CATEGORY = {
+    "Baby Products": {
+        "clear": "nursery wall decor quotes motivational stickers",
+        "somewhat_ambiguous": "baby room essentials",
+        "ambiguous": "expecting soon",
+    },
     "Toys & Games": {
         "clear": "magnetic construction building blocks educational toy",
         "somewhat_ambiguous": "creative play for young children",
         "somewhat_ambiguous": "active outdoor toy",
         "ambiguous": "yard activity",
     },
 }