Spaces:
Running
Running
| import bm25s | |
| from typing import List, Dict | |
| from pathlib import Path | |
| from constants.constants import BM25_INDEX, PRODUCTS_DF | |
| _FILE_PATH = Path(__file__).parents[2] | |
| def search_bm25(query: str, top_k: int = 5) -> List[Dict[str, any]]: | |
| """ | |
| Search products using BM25 lexical search (keyword matching). | |
| This is Stage 1: traditional keyword-based ranking without semantic understanding. | |
| Fast but misses semantic meaning and intent. | |
| Args: | |
| query: Search query string | |
| top_k: Number of top results to return (default: 5) | |
| Returns: | |
| List of dictionaries containing product information and scores | |
| """ | |
| query_tokens = bm25s.tokenize(query, stopwords="en") | |
| results, scores = BM25_INDEX.retrieve(query_tokens, k=top_k) | |
| return [ | |
| { | |
| "product_name": PRODUCTS_DF.iloc[idx]["Product Name"], | |
| "description": PRODUCTS_DF.iloc[idx]["Description"], | |
| "main_category": PRODUCTS_DF.iloc[idx]["MainCategory"], | |
| "secondary_category": PRODUCTS_DF.iloc[idx]["SecondaryCategory"], | |
| "score": float(score), | |
| } | |
| for idx, score in zip(results[0], scores[0]) | |
| ] | |