Spaces:

bharatverse11
/

new_recommender_system_nlp2

Running

App Files Files Community

bharatverse11 commited on 16 days ago

Commit

113d0e3

verified ·

1 Parent(s): 454b411

Update keyword_boosting_layer.py

Browse files

Files changed (1) hide show

keyword_boosting_layer.py +216 -186

keyword_boosting_layer.py CHANGED Viewed

@@ -1,186 +1,216 @@
-"""
-FILE: keyword_boosting_layer.py (ENHANCED VERSION)
-PURPOSE:
-- Advanced keyword-based boosting for final score refinement
-- Exact match detection for precise results
-- Location-based boosting
-- Expanded category coverage
-"""
-import re
-from difflib import SequenceMatcher
-# ---------------- KEYWORD CATEGORIES ----------------
-ROMANTIC_BOOST_WORDS = [
-    "love", "romantic", "couple", "honeymoon", "beautiful", "sunset",
-    "palace", "view", "lake", "memories", "historic", "architecture"
-]
-SPIRITUAL_BOOST = [
-    "peace", "meditation", "spiritual", "calm", "holy", "divine", "quiet",
-    "temple", "church", "mosque", "monastery", "shrine", "sacred"
-]
-FOOD_SPICY = [
-    "spicy", "masala", "hot", "tangy", "flavour", "chilli", "pepper"
-]
-ADVENTURE_BOOST = [
-    "trek", "hike", "camping", "rafting", "adventure", "mountain", "climb",
-    "rappelling", "kayaking", "paragliding", "safari", "jungle"
-]
-NATURE_BOOST = [
-    "waterfall", "lake", "forest", "valley", "river", "hill", "mountain",
-    "beach", "nature", "wildlife", "scenic", "green", "meadow", "canyon"
-]
-HERITAGE_BOOST = [
-    "fort", "palace", "museum", "ruins", "ancient", "heritage", "historic",
-    "archaeological", "monument", "tomb", "temple", "architecture"
-]
-SHOPPING_BOOST = [
-    "market", "bazaar", "silk", "saree", "shopping", "handicraft", "souvenir",
-    "textile", "jewellery", "craft"
-]
-FOOD_GENERAL = [
-    "authentic", "street food", "cafe", "bakery", "restaurant", "cuisine",
-    "traditional", "local food", "breakfast", "lunch", "dinner"
-]
-# Indian states and major cities for location matching
-INDIAN_LOCATIONS = [
-    "delhi", "mumbai", "bangalore", "bengaluru", "kolkata", "chennai", "hyderabad",
-    "pune", "ahmedabad", "jaipur", "lucknow", "varanasi", "banaras", "agra",
-    "goa", "kerala", "karnataka", "tamil nadu", "maharashtra", "rajasthan",
-    "gujarat", "punjab", "himachal pradesh", "uttarakhand", "kashmir", "jammu",
-    "meghalaya", "assam", "nagaland", "manipur", "tripura", "mizoram", "sikkim",
-    "west bengal", "odisha", "chhattisgarh", "madhya pradesh", "uttar pradesh",
-    "bihar", "jharkhand", "andhra pradesh", "telangana", "ladakh", "arunachal pradesh",
-    "shillong", "guwahati", "imphal", "kohima", "gangtok", "darjeeling"
-]
-# ---------------- HELPER FUNCTIONS ----------------
-def similarity_ratio(a, b):
-    """Calculate similarity between two strings (0.0 to 1.0)"""
-    return SequenceMatcher(None, a.lower(), b.lower()).ratio()
-def extract_location_from_query(query):
-    """Extract potential location names from query"""
-    query_lower = query.lower()
-    found_locations = []
-    for loc in INDIAN_LOCATIONS:
-        if loc in query_lower:
-            found_locations.append(loc)
-    return found_locations
-# ---------------- MAIN BOOSTING FUNCTION ----------------
-def apply_keyword_boost(query, candidates):
-    """
-    Apply advanced keyword boosting to candidates.
-    Args:
-        query: User query string
-        candidates: List of candidate dictionaries with 'name', 'text', 'region', 'state', etc.
-    Returns:
-        Sorted list of candidates with 'final_score' field
-    """
-    query_lower = query.lower()
-    query_locations = extract_location_from_query(query_lower)
-    for item in candidates:
-        # Start with rerank score or embedding score
-        base_score = float(item.get("rerank_score", item.get("embedding_score", 0)))
-        boost = 0.0
-        name = item.get("name", "").lower()
-        text = item.get("text", "").lower()
-        region = item.get("region", "").lower()
-        state = str(item.get("state", "")).lower()
-        # ============ EXACT/NEAR EXACT MATCH BOOST ============
-        # Massive boost if query contains the exact name or very close variant
-        if name in query_lower or query_lower in name:
-            boost += 10.0
-        elif similarity_ratio(query_lower, name) > 0.85:
-            boost += 8.0
-        # ============ LOCATION MATCH BOOST ============
-        # Check if query mentions the location
-        for loc in query_locations:
-            if loc in region or loc in state:
-                boost += 8.0
-                break
-        # ============ INTENT MATCH BOOST ============
-        # "Soft Filter": Boost items that match the detected intent
-        detected_intent = item.get("intent", "general")
-        item_domain = str(item.get("domain", "")).lower()
-        # Map 'nature' intent to 'nature' domain (and 'travel' as secondary)
-        if detected_intent == "nature":
-            if item_domain == "nature":
-                boost += 3.0
-            elif item_domain == "travel":
-                boost += 1.5 # Secondary boost for travel items in nature query
-        # Standard intent match
-        elif detected_intent != "general" and detected_intent in item_domain:
-            boost += 3.0
-        # ============ CATEGORY KEYWORD BOOSTS ============
-        # Count matching keywords and apply scaled boost
-        # Romantic
-        romantic_matches = sum(1 for word in ROMANTIC_BOOST_WORDS if word in query_lower and word in text)
-        if romantic_matches > 0:
-            boost += min(romantic_matches * 0.8, 3.0)
-        # Spiritual
-        spiritual_matches = sum(1 for word in SPIRITUAL_BOOST if word in query_lower and word in text)
-        if spiritual_matches > 0:
-            boost += min(spiritual_matches * 0.7, 2.5)
-        # Spicy Food
-        spicy_matches = sum(1 for word in FOOD_SPICY if word in query_lower and word in text)
-        if spicy_matches > 0:
-            boost += min(spicy_matches * 0.6, 2.0)
-        # Adventure
-        adventure_matches = sum(1 for word in ADVENTURE_BOOST if word in query_lower and word in text)
-        if adventure_matches > 0:
-            boost += min(adventure_matches * 0.7, 2.5)
-        # Nature
-        nature_matches = sum(1 for word in NATURE_BOOST if word in query_lower and word in text)
-        if nature_matches > 0:
-            boost += min(nature_matches * 0.6, 2.5)
-        # Heritage
-        heritage_matches = sum(1 for word in HERITAGE_BOOST if word in query_lower and word in text)
-        if heritage_matches > 0:
-            boost += min(heritage_matches * 0.6, 2.5)
-        # Shopping
-        shopping_matches = sum(1 for word in SHOPPING_BOOST if word in query_lower and word in text)
-        if shopping_matches > 0:
-            boost += min(shopping_matches * 0.5, 2.0)
-        # Food General
-        food_matches = sum(1 for word in FOOD_GENERAL if word in query_lower and word in text)
-        if food_matches > 0:
-            boost += min(food_matches * 0.5, 2.0)
-        # Calculate final score
-        item["final_score"] = base_score + boost
-        item["boost_applied"] = boost
-    # Sort by final score (descending)
-    return sorted(candidates, key=lambda x: x["final_score"], reverse=True)

+"""
+FILE: keyword_boosting_layer.py (ENHANCED VERSION)
+PURPOSE:
+- Advanced keyword-based boosting for final score refinement
+- Exact match detection for precise results
+- Location-based boosting
+- Expanded category coverage
+"""
+import re
+from difflib import SequenceMatcher
+# ---------------- KEYWORD CATEGORIES ----------------
+ROMANTIC_BOOST_WORDS = [
+    "love", "romantic", "couple", "honeymoon", "beautiful", "sunset",
+    "palace", "view", "lake", "memories", "historic", "architecture"
+]
+SPIRITUAL_BOOST = [
+    "peace", "meditation", "spiritual", "calm", "holy", "divine", "quiet",
+    "temple", "church", "mosque", "monastery", "shrine", "sacred"
+]
+FOOD_SPICY = [
+    "spicy", "masala", "hot", "tangy", "flavour", "chilli", "pepper"
+]
+ADVENTURE_BOOST = [
+    "trek", "hike", "camping", "rafting", "adventure", "mountain", "climb",
+    "rappelling", "kayaking", "paragliding", "safari", "jungle"
+]
+NATURE_BOOST = [
+    "waterfall", "lake", "forest", "valley", "river", "hill", "mountain",
+    "beach", "nature", "wildlife", "scenic", "green", "meadow", "canyon"
+]
+HERITAGE_BOOST = [
+    "fort", "palace", "museum", "ruins", "ancient", "heritage", "historic",
+    "archaeological", "monument", "tomb", "temple", "architecture"
+]
+SHOPPING_BOOST = [
+    "market", "bazaar", "silk", "saree", "shopping", "handicraft", "souvenir",
+    "textile", "jewellery", "craft"
+]
+FOOD_GENERAL = [
+    "authentic", "street food", "cafe", "bakery", "restaurant", "cuisine",
+    "traditional", "local food", "breakfast", "lunch", "dinner", "eatery", "mess",
+    "bhavan", "tiffin", "dhaba", "canteen", "bistro"
+]
+SWEET_BOOST = [
+    "sweet", "dessert", "halwa", "mysore pak", "payasam", "kaja", "laddu",
+    "barfi", "ghevar", "petha", "rasgulla", "rosogolla", "sandesh", "mishti",
+    "jalebi", "gulab jamun", "double ka meetha", "qubani", "chhena poda", "bebinca"
+]
+SPECIFIC_DISH_BOOST = [
+    "biryani", "dosa", "idli", "vada", "sambar", "fish curry", "thali", "meals",
+    "kebab", "tikka", "tandoori", "butter chicken", "dal makhani", "chole bhature",
+    "paratha", "kulcha", "pulla heady", "haleem", "nihari", "galouti", "kachori",
+    "dhokla", "khandvi", "pav bhaji", "vada pav", "misal", "poh", "upma", "bisi bele bath",
+    "ragi mudde", "appam", "stew", "puttu", "beef fry", "porotta", "litti chokha",
+    "momos", "thukpa", "fish fry", "bamboo chicken", "pongal", "avial"
+]
+# Indian states and major cities for location matching
+INDIAN_LOCATIONS = [
+    "delhi", "mumbai", "bangalore", "bengaluru", "kolkata", "chennai", "hyderabad",
+    "pune", "ahmedabad", "jaipur", "lucknow", "varanasi", "banaras", "agra",
+    "goa", "kerala", "karnataka", "tamil nadu", "maharashtra", "rajasthan",
+    "gujarat", "punjab", "himachal pradesh", "uttarakhand", "kashmir", "jammu",
+    "meghalaya", "assam", "nagaland", "manipur", "tripura", "mizoram", "sikkim",
+    "west bengal", "odisha", "chhattisgarh", "madhya pradesh", "uttar pradesh",
+    "bihar", "jharkhand", "andhra pradesh", "telangana", "ladakh", "arunachal pradesh",
+    "shillong", "guwahati", "imphal", "kohima", "gangtok", "darjeeling",
+    "ongole", "tirupati", "vijayawada", "visakhapatnam", "vizag", "srisailam",
+    "simhachalam", "lepakshi", "ahobilam", "mangalagiri", "srikalahasti", "kurnool",
+    "warangal", "madurai", "rameshwaram", "thanjavur", "coimbatore", "mysore", "hampi",
+    "coorg", "wayanad", "munnar", "alleppey", "alappuzha", "pondicherry", "puducherry"
+]
+# ---------------- HELPER FUNCTIONS ----------------
+def similarity_ratio(a, b):
+    """Calculate similarity between two strings (0.0 to 1.0)"""
+    return SequenceMatcher(None, a.lower(), b.lower()).ratio()
+def extract_location_from_query(query):
+    """Extract potential location names from query"""
+    query_lower = query.lower()
+    found_locations = []
+    for loc in INDIAN_LOCATIONS:
+        if loc in query_lower:
+            found_locations.append(loc)
+    return found_locations
+# ---------------- MAIN BOOSTING FUNCTION ----------------
+def apply_keyword_boost(query, candidates):
+    """
+    Apply advanced keyword boosting to candidates.
+    Args:
+        query: User query string
+        candidates: List of candidate dictionaries with 'name', 'text', 'region', 'state', etc.
+    Returns:
+        Sorted list of candidates with 'final_score' field
+    """
+    query_lower = query.lower()
+    query_locations = extract_location_from_query(query_lower)
+    for item in candidates:
+        # Start with rerank score or embedding score
+        base_score = float(item.get("rerank_score", item.get("embedding_score", 0)))
+        boost = 0.0
+        name = item.get("name", "").lower()
+        text = item.get("text", "").lower()
+        region = item.get("region", "").lower()
+        state = str(item.get("state", "")).lower()
+        # ============ EXACT/NEAR EXACT MATCH BOOST ============
+        # Massive boost if query contains the exact name or very close variant
+        if name in query_lower or query_lower in name:
+            boost += 10.0
+        elif similarity_ratio(query_lower, name) > 0.85:
+            boost += 8.0
+        # ============ LOCATION MATCH BOOST ============
+        # Check if query mentions the location
+        for loc in query_locations:
+            if loc in region or loc in state:
+                boost += 8.0
+                break
+        # ============ INTENT MATCH BOOST ============
+        # "Soft Filter": Boost items that match the detected intent
+        detected_intent = item.get("intent", "general")
+        item_domain = str(item.get("domain", "")).lower()
+        # Map 'nature' intent to 'nature' domain (and 'travel' as secondary)
+        if detected_intent == "nature":
+            if item_domain == "nature":
+                boost += 3.0
+            elif item_domain == "travel":
+                boost += 1.5 # Secondary boost for travel items in nature query
+        # Standard intent match
+        elif detected_intent != "general" and detected_intent in item_domain:
+            boost += 3.0
+        # ============ CATEGORY KEYWORD BOOSTS ============
+        # Count matching keywords and apply scaled boost
+        # Romantic
+        romantic_matches = sum(1 for word in ROMANTIC_BOOST_WORDS if word in query_lower and word in text)
+        if romantic_matches > 0:
+            boost += min(romantic_matches * 0.8, 3.0)
+        # Spiritual
+        spiritual_matches = sum(1 for word in SPIRITUAL_BOOST if word in query_lower and word in text)
+        if spiritual_matches > 0:
+            boost += min(spiritual_matches * 0.7, 2.5)
+        # Spicy Food
+        spicy_matches = sum(1 for word in FOOD_SPICY if word in query_lower and word in text)
+        if spicy_matches > 0:
+            boost += min(spicy_matches * 0.6, 2.0)
+        # Adventure
+        adventure_matches = sum(1 for word in ADVENTURE_BOOST if word in query_lower and word in text)
+        if adventure_matches > 0:
+            boost += min(adventure_matches * 0.7, 2.5)
+        # Nature
+        nature_matches = sum(1 for word in NATURE_BOOST if word in query_lower and word in text)
+        if nature_matches > 0:
+            boost += min(nature_matches * 0.6, 2.5)
+        # Heritage
+        heritage_matches = sum(1 for word in HERITAGE_BOOST if word in query_lower and word in text)
+        if heritage_matches > 0:
+            boost += min(heritage_matches * 0.6, 2.5)
+        # Shopping
+        shopping_matches = sum(1 for word in SHOPPING_BOOST if word in query_lower and word in text)
+        if shopping_matches > 0:
+            boost += min(shopping_matches * 0.5, 2.0)
+        # Food General
+        food_matches = sum(1 for word in FOOD_GENERAL if word in query_lower and word in text)
+        if food_matches > 0:
+            boost += min(food_matches * 0.5, 2.0)
+        # Sweets/Desserts
+        sweet_matches = sum(1 for word in SWEET_BOOST if word in query_lower and word in text)
+        if sweet_matches > 0:
+            boost += min(sweet_matches * 0.8, 3.0)
+        # Specific Dishes
+        dish_matches = sum(1 for word in SPECIFIC_DISH_BOOST if word in query_lower and word in text)
+        if dish_matches > 0:
+            boost += min(dish_matches * 1.0, 4.0)
+        # Calculate final score
+        item["final_score"] = base_score + boost
+        item["boost_applied"] = boost
+    # Sort by final score (descending)
+    return sorted(candidates, key=lambda x: x["final_score"], reverse=True)