|
|
""" |
|
|
FILE: keyword_boosting_layer.py (ENHANCED VERSION) |
|
|
|
|
|
PURPOSE: |
|
|
- Advanced keyword-based boosting for final score refinement |
|
|
- Exact match detection for precise results |
|
|
- Location-based boosting |
|
|
- Expanded category coverage |
|
|
""" |
|
|
|
|
|
import re |
|
|
from difflib import SequenceMatcher |
|
|
|
|
|
|
|
|
|
|
|
ROMANTIC_BOOST_WORDS = [ |
|
|
"love", "romantic", "couple", "honeymoon", "beautiful", "sunset", |
|
|
"palace", "view", "lake", "memories", "historic", "architecture" |
|
|
] |
|
|
|
|
|
SPIRITUAL_BOOST = [ |
|
|
"peace", "meditation", "spiritual", "calm", "holy", "divine", "quiet", |
|
|
"temple", "church", "mosque", "monastery", "shrine", "sacred" |
|
|
] |
|
|
|
|
|
FOOD_SPICY = [ |
|
|
"spicy", "masala", "hot", "tangy", "flavour", "chilli", "pepper" |
|
|
] |
|
|
|
|
|
ADVENTURE_BOOST = [ |
|
|
"trek", "hike", "camping", "rafting", "adventure", "mountain", "climb", |
|
|
"rappelling", "kayaking", "paragliding", "safari", "jungle" |
|
|
] |
|
|
|
|
|
NATURE_BOOST = [ |
|
|
"waterfall", "lake", "forest", "valley", "river", "hill", "mountain", |
|
|
"beach", "nature", "wildlife", "scenic", "green", "meadow", "canyon" |
|
|
] |
|
|
|
|
|
HERITAGE_BOOST = [ |
|
|
"fort", "palace", "museum", "ruins", "ancient", "heritage", "historic", |
|
|
"archaeological", "monument", "tomb", "temple", "architecture" |
|
|
] |
|
|
|
|
|
SHOPPING_BOOST = [ |
|
|
"market", "bazaar", "silk", "saree", "shopping", "handicraft", "souvenir", |
|
|
"textile", "jewellery", "craft" |
|
|
] |
|
|
|
|
|
FOOD_GENERAL = [ |
|
|
"authentic", "street food", "cafe", "bakery", "restaurant", "cuisine", |
|
|
"traditional", "local food", "breakfast", "lunch", "dinner", "eatery", "mess", |
|
|
"bhavan", "tiffin", "dhaba", "canteen", "bistro" |
|
|
] |
|
|
|
|
|
SWEET_BOOST = [ |
|
|
"sweet", "dessert", "halwa", "mysore pak", "payasam", "kaja", "laddu", |
|
|
"barfi", "ghevar", "petha", "rasgulla", "rosogolla", "sandesh", "mishti", |
|
|
"jalebi", "gulab jamun", "double ka meetha", "qubani", "chhena poda", "bebinca" |
|
|
] |
|
|
|
|
|
SPECIFIC_DISH_BOOST = [ |
|
|
"biryani", "dosa", "idli", "vada", "sambar", "fish curry", "thali", "meals", |
|
|
"kebab", "tikka", "tandoori", "butter chicken", "dal makhani", "chole bhature", |
|
|
"paratha", "kulcha", "pulla heady", "haleem", "nihari", "galouti", "kachori", |
|
|
"dhokla", "khandvi", "pav bhaji", "vada pav", "misal", "poh", "upma", "bisi bele bath", |
|
|
"ragi mudde", "appam", "stew", "puttu", "beef fry", "porotta", "litti chokha", |
|
|
"momos", "thukpa", "fish fry", "bamboo chicken", "pongal", "avial" |
|
|
] |
|
|
|
|
|
|
|
|
INDIAN_LOCATIONS = [ |
|
|
"delhi", "mumbai", "bangalore", "bengaluru", "kolkata", "chennai", "hyderabad", |
|
|
"pune", "ahmedabad", "jaipur", "lucknow", "varanasi", "banaras", "agra", |
|
|
"goa", "kerala", "karnataka", "tamil nadu", "maharashtra", "rajasthan", |
|
|
"gujarat", "punjab", "himachal pradesh", "uttarakhand", "kashmir", "jammu", |
|
|
"meghalaya", "assam", "nagaland", "manipur", "tripura", "mizoram", "sikkim", |
|
|
"west bengal", "odisha", "chhattisgarh", "madhya pradesh", "uttar pradesh", |
|
|
"bihar", "jharkhand", "andhra pradesh", "telangana", "ladakh", "arunachal pradesh", |
|
|
"shillong", "guwahati", "imphal", "kohima", "gangtok", "darjeeling", |
|
|
"ongole", "tirupati", "vijayawada", "visakhapatnam", "vizag", "srisailam", |
|
|
"simhachalam", "lepakshi", "ahobilam", "mangalagiri", "srikalahasti", "kurnool", |
|
|
"warangal", "madurai", "rameshwaram", "thanjavur", "coimbatore", "mysore", "hampi", |
|
|
"coorg", "wayanad", "munnar", "alleppey", "alappuzha", "pondicherry", "puducherry" |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
def similarity_ratio(a, b): |
|
|
"""Calculate similarity between two strings (0.0 to 1.0)""" |
|
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio() |
|
|
|
|
|
def extract_location_from_query(query): |
|
|
"""Extract potential location names from query""" |
|
|
query_lower = query.lower() |
|
|
found_locations = [] |
|
|
for loc in INDIAN_LOCATIONS: |
|
|
if loc in query_lower: |
|
|
found_locations.append(loc) |
|
|
return found_locations |
|
|
|
|
|
|
|
|
|
|
|
def apply_keyword_boost(query, candidates): |
|
|
""" |
|
|
Apply advanced keyword boosting to candidates. |
|
|
|
|
|
Args: |
|
|
query: User query string |
|
|
candidates: List of candidate dictionaries with 'name', 'text', 'region', 'state', etc. |
|
|
|
|
|
Returns: |
|
|
Sorted list of candidates with 'final_score' field |
|
|
""" |
|
|
query_lower = query.lower() |
|
|
query_locations = extract_location_from_query(query_lower) |
|
|
|
|
|
for item in candidates: |
|
|
|
|
|
base_score = float(item.get("rerank_score", item.get("embedding_score", 0))) |
|
|
boost = 0.0 |
|
|
|
|
|
name = item.get("name", "").lower() |
|
|
text = item.get("text", "").lower() |
|
|
region = item.get("region", "").lower() |
|
|
state = str(item.get("state", "")).lower() |
|
|
|
|
|
|
|
|
|
|
|
if name in query_lower or query_lower in name: |
|
|
boost += 10.0 |
|
|
elif similarity_ratio(query_lower, name) > 0.85: |
|
|
boost += 8.0 |
|
|
|
|
|
|
|
|
|
|
|
for loc in query_locations: |
|
|
if loc in region or loc in state: |
|
|
boost += 8.0 |
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
detected_intent = item.get("intent", "general") |
|
|
item_domain = str(item.get("domain", "")).lower() |
|
|
|
|
|
|
|
|
if detected_intent == "nature": |
|
|
if item_domain == "nature": |
|
|
boost += 3.0 |
|
|
elif item_domain == "travel": |
|
|
boost += 1.5 |
|
|
|
|
|
|
|
|
elif detected_intent != "general" and detected_intent in item_domain: |
|
|
boost += 3.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
romantic_matches = sum(1 for word in ROMANTIC_BOOST_WORDS if word in query_lower and word in text) |
|
|
if romantic_matches > 0: |
|
|
boost += min(romantic_matches * 0.8, 3.0) |
|
|
|
|
|
|
|
|
spiritual_matches = sum(1 for word in SPIRITUAL_BOOST if word in query_lower and word in text) |
|
|
if spiritual_matches > 0: |
|
|
boost += min(spiritual_matches * 0.7, 2.5) |
|
|
|
|
|
|
|
|
spicy_matches = sum(1 for word in FOOD_SPICY if word in query_lower and word in text) |
|
|
if spicy_matches > 0: |
|
|
boost += min(spicy_matches * 0.6, 2.0) |
|
|
|
|
|
|
|
|
adventure_matches = sum(1 for word in ADVENTURE_BOOST if word in query_lower and word in text) |
|
|
if adventure_matches > 0: |
|
|
boost += min(adventure_matches * 0.7, 2.5) |
|
|
|
|
|
|
|
|
nature_matches = sum(1 for word in NATURE_BOOST if word in query_lower and word in text) |
|
|
if nature_matches > 0: |
|
|
boost += min(nature_matches * 0.6, 2.5) |
|
|
|
|
|
|
|
|
heritage_matches = sum(1 for word in HERITAGE_BOOST if word in query_lower and word in text) |
|
|
if heritage_matches > 0: |
|
|
boost += min(heritage_matches * 0.6, 2.5) |
|
|
|
|
|
|
|
|
shopping_matches = sum(1 for word in SHOPPING_BOOST if word in query_lower and word in text) |
|
|
if shopping_matches > 0: |
|
|
boost += min(shopping_matches * 0.5, 2.0) |
|
|
|
|
|
|
|
|
food_matches = sum(1 for word in FOOD_GENERAL if word in query_lower and word in text) |
|
|
if food_matches > 0: |
|
|
boost += min(food_matches * 0.5, 2.0) |
|
|
|
|
|
|
|
|
sweet_matches = sum(1 for word in SWEET_BOOST if word in query_lower and word in text) |
|
|
if sweet_matches > 0: |
|
|
boost += min(sweet_matches * 0.8, 3.0) |
|
|
|
|
|
|
|
|
dish_matches = sum(1 for word in SPECIFIC_DISH_BOOST if word in query_lower and word in text) |
|
|
if dish_matches > 0: |
|
|
boost += min(dish_matches * 1.0, 4.0) |
|
|
|
|
|
|
|
|
item["final_score"] = base_score + boost |
|
|
item["boost_applied"] = boost |
|
|
|
|
|
|
|
|
return sorted(candidates, key=lambda x: x["final_score"], reverse=True) |