|
|
"""
|
|
|
FILE: keyword_boosting_layer.py (ENHANCED VERSION)
|
|
|
|
|
|
PURPOSE:
|
|
|
- Advanced keyword-based boosting for final score refinement
|
|
|
- Exact match detection for precise results
|
|
|
- Location-based boosting
|
|
|
- Expanded category coverage
|
|
|
"""
|
|
|
|
|
|
import re
|
|
|
from difflib import SequenceMatcher
|
|
|
|
|
|
|
|
|
|
|
|
ROMANTIC_BOOST_WORDS = [
|
|
|
"love", "romantic", "couple", "honeymoon", "beautiful", "sunset",
|
|
|
"palace", "view", "lake", "memories", "historic", "architecture"
|
|
|
]
|
|
|
|
|
|
SPIRITUAL_BOOST = [
|
|
|
"peace", "meditation", "spiritual", "calm", "holy", "divine", "quiet",
|
|
|
"temple", "church", "mosque", "monastery", "shrine", "sacred"
|
|
|
]
|
|
|
|
|
|
FOOD_SPICY = [
|
|
|
"spicy", "masala", "hot", "tangy", "flavour", "chilli", "pepper"
|
|
|
]
|
|
|
|
|
|
ADVENTURE_BOOST = [
|
|
|
"trek", "hike", "camping", "rafting", "adventure", "mountain", "climb",
|
|
|
"rappelling", "kayaking", "paragliding", "safari", "jungle"
|
|
|
]
|
|
|
|
|
|
NATURE_BOOST = [
|
|
|
"waterfall", "lake", "forest", "valley", "river", "hill", "mountain",
|
|
|
"beach", "nature", "wildlife", "scenic", "green", "meadow", "canyon"
|
|
|
]
|
|
|
|
|
|
HERITAGE_BOOST = [
|
|
|
"fort", "palace", "museum", "ruins", "ancient", "heritage", "historic",
|
|
|
"archaeological", "monument", "tomb", "temple", "architecture"
|
|
|
]
|
|
|
|
|
|
SHOPPING_BOOST = [
|
|
|
"market", "bazaar", "silk", "saree", "shopping", "handicraft", "souvenir",
|
|
|
"textile", "jewellery", "craft"
|
|
|
]
|
|
|
|
|
|
FOOD_GENERAL = [
|
|
|
"authentic", "street food", "cafe", "bakery", "restaurant", "cuisine",
|
|
|
"traditional", "local food", "breakfast", "lunch", "dinner"
|
|
|
]
|
|
|
|
|
|
|
|
|
INDIAN_LOCATIONS = [
|
|
|
"delhi", "mumbai", "bangalore", "bengaluru", "kolkata", "chennai", "hyderabad",
|
|
|
"pune", "ahmedabad", "jaipur", "lucknow", "varanasi", "banaras", "agra",
|
|
|
"goa", "kerala", "karnataka", "tamil nadu", "maharashtra", "rajasthan",
|
|
|
"gujarat", "punjab", "himachal pradesh", "uttarakhand", "kashmir", "jammu",
|
|
|
"meghalaya", "assam", "nagaland", "manipur", "tripura", "mizoram", "sikkim",
|
|
|
"west bengal", "odisha", "chhattisgarh", "madhya pradesh", "uttar pradesh",
|
|
|
"bihar", "jharkhand", "andhra pradesh", "telangana", "ladakh", "arunachal pradesh",
|
|
|
"shillong", "guwahati", "imphal", "kohima", "gangtok", "darjeeling"
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def similarity_ratio(a, b):
|
|
|
"""Calculate similarity between two strings (0.0 to 1.0)"""
|
|
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
|
|
|
|
|
def extract_location_from_query(query):
|
|
|
"""Extract potential location names from query"""
|
|
|
query_lower = query.lower()
|
|
|
found_locations = []
|
|
|
for loc in INDIAN_LOCATIONS:
|
|
|
if loc in query_lower:
|
|
|
found_locations.append(loc)
|
|
|
return found_locations
|
|
|
|
|
|
|
|
|
|
|
|
def apply_keyword_boost(query, candidates):
|
|
|
"""
|
|
|
Apply advanced keyword boosting to candidates.
|
|
|
|
|
|
Args:
|
|
|
query: User query string
|
|
|
candidates: List of candidate dictionaries with 'name', 'text', 'region', 'state', etc.
|
|
|
|
|
|
Returns:
|
|
|
Sorted list of candidates with 'final_score' field
|
|
|
"""
|
|
|
query_lower = query.lower()
|
|
|
query_locations = extract_location_from_query(query_lower)
|
|
|
|
|
|
for item in candidates:
|
|
|
|
|
|
base_score = float(item.get("rerank_score", item.get("embedding_score", 0)))
|
|
|
boost = 0.0
|
|
|
|
|
|
name = item.get("name", "").lower()
|
|
|
text = item.get("text", "").lower()
|
|
|
region = item.get("region", "").lower()
|
|
|
state = str(item.get("state", "")).lower()
|
|
|
|
|
|
|
|
|
|
|
|
if name in query_lower or query_lower in name:
|
|
|
boost += 10.0
|
|
|
elif similarity_ratio(query_lower, name) > 0.85:
|
|
|
boost += 8.0
|
|
|
|
|
|
|
|
|
|
|
|
for loc in query_locations:
|
|
|
if loc in region or loc in state:
|
|
|
boost += 8.0
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
detected_intent = item.get("intent", "general")
|
|
|
item_domain = str(item.get("domain", "")).lower()
|
|
|
|
|
|
|
|
|
if detected_intent == "nature":
|
|
|
if item_domain == "nature":
|
|
|
boost += 3.0
|
|
|
elif item_domain == "travel":
|
|
|
boost += 1.5
|
|
|
|
|
|
|
|
|
elif detected_intent != "general" and detected_intent in item_domain:
|
|
|
boost += 3.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
romantic_matches = sum(1 for word in ROMANTIC_BOOST_WORDS if word in query_lower and word in text)
|
|
|
if romantic_matches > 0:
|
|
|
boost += min(romantic_matches * 0.8, 3.0)
|
|
|
|
|
|
|
|
|
spiritual_matches = sum(1 for word in SPIRITUAL_BOOST if word in query_lower and word in text)
|
|
|
if spiritual_matches > 0:
|
|
|
boost += min(spiritual_matches * 0.7, 2.5)
|
|
|
|
|
|
|
|
|
spicy_matches = sum(1 for word in FOOD_SPICY if word in query_lower and word in text)
|
|
|
if spicy_matches > 0:
|
|
|
boost += min(spicy_matches * 0.6, 2.0)
|
|
|
|
|
|
|
|
|
adventure_matches = sum(1 for word in ADVENTURE_BOOST if word in query_lower and word in text)
|
|
|
if adventure_matches > 0:
|
|
|
boost += min(adventure_matches * 0.7, 2.5)
|
|
|
|
|
|
|
|
|
nature_matches = sum(1 for word in NATURE_BOOST if word in query_lower and word in text)
|
|
|
if nature_matches > 0:
|
|
|
boost += min(nature_matches * 0.6, 2.5)
|
|
|
|
|
|
|
|
|
heritage_matches = sum(1 for word in HERITAGE_BOOST if word in query_lower and word in text)
|
|
|
if heritage_matches > 0:
|
|
|
boost += min(heritage_matches * 0.6, 2.5)
|
|
|
|
|
|
|
|
|
shopping_matches = sum(1 for word in SHOPPING_BOOST if word in query_lower and word in text)
|
|
|
if shopping_matches > 0:
|
|
|
boost += min(shopping_matches * 0.5, 2.0)
|
|
|
|
|
|
|
|
|
food_matches = sum(1 for word in FOOD_GENERAL if word in query_lower and word in text)
|
|
|
if food_matches > 0:
|
|
|
boost += min(food_matches * 0.5, 2.0)
|
|
|
|
|
|
|
|
|
item["final_score"] = base_score + boost
|
|
|
item["boost_applied"] = boost
|
|
|
|
|
|
|
|
|
return sorted(candidates, key=lambda x: x["final_score"], reverse=True)
|
|
|
|