bharatverse11 commited on
Commit
113d0e3
·
verified ·
1 Parent(s): 454b411

Update keyword_boosting_layer.py

Browse files
Files changed (1) hide show
  1. keyword_boosting_layer.py +216 -186
keyword_boosting_layer.py CHANGED
@@ -1,186 +1,216 @@
1
- """
2
- FILE: keyword_boosting_layer.py (ENHANCED VERSION)
3
-
4
- PURPOSE:
5
- - Advanced keyword-based boosting for final score refinement
6
- - Exact match detection for precise results
7
- - Location-based boosting
8
- - Expanded category coverage
9
- """
10
-
11
- import re
12
- from difflib import SequenceMatcher
13
-
14
- # ---------------- KEYWORD CATEGORIES ----------------
15
-
16
- ROMANTIC_BOOST_WORDS = [
17
- "love", "romantic", "couple", "honeymoon", "beautiful", "sunset",
18
- "palace", "view", "lake", "memories", "historic", "architecture"
19
- ]
20
-
21
- SPIRITUAL_BOOST = [
22
- "peace", "meditation", "spiritual", "calm", "holy", "divine", "quiet",
23
- "temple", "church", "mosque", "monastery", "shrine", "sacred"
24
- ]
25
-
26
- FOOD_SPICY = [
27
- "spicy", "masala", "hot", "tangy", "flavour", "chilli", "pepper"
28
- ]
29
-
30
- ADVENTURE_BOOST = [
31
- "trek", "hike", "camping", "rafting", "adventure", "mountain", "climb",
32
- "rappelling", "kayaking", "paragliding", "safari", "jungle"
33
- ]
34
-
35
- NATURE_BOOST = [
36
- "waterfall", "lake", "forest", "valley", "river", "hill", "mountain",
37
- "beach", "nature", "wildlife", "scenic", "green", "meadow", "canyon"
38
- ]
39
-
40
- HERITAGE_BOOST = [
41
- "fort", "palace", "museum", "ruins", "ancient", "heritage", "historic",
42
- "archaeological", "monument", "tomb", "temple", "architecture"
43
- ]
44
-
45
- SHOPPING_BOOST = [
46
- "market", "bazaar", "silk", "saree", "shopping", "handicraft", "souvenir",
47
- "textile", "jewellery", "craft"
48
- ]
49
-
50
- FOOD_GENERAL = [
51
- "authentic", "street food", "cafe", "bakery", "restaurant", "cuisine",
52
- "traditional", "local food", "breakfast", "lunch", "dinner"
53
- ]
54
-
55
- # Indian states and major cities for location matching
56
- INDIAN_LOCATIONS = [
57
- "delhi", "mumbai", "bangalore", "bengaluru", "kolkata", "chennai", "hyderabad",
58
- "pune", "ahmedabad", "jaipur", "lucknow", "varanasi", "banaras", "agra",
59
- "goa", "kerala", "karnataka", "tamil nadu", "maharashtra", "rajasthan",
60
- "gujarat", "punjab", "himachal pradesh", "uttarakhand", "kashmir", "jammu",
61
- "meghalaya", "assam", "nagaland", "manipur", "tripura", "mizoram", "sikkim",
62
- "west bengal", "odisha", "chhattisgarh", "madhya pradesh", "uttar pradesh",
63
- "bihar", "jharkhand", "andhra pradesh", "telangana", "ladakh", "arunachal pradesh",
64
- "shillong", "guwahati", "imphal", "kohima", "gangtok", "darjeeling"
65
- ]
66
-
67
- # ---------------- HELPER FUNCTIONS ----------------
68
-
69
- def similarity_ratio(a, b):
70
- """Calculate similarity between two strings (0.0 to 1.0)"""
71
- return SequenceMatcher(None, a.lower(), b.lower()).ratio()
72
-
73
- def extract_location_from_query(query):
74
- """Extract potential location names from query"""
75
- query_lower = query.lower()
76
- found_locations = []
77
- for loc in INDIAN_LOCATIONS:
78
- if loc in query_lower:
79
- found_locations.append(loc)
80
- return found_locations
81
-
82
- # ---------------- MAIN BOOSTING FUNCTION ----------------
83
-
84
- def apply_keyword_boost(query, candidates):
85
- """
86
- Apply advanced keyword boosting to candidates.
87
-
88
- Args:
89
- query: User query string
90
- candidates: List of candidate dictionaries with 'name', 'text', 'region', 'state', etc.
91
-
92
- Returns:
93
- Sorted list of candidates with 'final_score' field
94
- """
95
- query_lower = query.lower()
96
- query_locations = extract_location_from_query(query_lower)
97
-
98
- for item in candidates:
99
- # Start with rerank score or embedding score
100
- base_score = float(item.get("rerank_score", item.get("embedding_score", 0)))
101
- boost = 0.0
102
-
103
- name = item.get("name", "").lower()
104
- text = item.get("text", "").lower()
105
- region = item.get("region", "").lower()
106
- state = str(item.get("state", "")).lower()
107
-
108
- # ============ EXACT/NEAR EXACT MATCH BOOST ============
109
- # Massive boost if query contains the exact name or very close variant
110
- if name in query_lower or query_lower in name:
111
- boost += 10.0
112
- elif similarity_ratio(query_lower, name) > 0.85:
113
- boost += 8.0
114
-
115
- # ============ LOCATION MATCH BOOST ============
116
- # Check if query mentions the location
117
- for loc in query_locations:
118
- if loc in region or loc in state:
119
- boost += 8.0
120
- break
121
-
122
- # ============ INTENT MATCH BOOST ============
123
- # "Soft Filter": Boost items that match the detected intent
124
- detected_intent = item.get("intent", "general")
125
- item_domain = str(item.get("domain", "")).lower()
126
-
127
- # Map 'nature' intent to 'nature' domain (and 'travel' as secondary)
128
- if detected_intent == "nature":
129
- if item_domain == "nature":
130
- boost += 3.0
131
- elif item_domain == "travel":
132
- boost += 1.5 # Secondary boost for travel items in nature query
133
-
134
- # Standard intent match
135
- elif detected_intent != "general" and detected_intent in item_domain:
136
- boost += 3.0
137
-
138
- # ============ CATEGORY KEYWORD BOOSTS ============
139
- # Count matching keywords and apply scaled boost
140
-
141
- # Romantic
142
- romantic_matches = sum(1 for word in ROMANTIC_BOOST_WORDS if word in query_lower and word in text)
143
- if romantic_matches > 0:
144
- boost += min(romantic_matches * 0.8, 3.0)
145
-
146
- # Spiritual
147
- spiritual_matches = sum(1 for word in SPIRITUAL_BOOST if word in query_lower and word in text)
148
- if spiritual_matches > 0:
149
- boost += min(spiritual_matches * 0.7, 2.5)
150
-
151
- # Spicy Food
152
- spicy_matches = sum(1 for word in FOOD_SPICY if word in query_lower and word in text)
153
- if spicy_matches > 0:
154
- boost += min(spicy_matches * 0.6, 2.0)
155
-
156
- # Adventure
157
- adventure_matches = sum(1 for word in ADVENTURE_BOOST if word in query_lower and word in text)
158
- if adventure_matches > 0:
159
- boost += min(adventure_matches * 0.7, 2.5)
160
-
161
- # Nature
162
- nature_matches = sum(1 for word in NATURE_BOOST if word in query_lower and word in text)
163
- if nature_matches > 0:
164
- boost += min(nature_matches * 0.6, 2.5)
165
-
166
- # Heritage
167
- heritage_matches = sum(1 for word in HERITAGE_BOOST if word in query_lower and word in text)
168
- if heritage_matches > 0:
169
- boost += min(heritage_matches * 0.6, 2.5)
170
-
171
- # Shopping
172
- shopping_matches = sum(1 for word in SHOPPING_BOOST if word in query_lower and word in text)
173
- if shopping_matches > 0:
174
- boost += min(shopping_matches * 0.5, 2.0)
175
-
176
- # Food General
177
- food_matches = sum(1 for word in FOOD_GENERAL if word in query_lower and word in text)
178
- if food_matches > 0:
179
- boost += min(food_matches * 0.5, 2.0)
180
-
181
- # Calculate final score
182
- item["final_score"] = base_score + boost
183
- item["boost_applied"] = boost
184
-
185
- # Sort by final score (descending)
186
- return sorted(candidates, key=lambda x: x["final_score"], reverse=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FILE: keyword_boosting_layer.py (ENHANCED VERSION)
3
+
4
+ PURPOSE:
5
+ - Advanced keyword-based boosting for final score refinement
6
+ - Exact match detection for precise results
7
+ - Location-based boosting
8
+ - Expanded category coverage
9
+ """
10
+
11
+ import re
12
+ from difflib import SequenceMatcher
13
+
14
+ # ---------------- KEYWORD CATEGORIES ----------------
15
+
16
+ ROMANTIC_BOOST_WORDS = [
17
+ "love", "romantic", "couple", "honeymoon", "beautiful", "sunset",
18
+ "palace", "view", "lake", "memories", "historic", "architecture"
19
+ ]
20
+
21
+ SPIRITUAL_BOOST = [
22
+ "peace", "meditation", "spiritual", "calm", "holy", "divine", "quiet",
23
+ "temple", "church", "mosque", "monastery", "shrine", "sacred"
24
+ ]
25
+
26
+ FOOD_SPICY = [
27
+ "spicy", "masala", "hot", "tangy", "flavour", "chilli", "pepper"
28
+ ]
29
+
30
+ ADVENTURE_BOOST = [
31
+ "trek", "hike", "camping", "rafting", "adventure", "mountain", "climb",
32
+ "rappelling", "kayaking", "paragliding", "safari", "jungle"
33
+ ]
34
+
35
+ NATURE_BOOST = [
36
+ "waterfall", "lake", "forest", "valley", "river", "hill", "mountain",
37
+ "beach", "nature", "wildlife", "scenic", "green", "meadow", "canyon"
38
+ ]
39
+
40
+ HERITAGE_BOOST = [
41
+ "fort", "palace", "museum", "ruins", "ancient", "heritage", "historic",
42
+ "archaeological", "monument", "tomb", "temple", "architecture"
43
+ ]
44
+
45
+ SHOPPING_BOOST = [
46
+ "market", "bazaar", "silk", "saree", "shopping", "handicraft", "souvenir",
47
+ "textile", "jewellery", "craft"
48
+ ]
49
+
50
+ FOOD_GENERAL = [
51
+ "authentic", "street food", "cafe", "bakery", "restaurant", "cuisine",
52
+ "traditional", "local food", "breakfast", "lunch", "dinner", "eatery", "mess",
53
+ "bhavan", "tiffin", "dhaba", "canteen", "bistro"
54
+ ]
55
+
56
+ SWEET_BOOST = [
57
+ "sweet", "dessert", "halwa", "mysore pak", "payasam", "kaja", "laddu",
58
+ "barfi", "ghevar", "petha", "rasgulla", "rosogolla", "sandesh", "mishti",
59
+ "jalebi", "gulab jamun", "double ka meetha", "qubani", "chhena poda", "bebinca"
60
+ ]
61
+
62
+ SPECIFIC_DISH_BOOST = [
63
+ "biryani", "dosa", "idli", "vada", "sambar", "fish curry", "thali", "meals",
64
+ "kebab", "tikka", "tandoori", "butter chicken", "dal makhani", "chole bhature",
65
+ "paratha", "kulcha", "pulla heady", "haleem", "nihari", "galouti", "kachori",
66
+ "dhokla", "khandvi", "pav bhaji", "vada pav", "misal", "poh", "upma", "bisi bele bath",
67
+ "ragi mudde", "appam", "stew", "puttu", "beef fry", "porotta", "litti chokha",
68
+ "momos", "thukpa", "fish fry", "bamboo chicken", "pongal", "avial"
69
+ ]
70
+
71
+ # Indian states and major cities for location matching
72
+ INDIAN_LOCATIONS = [
73
+ "delhi", "mumbai", "bangalore", "bengaluru", "kolkata", "chennai", "hyderabad",
74
+ "pune", "ahmedabad", "jaipur", "lucknow", "varanasi", "banaras", "agra",
75
+ "goa", "kerala", "karnataka", "tamil nadu", "maharashtra", "rajasthan",
76
+ "gujarat", "punjab", "himachal pradesh", "uttarakhand", "kashmir", "jammu",
77
+ "meghalaya", "assam", "nagaland", "manipur", "tripura", "mizoram", "sikkim",
78
+ "west bengal", "odisha", "chhattisgarh", "madhya pradesh", "uttar pradesh",
79
+ "bihar", "jharkhand", "andhra pradesh", "telangana", "ladakh", "arunachal pradesh",
80
+ "shillong", "guwahati", "imphal", "kohima", "gangtok", "darjeeling",
81
+ "ongole", "tirupati", "vijayawada", "visakhapatnam", "vizag", "srisailam",
82
+ "simhachalam", "lepakshi", "ahobilam", "mangalagiri", "srikalahasti", "kurnool",
83
+ "warangal", "madurai", "rameshwaram", "thanjavur", "coimbatore", "mysore", "hampi",
84
+ "coorg", "wayanad", "munnar", "alleppey", "alappuzha", "pondicherry", "puducherry"
85
+ ]
86
+
87
+ # ---------------- HELPER FUNCTIONS ----------------
88
+
89
+ def similarity_ratio(a, b):
90
+ """Calculate similarity between two strings (0.0 to 1.0)"""
91
+ return SequenceMatcher(None, a.lower(), b.lower()).ratio()
92
+
93
+ def extract_location_from_query(query):
94
+ """Extract potential location names from query"""
95
+ query_lower = query.lower()
96
+ found_locations = []
97
+ for loc in INDIAN_LOCATIONS:
98
+ if loc in query_lower:
99
+ found_locations.append(loc)
100
+ return found_locations
101
+
102
+ # ---------------- MAIN BOOSTING FUNCTION ----------------
103
+
104
+ def apply_keyword_boost(query, candidates):
105
+ """
106
+ Apply advanced keyword boosting to candidates.
107
+
108
+ Args:
109
+ query: User query string
110
+ candidates: List of candidate dictionaries with 'name', 'text', 'region', 'state', etc.
111
+
112
+ Returns:
113
+ Sorted list of candidates with 'final_score' field
114
+ """
115
+ query_lower = query.lower()
116
+ query_locations = extract_location_from_query(query_lower)
117
+
118
+ for item in candidates:
119
+ # Start with rerank score or embedding score
120
+ base_score = float(item.get("rerank_score", item.get("embedding_score", 0)))
121
+ boost = 0.0
122
+
123
+ name = item.get("name", "").lower()
124
+ text = item.get("text", "").lower()
125
+ region = item.get("region", "").lower()
126
+ state = str(item.get("state", "")).lower()
127
+
128
+ # ============ EXACT/NEAR EXACT MATCH BOOST ============
129
+ # Massive boost if query contains the exact name or very close variant
130
+ if name in query_lower or query_lower in name:
131
+ boost += 10.0
132
+ elif similarity_ratio(query_lower, name) > 0.85:
133
+ boost += 8.0
134
+
135
+ # ============ LOCATION MATCH BOOST ============
136
+ # Check if query mentions the location
137
+ for loc in query_locations:
138
+ if loc in region or loc in state:
139
+ boost += 8.0
140
+ break
141
+
142
+ # ============ INTENT MATCH BOOST ============
143
+ # "Soft Filter": Boost items that match the detected intent
144
+ detected_intent = item.get("intent", "general")
145
+ item_domain = str(item.get("domain", "")).lower()
146
+
147
+ # Map 'nature' intent to 'nature' domain (and 'travel' as secondary)
148
+ if detected_intent == "nature":
149
+ if item_domain == "nature":
150
+ boost += 3.0
151
+ elif item_domain == "travel":
152
+ boost += 1.5 # Secondary boost for travel items in nature query
153
+
154
+ # Standard intent match
155
+ elif detected_intent != "general" and detected_intent in item_domain:
156
+ boost += 3.0
157
+
158
+ # ============ CATEGORY KEYWORD BOOSTS ============
159
+ # Count matching keywords and apply scaled boost
160
+
161
+ # Romantic
162
+ romantic_matches = sum(1 for word in ROMANTIC_BOOST_WORDS if word in query_lower and word in text)
163
+ if romantic_matches > 0:
164
+ boost += min(romantic_matches * 0.8, 3.0)
165
+
166
+ # Spiritual
167
+ spiritual_matches = sum(1 for word in SPIRITUAL_BOOST if word in query_lower and word in text)
168
+ if spiritual_matches > 0:
169
+ boost += min(spiritual_matches * 0.7, 2.5)
170
+
171
+ # Spicy Food
172
+ spicy_matches = sum(1 for word in FOOD_SPICY if word in query_lower and word in text)
173
+ if spicy_matches > 0:
174
+ boost += min(spicy_matches * 0.6, 2.0)
175
+
176
+ # Adventure
177
+ adventure_matches = sum(1 for word in ADVENTURE_BOOST if word in query_lower and word in text)
178
+ if adventure_matches > 0:
179
+ boost += min(adventure_matches * 0.7, 2.5)
180
+
181
+ # Nature
182
+ nature_matches = sum(1 for word in NATURE_BOOST if word in query_lower and word in text)
183
+ if nature_matches > 0:
184
+ boost += min(nature_matches * 0.6, 2.5)
185
+
186
+ # Heritage
187
+ heritage_matches = sum(1 for word in HERITAGE_BOOST if word in query_lower and word in text)
188
+ if heritage_matches > 0:
189
+ boost += min(heritage_matches * 0.6, 2.5)
190
+
191
+ # Shopping
192
+ shopping_matches = sum(1 for word in SHOPPING_BOOST if word in query_lower and word in text)
193
+ if shopping_matches > 0:
194
+ boost += min(shopping_matches * 0.5, 2.0)
195
+
196
+ # Food General
197
+ food_matches = sum(1 for word in FOOD_GENERAL if word in query_lower and word in text)
198
+ if food_matches > 0:
199
+ boost += min(food_matches * 0.5, 2.0)
200
+
201
+ # Sweets/Desserts
202
+ sweet_matches = sum(1 for word in SWEET_BOOST if word in query_lower and word in text)
203
+ if sweet_matches > 0:
204
+ boost += min(sweet_matches * 0.8, 3.0)
205
+
206
+ # Specific Dishes
207
+ dish_matches = sum(1 for word in SPECIFIC_DISH_BOOST if word in query_lower and word in text)
208
+ if dish_matches > 0:
209
+ boost += min(dish_matches * 1.0, 4.0)
210
+
211
+ # Calculate final score
212
+ item["final_score"] = base_score + boost
213
+ item["boost_applied"] = boost
214
+
215
+ # Sort by final score (descending)
216
+ return sorted(candidates, key=lambda x: x["final_score"], reverse=True)