davidtran999 commited on
Commit
eac17f7
·
verified ·
1 Parent(s): 9748a61

Upload backend/chatbot/query_expansion.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. backend/chatbot/query_expansion.py +228 -0
backend/chatbot/query_expansion.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Query expansion and paraphrasing utilities for improving search recall.
3
+ """
4
+ import re
5
+ import unicodedata
6
+ from typing import List, Dict, Any, Optional, Set
7
+ from hue_portal.core.models import Synonym
8
+ from hue_portal.core.search_ml import expand_query_with_synonyms
9
+
10
+
11
+ def normalize_vietnamese_query(query: str) -> str:
12
+ """
13
+ Normalize Vietnamese text by handling diacritics variants.
14
+
15
+ Args:
16
+ query: Input query string.
17
+
18
+ Returns:
19
+ Normalized query string.
20
+ """
21
+ if not query:
22
+ return ""
23
+
24
+ # Remove extra spaces
25
+ query = re.sub(r'\s+', ' ', query.strip())
26
+
27
+ # Lowercase
28
+ query = query.lower()
29
+
30
+ return query
31
+
32
+
33
+ def extract_key_phrases(query: str) -> List[str]:
34
+ """
35
+ Extract key phrases from query.
36
+
37
+ Args:
38
+ query: Input query string.
39
+
40
+ Returns:
41
+ List of key phrases.
42
+ """
43
+ if not query:
44
+ return []
45
+
46
+ # Remove common stopwords
47
+ stopwords = {
48
+ "là", "gì", "bao nhiêu", "như thế nào", "ở đâu", "của", "và", "hoặc",
49
+ "tôi", "bạn", "có", "không", "được", "một", "các", "với", "cho"
50
+ }
51
+
52
+ # Split into words
53
+ words = re.findall(r'\b\w+\b', query.lower())
54
+
55
+ # Filter stopwords and short words
56
+ key_words = [w for w in words if w not in stopwords and len(w) > 2]
57
+
58
+ # Extract bigrams (2-word phrases)
59
+ phrases = []
60
+ for i in range(len(key_words) - 1):
61
+ phrase = f"{key_words[i]} {key_words[i+1]}"
62
+ phrases.append(phrase)
63
+
64
+ # Combine single words and phrases
65
+ all_phrases = key_words + phrases
66
+
67
+ return all_phrases
68
+
69
+
70
+ def expand_query_semantically(query: str, context: Optional[Dict[str, Any]] = None) -> List[str]:
71
+ """
72
+ Expand query with synonyms and related terms.
73
+
74
+ Args:
75
+ query: Original query string.
76
+ context: Optional context dictionary with entities, intents, etc.
77
+
78
+ Returns:
79
+ List of expanded query variations.
80
+ """
81
+ expanded = [query]
82
+
83
+ # Use existing synonym expansion
84
+ synonym_expanded = expand_query_with_synonyms(query)
85
+ expanded.extend(synonym_expanded)
86
+
87
+ # Add context-based expansions
88
+ if context:
89
+ entities = context.get("entities", {})
90
+
91
+ # If fine_code in context, add fine name variations
92
+ if "fine_code" in entities:
93
+ fine_code = entities["fine_code"]
94
+ # Could look up fine name from database and add variations
95
+ expanded.append(f"{query} {fine_code}")
96
+
97
+ # If procedure_name in context, add procedure variations
98
+ if "procedure_name" in entities:
99
+ procedure_name = entities["procedure_name"]
100
+ expanded.append(f"{query} {procedure_name}")
101
+
102
+ # Add common Vietnamese variations
103
+ variations = _get_vietnamese_variations(query)
104
+ expanded.extend(variations)
105
+
106
+ # Remove duplicates while preserving order
107
+ seen = set()
108
+ unique_expanded = []
109
+ for q in expanded:
110
+ q_normalized = normalize_vietnamese_query(q)
111
+ if q_normalized not in seen:
112
+ seen.add(q_normalized)
113
+ unique_expanded.append(q)
114
+
115
+ return unique_expanded
116
+
117
+
118
+ def _get_vietnamese_variations(query: str) -> List[str]:
119
+ """
120
+ Get common Vietnamese query variations.
121
+
122
+ Args:
123
+ query: Input query.
124
+
125
+ Returns:
126
+ List of variations.
127
+ """
128
+ variations = []
129
+ query_lower = query.lower()
130
+
131
+ # Common synonym mappings
132
+ synonym_map = {
133
+ "mức phạt": ["tiền phạt", "phạt", "xử phạt"],
134
+ "thủ tục": ["hồ sơ", "giấy tờ", "quy trình"],
135
+ "địa chỉ": ["nơi", "chỗ", "điểm"],
136
+ "số điện thoại": ["điện thoại", "số liên hệ", "hotline"],
137
+ "giờ làm việc": ["thời gian", "giờ", "lịch làm việc"],
138
+ "cảnh báo": ["thông báo", "lưu ý", "chú ý"],
139
+ "lừa đảo": ["scam", "gian lận", "lừa"],
140
+ }
141
+
142
+ for key, synonyms in synonym_map.items():
143
+ if key in query_lower:
144
+ for synonym in synonyms:
145
+ variation = query_lower.replace(key, synonym)
146
+ if variation != query_lower:
147
+ variations.append(variation)
148
+
149
+ return variations
150
+
151
+
152
+ def paraphrase_query(query: str) -> List[str]:
153
+ """
154
+ Generate paraphrases of the query to increase recall.
155
+
156
+ Args:
157
+ query: Original query string.
158
+
159
+ Returns:
160
+ List of paraphrased queries.
161
+ """
162
+ paraphrases = [query]
163
+ query_lower = query.lower()
164
+
165
+ # Common paraphrasing patterns for Vietnamese
166
+ patterns = [
167
+ # Question variations
168
+ (r"mức phạt (.+) là bao nhiêu", r"phạt \1 bao nhiêu tiền"),
169
+ (r"thủ tục (.+) cần gì", r"làm thủ tục \1 cần giấy tờ gì"),
170
+ (r"địa chỉ (.+) ở đâu", r"\1 ở đâu"),
171
+ (r"(.+) như thế nào", r"cách \1"),
172
+ ]
173
+
174
+ for pattern, replacement in patterns:
175
+ if re.search(pattern, query_lower):
176
+ paraphrase = re.sub(pattern, replacement, query_lower)
177
+ if paraphrase != query_lower:
178
+ paraphrases.append(paraphrase)
179
+
180
+ # Add question word variations
181
+ if "bao nhiêu" in query_lower:
182
+ paraphrases.append(query_lower.replace("bao nhiêu", "mức"))
183
+ paraphrases.append(query_lower.replace("bao nhiêu", "giá"))
184
+
185
+ if "như thế nào" in query_lower:
186
+ paraphrases.append(query_lower.replace("như thế nào", "cách"))
187
+ paraphrases.append(query_lower.replace("như thế nào", "quy trình"))
188
+
189
+ # Remove duplicates
190
+ return list(dict.fromkeys(paraphrases))
191
+
192
+
193
+ def enhance_query_with_context(query: str, context: Optional[Dict[str, Any]] = None) -> str:
194
+ """
195
+ Enhance query with context information.
196
+
197
+ Args:
198
+ query: Original query string.
199
+ context: Optional context dictionary.
200
+
201
+ Returns:
202
+ Enhanced query string.
203
+ """
204
+ if not context:
205
+ return query
206
+
207
+ enhanced_parts = [query]
208
+
209
+ # Add entities from context
210
+ entities = context.get("entities", {})
211
+ if "fine_code" in entities:
212
+ enhanced_parts.append(entities["fine_code"])
213
+ if "procedure_name" in entities:
214
+ enhanced_parts.append(entities["procedure_name"])
215
+ if "office_name" in entities:
216
+ enhanced_parts.append(entities["office_name"])
217
+
218
+ # Add intent-based keywords
219
+ intent = context.get("intent", "")
220
+ if intent == "search_fine":
221
+ enhanced_parts.append("mức phạt vi phạm")
222
+ elif intent == "search_procedure":
223
+ enhanced_parts.append("thủ tục hành chính")
224
+ elif intent == "search_office":
225
+ enhanced_parts.append("đơn vị công an")
226
+
227
+ return " ".join(enhanced_parts)
228
+