Spaces:
Sleeping
Sleeping
File size: 6,824 Bytes
eac17f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
"""
Query expansion and paraphrasing utilities for improving search recall.
"""
import re
import unicodedata
from typing import List, Dict, Any, Optional, Set
from hue_portal.core.models import Synonym
from hue_portal.core.search_ml import expand_query_with_synonyms
def normalize_vietnamese_query(query: str) -> str:
"""
Normalize Vietnamese text by handling diacritics variants.
Args:
query: Input query string.
Returns:
Normalized query string.
"""
if not query:
return ""
# Remove extra spaces
query = re.sub(r'\s+', ' ', query.strip())
# Lowercase
query = query.lower()
return query
def extract_key_phrases(query: str) -> List[str]:
"""
Extract key phrases from query.
Args:
query: Input query string.
Returns:
List of key phrases.
"""
if not query:
return []
# Remove common stopwords
stopwords = {
"là", "gì", "bao nhiêu", "như thế nào", "ở đâu", "của", "và", "hoặc",
"tôi", "bạn", "có", "không", "được", "một", "các", "với", "cho"
}
# Split into words
words = re.findall(r'\b\w+\b', query.lower())
# Filter stopwords and short words
key_words = [w for w in words if w not in stopwords and len(w) > 2]
# Extract bigrams (2-word phrases)
phrases = []
for i in range(len(key_words) - 1):
phrase = f"{key_words[i]} {key_words[i+1]}"
phrases.append(phrase)
# Combine single words and phrases
all_phrases = key_words + phrases
return all_phrases
def expand_query_semantically(query: str, context: Optional[Dict[str, Any]] = None) -> List[str]:
"""
Expand query with synonyms and related terms.
Args:
query: Original query string.
context: Optional context dictionary with entities, intents, etc.
Returns:
List of expanded query variations.
"""
expanded = [query]
# Use existing synonym expansion
synonym_expanded = expand_query_with_synonyms(query)
expanded.extend(synonym_expanded)
# Add context-based expansions
if context:
entities = context.get("entities", {})
# If fine_code in context, add fine name variations
if "fine_code" in entities:
fine_code = entities["fine_code"]
# Could look up fine name from database and add variations
expanded.append(f"{query} {fine_code}")
# If procedure_name in context, add procedure variations
if "procedure_name" in entities:
procedure_name = entities["procedure_name"]
expanded.append(f"{query} {procedure_name}")
# Add common Vietnamese variations
variations = _get_vietnamese_variations(query)
expanded.extend(variations)
# Remove duplicates while preserving order
seen = set()
unique_expanded = []
for q in expanded:
q_normalized = normalize_vietnamese_query(q)
if q_normalized not in seen:
seen.add(q_normalized)
unique_expanded.append(q)
return unique_expanded
def _get_vietnamese_variations(query: str) -> List[str]:
"""
Get common Vietnamese query variations.
Args:
query: Input query.
Returns:
List of variations.
"""
variations = []
query_lower = query.lower()
# Common synonym mappings
synonym_map = {
"mức phạt": ["tiền phạt", "phạt", "xử phạt"],
"thủ tục": ["hồ sơ", "giấy tờ", "quy trình"],
"địa chỉ": ["nơi", "chỗ", "điểm"],
"số điện thoại": ["điện thoại", "số liên hệ", "hotline"],
"giờ làm việc": ["thời gian", "giờ", "lịch làm việc"],
"cảnh báo": ["thông báo", "lưu ý", "chú ý"],
"lừa đảo": ["scam", "gian lận", "lừa"],
}
for key, synonyms in synonym_map.items():
if key in query_lower:
for synonym in synonyms:
variation = query_lower.replace(key, synonym)
if variation != query_lower:
variations.append(variation)
return variations
def paraphrase_query(query: str) -> List[str]:
"""
Generate paraphrases of the query to increase recall.
Args:
query: Original query string.
Returns:
List of paraphrased queries.
"""
paraphrases = [query]
query_lower = query.lower()
# Common paraphrasing patterns for Vietnamese
patterns = [
# Question variations
(r"mức phạt (.+) là bao nhiêu", r"phạt \1 bao nhiêu tiền"),
(r"thủ tục (.+) cần gì", r"làm thủ tục \1 cần giấy tờ gì"),
(r"địa chỉ (.+) ở đâu", r"\1 ở đâu"),
(r"(.+) như thế nào", r"cách \1"),
]
for pattern, replacement in patterns:
if re.search(pattern, query_lower):
paraphrase = re.sub(pattern, replacement, query_lower)
if paraphrase != query_lower:
paraphrases.append(paraphrase)
# Add question word variations
if "bao nhiêu" in query_lower:
paraphrases.append(query_lower.replace("bao nhiêu", "mức"))
paraphrases.append(query_lower.replace("bao nhiêu", "giá"))
if "như thế nào" in query_lower:
paraphrases.append(query_lower.replace("như thế nào", "cách"))
paraphrases.append(query_lower.replace("như thế nào", "quy trình"))
# Remove duplicates
return list(dict.fromkeys(paraphrases))
def enhance_query_with_context(query: str, context: Optional[Dict[str, Any]] = None) -> str:
"""
Enhance query with context information.
Args:
query: Original query string.
context: Optional context dictionary.
Returns:
Enhanced query string.
"""
if not context:
return query
enhanced_parts = [query]
# Add entities from context
entities = context.get("entities", {})
if "fine_code" in entities:
enhanced_parts.append(entities["fine_code"])
if "procedure_name" in entities:
enhanced_parts.append(entities["procedure_name"])
if "office_name" in entities:
enhanced_parts.append(entities["office_name"])
# Add intent-based keywords
intent = context.get("intent", "")
if intent == "search_fine":
enhanced_parts.append("mức phạt vi phạm")
elif intent == "search_procedure":
enhanced_parts.append("thủ tục hành chính")
elif intent == "search_office":
enhanced_parts.append("đơn vị công an")
return " ".join(enhanced_parts)
|