davidtran999's picture
Upload backend/chatbot/query_expansion.py with huggingface_hub
eac17f7 verified
"""
Query expansion and paraphrasing utilities for improving search recall.
"""
import re
import unicodedata
from typing import List, Dict, Any, Optional, Set
from hue_portal.core.models import Synonym
from hue_portal.core.search_ml import expand_query_with_synonyms
def normalize_vietnamese_query(query: str) -> str:
"""
Normalize Vietnamese text by handling diacritics variants.
Args:
query: Input query string.
Returns:
Normalized query string.
"""
if not query:
return ""
# Remove extra spaces
query = re.sub(r'\s+', ' ', query.strip())
# Lowercase
query = query.lower()
return query
def extract_key_phrases(query: str) -> List[str]:
"""
Extract key phrases from query.
Args:
query: Input query string.
Returns:
List of key phrases.
"""
if not query:
return []
# Remove common stopwords
stopwords = {
"là", "gì", "bao nhiêu", "như thế nào", "ở đâu", "của", "và", "hoặc",
"tôi", "bạn", "có", "không", "được", "một", "các", "với", "cho"
}
# Split into words
words = re.findall(r'\b\w+\b', query.lower())
# Filter stopwords and short words
key_words = [w for w in words if w not in stopwords and len(w) > 2]
# Extract bigrams (2-word phrases)
phrases = []
for i in range(len(key_words) - 1):
phrase = f"{key_words[i]} {key_words[i+1]}"
phrases.append(phrase)
# Combine single words and phrases
all_phrases = key_words + phrases
return all_phrases
def expand_query_semantically(query: str, context: Optional[Dict[str, Any]] = None) -> List[str]:
"""
Expand query with synonyms and related terms.
Args:
query: Original query string.
context: Optional context dictionary with entities, intents, etc.
Returns:
List of expanded query variations.
"""
expanded = [query]
# Use existing synonym expansion
synonym_expanded = expand_query_with_synonyms(query)
expanded.extend(synonym_expanded)
# Add context-based expansions
if context:
entities = context.get("entities", {})
# If fine_code in context, add fine name variations
if "fine_code" in entities:
fine_code = entities["fine_code"]
# Could look up fine name from database and add variations
expanded.append(f"{query} {fine_code}")
# If procedure_name in context, add procedure variations
if "procedure_name" in entities:
procedure_name = entities["procedure_name"]
expanded.append(f"{query} {procedure_name}")
# Add common Vietnamese variations
variations = _get_vietnamese_variations(query)
expanded.extend(variations)
# Remove duplicates while preserving order
seen = set()
unique_expanded = []
for q in expanded:
q_normalized = normalize_vietnamese_query(q)
if q_normalized not in seen:
seen.add(q_normalized)
unique_expanded.append(q)
return unique_expanded
def _get_vietnamese_variations(query: str) -> List[str]:
"""
Get common Vietnamese query variations.
Args:
query: Input query.
Returns:
List of variations.
"""
variations = []
query_lower = query.lower()
# Common synonym mappings
synonym_map = {
"mức phạt": ["tiền phạt", "phạt", "xử phạt"],
"thủ tục": ["hồ sơ", "giấy tờ", "quy trình"],
"địa chỉ": ["nơi", "chỗ", "điểm"],
"số điện thoại": ["điện thoại", "số liên hệ", "hotline"],
"giờ làm việc": ["thời gian", "giờ", "lịch làm việc"],
"cảnh báo": ["thông báo", "lưu ý", "chú ý"],
"lừa đảo": ["scam", "gian lận", "lừa"],
}
for key, synonyms in synonym_map.items():
if key in query_lower:
for synonym in synonyms:
variation = query_lower.replace(key, synonym)
if variation != query_lower:
variations.append(variation)
return variations
def paraphrase_query(query: str) -> List[str]:
"""
Generate paraphrases of the query to increase recall.
Args:
query: Original query string.
Returns:
List of paraphrased queries.
"""
paraphrases = [query]
query_lower = query.lower()
# Common paraphrasing patterns for Vietnamese
patterns = [
# Question variations
(r"mức phạt (.+) là bao nhiêu", r"phạt \1 bao nhiêu tiền"),
(r"thủ tục (.+) cần gì", r"làm thủ tục \1 cần giấy tờ gì"),
(r"địa chỉ (.+) ở đâu", r"\1 ở đâu"),
(r"(.+) như thế nào", r"cách \1"),
]
for pattern, replacement in patterns:
if re.search(pattern, query_lower):
paraphrase = re.sub(pattern, replacement, query_lower)
if paraphrase != query_lower:
paraphrases.append(paraphrase)
# Add question word variations
if "bao nhiêu" in query_lower:
paraphrases.append(query_lower.replace("bao nhiêu", "mức"))
paraphrases.append(query_lower.replace("bao nhiêu", "giá"))
if "như thế nào" in query_lower:
paraphrases.append(query_lower.replace("như thế nào", "cách"))
paraphrases.append(query_lower.replace("như thế nào", "quy trình"))
# Remove duplicates
return list(dict.fromkeys(paraphrases))
def enhance_query_with_context(query: str, context: Optional[Dict[str, Any]] = None) -> str:
"""
Enhance query with context information.
Args:
query: Original query string.
context: Optional context dictionary.
Returns:
Enhanced query string.
"""
if not context:
return query
enhanced_parts = [query]
# Add entities from context
entities = context.get("entities", {})
if "fine_code" in entities:
enhanced_parts.append(entities["fine_code"])
if "procedure_name" in entities:
enhanced_parts.append(entities["procedure_name"])
if "office_name" in entities:
enhanced_parts.append(entities["office_name"])
# Add intent-based keywords
intent = context.get("intent", "")
if intent == "search_fine":
enhanced_parts.append("mức phạt vi phạm")
elif intent == "search_procedure":
enhanced_parts.append("thủ tục hành chính")
elif intent == "search_office":
enhanced_parts.append("đơn vị công an")
return " ".join(enhanced_parts)