| | """Vietnamese text normalization for address matching. |
| | |
| | Uses underthesea character normalization (NFC + character map) to fix |
| | encoding issues, then applies address-specific transformations |
| | (abbreviation expansion, diacritics removal, key generation). |
| | """ |
| |
|
| | import re |
| | import unicodedata |
| |
|
| | from underthesea.pipeline.text_normalize.character_normalize import ( |
| | normalize_characters_in_text, |
| | ) |
| |
|
| | |
| | ABBREVIATIONS = { |
| | "tp.": "thành phố ", |
| | "tp ": "thành phố ", |
| | "t.p.": "thành phố ", |
| | "t.p ": "thành phố ", |
| | "p.": "phường ", |
| | "q.": "quận ", |
| | "h.": "huyện ", |
| | "tx.": "thị xã ", |
| | "t.x.": "thị xã ", |
| | "tt.": "thị trấn ", |
| | "t.t.": "thị trấn ", |
| | "x.": "xã ", |
| | } |
| |
|
| |
|
| | def remove_diacritics(text: str) -> str: |
| | """Remove Vietnamese diacritics from text. |
| | |
| | First applies underthesea character normalization (NFC + character map) |
| | to fix encoding issues, then strips combining marks via NFKD decomposition. |
| | """ |
| | text = normalize_characters_in_text(text) |
| | nfkd = unicodedata.normalize("NFKD", text) |
| | result = "".join(c for c in nfkd if not unicodedata.combining(c)) |
| | |
| | result = result.replace("đ", "d").replace("Đ", "D") |
| | return result |
| |
|
| |
|
| | def normalize_key(text: str) -> str: |
| | """Normalize text to a lookup key (lowercase, no diacritics, no spaces/punctuation).""" |
| | text = text.lower().strip() |
| | text = remove_diacritics(text) |
| | text = re.sub(r"[^a-z0-9]", "", text) |
| | return text |
| |
|
| |
|
| | def expand_abbreviations(text: str) -> str: |
| | """Expand common Vietnamese address abbreviations.""" |
| | result = text.lower().strip() |
| | |
| | for abbr, full in sorted(ABBREVIATIONS.items(), key=lambda x: -len(x[0])): |
| | result = result.replace(abbr, full) |
| | return result.strip() |
| |
|
| |
|
| | def normalize_for_matching(text: str) -> str: |
| | """Full normalization pipeline for fuzzy matching.""" |
| | text = expand_abbreviations(text) |
| | return normalize_key(text) |
| |
|