status-law-gbot / src /language_utils.py
Rulga's picture
Refactor language utilities to enhance language support and detection accuracy
7073ee1
# src/language_utils.py
from langdetect import detect, DetectorFactory
from typing import Optional, List
import logging
# For more stable language detection
DetectorFactory.seed = 0
# Logger setup
logger = logging.getLogger(__name__)
class LanguageUtils:
"""Utility class for language operations"""
SUPPORTED_LANGUAGES = {
# Common European languages
'en': 'English',
'ru': 'Russian',
'de': 'German',
'fr': 'French',
'es': 'Spanish',
'it': 'Italian',
'pt': 'Portuguese',
'nl': 'Dutch',
'pl': 'Polish',
'sv': 'Swedish',
'no': 'Norwegian',
'da': 'Danish',
'fi': 'Finnish',
# Asian languages
'zh': 'Chinese',
'ja': 'Japanese',
'ko': 'Korean',
# Other widely used languages
'ar': 'Arabic',
'hi': 'Hindi',
'tr': 'Turkish',
'cs': 'Czech',
'uk': 'Ukrainian',
'bg': 'Bulgarian',
'el': 'Greek',
'he': 'Hebrew',
'th': 'Thai',
'vi': 'Vietnamese',
'hu': 'Hungarian',
'sk': 'Slovak',
'ro': 'Romanian',
'id': 'Indonesian',
'ms': 'Malay',
}
@classmethod
def get_language_name(cls, lang_code: str) -> str:
"""Get language name from code"""
return cls.SUPPORTED_LANGUAGES.get(lang_code, "Unknown")
@classmethod
def is_supported(cls, lang_code: str) -> bool:
"""Check if language is supported"""
return lang_code in cls.SUPPORTED_LANGUAGES
@classmethod
def get_closest_supported_language(cls, lang_code: str) -> str:
"""
Get the closest supported language code
This helps with similar language detection issues
like confusing 'no' (Norwegian) with 'da' (Danish)
"""
if lang_code in cls.SUPPORTED_LANGUAGES:
return lang_code
# Language mapping for commonly confused languages
similar_languages = {
'nb': 'no', # Norwegian BokmΓ₯l β†’ Norwegian
'nn': 'no', # Norwegian Nynorsk β†’ Norwegian
'zh-cn': 'zh', # Chinese Simplified β†’ Chinese
'zh-tw': 'zh', # Chinese Traditional β†’ Chinese
'hr': 'sr', # Croatian β†’ Serbian (similar)
'bs': 'sr', # Bosnian β†’ Serbian (similar)
'mk': 'bg', # Macedonian β†’ Bulgarian (similar)
'be': 'ru', # Belarusian β†’ Russian (similar)
'ca': 'es', # Catalan β†’ Spanish (similar)
'gl': 'pt', # Galician β†’ Portuguese (similar)
'af': 'nl', # Afrikaans β†’ Dutch (similar)
}
return similar_languages.get(lang_code, "en")
# Create instance for convenient import
language_processor = LanguageUtils()