Spaces:
Running
Running
File size: 2,867 Bytes
a19d0ca 7073ee1 a19d0ca 7073ee1 a19d0ca 7073ee1 a19d0ca 7073ee1 a19d0ca 7073ee1 a19d0ca 7073ee1 a19d0ca 7073ee1 a19d0ca 7073ee1 a19d0ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# src/language_utils.py
from langdetect import detect, DetectorFactory
from typing import Optional, List
import logging
# For more stable language detection
DetectorFactory.seed = 0
# Logger setup
logger = logging.getLogger(__name__)
class LanguageUtils:
"""Utility class for language operations"""
SUPPORTED_LANGUAGES = {
# Common European languages
'en': 'English',
'ru': 'Russian',
'de': 'German',
'fr': 'French',
'es': 'Spanish',
'it': 'Italian',
'pt': 'Portuguese',
'nl': 'Dutch',
'pl': 'Polish',
'sv': 'Swedish',
'no': 'Norwegian',
'da': 'Danish',
'fi': 'Finnish',
# Asian languages
'zh': 'Chinese',
'ja': 'Japanese',
'ko': 'Korean',
# Other widely used languages
'ar': 'Arabic',
'hi': 'Hindi',
'tr': 'Turkish',
'cs': 'Czech',
'uk': 'Ukrainian',
'bg': 'Bulgarian',
'el': 'Greek',
'he': 'Hebrew',
'th': 'Thai',
'vi': 'Vietnamese',
'hu': 'Hungarian',
'sk': 'Slovak',
'ro': 'Romanian',
'id': 'Indonesian',
'ms': 'Malay',
}
@classmethod
def get_language_name(cls, lang_code: str) -> str:
"""Get language name from code"""
return cls.SUPPORTED_LANGUAGES.get(lang_code, "Unknown")
@classmethod
def is_supported(cls, lang_code: str) -> bool:
"""Check if language is supported"""
return lang_code in cls.SUPPORTED_LANGUAGES
@classmethod
def get_closest_supported_language(cls, lang_code: str) -> str:
"""
Get the closest supported language code
This helps with similar language detection issues
like confusing 'no' (Norwegian) with 'da' (Danish)
"""
if lang_code in cls.SUPPORTED_LANGUAGES:
return lang_code
# Language mapping for commonly confused languages
similar_languages = {
'nb': 'no', # Norwegian BokmΓ₯l β Norwegian
'nn': 'no', # Norwegian Nynorsk β Norwegian
'zh-cn': 'zh', # Chinese Simplified β Chinese
'zh-tw': 'zh', # Chinese Traditional β Chinese
'hr': 'sr', # Croatian β Serbian (similar)
'bs': 'sr', # Bosnian β Serbian (similar)
'mk': 'bg', # Macedonian β Bulgarian (similar)
'be': 'ru', # Belarusian β Russian (similar)
'ca': 'es', # Catalan β Spanish (similar)
'gl': 'pt', # Galician β Portuguese (similar)
'af': 'nl', # Afrikaans β Dutch (similar)
}
return similar_languages.get(lang_code, "en")
# Create instance for convenient import
language_processor = LanguageUtils()
|