File size: 2,867 Bytes
a19d0ca
 
 
 
 
 
 
 
 
 
 
 
7073ee1
a19d0ca
7073ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a19d0ca
 
7073ee1
 
 
 
 
 
 
 
a19d0ca
 
7073ee1
a19d0ca
7073ee1
a19d0ca
7073ee1
 
a19d0ca
7073ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a19d0ca
 
7073ee1
a19d0ca
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# src/language_utils.py
from langdetect import detect, DetectorFactory
from typing import Optional, List
import logging

# For more stable language detection
DetectorFactory.seed = 0

# Logger setup
logger = logging.getLogger(__name__)

class LanguageUtils:
    """Utility class for language operations"""
    
    SUPPORTED_LANGUAGES = {
        # Common European languages
        'en': 'English',
        'ru': 'Russian',
        'de': 'German',
        'fr': 'French',
        'es': 'Spanish',
        'it': 'Italian',
        'pt': 'Portuguese',
        'nl': 'Dutch',
        'pl': 'Polish',
        'sv': 'Swedish',
        'no': 'Norwegian',
        'da': 'Danish',
        'fi': 'Finnish',
        
        # Asian languages
        'zh': 'Chinese',
        'ja': 'Japanese',
        'ko': 'Korean',
        
        # Other widely used languages
        'ar': 'Arabic',
        'hi': 'Hindi',
        'tr': 'Turkish',
        'cs': 'Czech',
        'uk': 'Ukrainian',
        'bg': 'Bulgarian',
        'el': 'Greek',
        'he': 'Hebrew',
        'th': 'Thai',
        'vi': 'Vietnamese',
        'hu': 'Hungarian',
        'sk': 'Slovak',
        'ro': 'Romanian',
        'id': 'Indonesian',
        'ms': 'Malay',
    }
    
    @classmethod
    def get_language_name(cls, lang_code: str) -> str:
        """Get language name from code"""
        return cls.SUPPORTED_LANGUAGES.get(lang_code, "Unknown")
    
    @classmethod
    def is_supported(cls, lang_code: str) -> bool:
        """Check if language is supported"""
        return lang_code in cls.SUPPORTED_LANGUAGES
    
    @classmethod
    def get_closest_supported_language(cls, lang_code: str) -> str:
        """
        Get the closest supported language code
        
        This helps with similar language detection issues
        like confusing 'no' (Norwegian) with 'da' (Danish)
        """
        if lang_code in cls.SUPPORTED_LANGUAGES:
            return lang_code
            
        # Language mapping for commonly confused languages
        similar_languages = {
            'nb': 'no',  # Norwegian BokmΓ₯l β†’ Norwegian
            'nn': 'no',  # Norwegian Nynorsk β†’ Norwegian
            'zh-cn': 'zh',  # Chinese Simplified β†’ Chinese 
            'zh-tw': 'zh',  # Chinese Traditional β†’ Chinese
            'hr': 'sr',  # Croatian β†’ Serbian (similar)
            'bs': 'sr',  # Bosnian β†’ Serbian (similar)
            'mk': 'bg',  # Macedonian β†’ Bulgarian (similar)
            'be': 'ru',  # Belarusian β†’ Russian (similar)
            'ca': 'es',  # Catalan β†’ Spanish (similar)
            'gl': 'pt',  # Galician β†’ Portuguese (similar)
            'af': 'nl',  # Afrikaans β†’ Dutch (similar)
        }
        
        return similar_languages.get(lang_code, "en")
    
# Create instance for convenient import
language_processor = LanguageUtils()