File size: 964 Bytes
acd7cf4
 
0b9d8c7
acd7cf4
 
 
 
 
0b9d8c7
acd7cf4
0b9d8c7
acd7cf4
 
 
 
0b9d8c7
acd7cf4
 
 
 
 
 
0b9d8c7
acd7cf4
 
 
 
0b9d8c7
 
 
acd7cf4
 
 
0b9d8c7
acd7cf4
 
 
 
 
 
0b9d8c7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def detect_main_language(text):
    """
    Detect the main language of the text, 'zh' for Chinese, 'en' for English

    :param text:
    :return:
    """
    assert isinstance(text, str)

    def is_chinese_char(char):
        return "\u4e00" <= char <= "\u9fff"

    def is_english_char(char):
        return char.isascii() and char.isalpha()

    text = "".join(char for char in text if char.strip())

    chinese_count = sum(1 for char in text if is_chinese_char(char))
    english_count = sum(1 for char in text if is_english_char(char))

    total = chinese_count + english_count
    if total == 0:
        return "en"

    chinese_ratio = chinese_count / total

    if chinese_ratio >= 0.5:
        return "zh"
    return "en"


def detect_if_chinese(text):
    """
    Detect if the text contains any Chinese characters

    :param text:
    :return:
    """

    assert isinstance(text, str)
    return any("\u4e00" <= char <= "\u9fff" for char in text)