GraphGen / graphgen /utils /detect_lang.py
github-actions[bot]
Auto-sync from demo at Thu Oct 23 11:07:54 UTC 2025
0b9d8c7
raw
history blame
964 Bytes
def detect_main_language(text):
"""
Detect the main language of the text, 'zh' for Chinese, 'en' for English
:param text:
:return:
"""
assert isinstance(text, str)
def is_chinese_char(char):
return "\u4e00" <= char <= "\u9fff"
def is_english_char(char):
return char.isascii() and char.isalpha()
text = "".join(char for char in text if char.strip())
chinese_count = sum(1 for char in text if is_chinese_char(char))
english_count = sum(1 for char in text if is_english_char(char))
total = chinese_count + english_count
if total == 0:
return "en"
chinese_ratio = chinese_count / total
if chinese_ratio >= 0.5:
return "zh"
return "en"
def detect_if_chinese(text):
"""
Detect if the text contains any Chinese characters
:param text:
:return:
"""
assert isinstance(text, str)
return any("\u4e00" <= char <= "\u9fff" for char in text)