Spaces:
Running
on
Zero
Running
on
Zero
| #!/usr/bin/env python3 | |
| # -*- encoding: utf-8 -*- | |
| # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. | |
| # MIT License (https://opensource.org/licenses/MIT) | |
| import re | |
| def split_to_mini_sentence(words: list, word_limit: int = 20): | |
| assert word_limit > 1 | |
| if len(words) <= word_limit: | |
| return [words] | |
| sentences = [] | |
| length = len(words) | |
| sentence_len = length // word_limit | |
| for i in range(sentence_len): | |
| sentences.append(words[i * word_limit : (i + 1) * word_limit]) | |
| if length % word_limit > 0: | |
| sentences.append(words[sentence_len * word_limit :]) | |
| return sentences | |
| def split_words(text: str, jieba_usr_dict=None, **kwargs): | |
| if jieba_usr_dict: | |
| input_list = text.split() | |
| token_list_all = [] | |
| langauge_list = [] | |
| token_list_tmp = [] | |
| language_flag = None | |
| for token in input_list: | |
| if isEnglish(token) and language_flag == "Chinese": | |
| token_list_all.append(token_list_tmp) | |
| langauge_list.append("Chinese") | |
| token_list_tmp = [] | |
| elif not isEnglish(token) and language_flag == "English": | |
| token_list_all.append(token_list_tmp) | |
| langauge_list.append("English") | |
| token_list_tmp = [] | |
| token_list_tmp.append(token) | |
| if isEnglish(token): | |
| language_flag = "English" | |
| else: | |
| language_flag = "Chinese" | |
| if token_list_tmp: | |
| token_list_all.append(token_list_tmp) | |
| langauge_list.append(language_flag) | |
| result_list = [] | |
| for token_list_tmp, language_flag in zip(token_list_all, langauge_list): | |
| if language_flag == "English": | |
| result_list.extend(token_list_tmp) | |
| else: | |
| seg_list = jieba_usr_dict.cut( | |
| join_chinese_and_english(token_list_tmp), HMM=False | |
| ) | |
| result_list.extend(seg_list) | |
| return result_list | |
| else: | |
| words = [] | |
| segs = text.split() | |
| for seg in segs: | |
| # There is no space in seg. | |
| current_word = "" | |
| for c in seg: | |
| if len(c.encode()) == 1: | |
| # This is an ASCII char. | |
| current_word += c | |
| else: | |
| # This is a Chinese char. | |
| if len(current_word) > 0: | |
| words.append(current_word) | |
| current_word = "" | |
| words.append(c) | |
| if len(current_word) > 0: | |
| words.append(current_word) | |
| return words | |
| def isEnglish(text: str): | |
| if re.search("^[a-zA-Z']+$", text): | |
| return True | |
| else: | |
| return False | |
| def join_chinese_and_english(input_list): | |
| line = "" | |
| for token in input_list: | |
| if isEnglish(token): | |
| line = line + " " + token | |
| else: | |
| line = line + token | |
| line = line.strip() | |
| return line | |