Spaces:

mrfakename
/

DMOSpeech2

Running on Zero

App Files Files Community

DMOSpeech2 / funasr_detach /utils /postprocess_utils.py

mrfakename

Super-squash branch 'main' using huggingface_hub

0102e16 verified 10 months ago

raw

history blame

9.23 kB

	# Copyright (c) Alibaba, Inc. and its affiliates.

	import string
	import logging
	from typing import Any, List, Union


	def isChinese(ch: str):
	if "\u4e00" <= ch <= "\u9fff" or "\u0030" <= ch <= "\u0039" or ch == "@":
	return True
	return False


	def isAllChinese(word: Union[List[Any], str]):
	word_lists = []
	for i in word:
	cur = i.replace(" ", "")
	cur = cur.replace("</s>", "")
	cur = cur.replace("<s>", "")
	cur = cur.replace("<unk>", "")
	cur = cur.replace("<OOV>", "")
	word_lists.append(cur)

	if len(word_lists) == 0:
	return False

	for ch in word_lists:
	if isChinese(ch) is False:
	return False
	return True


	def isAllAlpha(word: Union[List[Any], str]):
	word_lists = []
	for i in word:
	cur = i.replace(" ", "")
	cur = cur.replace("</s>", "")
	cur = cur.replace("<s>", "")
	cur = cur.replace("<unk>", "")
	cur = cur.replace("<OOV>", "")
	word_lists.append(cur)

	if len(word_lists) == 0:
	return False

	for ch in word_lists:
	if ch.isalpha() is False and ch != "'":
	return False
	elif ch.isalpha() is True and isChinese(ch) is True:
	return False

	return True


	# def abbr_dispose(words: List[Any]) -> List[Any]:
	def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
	words_size = len(words)
	word_lists = []
	abbr_begin = []
	abbr_end = []
	last_num = -1
	ts_lists = []
	ts_nums = []
	ts_index = 0
	for num in range(words_size):
	if num <= last_num:
	continue

	if len(words[num]) == 1 and words[num].encode("utf-8").isalpha():
	if (
	num + 1 < words_size
	and words[num + 1] == " "
	and num + 2 < words_size
	and len(words[num + 2]) == 1
	and words[num + 2].encode("utf-8").isalpha()
	):
	# found the begin of abbr
	abbr_begin.append(num)
	num += 2
	abbr_end.append(num)
	# to find the end of abbr
	while True:
	num += 1
	if num < words_size and words[num] == " ":
	num += 1
	if (
	num < words_size
	and len(words[num]) == 1
	and words[num].encode("utf-8").isalpha()
	):
	abbr_end.pop()
	abbr_end.append(num)
	last_num = num
	else:
	break
	else:
	break

	for num in range(words_size):
	if words[num] == " ":
	ts_nums.append(ts_index)
	else:
	ts_nums.append(ts_index)
	ts_index += 1
	last_num = -1
	for num in range(words_size):
	if num <= last_num:
	continue

	if num in abbr_begin:
	if time_stamp is not None:
	begin = time_stamp[ts_nums[num]][0]
	abbr_word = words[num].upper()
	num += 1
	while num < words_size:
	if num in abbr_end:
	abbr_word += words[num].upper()
	last_num = num
	break
	else:
	if words[num].encode("utf-8").isalpha():
	abbr_word += words[num].upper()
	num += 1
	word_lists.append(abbr_word)
	if time_stamp is not None:
	end = time_stamp[ts_nums[num]][1]
	ts_lists.append([begin, end])
	else:
	word_lists.append(words[num])
	if time_stamp is not None and words[num] != " ":
	begin = time_stamp[ts_nums[num]][0]
	end = time_stamp[ts_nums[num]][1]
	ts_lists.append([begin, end])
	begin = end

	if time_stamp is not None:
	return word_lists, ts_lists
	else:
	return word_lists


	def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
	middle_lists = []
	word_lists = []
	word_item = ""
	ts_lists = []

	# wash words lists
	for i in words:
	word = ""
	if isinstance(i, str):
	word = i
	else:
	word = i.decode("utf-8")

	if word in ["<s>", "</s>", "<unk>", "<OOV>"]:
	continue
	else:
	middle_lists.append(word)

	# all chinese characters
	if isAllChinese(middle_lists):
	for i, ch in enumerate(middle_lists):
	word_lists.append(ch.replace(" ", ""))
	if time_stamp is not None:
	ts_lists = time_stamp

	# all alpha characters
	elif isAllAlpha(middle_lists):
	ts_flag = True
	for i, ch in enumerate(middle_lists):
	if ts_flag and time_stamp is not None:
	begin = time_stamp[i][0]
	end = time_stamp[i][1]
	word = ""
	if "@@" in ch:
	word = ch.replace("@@", "")
	word_item += word
	if time_stamp is not None:
	ts_flag = False
	end = time_stamp[i][1]
	else:
	word_item += ch
	word_lists.append(word_item)
	word_lists.append(" ")
	word_item = ""
	if time_stamp is not None:
	ts_flag = True
	end = time_stamp[i][1]
	ts_lists.append([begin, end])
	begin = end

	# mix characters
	else:
	alpha_blank = False
	ts_flag = True
	begin = -1
	end = -1
	for i, ch in enumerate(middle_lists):
	if ts_flag and time_stamp is not None:
	begin = time_stamp[i][0]
	end = time_stamp[i][1]
	word = ""
	if isAllChinese(ch):
	if alpha_blank is True:
	word_lists.pop()
	word_lists.append(ch)
	alpha_blank = False
	if time_stamp is not None:
	ts_flag = True
	ts_lists.append([begin, end])
	begin = end
	elif "@@" in ch:
	word = ch.replace("@@", "")
	word_item += word
	alpha_blank = False
	if time_stamp is not None:
	ts_flag = False
	end = time_stamp[i][1]
	elif isAllAlpha(ch):
	word_item += ch
	word_lists.append(word_item)
	word_lists.append(" ")
	word_item = ""
	alpha_blank = True
	if time_stamp is not None:
	ts_flag = True
	end = time_stamp[i][1]
	ts_lists.append([begin, end])
	begin = end
	else:
	word_lists.append(ch)

	if time_stamp is not None:
	word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
	real_word_lists = []
	for ch in word_lists:
	if ch != " ":
	real_word_lists.append(ch)
	sentence = " ".join(real_word_lists).strip()
	return sentence, ts_lists, real_word_lists
	else:
	word_lists = abbr_dispose(word_lists)
	real_word_lists = []
	for ch in word_lists:
	if ch != " ":
	real_word_lists.append(ch)
	sentence = "".join(word_lists).strip()
	return sentence, real_word_lists


	def sentence_postprocess_sentencepiece(words):
	middle_lists = []
	word_lists = []
	word_item = ""

	# wash words lists
	for i in words:
	word = ""
	if isinstance(i, str):
	word = i
	else:
	word = i.decode("utf-8")

	if word in ["<s>", "</s>", "<unk>", "<OOV>"]:
	continue
	else:
	middle_lists.append(word)

	# all alpha characters
	for i, ch in enumerate(middle_lists):
	word = ""
	if "\u2581" in ch and i == 0:
	word_item = ""
	word = ch.replace("\u2581", "")
	word_item += word
	elif "\u2581" in ch and i != 0:
	word_lists.append(word_item)
	word_lists.append(" ")
	word_item = ""
	word = ch.replace("\u2581", "")
	word_item += word
	else:
	word_item += ch
	if word_item is not None:
	word_lists.append(word_item)
	# word_lists = abbr_dispose(word_lists)
	real_word_lists = []
	for ch in word_lists:
	if ch != " ":
	if ch == "i":
	ch = ch.replace("i", "I")
	elif ch == "i'm":
	ch = ch.replace("i'm", "I'm")
	elif ch == "i've":
	ch = ch.replace("i've", "I've")
	elif ch == "i'll":
	ch = ch.replace("i'll", "I'll")
	real_word_lists.append(ch)
	sentence = "".join(word_lists)
	return sentence, real_word_lists