Spaces:
Running
on
Zero
Running
on
Zero
| # Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean | |
| import re | |
| class KoreanCleaner: | |
| def _normalize_numbers(cls, text): | |
| number_to_kor = { | |
| "0": "μ", | |
| "1": "μΌ", | |
| "2": "μ΄", | |
| "3": "μΌ", | |
| "4": "μ¬", | |
| "5": "μ€", | |
| "6": "μ‘", | |
| "7": "μΉ ", | |
| "8": "ν", | |
| "9": "ꡬ", | |
| } | |
| new_text = "".join( | |
| number_to_kor[char] if char in number_to_kor.keys() else char | |
| for char in text | |
| ) | |
| return new_text | |
| def _normalize_english_text(cls, text): | |
| upper_alphabet_to_kor = { | |
| "A": "μμ΄", | |
| "B": "λΉ", | |
| "C": "μ¨", | |
| "D": "λ", | |
| "E": "μ΄", | |
| "F": "μν", | |
| "G": "μ§", | |
| "H": "μμ΄μΉ", | |
| "I": "μμ΄", | |
| "J": "μ μ΄", | |
| "K": "μΌμ΄", | |
| "L": "μ", | |
| "M": "μ ", | |
| "N": "μ", | |
| "O": "μ€", | |
| "P": "νΌ", | |
| "Q": "ν", | |
| "R": "μ", | |
| "S": "μμ€", | |
| "T": "ν°", | |
| "U": "μ ", | |
| "V": "λΈμ΄", | |
| "W": "λλΈμ ", | |
| "X": "μμ€", | |
| "Y": "μμ΄", | |
| "Z": "μ§", | |
| } | |
| new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text) | |
| new_text = "".join( | |
| ( | |
| upper_alphabet_to_kor[char] | |
| if char in upper_alphabet_to_kor.keys() | |
| else char | |
| ) | |
| for char in new_text | |
| ) | |
| return new_text | |
| def normalize_text(cls, text): | |
| # stage 0 : text strip | |
| text = text.strip() | |
| # stage 1 : normalize numbers | |
| text = cls._normalize_numbers(text) | |
| # stage 2 : normalize english text | |
| text = cls._normalize_english_text(text) | |
| return text | |