Spaces:
Runtime error
Runtime error
| # Copyright (c) 2024 Amphion. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import re | |
| """ | |
| Text clean time | |
| """ | |
| # List of (regular expression, replacement) pairs for abbreviations in french: | |
| _abbreviations = [ | |
| (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) | |
| for x in [ | |
| ("M", "monsieur"), | |
| ("Mlle", "mademoiselle"), | |
| ("Mlles", "mesdemoiselles"), | |
| ("Mme", "Madame"), | |
| ("Mmes", "Mesdames"), | |
| ("N.B", "nota bene"), | |
| ("M", "monsieur"), | |
| ("p.c.q", "parce que"), | |
| ("Pr", "professeur"), | |
| ("qqch", "quelque chose"), | |
| ("rdv", "rendez-vous"), | |
| ("max", "maximum"), | |
| ("min", "minimum"), | |
| ("no", "numéro"), | |
| ("adr", "adresse"), | |
| ("dr", "docteur"), | |
| ("st", "saint"), | |
| ("co", "companie"), | |
| ("jr", "junior"), | |
| ("sgt", "sergent"), | |
| ("capt", "capitain"), | |
| ("col", "colonel"), | |
| ("av", "avenue"), | |
| ("av. J.-C", "avant Jésus-Christ"), | |
| ("apr. J.-C", "après Jésus-Christ"), | |
| ("art", "article"), | |
| ("boul", "boulevard"), | |
| ("c.-à-d", "c’est-à-dire"), | |
| ("etc", "et cetera"), | |
| ("ex", "exemple"), | |
| ("excl", "exclusivement"), | |
| ("boul", "boulevard"), | |
| ] | |
| ] + [ | |
| (re.compile("\\b%s" % x[0]), x[1]) | |
| for x in [ | |
| ("Mlle", "mademoiselle"), | |
| ("Mlles", "mesdemoiselles"), | |
| ("Mme", "Madame"), | |
| ("Mmes", "Mesdames"), | |
| ] | |
| ] | |
| rep_map = { | |
| ":": ",", | |
| ";": ",", | |
| ",": ",", | |
| "。": ".", | |
| "!": "!", | |
| "?": "?", | |
| "\n": ".", | |
| "·": ",", | |
| "、": ",", | |
| "...": ".", | |
| "…": ".", | |
| "$": ".", | |
| "“": "", | |
| "”": "", | |
| "‘": "", | |
| "’": "", | |
| "(": "", | |
| ")": "", | |
| "(": "", | |
| ")": "", | |
| "《": "", | |
| "》": "", | |
| "【": "", | |
| "】": "", | |
| "[": "", | |
| "]": "", | |
| "—": "", | |
| "~": "-", | |
| "~": "-", | |
| "「": "", | |
| "」": "", | |
| "¿": "", | |
| "¡": "", | |
| } | |
| def collapse_whitespace(text): | |
| # Regular expression matching whitespace: | |
| _whitespace_re = re.compile(r"\s+") | |
| return re.sub(_whitespace_re, " ", text).strip() | |
| def remove_punctuation_at_begin(text): | |
| return re.sub(r"^[,.!?]+", "", text) | |
| def remove_aux_symbols(text): | |
| text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text) | |
| return text | |
| def replace_symbols(text): | |
| text = text.replace(";", ",") | |
| text = text.replace("-", " ") | |
| text = text.replace(":", ",") | |
| text = text.replace("&", " et ") | |
| return text | |
| def expand_abbreviations(text): | |
| for regex, replacement in _abbreviations: | |
| text = re.sub(regex, replacement, text) | |
| return text | |
| def replace_punctuation(text): | |
| pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) | |
| replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) | |
| return replaced_text | |
| def text_normalize(text): | |
| text = expand_abbreviations(text) | |
| text = replace_punctuation(text) | |
| text = replace_symbols(text) | |
| text = remove_aux_symbols(text) | |
| text = remove_punctuation_at_begin(text) | |
| text = collapse_whitespace(text) | |
| text = re.sub(r"([^\.,!\?\-…])$", r"\1", text) | |
| return text | |
| def french_to_ipa(text, text_tokenizer): | |
| if type(text) == str: | |
| text = text_normalize(text) | |
| phonemes = text_tokenizer(text) | |
| return phonemes | |
| else: | |
| for i, t in enumerate(text): | |
| text[i] = text_normalize(t) | |
| return text_tokenizer(text) | |