Spaces:
Running
Running
| import opencc | |
| from typing import Literal | |
| import re | |
| class Corrector: | |
| """ | |
| SenseVoice model ouputs Simplified Chinese only, this class converts the output to Traditional Chinese | |
| and fix common Cantonese spelling errors. | |
| """ | |
| def __init__(self, corrector: Literal["opencc"] = "opencc"): | |
| self.corrector = corrector | |
| self.converter = None | |
| self.bert_model = None | |
| if corrector == "opencc": | |
| self.converter = opencc.OpenCC("s2hk") | |
| self.regular_errors: list[tuple[re.Pattern, str]] = [ | |
| (re.compile(r"俾(?!(?:路支|斯麥|益))"), r"畀"), | |
| (re.compile(r"(?<!(?:聯))[系繫](?!(?:統))"), r"係"), | |
| (re.compile(r"噶"), r"㗎"), | |
| (re.compile(r"咁(?=[我你佢就樣就話係啊呀嘅,。])"), r"噉"), | |
| (re.compile(r"(?<![曝晾])曬(?:[衣太衫褲被命嘢相])"), r"晒"), | |
| (re.compile(r"(?<=[好])翻(?=[去到嚟])"), r"返"), | |
| (re.compile(r"<\|\w+\|>"), r""), | |
| ] | |
| def correct(self, text: str) -> str: | |
| """ | |
| Correct the output text using either a language model or OpenCC | |
| Args: | |
| text: Input text to correct | |
| t2s_char_dict: Dictionary mapping traditional to simplified characters | |
| lm_model: Either 'opencc' or a LanguageModel instance | |
| Returns: | |
| Corrected text string | |
| """ | |
| text = text.strip() | |
| if not text: # Early return for empty string | |
| return text | |
| if self.corrector == "opencc": | |
| return self.opencc_correct(text) | |
| else: | |
| raise ValueError("corrector should be either 'opencc' or 'bert'") | |
| def opencc_correct(self, text: str) -> str: | |
| """ | |
| Convert text using OpenCC | |
| Args: | |
| text: Input text to convert | |
| config: OpenCC configuration | |
| Returns: | |
| Converted text string | |
| """ | |
| opencc_text = self.converter.convert(text) | |
| for pattern, replacement in self.regular_errors: | |
| opencc_text = pattern.sub(replacement, opencc_text) | |
| return opencc_text | |