Spaces:
Running
Running
| import os | |
| from typing import Dict, List, Optional | |
| import nltk | |
| import jieba | |
| resource_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources") | |
| class NLTKHelper: | |
| _stopwords: Dict[str, Optional[List[str]]] = { | |
| "english": None, | |
| "chinese": None, | |
| } | |
| def __init__(self): | |
| jieba.initialize() | |
| def get_stopwords(self, lang: str) -> List[str]: | |
| nltk.data.path.append(os.path.join(resource_path, "nltk_data")) | |
| if self._stopwords[lang] is None: | |
| try: | |
| nltk.data.find("corpora/stopwords") | |
| except LookupError: | |
| nltk.download("stopwords", download_dir=os.path.join(resource_path, "nltk_data")) | |
| self._stopwords[lang] = nltk.corpus.stopwords.words(lang) | |
| return self._stopwords[lang] | |
| def word_tokenize(text: str, lang: str) -> List[str]: | |
| if lang == "zh": | |
| return jieba.lcut(text) | |
| nltk.data.path.append(os.path.join(resource_path, "nltk_data")) | |
| try: | |
| nltk.data.find("tokenizers/punkt_tab") | |
| except LookupError: | |
| nltk.download("punkt_tab", download_dir=os.path.join(resource_path, "nltk_data")) | |
| return nltk.word_tokenize(text) | |