ctr-ll4 / src /utils /text_functions.py
sanjin7's picture
Upload src/ with huggingface_hub
cea4a4b
import re
import string
from statistics import mode
import emoji
from langdetect import detect
from spellchecker import SpellChecker
def clean_text(text: str) -> str:
# text = text.replace("\n", " ")
for fun in [
remove_URL,
remove_html,
remove_hashtags,
# remove_backslashes,
# remove_emojis,
# remove_punct,
]:
text = fun(text)
return text
def remove_URL(text: str) -> str:
url = re.compile(r"https?://\S+|www\.\S+")
return url.sub(r"", text)
def remove_hashtags(text: str) -> str:
hashtag = re.compile(r"#\S+")
return hashtag.sub(r"", text)
def remove_html(text: str) -> str:
html = re.compile(r"<.*?>")
return html.sub(r"", text)
def remove_emojis(text: str) -> str:
delimiter = "#4="
for i in range(5):
text = emoji.demojize(string=text, delimiters=(delimiter, delimiter))
text = re.sub(f"{delimiter}\S+{delimiter}", "", text)
return text
def remove_punct(text):
table = str.maketrans("", "", string.punctuation)
return text.translate(table)
def correct_spellings(text):
spell = SpellChecker()
corrected_text = []
misspelled_words = spell.unknown(text.split())
for word in text.split():
corrected_word = spell.correction(word)
if word in misspelled_words and corrected_word is not None:
corrected_text.append(corrected_word)
else:
corrected_text.append(word)
return " ".join(corrected_text)
def remove_backslashes(text: str) -> str:
backslash = re.compile(r"\\\S+")
return backslash.sub(r"", text)
def detect_language(list_of_texts: list[str]) -> str | None:
if len(list_of_texts) == 0:
return None
languages = []
for text in list_of_texts:
try:
lan = detect(text)
languages.append(lan)
except Exception:
continue
return mode(languages) if len(languages) else None