import re import string from statistics import mode import emoji from langdetect import detect from spellchecker import SpellChecker def clean_text(text: str) -> str: # text = text.replace("\n", " ") for fun in [ remove_URL, remove_html, remove_hashtags, # remove_backslashes, # remove_emojis, # remove_punct, ]: text = fun(text) return text def remove_URL(text: str) -> str: url = re.compile(r"https?://\S+|www\.\S+") return url.sub(r"", text) def remove_hashtags(text: str) -> str: hashtag = re.compile(r"#\S+") return hashtag.sub(r"", text) def remove_html(text: str) -> str: html = re.compile(r"<.*?>") return html.sub(r"", text) def remove_emojis(text: str) -> str: delimiter = "#4=" for i in range(5): text = emoji.demojize(string=text, delimiters=(delimiter, delimiter)) text = re.sub(f"{delimiter}\S+{delimiter}", "", text) return text def remove_punct(text): table = str.maketrans("", "", string.punctuation) return text.translate(table) def correct_spellings(text): spell = SpellChecker() corrected_text = [] misspelled_words = spell.unknown(text.split()) for word in text.split(): corrected_word = spell.correction(word) if word in misspelled_words and corrected_word is not None: corrected_text.append(corrected_word) else: corrected_text.append(word) return " ".join(corrected_text) def remove_backslashes(text: str) -> str: backslash = re.compile(r"\\\S+") return backslash.sub(r"", text) def detect_language(list_of_texts: list[str]) -> str | None: if len(list_of_texts) == 0: return None languages = [] for text in list_of_texts: try: lan = detect(text) languages.append(lan) except Exception: continue return mode(languages) if len(languages) else None