File size: 1,975 Bytes
cea4a4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import re
import string
from statistics import mode
import emoji
from langdetect import detect
from spellchecker import SpellChecker
def clean_text(text: str) -> str:
# text = text.replace("\n", " ")
for fun in [
remove_URL,
remove_html,
remove_hashtags,
# remove_backslashes,
# remove_emojis,
# remove_punct,
]:
text = fun(text)
return text
def remove_URL(text: str) -> str:
url = re.compile(r"https?://\S+|www\.\S+")
return url.sub(r"", text)
def remove_hashtags(text: str) -> str:
hashtag = re.compile(r"#\S+")
return hashtag.sub(r"", text)
def remove_html(text: str) -> str:
html = re.compile(r"<.*?>")
return html.sub(r"", text)
def remove_emojis(text: str) -> str:
delimiter = "#4="
for i in range(5):
text = emoji.demojize(string=text, delimiters=(delimiter, delimiter))
text = re.sub(f"{delimiter}\S+{delimiter}", "", text)
return text
def remove_punct(text):
table = str.maketrans("", "", string.punctuation)
return text.translate(table)
def correct_spellings(text):
spell = SpellChecker()
corrected_text = []
misspelled_words = spell.unknown(text.split())
for word in text.split():
corrected_word = spell.correction(word)
if word in misspelled_words and corrected_word is not None:
corrected_text.append(corrected_word)
else:
corrected_text.append(word)
return " ".join(corrected_text)
def remove_backslashes(text: str) -> str:
backslash = re.compile(r"\\\S+")
return backslash.sub(r"", text)
def detect_language(list_of_texts: list[str]) -> str | None:
if len(list_of_texts) == 0:
return None
languages = []
for text in list_of_texts:
try:
lan = detect(text)
languages.append(lan)
except Exception:
continue
return mode(languages) if len(languages) else None
|