Spaces:
Running
Running
| import re | |
| import random | |
| import string | |
| import nltk | |
| nltk.download('punkt') | |
| def replacement1(review, regex_list): | |
| replaced_dict = {} | |
| for regex in regex_list: | |
| matches = re.findall(regex, review, re.IGNORECASE) | |
| for match in matches: | |
| random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=10)) | |
| review = review.replace(match, random_string) | |
| replaced_dict[random_string] = match | |
| return review, replaced_dict | |
| def replacement2(sentences, replaced_dict): | |
| for i in range(len(sentences)): | |
| for randomized, original in replaced_dict.items(): | |
| sentences[i] = sentences[i].replace(randomized, original) | |
| return sentences | |
| def parse_sentences(review): | |
| regex_list = [r'et al.', r'"(.*?)"', r"'(.*?)'", r'e.g.', r'Sec.', r'Sec \d+(\.\d+)?\.', r'w.r.t.', r'e.q', r'fig.'] | |
| review, replaced_dict = replacement1(review, regex_list) | |
| sentences = nltk.sent_tokenize(review) | |
| sentences = replacement2(sentences, replaced_dict) | |
| return sentences |