Spaces:
Sleeping
Sleeping
| import re | |
| def drop_non_relevant_text(text_list): | |
| '''Based on the distribution of the len in the Spanish words, | |
| I decided to drop paragraphs composed mainly (over 50%) of too long words (len over 14 characters) | |
| This is to remove errors in the process to read PDFs | |
| ''' | |
| text_list = [x.split(' ') for x in text_list] | |
| relevant_sentences = [] | |
| counter = 0 | |
| for i in text_list: | |
| for j in i: | |
| if len(j)>14: | |
| counter+=1 | |
| if counter/len(i)<0.5: | |
| relevant_sentences+=[i] | |
| counter=0 | |
| return [' '.join(x).strip() for x in relevant_sentences] | |
| def preprocess_text(text): | |
| text=text.strip() | |
| text = re.sub(' +', ' ',text) | |
| text = re.sub('-', '',text) | |
| text = re.sub('-', '',text) | |
| text = re.sub('\n', '',text) | |
| return [x for x in text.split('.') if len(x)>1] | |