Spaces:
Runtime error
Runtime error
| import re | |
| import pymupdf | |
| from nltk.tokenize import sent_tokenize | |
| from docx.api import Document | |
| from pptx import Presentation | |
| from bs4 import BeautifulSoup | |
| import pypandoc | |
| def clean_text(text): | |
| text = re.sub(r"[^\x00-\x7F]", " ", text) | |
| text = re.sub(r"[\n]", " ", text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| text = re.sub(r'[ββ]', '"', text) | |
| text = re.sub(r"[ββ]", "'", text) | |
| text = text.replace('\xad', '') | |
| text = re.sub(r'[ββββ]', '-', text) | |
| return text | |
| # Function to create text chunks | |
| def segment_text(text, max_segment_length=700, batch_size=7): | |
| sentences = sent_tokenize(text) | |
| segments = [] | |
| current_segment = "" | |
| for sentence in sentences: | |
| if len(current_segment) + len(sentence) <= max_segment_length: | |
| current_segment += sentence + " " | |
| else: | |
| segments.append(current_segment.strip()) | |
| current_segment = sentence + " " | |
| if current_segment: | |
| segments.append(current_segment.strip()) | |
| # Create batches | |
| batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)] | |
| return batches | |
| def get_pdf_text(pdf_file): | |
| doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf") | |
| text = "" | |
| for page_num in range(doc.page_count): | |
| page = doc.load_page(page_num) | |
| text += page.get_text() | |
| return text | |
| # Function to get text from a DOCX file | |
| def get_doc_text(doc_files): | |
| doc = Document(doc_files) | |
| text = "" | |
| for paragraph in doc.paragraphs: | |
| text += paragraph.text + "\n" | |
| return text | |
| # Function to get text from a PPTX file | |
| def get_ppt_text(ppt_files): | |
| prs = Presentation(ppt_files) | |
| text = "" | |
| for slide in prs.slides: | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text"): | |
| text += shape.text + "\n" | |
| return text | |
| # Function to get text from HTML files | |
| def get_html_text(html_files): | |
| text = "" | |
| for html_file in html_files: | |
| with open(html_file, 'r', encoding='utf-8') as f: | |
| soup = BeautifulSoup(f, 'html.parser') | |
| text += soup.get_text() | |
| return text | |
| # Function to get text from LaTeX files | |
| def get_latex_text(latex_files): | |
| text = "" | |
| for latex_file in latex_files: | |
| output = pypandoc.convert_file(latex_file, 'plain') | |
| text += output | |
| return text | |
| # Function to parse text from a file | |
| def parse_text(file): | |
| text = file.getvalue().decode("utf-8") | |
| return text | |
| # Function to get text from uploaded documents | |
| def get_text_from_document(file): | |
| content = "" | |
| if file is not None: | |
| if file.name.endswith('.pdf'): | |
| content += get_pdf_text(file) | |
| elif file.name.endswith('.docx') or file.name.endswith('.doc'): | |
| content += get_doc_text(file) | |
| elif file.name.endswith('.pptx') or file.name.endswith('.ppt'): | |
| content += get_ppt_text(file) | |
| elif file.name.endswith('.html'): | |
| content += get_html_text(file) | |
| elif file.name.endswith('.tex'): | |
| content += get_latex_text(file) | |
| elif file.name.endswith('.txt'): | |
| content += parse_text(file) | |
| return content |