Spaces:

DevBM
/

QGen

Runtime error

App Files Files Community

QGen / text_processing.py

DevBM

Add get_text_from_document function for extracting text from any document type

b4c86a6 verified about 1 year ago

raw

history blame contribute delete

3.2 kB

	import re
	import pymupdf
	from nltk.tokenize import sent_tokenize
	from docx.api import Document
	from pptx import Presentation
	from bs4 import BeautifulSoup
	import pypandoc

	def clean_text(text):
	text = re.sub(r"[^\x00-\x7F]", " ", text)
	text = re.sub(r"[\n]", " ", text)
	text = re.sub(r'\s+', ' ', text).strip()
	text = re.sub(r'[“”]', '"', text)
	text = re.sub(r"[‘’]", "'", text)
	text = text.replace('\xad', '')
	text = re.sub(r'[‒–—―]', '-', text)
	return text

	# Function to create text chunks
	def segment_text(text, max_segment_length=700, batch_size=7):
	sentences = sent_tokenize(text)
	segments = []
	current_segment = ""

	for sentence in sentences:
	if len(current_segment) + len(sentence) <= max_segment_length:
	current_segment += sentence + " "
	else:
	segments.append(current_segment.strip())
	current_segment = sentence + " "

	if current_segment:
	segments.append(current_segment.strip())

	# Create batches
	batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)]
	return batches

	def get_pdf_text(pdf_file):
	doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
	text = ""
	for page_num in range(doc.page_count):
	page = doc.load_page(page_num)
	text += page.get_text()
	return text

	# Function to get text from a DOCX file
	def get_doc_text(doc_files):
	doc = Document(doc_files)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text

	# Function to get text from a PPTX file
	def get_ppt_text(ppt_files):
	prs = Presentation(ppt_files)
	text = ""
	for slide in prs.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	text += shape.text + "\n"
	return text

	# Function to get text from HTML files
	def get_html_text(html_files):
	text = ""
	for html_file in html_files:
	with open(html_file, 'r', encoding='utf-8') as f:
	soup = BeautifulSoup(f, 'html.parser')
	text += soup.get_text()
	return text

	# Function to get text from LaTeX files
	def get_latex_text(latex_files):
	text = ""
	for latex_file in latex_files:
	output = pypandoc.convert_file(latex_file, 'plain')
	text += output
	return text

	# Function to parse text from a file
	def parse_text(file):
	text = file.getvalue().decode("utf-8")
	return text

	# Function to get text from uploaded documents
	def get_text_from_document(file):
	content = ""
	if file is not None:
	if file.name.endswith('.pdf'):
	content += get_pdf_text(file)
	elif file.name.endswith('.docx') or file.name.endswith('.doc'):
	content += get_doc_text(file)
	elif file.name.endswith('.pptx') or file.name.endswith('.ppt'):
	content += get_ppt_text(file)
	elif file.name.endswith('.html'):
	content += get_html_text(file)
	elif file.name.endswith('.tex'):
	content += get_latex_text(file)
	elif file.name.endswith('.txt'):
	content += parse_text(file)
	return content