Spaces:

venni16
/

material-summeraizer

Running

App Files Files Community

material-summeraizer / document_parser.py

venni16

Upload 7 files

5335722 verified 7 days ago

raw

history blame

4.49 kB

	import os
	import logging
	from typing import Optional
	import pdfplumber
	from docx import Document
	import PyPDF2
	from pptx import Presentation

	logger = logging.getLogger(__name__)

	def parse_document(file_path: str, file_extension: str) -> str:
	"""
	Parse different document formats and extract text
	"""
	try:
	if file_extension == '.pdf':
	return parse_pdf(file_path)
	elif file_extension in ['.docx', '.doc']:
	return parse_docx(file_path)
	elif file_extension in ['.pptx', '.ppt']:
	return parse_pptx(file_path)
	elif file_extension == '.txt':
	return parse_txt(file_path)
	else:
	raise ValueError(f"Unsupported file format: {file_extension}")
	except Exception as e:
	logger.error(f"Error parsing document {file_path}: {e}")
	raise

	def parse_pdf(file_path: str) -> str:
	"""
	Extract text from PDF using multiple methods for better coverage
	"""
	text = ""

	# Method 1: Use pdfplumber (better for text-based PDFs)
	try:
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	except Exception as e:
	logger.warning(f"pdfplumber failed: {e}")

	# Method 2: Use PyPDF2 as fallback
	if not text.strip():
	try:
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	except Exception as e:
	logger.warning(f"PyPDF2 failed: {e}")

	if not text.strip():
	raise ValueError("Could not extract text from PDF")

	return clean_text(text)

	def parse_docx(file_path: str) -> str:
	"""
	Extract text from DOCX/DOC files
	"""
	try:
	doc = Document(file_path)
	text = ""

	# Extract paragraphs
	for paragraph in doc.paragraphs:
	if paragraph.text.strip():
	text += paragraph.text + "\n"

	# Extract tables
	for table in doc.tables:
	for row in table.rows:
	for cell in row.cells:
	if cell.text.strip():
	text += cell.text + "\n"

	return clean_text(text)
	except Exception as e:
	logger.error(f"Error parsing DOCX file: {e}")
	raise

	def parse_pptx(file_path: str) -> str:
	"""
	Extract text from PowerPoint files
	"""
	try:
	prs = Presentation(file_path)
	text = ""

	for slide in prs.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text") and shape.text.strip():
	text += shape.text + "\n"

	return clean_text(text)
	except Exception as e:
	logger.error(f"Error parsing PPTX file: {e}")
	raise

	def parse_txt(file_path: str) -> str:
	"""
	Extract text from plain text files
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	text = file.read()
	return clean_text(text)
	except UnicodeDecodeError:
	# Try different encodings
	for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
	try:
	with open(file_path, 'r', encoding=encoding) as file:
	text = file.read()
	return clean_text(text)
	except UnicodeDecodeError:
	continue
	raise ValueError("Could not decode text file with any encoding")

	def clean_text(text: str) -> str:
	"""
	Clean and normalize extracted text
	"""
	# Remove excessive whitespace
	lines = [line.strip() for line in text.split('\n') if line.strip()]

	# Remove very short lines that are likely formatting artifacts
	meaningful_lines = [line for line in lines if len(line) > 2]

	# Join with proper spacing
	cleaned_text = '\n'.join(meaningful_lines)

	# Remove multiple consecutive newlines
	while '\n\n\n' in cleaned_text:
	cleaned_text = cleaned_text.replace('\n\n\n', '\n\n')

	return cleaned_text.strip()