material-summeraizer / document_parser.py
venni16's picture
Upload 7 files
5335722 verified
raw
history blame
4.49 kB
import os
import logging
from typing import Optional
import pdfplumber
from docx import Document
import PyPDF2
from pptx import Presentation
logger = logging.getLogger(__name__)
def parse_document(file_path: str, file_extension: str) -> str:
"""
Parse different document formats and extract text
"""
try:
if file_extension == '.pdf':
return parse_pdf(file_path)
elif file_extension in ['.docx', '.doc']:
return parse_docx(file_path)
elif file_extension in ['.pptx', '.ppt']:
return parse_pptx(file_path)
elif file_extension == '.txt':
return parse_txt(file_path)
else:
raise ValueError(f"Unsupported file format: {file_extension}")
except Exception as e:
logger.error(f"Error parsing document {file_path}: {e}")
raise
def parse_pdf(file_path: str) -> str:
"""
Extract text from PDF using multiple methods for better coverage
"""
text = ""
# Method 1: Use pdfplumber (better for text-based PDFs)
try:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
except Exception as e:
logger.warning(f"pdfplumber failed: {e}")
# Method 2: Use PyPDF2 as fallback
if not text.strip():
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
except Exception as e:
logger.warning(f"PyPDF2 failed: {e}")
if not text.strip():
raise ValueError("Could not extract text from PDF")
return clean_text(text)
def parse_docx(file_path: str) -> str:
"""
Extract text from DOCX/DOC files
"""
try:
doc = Document(file_path)
text = ""
# Extract paragraphs
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text += paragraph.text + "\n"
# Extract tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text += cell.text + "\n"
return clean_text(text)
except Exception as e:
logger.error(f"Error parsing DOCX file: {e}")
raise
def parse_pptx(file_path: str) -> str:
"""
Extract text from PowerPoint files
"""
try:
prs = Presentation(file_path)
text = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
text += shape.text + "\n"
return clean_text(text)
except Exception as e:
logger.error(f"Error parsing PPTX file: {e}")
raise
def parse_txt(file_path: str) -> str:
"""
Extract text from plain text files
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
return clean_text(text)
except UnicodeDecodeError:
# Try different encodings
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
try:
with open(file_path, 'r', encoding=encoding) as file:
text = file.read()
return clean_text(text)
except UnicodeDecodeError:
continue
raise ValueError("Could not decode text file with any encoding")
def clean_text(text: str) -> str:
"""
Clean and normalize extracted text
"""
# Remove excessive whitespace
lines = [line.strip() for line in text.split('\n') if line.strip()]
# Remove very short lines that are likely formatting artifacts
meaningful_lines = [line for line in lines if len(line) > 2]
# Join with proper spacing
cleaned_text = '\n'.join(meaningful_lines)
# Remove multiple consecutive newlines
while '\n\n\n' in cleaned_text:
cleaned_text = cleaned_text.replace('\n\n\n', '\n\n')
return cleaned_text.strip()