Spaces:
Running
Running
File size: 4,488 Bytes
5335722 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import os
import logging
from typing import Optional
import pdfplumber
from docx import Document
import PyPDF2
from pptx import Presentation
logger = logging.getLogger(__name__)
def parse_document(file_path: str, file_extension: str) -> str:
"""
Parse different document formats and extract text
"""
try:
if file_extension == '.pdf':
return parse_pdf(file_path)
elif file_extension in ['.docx', '.doc']:
return parse_docx(file_path)
elif file_extension in ['.pptx', '.ppt']:
return parse_pptx(file_path)
elif file_extension == '.txt':
return parse_txt(file_path)
else:
raise ValueError(f"Unsupported file format: {file_extension}")
except Exception as e:
logger.error(f"Error parsing document {file_path}: {e}")
raise
def parse_pdf(file_path: str) -> str:
"""
Extract text from PDF using multiple methods for better coverage
"""
text = ""
# Method 1: Use pdfplumber (better for text-based PDFs)
try:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
except Exception as e:
logger.warning(f"pdfplumber failed: {e}")
# Method 2: Use PyPDF2 as fallback
if not text.strip():
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
except Exception as e:
logger.warning(f"PyPDF2 failed: {e}")
if not text.strip():
raise ValueError("Could not extract text from PDF")
return clean_text(text)
def parse_docx(file_path: str) -> str:
"""
Extract text from DOCX/DOC files
"""
try:
doc = Document(file_path)
text = ""
# Extract paragraphs
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text += paragraph.text + "\n"
# Extract tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text += cell.text + "\n"
return clean_text(text)
except Exception as e:
logger.error(f"Error parsing DOCX file: {e}")
raise
def parse_pptx(file_path: str) -> str:
"""
Extract text from PowerPoint files
"""
try:
prs = Presentation(file_path)
text = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
text += shape.text + "\n"
return clean_text(text)
except Exception as e:
logger.error(f"Error parsing PPTX file: {e}")
raise
def parse_txt(file_path: str) -> str:
"""
Extract text from plain text files
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
return clean_text(text)
except UnicodeDecodeError:
# Try different encodings
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
try:
with open(file_path, 'r', encoding=encoding) as file:
text = file.read()
return clean_text(text)
except UnicodeDecodeError:
continue
raise ValueError("Could not decode text file with any encoding")
def clean_text(text: str) -> str:
"""
Clean and normalize extracted text
"""
# Remove excessive whitespace
lines = [line.strip() for line in text.split('\n') if line.strip()]
# Remove very short lines that are likely formatting artifacts
meaningful_lines = [line for line in lines if len(line) > 2]
# Join with proper spacing
cleaned_text = '\n'.join(meaningful_lines)
# Remove multiple consecutive newlines
while '\n\n\n' in cleaned_text:
cleaned_text = cleaned_text.replace('\n\n\n', '\n\n')
return cleaned_text.strip() |