Spaces:
Running
Running
| import os | |
| import logging | |
| from typing import Optional | |
| import pdfplumber | |
| from docx import Document | |
| import PyPDF2 | |
| from pptx import Presentation | |
| logger = logging.getLogger(__name__) | |
| def parse_document(file_path: str, file_extension: str) -> str: | |
| """ | |
| Parse different document formats and extract text | |
| """ | |
| try: | |
| if file_extension == '.pdf': | |
| return parse_pdf(file_path) | |
| elif file_extension in ['.docx', '.doc']: | |
| return parse_docx(file_path) | |
| elif file_extension in ['.pptx', '.ppt']: | |
| return parse_pptx(file_path) | |
| elif file_extension == '.txt': | |
| return parse_txt(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_extension}") | |
| except Exception as e: | |
| logger.error(f"Error parsing document {file_path}: {e}") | |
| raise | |
| def parse_pdf(file_path: str) -> str: | |
| """ | |
| Extract text from PDF using multiple methods for better coverage | |
| """ | |
| text = "" | |
| # Method 1: Use pdfplumber (better for text-based PDFs) | |
| try: | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| except Exception as e: | |
| logger.warning(f"pdfplumber failed: {e}") | |
| # Method 2: Use PyPDF2 as fallback | |
| if not text.strip(): | |
| try: | |
| with open(file_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| for page in pdf_reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| except Exception as e: | |
| logger.warning(f"PyPDF2 failed: {e}") | |
| if not text.strip(): | |
| raise ValueError("Could not extract text from PDF") | |
| return clean_text(text) | |
| def parse_docx(file_path: str) -> str: | |
| """ | |
| Extract text from DOCX/DOC files | |
| """ | |
| try: | |
| doc = Document(file_path) | |
| text = "" | |
| # Extract paragraphs | |
| for paragraph in doc.paragraphs: | |
| if paragraph.text.strip(): | |
| text += paragraph.text + "\n" | |
| # Extract tables | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| if cell.text.strip(): | |
| text += cell.text + "\n" | |
| return clean_text(text) | |
| except Exception as e: | |
| logger.error(f"Error parsing DOCX file: {e}") | |
| raise | |
| def parse_pptx(file_path: str) -> str: | |
| """ | |
| Extract text from PowerPoint files | |
| """ | |
| try: | |
| prs = Presentation(file_path) | |
| text = "" | |
| for slide in prs.slides: | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text") and shape.text.strip(): | |
| text += shape.text + "\n" | |
| return clean_text(text) | |
| except Exception as e: | |
| logger.error(f"Error parsing PPTX file: {e}") | |
| raise | |
| def parse_txt(file_path: str) -> str: | |
| """ | |
| Extract text from plain text files | |
| """ | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| text = file.read() | |
| return clean_text(text) | |
| except UnicodeDecodeError: | |
| # Try different encodings | |
| for encoding in ['latin-1', 'cp1252', 'iso-8859-1']: | |
| try: | |
| with open(file_path, 'r', encoding=encoding) as file: | |
| text = file.read() | |
| return clean_text(text) | |
| except UnicodeDecodeError: | |
| continue | |
| raise ValueError("Could not decode text file with any encoding") | |
| def clean_text(text: str) -> str: | |
| """ | |
| Clean and normalize extracted text | |
| """ | |
| # Remove excessive whitespace | |
| lines = [line.strip() for line in text.split('\n') if line.strip()] | |
| # Remove very short lines that are likely formatting artifacts | |
| meaningful_lines = [line for line in lines if len(line) > 2] | |
| # Join with proper spacing | |
| cleaned_text = '\n'.join(meaningful_lines) | |
| # Remove multiple consecutive newlines | |
| while '\n\n\n' in cleaned_text: | |
| cleaned_text = cleaned_text.replace('\n\n\n', '\n\n') | |
| return cleaned_text.strip() |