Spaces:

venni16
/

material-summeraizer

Running

File size: 4,488 Bytes
import os
import logging
from typing import Optional
import pdfplumber
from docx import Document
import PyPDF2
from pptx import Presentation

logger = logging.getLogger(__name__)

def parse_document(file_path: str, file_extension: str) -> str:
    """

    Parse different document formats and extract text

    """
    try:
        if file_extension == '.pdf':
            return parse_pdf(file_path)
        elif file_extension in ['.docx', '.doc']:
            return parse_docx(file_path)
        elif file_extension in ['.pptx', '.ppt']:
            return parse_pptx(file_path)
        elif file_extension == '.txt':
            return parse_txt(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")
    except Exception as e:
        logger.error(f"Error parsing document {file_path}: {e}")
        raise

def parse_pdf(file_path: str) -> str:
    """

    Extract text from PDF using multiple methods for better coverage

    """
    text = ""
    
    # Method 1: Use pdfplumber (better for text-based PDFs)
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        logger.warning(f"pdfplumber failed: {e}")
    
    # Method 2: Use PyPDF2 as fallback
    if not text.strip():
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
        except Exception as e:
            logger.warning(f"PyPDF2 failed: {e}")
    
    if not text.strip():
        raise ValueError("Could not extract text from PDF")
    
    return clean_text(text)

def parse_docx(file_path: str) -> str:
    """

    Extract text from DOCX/DOC files

    """
    try:
        doc = Document(file_path)
        text = ""
        
        # Extract paragraphs
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                text += paragraph.text + "\n"
        
        # Extract tables
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        text += cell.text + "\n"
        
        return clean_text(text)
    except Exception as e:
        logger.error(f"Error parsing DOCX file: {e}")
        raise

def parse_pptx(file_path: str) -> str:
    """

    Extract text from PowerPoint files

    """
    try:
        prs = Presentation(file_path)
        text = ""
        
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text") and shape.text.strip():
                    text += shape.text + "\n"
        
        return clean_text(text)
    except Exception as e:
        logger.error(f"Error parsing PPTX file: {e}")
        raise

def parse_txt(file_path: str) -> str:
    """

    Extract text from plain text files

    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return clean_text(text)
    except UnicodeDecodeError:
        # Try different encodings
        for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
            try:
                with open(file_path, 'r', encoding=encoding) as file:
                    text = file.read()
                return clean_text(text)
            except UnicodeDecodeError:
                continue
        raise ValueError("Could not decode text file with any encoding")

def clean_text(text: str) -> str:
    """

    Clean and normalize extracted text

    """
    # Remove excessive whitespace
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    # Remove very short lines that are likely formatting artifacts
    meaningful_lines = [line for line in lines if len(line) > 2]
    
    # Join with proper spacing
    cleaned_text = '\n'.join(meaningful_lines)
    
    # Remove multiple consecutive newlines
    while '\n\n\n' in cleaned_text:
        cleaned_text = cleaned_text.replace('\n\n\n', '\n\n')
    
    return cleaned_text.strip()