Spaces:

ghost-logic
/

DnD_Campaign_Manager

Sleeping

File size: 3,421 Bytes

71b378e

"""
File Parsing Utilities for Session Notes Upload
Supports: .txt, .md, .docx, .pdf
"""
from pathlib import Path
from typing import Optional


def parse_uploaded_file(file_path: str) -> str:
    """
    Parse uploaded session notes file and extract text content.

    Supported formats:
    - .txt - Plain text
    - .md - Markdown
    - .docx - Microsoft Word (requires python-docx)
    - .pdf - PDF documents (requires PyPDF2)

    Args:
        file_path: Path to uploaded file

    Returns:
        Extracted text content as string

    Raises:
        ValueError: If file type is not supported
        Exception: If file parsing fails
    """
    path = Path(file_path)
    extension = path.suffix.lower()

    # Plain text and markdown (simple read)
    if extension in ['.txt', '.md']:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except UnicodeDecodeError:
            # Try with different encoding if UTF-8 fails
            with open(file_path, 'r', encoding='latin-1') as f:
                return f.read()

    # Microsoft Word documents
    elif extension == '.docx':
        try:
            import docx
            doc = docx.Document(file_path)
            paragraphs = [para.text for para in doc.paragraphs]
            return '\n'.join(paragraphs)
        except ImportError:
            raise ImportError(
                "python-docx is required to parse .docx files. "
                "Install with: pip install python-docx"
            )
        except Exception as e:
            raise Exception(f"Error parsing .docx file: {str(e)}")

    # PDF documents
    elif extension == '.pdf':
        try:
            import PyPDF2
            text_content = []

            with open(file_path, 'rb') as f:
                pdf_reader = PyPDF2.PdfReader(f)

                for page in pdf_reader.pages:
                    text = page.extract_text()
                    if text:
                        text_content.append(text)

            return '\n'.join(text_content)

        except ImportError:
            raise ImportError(
                "PyPDF2 is required to parse .pdf files. "
                "Install with: pip install PyPDF2"
            )
        except Exception as e:
            raise Exception(f"Error parsing .pdf file: {str(e)}")

    else:
        raise ValueError(
            f"Unsupported file type: {extension}. "
            f"Supported formats: .txt, .md, .docx, .pdf"
        )


def validate_file_size(file_path: str, max_size_mb: int = 10) -> bool:
    """
    Validate that file size is within acceptable limits.

    Args:
        file_path: Path to file
        max_size_mb: Maximum file size in megabytes (default: 10 MB)

    Returns:
        True if file size is acceptable, False otherwise
    """
    path = Path(file_path)

    if not path.exists():
        return False

    file_size_mb = path.stat().st_size / (1024 * 1024)
    return file_size_mb <= max_size_mb


def get_file_info(file_path: str) -> dict:
    """
    Get information about uploaded file.

    Args:
        file_path: Path to file

    Returns:
        Dictionary with file information
    """
    path = Path(file_path)

    return {
        'name': path.name,
        'extension': path.suffix.lower(),
        'size_bytes': path.stat().st_size,
        'size_mb': round(path.stat().st_size / (1024 * 1024), 2)
    }