""" File Parsing Utilities for Session Notes Upload Supports: .txt, .md, .docx, .pdf """ from pathlib import Path from typing import Optional def parse_uploaded_file(file_path: str) -> str: """ Parse uploaded session notes file and extract text content. Supported formats: - .txt - Plain text - .md - Markdown - .docx - Microsoft Word (requires python-docx) - .pdf - PDF documents (requires PyPDF2) Args: file_path: Path to uploaded file Returns: Extracted text content as string Raises: ValueError: If file type is not supported Exception: If file parsing fails """ path = Path(file_path) extension = path.suffix.lower() # Plain text and markdown (simple read) if extension in ['.txt', '.md']: try: with open(file_path, 'r', encoding='utf-8') as f: return f.read() except UnicodeDecodeError: # Try with different encoding if UTF-8 fails with open(file_path, 'r', encoding='latin-1') as f: return f.read() # Microsoft Word documents elif extension == '.docx': try: import docx doc = docx.Document(file_path) paragraphs = [para.text for para in doc.paragraphs] return '\n'.join(paragraphs) except ImportError: raise ImportError( "python-docx is required to parse .docx files. " "Install with: pip install python-docx" ) except Exception as e: raise Exception(f"Error parsing .docx file: {str(e)}") # PDF documents elif extension == '.pdf': try: import PyPDF2 text_content = [] with open(file_path, 'rb') as f: pdf_reader = PyPDF2.PdfReader(f) for page in pdf_reader.pages: text = page.extract_text() if text: text_content.append(text) return '\n'.join(text_content) except ImportError: raise ImportError( "PyPDF2 is required to parse .pdf files. " "Install with: pip install PyPDF2" ) except Exception as e: raise Exception(f"Error parsing .pdf file: {str(e)}") else: raise ValueError( f"Unsupported file type: {extension}. " f"Supported formats: .txt, .md, .docx, .pdf" ) def validate_file_size(file_path: str, max_size_mb: int = 10) -> bool: """ Validate that file size is within acceptable limits. Args: file_path: Path to file max_size_mb: Maximum file size in megabytes (default: 10 MB) Returns: True if file size is acceptable, False otherwise """ path = Path(file_path) if not path.exists(): return False file_size_mb = path.stat().st_size / (1024 * 1024) return file_size_mb <= max_size_mb def get_file_info(file_path: str) -> dict: """ Get information about uploaded file. Args: file_path: Path to file Returns: Dictionary with file information """ path = Path(file_path) return { 'name': path.name, 'extension': path.suffix.lower(), 'size_bytes': path.stat().st_size, 'size_mb': round(path.stat().st_size / (1024 * 1024), 2) }