Spaces:
Sleeping
Sleeping
| """ | |
| File Parsing Utilities for Session Notes Upload | |
| Supports: .txt, .md, .docx, .pdf | |
| """ | |
| from pathlib import Path | |
| from typing import Optional | |
| def parse_uploaded_file(file_path: str) -> str: | |
| """ | |
| Parse uploaded session notes file and extract text content. | |
| Supported formats: | |
| - .txt - Plain text | |
| - .md - Markdown | |
| - .docx - Microsoft Word (requires python-docx) | |
| - .pdf - PDF documents (requires PyPDF2) | |
| Args: | |
| file_path: Path to uploaded file | |
| Returns: | |
| Extracted text content as string | |
| Raises: | |
| ValueError: If file type is not supported | |
| Exception: If file parsing fails | |
| """ | |
| path = Path(file_path) | |
| extension = path.suffix.lower() | |
| # Plain text and markdown (simple read) | |
| if extension in ['.txt', '.md']: | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| except UnicodeDecodeError: | |
| # Try with different encoding if UTF-8 fails | |
| with open(file_path, 'r', encoding='latin-1') as f: | |
| return f.read() | |
| # Microsoft Word documents | |
| elif extension == '.docx': | |
| try: | |
| import docx | |
| doc = docx.Document(file_path) | |
| paragraphs = [para.text for para in doc.paragraphs] | |
| return '\n'.join(paragraphs) | |
| except ImportError: | |
| raise ImportError( | |
| "python-docx is required to parse .docx files. " | |
| "Install with: pip install python-docx" | |
| ) | |
| except Exception as e: | |
| raise Exception(f"Error parsing .docx file: {str(e)}") | |
| # PDF documents | |
| elif extension == '.pdf': | |
| try: | |
| import PyPDF2 | |
| text_content = [] | |
| with open(file_path, 'rb') as f: | |
| pdf_reader = PyPDF2.PdfReader(f) | |
| for page in pdf_reader.pages: | |
| text = page.extract_text() | |
| if text: | |
| text_content.append(text) | |
| return '\n'.join(text_content) | |
| except ImportError: | |
| raise ImportError( | |
| "PyPDF2 is required to parse .pdf files. " | |
| "Install with: pip install PyPDF2" | |
| ) | |
| except Exception as e: | |
| raise Exception(f"Error parsing .pdf file: {str(e)}") | |
| else: | |
| raise ValueError( | |
| f"Unsupported file type: {extension}. " | |
| f"Supported formats: .txt, .md, .docx, .pdf" | |
| ) | |
| def validate_file_size(file_path: str, max_size_mb: int = 10) -> bool: | |
| """ | |
| Validate that file size is within acceptable limits. | |
| Args: | |
| file_path: Path to file | |
| max_size_mb: Maximum file size in megabytes (default: 10 MB) | |
| Returns: | |
| True if file size is acceptable, False otherwise | |
| """ | |
| path = Path(file_path) | |
| if not path.exists(): | |
| return False | |
| file_size_mb = path.stat().st_size / (1024 * 1024) | |
| return file_size_mb <= max_size_mb | |
| def get_file_info(file_path: str) -> dict: | |
| """ | |
| Get information about uploaded file. | |
| Args: | |
| file_path: Path to file | |
| Returns: | |
| Dictionary with file information | |
| """ | |
| path = Path(file_path) | |
| return { | |
| 'name': path.name, | |
| 'extension': path.suffix.lower(), | |
| 'size_bytes': path.stat().st_size, | |
| 'size_mb': round(path.stat().st_size / (1024 * 1024), 2) | |
| } | |