File size: 3,421 Bytes
71b378e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
File Parsing Utilities for Session Notes Upload
Supports: .txt, .md, .docx, .pdf
"""
from pathlib import Path
from typing import Optional


def parse_uploaded_file(file_path: str) -> str:
    """
    Parse uploaded session notes file and extract text content.

    Supported formats:
    - .txt - Plain text
    - .md - Markdown
    - .docx - Microsoft Word (requires python-docx)
    - .pdf - PDF documents (requires PyPDF2)

    Args:
        file_path: Path to uploaded file

    Returns:
        Extracted text content as string

    Raises:
        ValueError: If file type is not supported
        Exception: If file parsing fails
    """
    path = Path(file_path)
    extension = path.suffix.lower()

    # Plain text and markdown (simple read)
    if extension in ['.txt', '.md']:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except UnicodeDecodeError:
            # Try with different encoding if UTF-8 fails
            with open(file_path, 'r', encoding='latin-1') as f:
                return f.read()

    # Microsoft Word documents
    elif extension == '.docx':
        try:
            import docx
            doc = docx.Document(file_path)
            paragraphs = [para.text for para in doc.paragraphs]
            return '\n'.join(paragraphs)
        except ImportError:
            raise ImportError(
                "python-docx is required to parse .docx files. "
                "Install with: pip install python-docx"
            )
        except Exception as e:
            raise Exception(f"Error parsing .docx file: {str(e)}")

    # PDF documents
    elif extension == '.pdf':
        try:
            import PyPDF2
            text_content = []

            with open(file_path, 'rb') as f:
                pdf_reader = PyPDF2.PdfReader(f)

                for page in pdf_reader.pages:
                    text = page.extract_text()
                    if text:
                        text_content.append(text)

            return '\n'.join(text_content)

        except ImportError:
            raise ImportError(
                "PyPDF2 is required to parse .pdf files. "
                "Install with: pip install PyPDF2"
            )
        except Exception as e:
            raise Exception(f"Error parsing .pdf file: {str(e)}")

    else:
        raise ValueError(
            f"Unsupported file type: {extension}. "
            f"Supported formats: .txt, .md, .docx, .pdf"
        )


def validate_file_size(file_path: str, max_size_mb: int = 10) -> bool:
    """
    Validate that file size is within acceptable limits.

    Args:
        file_path: Path to file
        max_size_mb: Maximum file size in megabytes (default: 10 MB)

    Returns:
        True if file size is acceptable, False otherwise
    """
    path = Path(file_path)

    if not path.exists():
        return False

    file_size_mb = path.stat().st_size / (1024 * 1024)
    return file_size_mb <= max_size_mb


def get_file_info(file_path: str) -> dict:
    """
    Get information about uploaded file.

    Args:
        file_path: Path to file

    Returns:
        Dictionary with file information
    """
    path = Path(file_path)

    return {
        'name': path.name,
        'extension': path.suffix.lower(),
        'size_bytes': path.stat().st_size,
        'size_mb': round(path.stat().st_size / (1024 * 1024), 2)
    }