File size: 3,958 Bytes
18352e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
"""
MonkeyOCR Command Line Interface
Process documents using MonkeyOCR with MLX-VLM optimization
"""

import sys
import os
import argparse
import time
from pathlib import Path
from loguru import logger

def main():
    parser = argparse.ArgumentParser(
        description="MonkeyOCR: Advanced OCR with MLX-VLM optimization for Apple Silicon"
    )
    parser.add_argument("input_path", help="Path to PDF or image file to process")
    parser.add_argument(
        "-o", "--output", 
        help="Output directory (default: same as input file)", 
        default=None
    )
    parser.add_argument(
        "-c", "--config", 
        help="Config file path", 
        default="model_configs_mps.yaml"
    )
    parser.add_argument(
        "--verbose", "-v", 
        action="store_true", 
        help="Enable verbose logging"
    )
    
    args = parser.parse_args()
    
    # Configure logging
    if args.verbose:
        logger.add(sys.stderr, level="DEBUG")
    else:
        logger.add(sys.stderr, level="INFO")
    
    # Check if input file exists
    input_path = Path(args.input_path)
    if not input_path.exists():
        logger.error(f"Input file not found: {input_path}")
        sys.exit(1)
    
    # Check file extension
    supported_extensions = {'.pdf', '.png', '.jpg', '.jpeg'}
    if input_path.suffix.lower() not in supported_extensions:
        logger.error(f"Unsupported file type: {input_path.suffix}")
        logger.info(f"Supported formats: {', '.join(supported_extensions)}")
        sys.exit(1)
    
    # Set output directory
    if args.output:
        output_dir = Path(args.output)
    else:
        output_dir = input_path.parent
    
    output_dir.mkdir(parents=True, exist_ok=True)
    
    logger.info(f"πŸš€ Starting MonkeyOCR processing...")
    logger.info(f"πŸ“„ Input: {input_path}")
    logger.info(f"πŸ“ Output: {output_dir}")
    logger.info(f"βš™οΈ Config: {args.config}")
    
    try:
        # Import and process
        from app import process_document, initialize_model
        
        # Initialize model
        logger.info("πŸ”§ Initializing MonkeyOCR model...")
        start_time = time.time()
        model = initialize_model(args.config)
        init_time = time.time() - start_time
        logger.info(f"βœ… Model initialized in {init_time:.2f}s")
        
        # Process document
        logger.info("πŸ“Š Processing document...")
        process_start = time.time()
        
        markdown_content, layout_pdf_path = process_document(str(input_path))
        
        process_time = time.time() - process_start
        logger.info(f"⚑ Document processed in {process_time:.2f}s")
        
        # Save results
        output_name = input_path.stem
        markdown_file = output_dir / f"{output_name}.md"
        
        with open(markdown_file, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        logger.info(f"πŸ“ Markdown saved: {markdown_file}")
        
        if layout_pdf_path and os.path.exists(layout_pdf_path):
            logger.info(f"🎨 Layout PDF: {layout_pdf_path}")
        
        # Summary
        logger.info("πŸŽ‰ Processing completed successfully!")
        logger.info(f"⏱️ Total time: {time.time() - start_time:.2f}s")
        
        # Print first few lines of markdown for preview
        lines = markdown_content.split('\n')[:10]
        logger.info("πŸ“‹ Preview:")
        for line in lines:
            if line.strip():
                logger.info(f"   {line}")
        
        if len(lines) >= 10:
            logger.info("   ...")
        
    except KeyboardInterrupt:
        logger.warning("⚠️ Processing interrupted by user")
        sys.exit(1)
    except Exception as e:
        logger.error(f"❌ Processing failed: {e}")
        if args.verbose:
            import traceback
            traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    main()