Your Name
Initial commit with working MLX-VLM configuration
18352e1
#!/usr/bin/env python3
"""
MonkeyOCR Command Line Interface
Process documents using MonkeyOCR with MLX-VLM optimization
"""
import sys
import os
import argparse
import time
from pathlib import Path
from loguru import logger
def main():
parser = argparse.ArgumentParser(
description="MonkeyOCR: Advanced OCR with MLX-VLM optimization for Apple Silicon"
)
parser.add_argument("input_path", help="Path to PDF or image file to process")
parser.add_argument(
"-o", "--output",
help="Output directory (default: same as input file)",
default=None
)
parser.add_argument(
"-c", "--config",
help="Config file path",
default="model_configs_mps.yaml"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args()
# Configure logging
if args.verbose:
logger.add(sys.stderr, level="DEBUG")
else:
logger.add(sys.stderr, level="INFO")
# Check if input file exists
input_path = Path(args.input_path)
if not input_path.exists():
logger.error(f"Input file not found: {input_path}")
sys.exit(1)
# Check file extension
supported_extensions = {'.pdf', '.png', '.jpg', '.jpeg'}
if input_path.suffix.lower() not in supported_extensions:
logger.error(f"Unsupported file type: {input_path.suffix}")
logger.info(f"Supported formats: {', '.join(supported_extensions)}")
sys.exit(1)
# Set output directory
if args.output:
output_dir = Path(args.output)
else:
output_dir = input_path.parent
output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"πŸš€ Starting MonkeyOCR processing...")
logger.info(f"πŸ“„ Input: {input_path}")
logger.info(f"πŸ“ Output: {output_dir}")
logger.info(f"βš™οΈ Config: {args.config}")
try:
# Import and process
from app import process_document, initialize_model
# Initialize model
logger.info("πŸ”§ Initializing MonkeyOCR model...")
start_time = time.time()
model = initialize_model(args.config)
init_time = time.time() - start_time
logger.info(f"βœ… Model initialized in {init_time:.2f}s")
# Process document
logger.info("πŸ“Š Processing document...")
process_start = time.time()
markdown_content, layout_pdf_path = process_document(str(input_path))
process_time = time.time() - process_start
logger.info(f"⚑ Document processed in {process_time:.2f}s")
# Save results
output_name = input_path.stem
markdown_file = output_dir / f"{output_name}.md"
with open(markdown_file, 'w', encoding='utf-8') as f:
f.write(markdown_content)
logger.info(f"πŸ“ Markdown saved: {markdown_file}")
if layout_pdf_path and os.path.exists(layout_pdf_path):
logger.info(f"🎨 Layout PDF: {layout_pdf_path}")
# Summary
logger.info("πŸŽ‰ Processing completed successfully!")
logger.info(f"⏱️ Total time: {time.time() - start_time:.2f}s")
# Print first few lines of markdown for preview
lines = markdown_content.split('\n')[:10]
logger.info("πŸ“‹ Preview:")
for line in lines:
if line.strip():
logger.info(f" {line}")
if len(lines) >= 10:
logger.info(" ...")
except KeyboardInterrupt:
logger.warning("⚠️ Processing interrupted by user")
sys.exit(1)
except Exception as e:
logger.error(f"❌ Processing failed: {e}")
if args.verbose:
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()