|
|
|
|
|
""" |
|
|
MonkeyOCR Command Line Interface |
|
|
Process documents using MonkeyOCR with MLX-VLM optimization |
|
|
""" |
|
|
|
|
|
import sys |
|
|
import os |
|
|
import argparse |
|
|
import time |
|
|
from pathlib import Path |
|
|
from loguru import logger |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="MonkeyOCR: Advanced OCR with MLX-VLM optimization for Apple Silicon" |
|
|
) |
|
|
parser.add_argument("input_path", help="Path to PDF or image file to process") |
|
|
parser.add_argument( |
|
|
"-o", "--output", |
|
|
help="Output directory (default: same as input file)", |
|
|
default=None |
|
|
) |
|
|
parser.add_argument( |
|
|
"-c", "--config", |
|
|
help="Config file path", |
|
|
default="model_configs_mps.yaml" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--verbose", "-v", |
|
|
action="store_true", |
|
|
help="Enable verbose logging" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
if args.verbose: |
|
|
logger.add(sys.stderr, level="DEBUG") |
|
|
else: |
|
|
logger.add(sys.stderr, level="INFO") |
|
|
|
|
|
|
|
|
input_path = Path(args.input_path) |
|
|
if not input_path.exists(): |
|
|
logger.error(f"Input file not found: {input_path}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
supported_extensions = {'.pdf', '.png', '.jpg', '.jpeg'} |
|
|
if input_path.suffix.lower() not in supported_extensions: |
|
|
logger.error(f"Unsupported file type: {input_path.suffix}") |
|
|
logger.info(f"Supported formats: {', '.join(supported_extensions)}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
if args.output: |
|
|
output_dir = Path(args.output) |
|
|
else: |
|
|
output_dir = input_path.parent |
|
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
logger.info(f"π Starting MonkeyOCR processing...") |
|
|
logger.info(f"π Input: {input_path}") |
|
|
logger.info(f"π Output: {output_dir}") |
|
|
logger.info(f"βοΈ Config: {args.config}") |
|
|
|
|
|
try: |
|
|
|
|
|
from app import process_document, initialize_model |
|
|
|
|
|
|
|
|
logger.info("π§ Initializing MonkeyOCR model...") |
|
|
start_time = time.time() |
|
|
model = initialize_model(args.config) |
|
|
init_time = time.time() - start_time |
|
|
logger.info(f"β
Model initialized in {init_time:.2f}s") |
|
|
|
|
|
|
|
|
logger.info("π Processing document...") |
|
|
process_start = time.time() |
|
|
|
|
|
markdown_content, layout_pdf_path = process_document(str(input_path)) |
|
|
|
|
|
process_time = time.time() - process_start |
|
|
logger.info(f"β‘ Document processed in {process_time:.2f}s") |
|
|
|
|
|
|
|
|
output_name = input_path.stem |
|
|
markdown_file = output_dir / f"{output_name}.md" |
|
|
|
|
|
with open(markdown_file, 'w', encoding='utf-8') as f: |
|
|
f.write(markdown_content) |
|
|
|
|
|
logger.info(f"π Markdown saved: {markdown_file}") |
|
|
|
|
|
if layout_pdf_path and os.path.exists(layout_pdf_path): |
|
|
logger.info(f"π¨ Layout PDF: {layout_pdf_path}") |
|
|
|
|
|
|
|
|
logger.info("π Processing completed successfully!") |
|
|
logger.info(f"β±οΈ Total time: {time.time() - start_time:.2f}s") |
|
|
|
|
|
|
|
|
lines = markdown_content.split('\n')[:10] |
|
|
logger.info("π Preview:") |
|
|
for line in lines: |
|
|
if line.strip(): |
|
|
logger.info(f" {line}") |
|
|
|
|
|
if len(lines) >= 10: |
|
|
logger.info(" ...") |
|
|
|
|
|
except KeyboardInterrupt: |
|
|
logger.warning("β οΈ Processing interrupted by user") |
|
|
sys.exit(1) |
|
|
except Exception as e: |
|
|
logger.error(f"β Processing failed: {e}") |
|
|
if args.verbose: |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
sys.exit(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|