Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| PDF to Markdown Converter using MinerU (vendor/mineru) | |
| This is the main conversion script that uses the local MinerU installation | |
| """ | |
| import os | |
| import sys | |
| import logging | |
| import argparse | |
| from pathlib import Path | |
| import subprocess | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.StreamHandler(), | |
| logging.FileHandler('pdf_converter.log') | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class PdfConverterResult: | |
| """Class representing the result of a PDF conversion""" | |
| def __init__(self, pdf_path: str, success: bool, md_path: str = None, | |
| time_taken: float = 0, error: str = None): | |
| self.pdf_path = pdf_path | |
| self.success = success | |
| self.md_path = md_path | |
| self.time_taken = time_taken | |
| self.error = error | |
| def __str__(self): | |
| if self.success: | |
| return f"β Successfully converted {self.pdf_path} in {self.time_taken:.2f}s" | |
| else: | |
| return f"β Failed to convert {self.pdf_path}: {self.error}" | |
| class MineruPdfConverter: | |
| """ | |
| PDF to Markdown converter using MinerU | |
| """ | |
| def __init__(self, output_dir: str = "output"): | |
| self.output_dir = output_dir | |
| os.makedirs(output_dir, exist_ok=True) | |
| def convert_file(self, pdf_path: str, delete_after: bool = False) -> PdfConverterResult: | |
| """Convert a single PDF file to Markdown using MinerU""" | |
| import time | |
| start_time = time.time() | |
| try: | |
| pdf_path = Path(pdf_path) | |
| if not pdf_path.exists(): | |
| return PdfConverterResult( | |
| str(pdf_path), False, error=f"File not found: {pdf_path}" | |
| ) | |
| logger.info(f"Processing: {pdf_path}") | |
| # Prepare output directory | |
| pdf_output_dir = os.path.join(self.output_dir, pdf_path.stem) | |
| # Run MinerU command | |
| cmd = [ | |
| "mineru", | |
| "-p", str(pdf_path), | |
| "-o", pdf_output_dir, | |
| "-m", "txt", # Use text mode | |
| "-f", "false", # Disable formula parsing for speed | |
| "-t", "false", # Disable table parsing for speed | |
| ] | |
| logger.info(f"Running command: {' '.join(cmd)}") | |
| # Execute MinerU | |
| result = subprocess.run(cmd, capture_output=True, text=True) | |
| if result.returncode != 0: | |
| error_msg = result.stderr if result.stderr else "Unknown error" | |
| return PdfConverterResult( | |
| str(pdf_path), False, error=error_msg | |
| ) | |
| # Find the generated markdown file | |
| md_path = None | |
| expected_md = Path(pdf_output_dir) / pdf_path.stem / "txt" / f"{pdf_path.stem}.md" | |
| if expected_md.exists(): | |
| md_path = str(expected_md) | |
| logger.info(f"β Markdown file created: {md_path}") | |
| else: | |
| # Search for any .md file in the output directory | |
| for md_file in Path(pdf_output_dir).rglob("*.md"): | |
| md_path = str(md_file) | |
| logger.info(f"β Found markdown file: {md_path}") | |
| break | |
| if not md_path: | |
| return PdfConverterResult( | |
| str(pdf_path), False, error="No markdown file generated" | |
| ) | |
| # Delete original PDF if requested | |
| if delete_after and pdf_path.exists(): | |
| pdf_path.unlink() | |
| logger.info(f"ποΈ Deleted original PDF: {pdf_path}") | |
| elapsed_time = time.time() - start_time | |
| return PdfConverterResult( | |
| str(pdf_path), True, md_path=md_path, time_taken=elapsed_time | |
| ) | |
| except Exception as e: | |
| logger.error(f"Error processing {pdf_path}: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return PdfConverterResult( | |
| str(pdf_path), False, error=str(e) | |
| ) | |
| class BatchProcessor: | |
| """Process multiple PDF files in batch""" | |
| def __init__(self, batch_dir: str = "batch-files", output_dir: str = "output", | |
| workers: int = 1, delete_after: bool = False): | |
| self.batch_dir = batch_dir | |
| self.output_dir = output_dir | |
| self.workers = workers | |
| self.delete_after = delete_after | |
| self.converter = MineruPdfConverter(output_dir) | |
| def find_pdf_files(self) -> list[Path]: | |
| """Find all PDF files in the batch directory""" | |
| pdf_files = [] | |
| batch_path = Path(self.batch_dir) | |
| if not batch_path.exists(): | |
| logger.warning(f"Batch directory not found: {self.batch_dir}") | |
| return pdf_files | |
| # Find all PDFs recursively | |
| pdf_files = list(batch_path.rglob("*.pdf")) | |
| logger.info(f"Found {len(pdf_files)} PDF files in {self.batch_dir}") | |
| return pdf_files | |
| def process_batch(self) -> tuple[int, int]: | |
| """Process all PDFs in the batch directory""" | |
| pdf_files = self.find_pdf_files() | |
| if not pdf_files: | |
| logger.info("No PDF files found to process") | |
| return 0, 0 | |
| successful = 0 | |
| failed = 0 | |
| logger.info(f"Starting batch processing of {len(pdf_files)} files...") | |
| # Process files sequentially (MinerU already handles parallelism internally) | |
| for pdf_file in pdf_files: | |
| result = self.converter.convert_file(str(pdf_file), self.delete_after) | |
| if result.success: | |
| successful += 1 | |
| logger.info(f"β {result}") | |
| else: | |
| failed += 1 | |
| logger.error(f"β {result}") | |
| return successful, failed | |
| def main(): | |
| """Main entry point""" | |
| parser = argparse.ArgumentParser( | |
| description="Convert PDF files to Markdown using MinerU", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| # Convert a single PDF | |
| %(prog)s convert path/to/file.pdf | |
| # Batch convert all PDFs in batch-files directory | |
| %(prog)s batch | |
| # Batch convert with custom settings | |
| %(prog)s batch --batch-dir /path/to/pdfs --output-dir /path/to/output --workers 4 | |
| # Delete PDFs after successful conversion | |
| %(prog)s batch --delete-after | |
| """ | |
| ) | |
| subparsers = parser.add_subparsers(dest='command', help='Command to run') | |
| # Convert command | |
| convert_parser = subparsers.add_parser('convert', help='Convert a single PDF file') | |
| convert_parser.add_argument('pdf_file', help='Path to PDF file') | |
| convert_parser.add_argument('--output-dir', default='output', help='Output directory') | |
| convert_parser.add_argument('--delete-after', action='store_true', | |
| help='Delete PDF after successful conversion') | |
| # Batch command | |
| batch_parser = subparsers.add_parser('batch', help='Batch convert PDF files') | |
| batch_parser.add_argument('--batch-dir', default='batch-files', | |
| help='Directory containing PDF files') | |
| batch_parser.add_argument('--output-dir', default='output', | |
| help='Output directory') | |
| batch_parser.add_argument('--workers', type=int, default=1, | |
| help='Number of parallel workers') | |
| batch_parser.add_argument('--delete-after', action='store_true', | |
| help='Delete PDFs after successful conversion') | |
| args = parser.parse_args() | |
| # Auto-detect command if none specified | |
| if not args.command: | |
| # If first argument looks like a file, assume convert command | |
| if len(sys.argv) > 1 and (sys.argv[1].endswith('.pdf') or Path(sys.argv[1]).exists()): | |
| args.command = 'convert' | |
| args.pdf_file = sys.argv[1] | |
| args.output_dir = 'output' | |
| args.delete_after = False | |
| else: | |
| # Default to batch mode | |
| args.command = 'batch' | |
| args.batch_dir = 'batch-files' | |
| args.output_dir = 'output' | |
| args.workers = 1 | |
| args.delete_after = False | |
| # Execute command | |
| if args.command == 'convert': | |
| converter = MineruPdfConverter(args.output_dir) | |
| result = converter.convert_file(args.pdf_file, args.delete_after) | |
| print(result) | |
| sys.exit(0 if result.success else 1) | |
| elif args.command == 'batch': | |
| processor = BatchProcessor( | |
| args.batch_dir, | |
| args.output_dir, | |
| args.workers, | |
| args.delete_after | |
| ) | |
| successful, failed = processor.process_batch() | |
| print(f"\nπ Batch processing complete:") | |
| print(f" β Successful: {successful}") | |
| print(f" β Failed: {failed}") | |
| print(f" π Output directory: {args.output_dir}") | |
| sys.exit(0 if failed == 0 else 1) | |
| else: | |
| parser.print_help() | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() |