Spaces:

marcosremar2
/

mineru2

Running

App Files Files Community

mineru2 / pdf_converter_mineru.py

marcosremar2

Add PDF conversion API endpoints

550ec39 5 months ago

raw

history blame

9.44 kB

	#!/usr/bin/env python3
	"""
	PDF to Markdown Converter using MinerU (vendor/mineru)
	This is the main conversion script that uses the local MinerU installation
	"""

	import os
	import sys
	import logging
	import argparse
	from pathlib import Path
	import subprocess

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler(),
	logging.FileHandler('pdf_converter.log')
	]
	)
	logger = logging.getLogger(__name__)


	class PdfConverterResult:
	"""Class representing the result of a PDF conversion"""

	def __init__(self, pdf_path: str, success: bool, md_path: str = None,
	time_taken: float = 0, error: str = None):
	self.pdf_path = pdf_path
	self.success = success
	self.md_path = md_path
	self.time_taken = time_taken
	self.error = error

	def __str__(self):
	if self.success:
	return f"✅ Successfully converted {self.pdf_path} in {self.time_taken:.2f}s"
	else:
	return f"❌ Failed to convert {self.pdf_path}: {self.error}"


	class MineruPdfConverter:
	"""
	PDF to Markdown converter using MinerU
	"""

	def __init__(self, output_dir: str = "output"):
	self.output_dir = output_dir
	os.makedirs(output_dir, exist_ok=True)

	def convert_file(self, pdf_path: str, delete_after: bool = False) -> PdfConverterResult:
	"""Convert a single PDF file to Markdown using MinerU"""
	import time
	start_time = time.time()

	try:
	pdf_path = Path(pdf_path)
	if not pdf_path.exists():
	return PdfConverterResult(
	str(pdf_path), False, error=f"File not found: {pdf_path}"
	)

	logger.info(f"Processing: {pdf_path}")

	# Prepare output directory
	pdf_output_dir = os.path.join(self.output_dir, pdf_path.stem)

	# Run MinerU command
	cmd = [
	"mineru",
	"-p", str(pdf_path),
	"-o", pdf_output_dir,
	"-m", "txt", # Use text mode
	"-f", "false", # Disable formula parsing for speed
	"-t", "false", # Disable table parsing for speed
	]

	logger.info(f"Running command: {' '.join(cmd)}")

	# Execute MinerU
	result = subprocess.run(cmd, capture_output=True, text=True)

	if result.returncode != 0:
	error_msg = result.stderr if result.stderr else "Unknown error"
	return PdfConverterResult(
	str(pdf_path), False, error=error_msg
	)

	# Find the generated markdown file
	md_path = None
	expected_md = Path(pdf_output_dir) / pdf_path.stem / "txt" / f"{pdf_path.stem}.md"

	if expected_md.exists():
	md_path = str(expected_md)
	logger.info(f"✅ Markdown file created: {md_path}")
	else:
	# Search for any .md file in the output directory
	for md_file in Path(pdf_output_dir).rglob("*.md"):
	md_path = str(md_file)
	logger.info(f"✅ Found markdown file: {md_path}")
	break

	if not md_path:
	return PdfConverterResult(
	str(pdf_path), False, error="No markdown file generated"
	)

	# Delete original PDF if requested
	if delete_after and pdf_path.exists():
	pdf_path.unlink()
	logger.info(f"🗑️ Deleted original PDF: {pdf_path}")

	elapsed_time = time.time() - start_time

	return PdfConverterResult(
	str(pdf_path), True, md_path=md_path, time_taken=elapsed_time
	)

	except Exception as e:
	logger.error(f"Error processing {pdf_path}: {e}")
	import traceback
	traceback.print_exc()

	return PdfConverterResult(
	str(pdf_path), False, error=str(e)
	)


	class BatchProcessor:
	"""Process multiple PDF files in batch"""

	def __init__(self, batch_dir: str = "batch-files", output_dir: str = "output",
	workers: int = 1, delete_after: bool = False):
	self.batch_dir = batch_dir
	self.output_dir = output_dir
	self.workers = workers
	self.delete_after = delete_after
	self.converter = MineruPdfConverter(output_dir)

	def find_pdf_files(self) -> list[Path]:
	"""Find all PDF files in the batch directory"""
	pdf_files = []
	batch_path = Path(self.batch_dir)

	if not batch_path.exists():
	logger.warning(f"Batch directory not found: {self.batch_dir}")
	return pdf_files

	# Find all PDFs recursively
	pdf_files = list(batch_path.rglob("*.pdf"))
	logger.info(f"Found {len(pdf_files)} PDF files in {self.batch_dir}")

	return pdf_files

	def process_batch(self) -> tuple[int, int]:
	"""Process all PDFs in the batch directory"""
	pdf_files = self.find_pdf_files()

	if not pdf_files:
	logger.info("No PDF files found to process")
	return 0, 0

	successful = 0
	failed = 0

	logger.info(f"Starting batch processing of {len(pdf_files)} files...")

	# Process files sequentially (MinerU already handles parallelism internally)
	for pdf_file in pdf_files:
	result = self.converter.convert_file(str(pdf_file), self.delete_after)

	if result.success:
	successful += 1
	logger.info(f"✅ {result}")
	else:
	failed += 1
	logger.error(f"❌ {result}")

	return successful, failed


	def main():
	"""Main entry point"""
	parser = argparse.ArgumentParser(
	description="Convert PDF files to Markdown using MinerU",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Convert a single PDF
	%(prog)s convert path/to/file.pdf

	# Batch convert all PDFs in batch-files directory
	%(prog)s batch

	# Batch convert with custom settings
	%(prog)s batch --batch-dir /path/to/pdfs --output-dir /path/to/output --workers 4

	# Delete PDFs after successful conversion
	%(prog)s batch --delete-after
	"""
	)

	subparsers = parser.add_subparsers(dest='command', help='Command to run')

	# Convert command
	convert_parser = subparsers.add_parser('convert', help='Convert a single PDF file')
	convert_parser.add_argument('pdf_file', help='Path to PDF file')
	convert_parser.add_argument('--output-dir', default='output', help='Output directory')
	convert_parser.add_argument('--delete-after', action='store_true',
	help='Delete PDF after successful conversion')

	# Batch command
	batch_parser = subparsers.add_parser('batch', help='Batch convert PDF files')
	batch_parser.add_argument('--batch-dir', default='batch-files',
	help='Directory containing PDF files')
	batch_parser.add_argument('--output-dir', default='output',
	help='Output directory')
	batch_parser.add_argument('--workers', type=int, default=1,
	help='Number of parallel workers')
	batch_parser.add_argument('--delete-after', action='store_true',
	help='Delete PDFs after successful conversion')

	args = parser.parse_args()

	# Auto-detect command if none specified
	if not args.command:
	# If first argument looks like a file, assume convert command
	if len(sys.argv) > 1 and (sys.argv[1].endswith('.pdf') or Path(sys.argv[1]).exists()):
	args.command = 'convert'
	args.pdf_file = sys.argv[1]
	args.output_dir = 'output'
	args.delete_after = False
	else:
	# Default to batch mode
	args.command = 'batch'
	args.batch_dir = 'batch-files'
	args.output_dir = 'output'
	args.workers = 1
	args.delete_after = False

	# Execute command
	if args.command == 'convert':
	converter = MineruPdfConverter(args.output_dir)
	result = converter.convert_file(args.pdf_file, args.delete_after)
	print(result)
	sys.exit(0 if result.success else 1)

	elif args.command == 'batch':
	processor = BatchProcessor(
	args.batch_dir,
	args.output_dir,
	args.workers,
	args.delete_after
	)
	successful, failed = processor.process_batch()

	print(f"\n📊 Batch processing complete:")
	print(f" ✅ Successful: {successful}")
	print(f" ❌ Failed: {failed}")
	print(f" 📁 Output directory: {args.output_dir}")

	sys.exit(0 if failed == 0 else 1)

	else:
	parser.print_help()
	sys.exit(1)


	if __name__ == "__main__":
	main()