Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / utils /file_utils.py

milwright

Rolling out modular v2

c04ffe5 8 months ago

raw

history blame contribute delete

3.39 kB

	"""
	File utility functions for historical OCR processing.
	"""
	import base64
	import logging
	from pathlib import Path

	# Configure logging
	logger = logging.getLogger("utils")
	logger.setLevel(logging.INFO)

	def get_base64_from_image(image_path):
	"""
	Get base64 data URL from image file with proper MIME type.

	Args:
	image_path: Path to the image file

	Returns:
	Base64 data URL with appropriate MIME type prefix
	"""
	try:
	# Convert to Path object for better handling
	path_obj = Path(image_path)

	# Determine mime type based on file extension
	mime_type = 'image/jpeg' # Default mime type
	suffix = path_obj.suffix.lower()
	if suffix == '.png':
	mime_type = 'image/png'
	elif suffix == '.gif':
	mime_type = 'image/gif'
	elif suffix in ['.jpg', '.jpeg']:
	mime_type = 'image/jpeg'
	elif suffix == '.pdf':
	mime_type = 'application/pdf'

	# Read and encode file
	with open(path_obj, "rb") as file:
	encoded = base64.b64encode(file.read()).decode('utf-8')
	return f"data:{mime_type};base64,{encoded}"
	except Exception as e:
	logger.error(f"Error encoding file to base64: {str(e)}")
	return ""

	def get_base64_from_bytes(file_bytes, mime_type=None, file_name=None):
	"""
	Get base64 data URL from file bytes with proper MIME type.

	Args:
	file_bytes: Binary file data
	mime_type: MIME type of the file (optional)
	file_name: Original file name for MIME type detection (optional)

	Returns:
	Base64 data URL with appropriate MIME type prefix
	"""
	try:
	# Determine mime type if not provided
	if mime_type is None and file_name is not None:
	# Get file extension
	suffix = Path(file_name).suffix.lower()
	if suffix == '.png':
	mime_type = 'image/png'
	elif suffix == '.gif':
	mime_type = 'image/gif'
	elif suffix in ['.jpg', '.jpeg']:
	mime_type = 'image/jpeg'
	elif suffix == '.pdf':
	mime_type = 'application/pdf'
	else:
	# Default to image/jpeg for unknown types when processing images
	mime_type = 'image/jpeg'
	elif mime_type is None:
	# Default MIME type if we can't determine it - use image/jpeg instead of application/octet-stream
	# to ensure compatibility with Mistral AI OCR API
	mime_type = 'image/jpeg'

	# Encode and create data URL
	encoded = base64.b64encode(file_bytes).decode('utf-8')
	return f"data:{mime_type};base64,{encoded}"
	except Exception as e:
	logger.error(f"Error encoding bytes to base64: {str(e)}")
	return ""

	def handle_temp_files(temp_file_paths):
	"""
	Clean up temporary files

	Args:
	temp_file_paths: List of temporary file paths to clean up
	"""
	import os
	for temp_path in temp_file_paths:
	try:
	if os.path.exists(temp_path):
	os.unlink(temp_path)
	logger.info(f"Removed temporary file: {temp_path}")
	except Exception as e:
	logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")