Spaces:
Running
Running
Enhance handwritten document processing for improved OCR accuracy
Browse files- config.py +12 -1
- ocr_utils.py +239 -50
config.py
CHANGED
|
@@ -45,7 +45,18 @@ IMAGE_PREPROCESSING = {
|
|
| 45 |
"denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
|
| 46 |
"max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")), # Increased size limit for better quality
|
| 47 |
"target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
|
| 48 |
-
"compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95")) # Higher quality for better OCR results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
}
|
| 50 |
|
| 51 |
# OCR settings optimized for single-page performance
|
|
|
|
| 45 |
"denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
|
| 46 |
"max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")), # Increased size limit for better quality
|
| 47 |
"target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
|
| 48 |
+
"compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95")), # Higher quality for better OCR results
|
| 49 |
+
# Enhanced settings for handwritten documents
|
| 50 |
+
"handwritten": {
|
| 51 |
+
"contrast": float(os.environ.get("HANDWRITTEN_CONTRAST", "1.2")), # Lower contrast for handwritten text
|
| 52 |
+
"block_size": int(os.environ.get("HANDWRITTEN_BLOCK_SIZE", "21")), # Larger block size for adaptive thresholding
|
| 53 |
+
"constant": int(os.environ.get("HANDWRITTEN_CONSTANT", "5")), # Lower constant for adaptive thresholding
|
| 54 |
+
"use_dilation": os.environ.get("HANDWRITTEN_DILATION", "True").lower() in ("true", "1", "yes"), # Connect broken strokes
|
| 55 |
+
"clahe_limit": float(os.environ.get("HANDWRITTEN_CLAHE_LIMIT", "2.0")), # CLAHE limit for local contrast
|
| 56 |
+
"bilateral_d": int(os.environ.get("HANDWRITTEN_BILATERAL_D", "5")), # Bilateral filter window size
|
| 57 |
+
"bilateral_sigma1": int(os.environ.get("HANDWRITTEN_BILATERAL_SIGMA1", "25")), # Color sigma
|
| 58 |
+
"bilateral_sigma2": int(os.environ.get("HANDWRITTEN_BILATERAL_SIGMA2", "45")) # Space sigma
|
| 59 |
+
}
|
| 60 |
}
|
| 61 |
|
| 62 |
# OCR settings optimized for single-page performance
|
ocr_utils.py
CHANGED
|
@@ -565,6 +565,26 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
|
|
| 565 |
logger.debug(f"Document type detection for {image_file.name}: " +
|
| 566 |
f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
|
| 567 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
# Special processing for very large images (newspapers and large documents)
|
| 569 |
if is_newspaper:
|
| 570 |
# For newspaper format, we need more specialized processing
|
|
@@ -601,6 +621,34 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
|
|
| 601 |
# Also enhance saturation to make colored text more visible
|
| 602 |
enhancer_sat = ImageEnhance.Color(processed_img)
|
| 603 |
processed_img = enhancer_sat.enhance(1.2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
|
| 605 |
# Standard processing for other large images
|
| 606 |
elif file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
|
|
@@ -778,7 +826,7 @@ def _detect_document_type_impl(img_hash=None) -> bool:
|
|
| 778 |
# Documents typically have high variance (text on background)
|
| 779 |
# Handwritten documents may have less contrast than printed text
|
| 780 |
std_dev = np.std(img_np)
|
| 781 |
-
if std_dev >
|
| 782 |
return True
|
| 783 |
|
| 784 |
# 2. Quick check using downsampled image for edges
|
|
@@ -789,38 +837,63 @@ def _detect_document_type_impl(img_hash=None) -> bool:
|
|
| 789 |
else:
|
| 790 |
small_img = img_np
|
| 791 |
|
| 792 |
-
#
|
| 793 |
-
#
|
| 794 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 795 |
edge_ratio = np.count_nonzero(edges) / edges.size
|
| 796 |
|
| 797 |
-
#
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 802 |
|
| 803 |
dark_ratio = np.count_nonzero(dark_mask) / img_np.size
|
|
|
|
| 804 |
light_ratio = np.count_nonzero(light_mask) / img_np.size
|
| 805 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 806 |
# Special analysis for handwritten documents
|
| 807 |
-
#
|
| 808 |
-
if
|
| 809 |
-
|
| 810 |
-
lines = cv2.HoughLinesP(edges, 1, np.pi/180,
|
| 811 |
-
threshold=50, # Lower threshold for detection
|
| 812 |
-
minLineLength=30, # Shorter lines for handwriting
|
| 813 |
-
maxLineGap=20) # Larger gap for discontinuous handwriting
|
| 814 |
-
|
| 815 |
-
# If we find enough line segments, it's likely a document with text
|
| 816 |
-
if lines is not None and len(lines) > 10:
|
| 817 |
-
return True
|
| 818 |
|
| 819 |
-
# Combine heuristics for final decision
|
| 820 |
-
# Documents typically have both dark (text) and light (background) regions,
|
| 821 |
-
# and/or well-defined edges
|
| 822 |
# Lower thresholds for handwritten documents
|
| 823 |
-
return (dark_ratio > 0.
|
| 824 |
|
| 825 |
# Removed caching to fix unhashable type error
|
| 826 |
def preprocess_document_image(img: Image.Image) -> Image.Image:
|
|
@@ -1010,26 +1083,53 @@ def _preprocess_document_image_impl() -> Image.Image:
|
|
| 1010 |
img_np = np.array(enhanced)
|
| 1011 |
|
| 1012 |
if is_handwritten:
|
| 1013 |
-
#
|
| 1014 |
-
#
|
| 1015 |
-
# Guided filter works well for handwriting by preserving stroke details
|
| 1016 |
if img_size > 3000000: # Large images - downsample first
|
| 1017 |
scale_factor = 0.5
|
| 1018 |
small_img = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
|
| 1019 |
interpolation=cv2.INTER_AREA)
|
| 1020 |
-
|
| 1021 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1022 |
# Resize back
|
| 1023 |
filtered = cv2.resize(filtered, (width, height), interpolation=cv2.INTER_LINEAR)
|
| 1024 |
else:
|
| 1025 |
-
#
|
| 1026 |
-
|
| 1027 |
-
|
| 1028 |
-
|
| 1029 |
-
|
|
|
|
|
|
|
| 1030 |
|
| 1031 |
-
#
|
| 1032 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1033 |
|
| 1034 |
else:
|
| 1035 |
# Standard document processing - optimized for printed text
|
|
@@ -1557,6 +1657,7 @@ def serialize_ocr_object(obj):
|
|
| 1557 |
def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
|
| 1558 |
"""
|
| 1559 |
Attempt to use local pytesseract OCR as a fallback when API fails
|
|
|
|
| 1560 |
|
| 1561 |
Args:
|
| 1562 |
image_path: Path to the image file
|
|
@@ -1582,27 +1683,115 @@ def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str =
|
|
| 1582 |
image_path = Path(image_path) if isinstance(image_path, str) else image_path
|
| 1583 |
image = Image.open(image_path)
|
| 1584 |
|
| 1585 |
-
#
|
| 1586 |
-
|
| 1587 |
-
image = image.convert('RGB')
|
| 1588 |
-
|
| 1589 |
-
# Apply image enhancements for better OCR
|
| 1590 |
-
# Convert to grayscale for better text recognition
|
| 1591 |
-
image = image.convert('L')
|
| 1592 |
|
| 1593 |
-
#
|
| 1594 |
-
|
| 1595 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1596 |
|
| 1597 |
-
# Run OCR
|
| 1598 |
-
ocr_text = pytesseract.image_to_string(image,
|
| 1599 |
|
| 1600 |
if ocr_text and len(ocr_text.strip()) > 50:
|
| 1601 |
logger.info(f"Local OCR successful: extracted {len(ocr_text)} characters")
|
| 1602 |
return ocr_text
|
| 1603 |
else:
|
| 1604 |
-
|
| 1605 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1606 |
except ImportError:
|
| 1607 |
logger.warning("Pytesseract not installed - local OCR not available")
|
| 1608 |
return None
|
|
|
|
| 565 |
logger.debug(f"Document type detection for {image_file.name}: " +
|
| 566 |
f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
|
| 567 |
|
| 568 |
+
# Check for handwritten document characteristics
|
| 569 |
+
is_handwritten = False
|
| 570 |
+
if CV2_AVAILABLE and not is_newspaper:
|
| 571 |
+
# Use more advanced detection for handwritten content
|
| 572 |
+
try:
|
| 573 |
+
gray_np = np.array(img.convert('L'))
|
| 574 |
+
# Higher variance in edge strengths can indicate handwriting
|
| 575 |
+
edges = cv2.Canny(gray_np, 30, 100)
|
| 576 |
+
if np.count_nonzero(edges) / edges.size > 0.02: # Low edge threshold for handwriting
|
| 577 |
+
# Additional check with gradient magnitudes
|
| 578 |
+
sobelx = cv2.Sobel(gray_np, cv2.CV_64F, 1, 0, ksize=3)
|
| 579 |
+
sobely = cv2.Sobel(gray_np, cv2.CV_64F, 0, 1, ksize=3)
|
| 580 |
+
magnitude = np.sqrt(sobelx**2 + sobely**2)
|
| 581 |
+
# Handwriting typically has more variation in gradient magnitudes
|
| 582 |
+
if np.std(magnitude) > 20:
|
| 583 |
+
is_handwritten = True
|
| 584 |
+
logger.info(f"Handwritten document detected: {image_file.name}")
|
| 585 |
+
except Exception as e:
|
| 586 |
+
logger.debug(f"Handwriting detection error: {str(e)}")
|
| 587 |
+
|
| 588 |
# Special processing for very large images (newspapers and large documents)
|
| 589 |
if is_newspaper:
|
| 590 |
# For newspaper format, we need more specialized processing
|
|
|
|
| 621 |
# Also enhance saturation to make colored text more visible
|
| 622 |
enhancer_sat = ImageEnhance.Color(processed_img)
|
| 623 |
processed_img = enhancer_sat.enhance(1.2)
|
| 624 |
+
# Special processing for handwritten documents
|
| 625 |
+
elif is_handwritten:
|
| 626 |
+
logger.info(f"Processing handwritten document: {width}x{height}")
|
| 627 |
+
|
| 628 |
+
# For handwritten text, we need to preserve stroke details
|
| 629 |
+
# Use gentle scaling to maintain handwriting characteristics
|
| 630 |
+
max_dimension = max(width, height)
|
| 631 |
+
|
| 632 |
+
if max_dimension > 4000: # Large handwritten document
|
| 633 |
+
scale_factor = 0.6 # Less aggressive reduction for handwriting
|
| 634 |
+
else:
|
| 635 |
+
scale_factor = 0.8 # Minimal reduction for moderate size
|
| 636 |
+
|
| 637 |
+
# Calculate new dimensions
|
| 638 |
+
new_width = int(width * scale_factor)
|
| 639 |
+
new_height = int(height * scale_factor)
|
| 640 |
+
|
| 641 |
+
# Use high-quality resampling to preserve handwriting details
|
| 642 |
+
processed_img = img.resize((new_width, new_height), Image.LANCZOS)
|
| 643 |
+
|
| 644 |
+
# Lower contrast enhancement for handwriting to preserve stroke details
|
| 645 |
+
if img.mode in ('RGB', 'RGBA'):
|
| 646 |
+
# Convert to grayscale for better text processing
|
| 647 |
+
processed_img = processed_img.convert('L')
|
| 648 |
+
|
| 649 |
+
# Use reduced contrast enhancement to preserve subtle strokes
|
| 650 |
+
enhancer = ImageEnhance.Contrast(processed_img)
|
| 651 |
+
processed_img = enhancer.enhance(1.2) # Lower contrast value for handwriting
|
| 652 |
|
| 653 |
# Standard processing for other large images
|
| 654 |
elif file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
|
|
|
|
| 826 |
# Documents typically have high variance (text on background)
|
| 827 |
# Handwritten documents may have less contrast than printed text
|
| 828 |
std_dev = np.std(img_np)
|
| 829 |
+
if std_dev > 40: # Further lowered threshold to better detect handwritten documents with low contrast
|
| 830 |
return True
|
| 831 |
|
| 832 |
# 2. Quick check using downsampled image for edges
|
|
|
|
| 837 |
else:
|
| 838 |
small_img = img_np
|
| 839 |
|
| 840 |
+
# Enhanced edge detection for handwritten documents
|
| 841 |
+
# Use multiple Canny thresholds to better capture both faint and bold strokes
|
| 842 |
+
edges_low = cv2.Canny(small_img, 20, 110, L2gradient=False) # For faint handwriting
|
| 843 |
+
edges_high = cv2.Canny(small_img, 30, 150, L2gradient=False) # For standard text
|
| 844 |
+
|
| 845 |
+
# Combine edge detection results
|
| 846 |
+
edges = cv2.bitwise_or(edges_low, edges_high)
|
| 847 |
edge_ratio = np.count_nonzero(edges) / edges.size
|
| 848 |
|
| 849 |
+
# Special handling for potential handwritten content - more sensitive detection
|
| 850 |
+
handwritten_indicator = False
|
| 851 |
+
if edge_ratio > 0.015: # Lower threshold specifically for handwritten content
|
| 852 |
+
try:
|
| 853 |
+
# Look for handwriting stroke characteristics using gradient analysis
|
| 854 |
+
# Compute gradient magnitudes and directions
|
| 855 |
+
sobelx = cv2.Sobel(small_img, cv2.CV_64F, 1, 0, ksize=3)
|
| 856 |
+
sobely = cv2.Sobel(small_img, cv2.CV_64F, 0, 1, ksize=3)
|
| 857 |
+
magnitude = np.sqrt(sobelx**2 + sobely**2)
|
| 858 |
+
|
| 859 |
+
# Handwriting typically has higher variation in gradient magnitudes
|
| 860 |
+
if np.std(magnitude) > 18: # Lower threshold for more sensitivity
|
| 861 |
+
# Handwriting is indicated if we also have some line structure
|
| 862 |
+
# Try to find line segments that could indicate text lines
|
| 863 |
+
lines = cv2.HoughLinesP(edges, 1, np.pi/180,
|
| 864 |
+
threshold=45, # Lower threshold for handwriting
|
| 865 |
+
minLineLength=25, # Shorter minimum line length
|
| 866 |
+
maxLineGap=25) # Larger gap for disconnected handwriting
|
| 867 |
+
|
| 868 |
+
if lines is not None and len(lines) > 8: # Fewer line segments needed
|
| 869 |
+
handwritten_indicator = True
|
| 870 |
+
except Exception:
|
| 871 |
+
# If analysis fails, continue with other checks
|
| 872 |
+
pass
|
| 873 |
+
|
| 874 |
+
# 3. Enhanced histogram analysis for handwritten content
|
| 875 |
+
# Use more granular bins for better detection of varying stroke densities
|
| 876 |
+
dark_mask = img_np < 65 # Increased threshold to capture lighter handwritten text
|
| 877 |
+
medium_mask = (img_np >= 65) & (img_np < 170) # Medium gray range for handwriting
|
| 878 |
+
light_mask = img_np > 175 # Slightly adjusted for aged paper
|
| 879 |
|
| 880 |
dark_ratio = np.count_nonzero(dark_mask) / img_np.size
|
| 881 |
+
medium_ratio = np.count_nonzero(medium_mask) / img_np.size
|
| 882 |
light_ratio = np.count_nonzero(light_mask) / img_np.size
|
| 883 |
|
| 884 |
+
# Handwritten documents often have more medium-gray content than printed text
|
| 885 |
+
# This helps detect pencil or faded ink handwriting
|
| 886 |
+
if medium_ratio > 0.3 and edge_ratio > 0.015:
|
| 887 |
+
return True
|
| 888 |
+
|
| 889 |
# Special analysis for handwritten documents
|
| 890 |
+
# Return true immediately if handwriting characteristics detected
|
| 891 |
+
if handwritten_indicator:
|
| 892 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 893 |
|
| 894 |
+
# Combine heuristics for final decision with improved sensitivity
|
|
|
|
|
|
|
| 895 |
# Lower thresholds for handwritten documents
|
| 896 |
+
return (dark_ratio > 0.025 and light_ratio > 0.2) or edge_ratio > 0.025
|
| 897 |
|
| 898 |
# Removed caching to fix unhashable type error
|
| 899 |
def preprocess_document_image(img: Image.Image) -> Image.Image:
|
|
|
|
| 1083 |
img_np = np.array(enhanced)
|
| 1084 |
|
| 1085 |
if is_handwritten:
|
| 1086 |
+
# Enhanced processing for handwritten documents
|
| 1087 |
+
# Optimized for better stroke preservation and readability
|
|
|
|
| 1088 |
if img_size > 3000000: # Large images - downsample first
|
| 1089 |
scale_factor = 0.5
|
| 1090 |
small_img = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
|
| 1091 |
interpolation=cv2.INTER_AREA)
|
| 1092 |
+
|
| 1093 |
+
# Apply CLAHE for better local contrast in handwriting
|
| 1094 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 1095 |
+
enhanced_img = clahe.apply(small_img)
|
| 1096 |
+
|
| 1097 |
+
# Apply bilateral filter with parameters optimized for handwriting
|
| 1098 |
+
# Lower sigma values to preserve more detail
|
| 1099 |
+
filtered = cv2.bilateralFilter(enhanced_img, 7, 30, 50)
|
| 1100 |
+
|
| 1101 |
# Resize back
|
| 1102 |
filtered = cv2.resize(filtered, (width, height), interpolation=cv2.INTER_LINEAR)
|
| 1103 |
else:
|
| 1104 |
+
# For smaller handwritten images
|
| 1105 |
+
# Apply CLAHE for better local contrast
|
| 1106 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 1107 |
+
enhanced_img = clahe.apply(img_np)
|
| 1108 |
+
|
| 1109 |
+
# Apply bilateral filter with parameters optimized for handwriting
|
| 1110 |
+
filtered = cv2.bilateralFilter(enhanced_img, 5, 25, 45)
|
| 1111 |
|
| 1112 |
+
# Adaptive thresholding specific to handwriting
|
| 1113 |
+
try:
|
| 1114 |
+
# Use larger block size and lower constant for better stroke preservation
|
| 1115 |
+
binary = cv2.adaptiveThreshold(
|
| 1116 |
+
filtered, 255,
|
| 1117 |
+
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 1118 |
+
cv2.THRESH_BINARY,
|
| 1119 |
+
21, # Larger block size for handwriting
|
| 1120 |
+
5 # Lower constant for better stroke preservation
|
| 1121 |
+
)
|
| 1122 |
+
|
| 1123 |
+
# Apply slight dilation to connect broken strokes
|
| 1124 |
+
kernel = np.ones((2, 2), np.uint8)
|
| 1125 |
+
binary = cv2.dilate(binary, kernel, iterations=1)
|
| 1126 |
+
|
| 1127 |
+
# Convert back to PIL Image
|
| 1128 |
+
return Image.fromarray(binary)
|
| 1129 |
+
except Exception as e:
|
| 1130 |
+
logger.debug(f"Adaptive threshold for handwriting failed: {str(e)}")
|
| 1131 |
+
# Convert filtered image to PIL and return as fallback
|
| 1132 |
+
return Image.fromarray(filtered)
|
| 1133 |
|
| 1134 |
else:
|
| 1135 |
# Standard document processing - optimized for printed text
|
|
|
|
| 1657 |
def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
|
| 1658 |
"""
|
| 1659 |
Attempt to use local pytesseract OCR as a fallback when API fails
|
| 1660 |
+
With enhanced processing optimized for handwritten content
|
| 1661 |
|
| 1662 |
Args:
|
| 1663 |
image_path: Path to the image file
|
|
|
|
| 1683 |
image_path = Path(image_path) if isinstance(image_path, str) else image_path
|
| 1684 |
image = Image.open(image_path)
|
| 1685 |
|
| 1686 |
+
# Auto-detect if this appears to be handwritten
|
| 1687 |
+
is_handwritten = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1688 |
|
| 1689 |
+
# Use OpenCV for better detection and preprocessing if available
|
| 1690 |
+
if CV2_AVAILABLE:
|
| 1691 |
+
try:
|
| 1692 |
+
# Convert image to numpy array
|
| 1693 |
+
img_np = np.array(image.convert('L'))
|
| 1694 |
+
|
| 1695 |
+
# Check for handwritten characteristics
|
| 1696 |
+
edges = cv2.Canny(img_np, 30, 100)
|
| 1697 |
+
edge_ratio = np.count_nonzero(edges) / edges.size
|
| 1698 |
+
|
| 1699 |
+
# Typical handwritten documents have more varied edge patterns
|
| 1700 |
+
if edge_ratio > 0.02:
|
| 1701 |
+
# Additional check with gradient magnitudes
|
| 1702 |
+
sobelx = cv2.Sobel(img_np, cv2.CV_64F, 1, 0, ksize=3)
|
| 1703 |
+
sobely = cv2.Sobel(img_np, cv2.CV_64F, 0, 1, ksize=3)
|
| 1704 |
+
magnitude = np.sqrt(sobelx**2 + sobely**2)
|
| 1705 |
+
# Handwriting typically has more variation in gradient magnitudes
|
| 1706 |
+
if np.std(magnitude) > 20:
|
| 1707 |
+
is_handwritten = True
|
| 1708 |
+
logger.info("Detected handwritten content for local OCR")
|
| 1709 |
+
|
| 1710 |
+
# Enhanced preprocessing based on document type
|
| 1711 |
+
if is_handwritten:
|
| 1712 |
+
# Process for handwritten content
|
| 1713 |
+
# Apply CLAHE for better local contrast
|
| 1714 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 1715 |
+
img_np = clahe.apply(img_np)
|
| 1716 |
+
|
| 1717 |
+
# Apply adaptive thresholding with optimized parameters for handwriting
|
| 1718 |
+
binary = cv2.adaptiveThreshold(
|
| 1719 |
+
img_np, 255,
|
| 1720 |
+
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 1721 |
+
cv2.THRESH_BINARY,
|
| 1722 |
+
21, # Larger block size for handwriting
|
| 1723 |
+
5 # Lower constant for better stroke preservation
|
| 1724 |
+
)
|
| 1725 |
+
|
| 1726 |
+
# Optional: apply dilation to thicken strokes slightly
|
| 1727 |
+
kernel = np.ones((2, 2), np.uint8)
|
| 1728 |
+
binary = cv2.dilate(binary, kernel, iterations=1)
|
| 1729 |
+
|
| 1730 |
+
# Convert back to PIL Image for tesseract
|
| 1731 |
+
image = Image.fromarray(binary)
|
| 1732 |
+
|
| 1733 |
+
# Set tesseract options for handwritten content
|
| 1734 |
+
custom_config = r'--oem 1 --psm 6 -l eng'
|
| 1735 |
+
else:
|
| 1736 |
+
# Process for printed content
|
| 1737 |
+
# Apply CLAHE for better contrast
|
| 1738 |
+
clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8, 8))
|
| 1739 |
+
img_np = clahe.apply(img_np)
|
| 1740 |
+
|
| 1741 |
+
# Apply bilateral filter to reduce noise while preserving edges
|
| 1742 |
+
img_np = cv2.bilateralFilter(img_np, 9, 75, 75)
|
| 1743 |
+
|
| 1744 |
+
# Apply Otsu's thresholding for printed text
|
| 1745 |
+
_, binary = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 1746 |
+
|
| 1747 |
+
# Convert back to PIL Image for tesseract
|
| 1748 |
+
image = Image.fromarray(binary)
|
| 1749 |
+
|
| 1750 |
+
# Set tesseract options for printed content
|
| 1751 |
+
custom_config = r'--oem 3 --psm 6 -l eng'
|
| 1752 |
+
except Exception as e:
|
| 1753 |
+
logger.warning(f"OpenCV preprocessing failed: {str(e)}. Using PIL fallback.")
|
| 1754 |
+
|
| 1755 |
+
# Convert to RGB if not already (pytesseract works best with RGB)
|
| 1756 |
+
if image.mode != 'RGB':
|
| 1757 |
+
image = image.convert('RGB')
|
| 1758 |
+
|
| 1759 |
+
# Apply basic image enhancements
|
| 1760 |
+
image = image.convert('L')
|
| 1761 |
+
enhancer = ImageEnhance.Contrast(image)
|
| 1762 |
+
image = enhancer.enhance(2.0)
|
| 1763 |
+
custom_config = r'--oem 3 --psm 6 -l eng'
|
| 1764 |
+
else:
|
| 1765 |
+
# PIL-only path without OpenCV
|
| 1766 |
+
# Convert to RGB if not already (pytesseract works best with RGB)
|
| 1767 |
+
if image.mode != 'RGB':
|
| 1768 |
+
image = image.convert('RGB')
|
| 1769 |
+
|
| 1770 |
+
# Apply basic image enhancements
|
| 1771 |
+
image = image.convert('L')
|
| 1772 |
+
enhancer = ImageEnhance.Contrast(image)
|
| 1773 |
+
image = enhancer.enhance(2.0)
|
| 1774 |
+
custom_config = r'--oem 3 --psm 6 -l eng'
|
| 1775 |
|
| 1776 |
+
# Run OCR with appropriate config
|
| 1777 |
+
ocr_text = pytesseract.image_to_string(image, config=custom_config)
|
| 1778 |
|
| 1779 |
if ocr_text and len(ocr_text.strip()) > 50:
|
| 1780 |
logger.info(f"Local OCR successful: extracted {len(ocr_text)} characters")
|
| 1781 |
return ocr_text
|
| 1782 |
else:
|
| 1783 |
+
# Try another psm mode as fallback
|
| 1784 |
+
logger.warning("First OCR attempt produced minimal text, trying another mode")
|
| 1785 |
+
# Try PSM mode 4 (assume single column of text)
|
| 1786 |
+
fallback_config = r'--oem 3 --psm 4 -l eng'
|
| 1787 |
+
ocr_text = pytesseract.image_to_string(image, config=fallback_config)
|
| 1788 |
+
|
| 1789 |
+
if ocr_text and len(ocr_text.strip()) > 50:
|
| 1790 |
+
logger.info(f"Local OCR fallback successful: extracted {len(ocr_text)} characters")
|
| 1791 |
+
return ocr_text
|
| 1792 |
+
else:
|
| 1793 |
+
logger.warning("Local OCR produced minimal or no text")
|
| 1794 |
+
return None
|
| 1795 |
except ImportError:
|
| 1796 |
logger.warning("Pytesseract not installed - local OCR not available")
|
| 1797 |
return None
|