Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

milwright commited on Apr 22

Commit

88d3e04

1 Parent(s): 70727c4

Enhance handwritten document processing for improved OCR accuracy

Browse files

Files changed (2) hide show

config.py +12 -1
ocr_utils.py +239 -50

config.py CHANGED Viewed

@@ -45,7 +45,18 @@ IMAGE_PREPROCESSING = {
     "denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
     "max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")),    # Increased size limit for better quality
     "target_dpi": int(os.environ.get("TARGET_DPI", "300")),               # Target DPI for scaling
-    "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95"))  # Higher quality for better OCR results
 }
 # OCR settings optimized for single-page performance

     "denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
     "max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "12.0")),    # Increased size limit for better quality
     "target_dpi": int(os.environ.get("TARGET_DPI", "300")),               # Target DPI for scaling
+    "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "95")),  # Higher quality for better OCR results
+    # Enhanced settings for handwritten documents
+    "handwritten": {
+        "contrast": float(os.environ.get("HANDWRITTEN_CONTRAST", "1.2")),  # Lower contrast for handwritten text
+        "block_size": int(os.environ.get("HANDWRITTEN_BLOCK_SIZE", "21")), # Larger block size for adaptive thresholding
+        "constant": int(os.environ.get("HANDWRITTEN_CONSTANT", "5")),      # Lower constant for adaptive thresholding
+        "use_dilation": os.environ.get("HANDWRITTEN_DILATION", "True").lower() in ("true", "1", "yes"),  # Connect broken strokes
+        "clahe_limit": float(os.environ.get("HANDWRITTEN_CLAHE_LIMIT", "2.0")),  # CLAHE limit for local contrast
+        "bilateral_d": int(os.environ.get("HANDWRITTEN_BILATERAL_D", "5")), # Bilateral filter window size
+        "bilateral_sigma1": int(os.environ.get("HANDWRITTEN_BILATERAL_SIGMA1", "25")),  # Color sigma
+        "bilateral_sigma2": int(os.environ.get("HANDWRITTEN_BILATERAL_SIGMA2", "45"))   # Space sigma
+    }
 }
 # OCR settings optimized for single-page performance

ocr_utils.py CHANGED Viewed

@@ -565,6 +565,26 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
                 logger.debug(f"Document type detection for {image_file.name}: " +
                            f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
             # Special processing for very large images (newspapers and large documents)
             if is_newspaper:
                 # For newspaper format, we need more specialized processing
@@ -601,6 +621,34 @@ def preprocess_image_for_ocr(image_path: Union[str, Path]) -> Tuple[Image.Image,
                     # Also enhance saturation to make colored text more visible
                     enhancer_sat = ImageEnhance.Color(processed_img)
                     processed_img = enhancer_sat.enhance(1.2)
             # Standard processing for other large images
             elif file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
@@ -778,7 +826,7 @@ def _detect_document_type_impl(img_hash=None) -> bool:
     # Documents typically have high variance (text on background)
     # Handwritten documents may have less contrast than printed text
     std_dev = np.std(img_np)
-    if std_dev > 45:  # Lowered threshold to better detect handwritten documents
         return True
     # 2. Quick check using downsampled image for edges
@@ -789,38 +837,63 @@ def _detect_document_type_impl(img_hash=None) -> bool:
     else:
         small_img = img_np
-    # Use adaptive edge detection parameters for handwritten documents
-    # Lowered threshold to better detect fainter handwritten text
-    edges = cv2.Canny(small_img, 30, 130, L2gradient=False)
     edge_ratio = np.count_nonzero(edges) / edges.size
-    # 3. Fast histogram approximation using bins
-    # Instead of calculating full histogram, use bins for dark and light regions
-    # Adjusted for handwritten documents which may have more gray values
-    dark_mask = img_np < 60  # Increased threshold to capture lighter handwritten text
-    light_mask = img_np > 180  # Lowered threshold to account for aged paper
     dark_ratio = np.count_nonzero(dark_mask) / img_np.size
     light_ratio = np.count_nonzero(light_mask) / img_np.size
     # Special analysis for handwritten documents
-    # Check for line-like structures typical in handwritten text
-    if CV2_AVAILABLE and edge_ratio > 0.02:  # Lower threshold to capture handwritten documents
-        # Try to find line segments that could indicate text lines
-        lines = cv2.HoughLinesP(edges, 1, np.pi/180,
-                               threshold=50,  # Lower threshold for detection
-                               minLineLength=30,  # Shorter lines for handwriting
-                               maxLineGap=20)   # Larger gap for discontinuous handwriting
-        # If we find enough line segments, it's likely a document with text
-        if lines is not None and len(lines) > 10:
-            return True
-    # Combine heuristics for final decision
-    # Documents typically have both dark (text) and light (background) regions,
-    # and/or well-defined edges
     # Lower thresholds for handwritten documents
-    return (dark_ratio > 0.03 and light_ratio > 0.25) or edge_ratio > 0.03
 # Removed caching to fix unhashable type error
 def preprocess_document_image(img: Image.Image) -> Image.Image:
@@ -1010,26 +1083,53 @@ def _preprocess_document_image_impl() -> Image.Image:
             img_np = np.array(enhanced)
             if is_handwritten:
-                # Special treatment for handwritten documents
-                # Use guided filter which preserves edges better than NLMeans
-                # Guided filter works well for handwriting by preserving stroke details
                 if img_size > 3000000:  # Large images - downsample first
                     scale_factor = 0.5
                     small_img = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
                                           interpolation=cv2.INTER_AREA)
-                    # Apply bilateral filter which preserves edges while smoothing
-                    filtered = cv2.bilateralFilter(small_img, 9, 75, 75)
                     # Resize back
                     filtered = cv2.resize(filtered, (width, height), interpolation=cv2.INTER_LINEAR)
                 else:
-                    # Use bilateral filter directly for smaller images
-                    filtered = cv2.bilateralFilter(img_np, 7, 50, 50)
-                # Convert back to PIL Image
-                enhanced = Image.fromarray(filtered)
-                # For handwritten docs, avoid binary thresholding which can destroy subtle strokes
-                return enhanced
             else:
                 # Standard document processing - optimized for printed text
@@ -1557,6 +1657,7 @@ def serialize_ocr_object(obj):
 def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
     """
     Attempt to use local pytesseract OCR as a fallback when API fails
     Args:
         image_path: Path to the image file
@@ -1582,27 +1683,115 @@ def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str =
             image_path = Path(image_path) if isinstance(image_path, str) else image_path
             image = Image.open(image_path)
-        # Convert to RGB if not already (pytesseract works best with RGB)
-        if image.mode != 'RGB':
-            image = image.convert('RGB')
-        # Apply image enhancements for better OCR
-        # Convert to grayscale for better text recognition
-        image = image.convert('L')
-        # Enhance contrast
-        enhancer = ImageEnhance.Contrast(image)
-        image = enhancer.enhance(2.0)  # Higher contrast for better OCR
-        # Run OCR
-        ocr_text = pytesseract.image_to_string(image, lang='eng')
         if ocr_text and len(ocr_text.strip()) > 50:
             logger.info(f"Local OCR successful: extracted {len(ocr_text)} characters")
             return ocr_text
         else:
-            logger.warning("Local OCR produced minimal or no text")
-            return None
     except ImportError:
         logger.warning("Pytesseract not installed - local OCR not available")
         return None

                 logger.debug(f"Document type detection for {image_file.name}: " +
                            f"{'newspaper' if is_newspaper else 'document' if is_document else 'photo'}")
+            # Check for handwritten document characteristics
+            is_handwritten = False
+            if CV2_AVAILABLE and not is_newspaper:
+                # Use more advanced detection for handwritten content
+                try:
+                    gray_np = np.array(img.convert('L'))
+                    # Higher variance in edge strengths can indicate handwriting
+                    edges = cv2.Canny(gray_np, 30, 100)
+                    if np.count_nonzero(edges) / edges.size > 0.02:  # Low edge threshold for handwriting
+                        # Additional check with gradient magnitudes
+                        sobelx = cv2.Sobel(gray_np, cv2.CV_64F, 1, 0, ksize=3)
+                        sobely = cv2.Sobel(gray_np, cv2.CV_64F, 0, 1, ksize=3)
+                        magnitude = np.sqrt(sobelx**2 + sobely**2)
+                        # Handwriting typically has more variation in gradient magnitudes
+                        if np.std(magnitude) > 20:
+                            is_handwritten = True
+                            logger.info(f"Handwritten document detected: {image_file.name}")
+                except Exception as e:
+                    logger.debug(f"Handwriting detection error: {str(e)}")
             # Special processing for very large images (newspapers and large documents)
             if is_newspaper:
                 # For newspaper format, we need more specialized processing
                     # Also enhance saturation to make colored text more visible
                     enhancer_sat = ImageEnhance.Color(processed_img)
                     processed_img = enhancer_sat.enhance(1.2)
+            # Special processing for handwritten documents
+            elif is_handwritten:
+                logger.info(f"Processing handwritten document: {width}x{height}")
+                # For handwritten text, we need to preserve stroke details
+                # Use gentle scaling to maintain handwriting characteristics
+                max_dimension = max(width, height)
+                if max_dimension > 4000:  # Large handwritten document
+                    scale_factor = 0.6   # Less aggressive reduction for handwriting
+                else:
+                    scale_factor = 0.8   # Minimal reduction for moderate size
+                # Calculate new dimensions
+                new_width = int(width * scale_factor)
+                new_height = int(height * scale_factor)
+                # Use high-quality resampling to preserve handwriting details
+                processed_img = img.resize((new_width, new_height), Image.LANCZOS)
+                # Lower contrast enhancement for handwriting to preserve stroke details
+                if img.mode in ('RGB', 'RGBA'):
+                    # Convert to grayscale for better text processing
+                    processed_img = processed_img.convert('L')
+                    # Use reduced contrast enhancement to preserve subtle strokes
+                    enhancer = ImageEnhance.Contrast(processed_img)
+                    processed_img = enhancer.enhance(1.2)  # Lower contrast value for handwriting
             # Standard processing for other large images
             elif file_size_mb > IMAGE_PREPROCESSING["max_size_mb"] or max(width, height) > 3000:
     # Documents typically have high variance (text on background)
     # Handwritten documents may have less contrast than printed text
     std_dev = np.std(img_np)
+    if std_dev > 40:  # Further lowered threshold to better detect handwritten documents with low contrast
         return True
     # 2. Quick check using downsampled image for edges
     else:
         small_img = img_np
+    # Enhanced edge detection for handwritten documents
+    # Use multiple Canny thresholds to better capture both faint and bold strokes
+    edges_low = cv2.Canny(small_img, 20, 110, L2gradient=False)  # For faint handwriting
+    edges_high = cv2.Canny(small_img, 30, 150, L2gradient=False) # For standard text
+    # Combine edge detection results
+    edges = cv2.bitwise_or(edges_low, edges_high)
     edge_ratio = np.count_nonzero(edges) / edges.size
+    # Special handling for potential handwritten content - more sensitive detection
+    handwritten_indicator = False
+    if edge_ratio > 0.015:  # Lower threshold specifically for handwritten content
+        try:
+            # Look for handwriting stroke characteristics using gradient analysis
+            # Compute gradient magnitudes and directions
+            sobelx = cv2.Sobel(small_img, cv2.CV_64F, 1, 0, ksize=3)
+            sobely = cv2.Sobel(small_img, cv2.CV_64F, 0, 1, ksize=3)
+            magnitude = np.sqrt(sobelx**2 + sobely**2)
+            # Handwriting typically has higher variation in gradient magnitudes
+            if np.std(magnitude) > 18:  # Lower threshold for more sensitivity
+                # Handwriting is indicated if we also have some line structure
+                # Try to find line segments that could indicate text lines
+                lines = cv2.HoughLinesP(edges, 1, np.pi/180,
+                                      threshold=45,  # Lower threshold for handwriting
+                                      minLineLength=25,  # Shorter minimum line length
+                                      maxLineGap=25)   # Larger gap for disconnected handwriting
+                if lines is not None and len(lines) > 8:  # Fewer line segments needed
+                    handwritten_indicator = True
+        except Exception:
+            # If analysis fails, continue with other checks
+            pass
+    # 3. Enhanced histogram analysis for handwritten content
+    # Use more granular bins for better detection of varying stroke densities
+    dark_mask = img_np < 65  # Increased threshold to capture lighter handwritten text
+    medium_mask = (img_np >= 65) & (img_np < 170)  # Medium gray range for handwriting
+    light_mask = img_np > 175  # Slightly adjusted for aged paper
     dark_ratio = np.count_nonzero(dark_mask) / img_np.size
+    medium_ratio = np.count_nonzero(medium_mask) / img_np.size
     light_ratio = np.count_nonzero(light_mask) / img_np.size
+    # Handwritten documents often have more medium-gray content than printed text
+    # This helps detect pencil or faded ink handwriting
+    if medium_ratio > 0.3 and edge_ratio > 0.015:
+        return True
     # Special analysis for handwritten documents
+    # Return true immediately if handwriting characteristics detected
+    if handwritten_indicator:
+        return True
+    # Combine heuristics for final decision with improved sensitivity
     # Lower thresholds for handwritten documents
+    return (dark_ratio > 0.025 and light_ratio > 0.2) or edge_ratio > 0.025
 # Removed caching to fix unhashable type error
 def preprocess_document_image(img: Image.Image) -> Image.Image:
             img_np = np.array(enhanced)
             if is_handwritten:
+                # Enhanced processing for handwritten documents
+                # Optimized for better stroke preservation and readability
                 if img_size > 3000000:  # Large images - downsample first
                     scale_factor = 0.5
                     small_img = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor,
                                           interpolation=cv2.INTER_AREA)
+                    # Apply CLAHE for better local contrast in handwriting
+                    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+                    enhanced_img = clahe.apply(small_img)
+                    # Apply bilateral filter with parameters optimized for handwriting
+                    # Lower sigma values to preserve more detail
+                    filtered = cv2.bilateralFilter(enhanced_img, 7, 30, 50)
                     # Resize back
                     filtered = cv2.resize(filtered, (width, height), interpolation=cv2.INTER_LINEAR)
                 else:
+                    # For smaller handwritten images
+                    # Apply CLAHE for better local contrast
+                    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+                    enhanced_img = clahe.apply(img_np)
+                    # Apply bilateral filter with parameters optimized for handwriting
+                    filtered = cv2.bilateralFilter(enhanced_img, 5, 25, 45)
+                # Adaptive thresholding specific to handwriting
+                try:
+                    # Use larger block size and lower constant for better stroke preservation
+                    binary = cv2.adaptiveThreshold(
+                        filtered, 255,
+                        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                        cv2.THRESH_BINARY,
+                        21,  # Larger block size for handwriting
+                        5    # Lower constant for better stroke preservation
+                    )
+                    # Apply slight dilation to connect broken strokes
+                    kernel = np.ones((2, 2), np.uint8)
+                    binary = cv2.dilate(binary, kernel, iterations=1)
+                    # Convert back to PIL Image
+                    return Image.fromarray(binary)
+                except Exception as e:
+                    logger.debug(f"Adaptive threshold for handwriting failed: {str(e)}")
+                    # Convert filtered image to PIL and return as fallback
+                    return Image.fromarray(filtered)
             else:
                 # Standard document processing - optimized for printed text
 def try_local_ocr_fallback(image_path: Union[str, Path], base64_data_url: str = None) -> str:
     """
     Attempt to use local pytesseract OCR as a fallback when API fails
+    With enhanced processing optimized for handwritten content
     Args:
         image_path: Path to the image file
             image_path = Path(image_path) if isinstance(image_path, str) else image_path
             image = Image.open(image_path)
+        # Auto-detect if this appears to be handwritten
+        is_handwritten = False
+        # Use OpenCV for better detection and preprocessing if available
+        if CV2_AVAILABLE:
+            try:
+                # Convert image to numpy array
+                img_np = np.array(image.convert('L'))
+                # Check for handwritten characteristics
+                edges = cv2.Canny(img_np, 30, 100)
+                edge_ratio = np.count_nonzero(edges) / edges.size
+                # Typical handwritten documents have more varied edge patterns
+                if edge_ratio > 0.02:
+                    # Additional check with gradient magnitudes
+                    sobelx = cv2.Sobel(img_np, cv2.CV_64F, 1, 0, ksize=3)
+                    sobely = cv2.Sobel(img_np, cv2.CV_64F, 0, 1, ksize=3)
+                    magnitude = np.sqrt(sobelx**2 + sobely**2)
+                    # Handwriting typically has more variation in gradient magnitudes
+                    if np.std(magnitude) > 20:
+                        is_handwritten = True
+                        logger.info("Detected handwritten content for local OCR")
+                # Enhanced preprocessing based on document type
+                if is_handwritten:
+                    # Process for handwritten content
+                    # Apply CLAHE for better local contrast
+                    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+                    img_np = clahe.apply(img_np)
+                    # Apply adaptive thresholding with optimized parameters for handwriting
+                    binary = cv2.adaptiveThreshold(
+                        img_np, 255,
+                        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                        cv2.THRESH_BINARY,
+                        21,  # Larger block size for handwriting
+                        5    # Lower constant for better stroke preservation
+                    )
+                    # Optional: apply dilation to thicken strokes slightly
+                    kernel = np.ones((2, 2), np.uint8)
+                    binary = cv2.dilate(binary, kernel, iterations=1)
+                    # Convert back to PIL Image for tesseract
+                    image = Image.fromarray(binary)
+                    # Set tesseract options for handwritten content
+                    custom_config = r'--oem 1 --psm 6 -l eng'
+                else:
+                    # Process for printed content
+                    # Apply CLAHE for better contrast
+                    clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8, 8))
+                    img_np = clahe.apply(img_np)
+                    # Apply bilateral filter to reduce noise while preserving edges
+                    img_np = cv2.bilateralFilter(img_np, 9, 75, 75)
+                    # Apply Otsu's thresholding for printed text
+                    _, binary = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+                    # Convert back to PIL Image for tesseract
+                    image = Image.fromarray(binary)
+                    # Set tesseract options for printed content
+                    custom_config = r'--oem 3 --psm 6 -l eng'
+            except Exception as e:
+                logger.warning(f"OpenCV preprocessing failed: {str(e)}. Using PIL fallback.")
+                # Convert to RGB if not already (pytesseract works best with RGB)
+                if image.mode != 'RGB':
+                    image = image.convert('RGB')
+                # Apply basic image enhancements
+                image = image.convert('L')
+                enhancer = ImageEnhance.Contrast(image)
+                image = enhancer.enhance(2.0)
+                custom_config = r'--oem 3 --psm 6 -l eng'
+        else:
+            # PIL-only path without OpenCV
+            # Convert to RGB if not already (pytesseract works best with RGB)
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+            # Apply basic image enhancements
+            image = image.convert('L')
+            enhancer = ImageEnhance.Contrast(image)
+            image = enhancer.enhance(2.0)
+            custom_config = r'--oem 3 --psm 6 -l eng'
+        # Run OCR with appropriate config
+        ocr_text = pytesseract.image_to_string(image, config=custom_config)
         if ocr_text and len(ocr_text.strip()) > 50:
             logger.info(f"Local OCR successful: extracted {len(ocr_text)} characters")
             return ocr_text
         else:
+            # Try another psm mode as fallback
+            logger.warning("First OCR attempt produced minimal text, trying another mode")
+            # Try PSM mode 4 (assume single column of text)
+            fallback_config = r'--oem 3 --psm 4 -l eng'
+            ocr_text = pytesseract.image_to_string(image, config=fallback_config)
+            if ocr_text and len(ocr_text.strip()) > 50:
+                logger.info(f"Local OCR fallback successful: extracted {len(ocr_text)} characters")
+                return ocr_text
+            else:
+                logger.warning("Local OCR produced minimal or no text")
+                return None
     except ImportError:
         logger.warning("Pytesseract not installed - local OCR not available")
         return None