Spaces:

DawnC
/

Pixcribe

Sleeping

File size: 4,891 Bytes

6a3bd1f

import torch
import easyocr
import numpy as np
import cv2
from PIL import Image
from typing import List, Dict
import re

class OCREngineManager:
    """Text extraction using EasyOCR with brand-optimized preprocessing"""

    def __init__(self):
        print("Loading EasyOCR (English + Traditional Chinese)...")

        # Try GPU first, fallback to CPU if GPU fails
        try:
            if torch.cuda.is_available():
                print("  Attempting GPU initialization...")
                self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=True)
                print("  ✓ EasyOCR loaded with GPU")
            else:
                print("  CUDA not available, using CPU...")
                self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=False)
                print("  ✓ EasyOCR loaded with CPU")
        except Exception as e:
            print(f"  ⚠️ GPU initialization failed: {e}")
            print("  Falling back to CPU...")
            self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=False)
            print("  ✓ EasyOCR loaded with CPU (fallback)")

        print("✓ EasyOCR loaded")

    def extract_text(self, image: Image.Image, use_brand_preprocessing: bool = False) -> List[Dict]:
        """Extract text from image with optional brand-optimized preprocessing"""
        if use_brand_preprocessing:
            # Apply brand-optimized preprocessing
            processed_image = self.preprocess_for_brand_ocr(image)
            img_array = np.array(processed_image)
        else:
            img_array = np.array(image)

        # Use more aggressive settings for brand detection
        if use_brand_preprocessing:
            results = self.reader.readtext(
                img_array,
                detail=1,
                paragraph=False,
                min_size=10,  # Lower to catch small brand text
                text_threshold=0.5,  # Lower threshold for brand logos
                link_threshold=0.3,
                contrast_ths=0.1,  # Lower to handle metallic/reflective text
                adjust_contrast=0.8  # Enhance contrast for logos
            )
        else:
            results = self.reader.readtext(
                img_array,
                detail=1,
                paragraph=False,
                min_size=20,
                text_threshold=0.7,
                link_threshold=0.4
            )

        structured_results = []
        for bbox, text, confidence in results:
            structured_results.append({
                'bbox': bbox,
                'text': self.clean_and_normalize(text),
                'confidence': confidence,
                'raw_text': text
            })

        return structured_results

    def clean_and_normalize(self, text: str) -> str:
        """Clean and normalize text"""
        # Keep Traditional Chinese characters
        text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
        text = ' '.join(text.split())
        return text.upper()

    def preprocess_for_brand_ocr(self, image_region: Image.Image) -> Image.Image:
        """
        Preprocess image for brand OCR recognition
        Optimizes for detecting brand logos and text on products (especially metallic logos)

        Args:
            image_region: PIL Image (typically a cropped region)

        Returns:
            Preprocessed PIL Image
        """
        # Convert to numpy array
        img_array = np.array(image_region)

        # Convert to grayscale
        if len(img_array.shape) == 3:
            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
        else:
            gray = img_array

        # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
        # Increased clipLimit for metallic logos (2.0 → 3.0)
        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)

        # Denoise (slightly reduced strength to preserve logo edges)
        denoised = cv2.fastNlMeansDenoising(enhanced, None, h=8, templateWindowSize=7, searchWindowSize=21)

        # Adaptive thresholding to handle varying lighting
        # Adjusted blockSize for better logo detection (11 → 15)
        binary = cv2.adaptiveThreshold(
            denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY, 15, 2
        )

        # Morphological operations to connect broken characters
        # Slightly larger kernel for logo text (2x2 → 3x3)
        kernel = np.ones((3, 3), np.uint8)
        morph = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)

        # Sharpen to enhance edges (increased center weight 9 → 11)
        kernel_sharp = np.array([[-1, -1, -1], [-1, 11, -1], [-1, -1, -1]])
        sharpened = cv2.filter2D(morph, -1, kernel_sharp)

        # Convert back to PIL Image
        return Image.fromarray(sharpened)

print("✓ OCREngineManager (with brand OCR preprocessing) defined")