File size: 4,891 Bytes
6a3bd1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import torch
import easyocr
import numpy as np
import cv2
from PIL import Image
from typing import List, Dict
import re

class OCREngineManager:
    """Text extraction using EasyOCR with brand-optimized preprocessing"""

    def __init__(self):
        print("Loading EasyOCR (English + Traditional Chinese)...")

        # Try GPU first, fallback to CPU if GPU fails
        try:
            if torch.cuda.is_available():
                print("  Attempting GPU initialization...")
                self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=True)
                print("  βœ“ EasyOCR loaded with GPU")
            else:
                print("  CUDA not available, using CPU...")
                self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=False)
                print("  βœ“ EasyOCR loaded with CPU")
        except Exception as e:
            print(f"  ⚠️ GPU initialization failed: {e}")
            print("  Falling back to CPU...")
            self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=False)
            print("  βœ“ EasyOCR loaded with CPU (fallback)")

        print("βœ“ EasyOCR loaded")

    def extract_text(self, image: Image.Image, use_brand_preprocessing: bool = False) -> List[Dict]:
        """Extract text from image with optional brand-optimized preprocessing"""
        if use_brand_preprocessing:
            # Apply brand-optimized preprocessing
            processed_image = self.preprocess_for_brand_ocr(image)
            img_array = np.array(processed_image)
        else:
            img_array = np.array(image)

        # Use more aggressive settings for brand detection
        if use_brand_preprocessing:
            results = self.reader.readtext(
                img_array,
                detail=1,
                paragraph=False,
                min_size=10,  # Lower to catch small brand text
                text_threshold=0.5,  # Lower threshold for brand logos
                link_threshold=0.3,
                contrast_ths=0.1,  # Lower to handle metallic/reflective text
                adjust_contrast=0.8  # Enhance contrast for logos
            )
        else:
            results = self.reader.readtext(
                img_array,
                detail=1,
                paragraph=False,
                min_size=20,
                text_threshold=0.7,
                link_threshold=0.4
            )

        structured_results = []
        for bbox, text, confidence in results:
            structured_results.append({
                'bbox': bbox,
                'text': self.clean_and_normalize(text),
                'confidence': confidence,
                'raw_text': text
            })

        return structured_results

    def clean_and_normalize(self, text: str) -> str:
        """Clean and normalize text"""
        # Keep Traditional Chinese characters
        text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
        text = ' '.join(text.split())
        return text.upper()

    def preprocess_for_brand_ocr(self, image_region: Image.Image) -> Image.Image:
        """
        Preprocess image for brand OCR recognition
        Optimizes for detecting brand logos and text on products (especially metallic logos)

        Args:
            image_region: PIL Image (typically a cropped region)

        Returns:
            Preprocessed PIL Image
        """
        # Convert to numpy array
        img_array = np.array(image_region)

        # Convert to grayscale
        if len(img_array.shape) == 3:
            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
        else:
            gray = img_array

        # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
        # Increased clipLimit for metallic logos (2.0 β†’ 3.0)
        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)

        # Denoise (slightly reduced strength to preserve logo edges)
        denoised = cv2.fastNlMeansDenoising(enhanced, None, h=8, templateWindowSize=7, searchWindowSize=21)

        # Adaptive thresholding to handle varying lighting
        # Adjusted blockSize for better logo detection (11 β†’ 15)
        binary = cv2.adaptiveThreshold(
            denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY, 15, 2
        )

        # Morphological operations to connect broken characters
        # Slightly larger kernel for logo text (2x2 β†’ 3x3)
        kernel = np.ones((3, 3), np.uint8)
        morph = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)

        # Sharpen to enhance edges (increased center weight 9 β†’ 11)
        kernel_sharp = np.array([[-1, -1, -1], [-1, 11, -1], [-1, -1, -1]])
        sharpened = cv2.filter2D(morph, -1, kernel_sharp)

        # Convert back to PIL Image
        return Image.fromarray(sharpened)

print("βœ“ OCREngineManager (with brand OCR preprocessing) defined")