|
|
import re |
|
|
import numpy as np |
|
|
import cv2 |
|
|
from PIL import Image |
|
|
import random |
|
|
import torch |
|
|
import torchvision.transforms as T |
|
|
from torchvision.transforms.functional import InterpolationMode |
|
|
from difflib import SequenceMatcher |
|
|
from nltk.metrics.distance import edit_distance |
|
|
import nltk |
|
|
|
|
|
|
|
|
try: |
|
|
nltk.data.find('corpora/words.zip') |
|
|
except LookupError: |
|
|
nltk.download('words') |
|
|
try: |
|
|
nltk.data.find('tokenizers/punkt') |
|
|
except LookupError: |
|
|
nltk.download('punkt') |
|
|
|
|
|
from nltk.corpus import words |
|
|
|
|
|
def set_seed(seed=42): |
|
|
random.seed(seed) |
|
|
np.random.seed(seed) |
|
|
torch.manual_seed(seed) |
|
|
|
|
|
torch.backends.cudnn.deterministic = True |
|
|
torch.backends.cudnn.benchmark = False |
|
|
|
|
|
def build_transform(input_size=448): |
|
|
mean = (0.485, 0.456, 0.406) |
|
|
std = (0.229, 0.224, 0.225) |
|
|
return T.Compose([ |
|
|
T.Lambda(lambda img: img.convert('RGB')), |
|
|
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), |
|
|
T.ToTensor(), |
|
|
T.Normalize(mean=mean, std=std) |
|
|
]) |
|
|
|
|
|
def get_roi(image_path_or_obj, *roi): |
|
|
""" |
|
|
Extracts ROI from an image path or PIL Image object. |
|
|
""" |
|
|
if isinstance(image_path_or_obj, str): |
|
|
image = Image.open(image_path_or_obj).convert('RGB') |
|
|
else: |
|
|
image = image_path_or_obj.convert('RGB') |
|
|
|
|
|
width, height = image.size |
|
|
|
|
|
roi_x_start = int(width * roi[0]) |
|
|
roi_y_start = int(height * roi[1]) |
|
|
roi_x_end = int(width * roi[2]) |
|
|
roi_y_end = int(height * roi[3]) |
|
|
|
|
|
cropped_image = image.crop((roi_x_start, roi_y_start, roi_x_end, roi_y_end)) |
|
|
return cropped_image |
|
|
|
|
|
def clean_text(text): |
|
|
return re.sub(r'[^a-zA-Z0-9]', '', text).strip().lower() |
|
|
|
|
|
def are_strings_similar(str1, str2, max_distance=3, max_length_diff=2): |
|
|
if str1 == str2: |
|
|
return True |
|
|
if abs(len(str1) - len(str2)) > max_length_diff: |
|
|
return False |
|
|
edit_distance_value = edit_distance(str1, str2) |
|
|
return edit_distance_value <= max_distance |
|
|
|
|
|
def blur_image(image, strength): |
|
|
image_np = np.array(image) |
|
|
blur_strength = int(strength * 50) |
|
|
blur_strength = max(1, blur_strength | 1) |
|
|
blurred_image = cv2.GaussianBlur(image_np, (blur_strength, blur_strength), 0) |
|
|
blurred_pil_image = Image.fromarray(blurred_image) |
|
|
return blurred_pil_image |
|
|
|
|
|
def is_blank(text, limit=15): |
|
|
return len(text) < limit |
|
|
|
|
|
def string_similarity(a, b): |
|
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio() |
|
|
|
|
|
def find_similar_substring(text, keyword, threshold=0.9): |
|
|
text = text.lower() |
|
|
keyword = keyword.lower() |
|
|
|
|
|
if keyword in text: |
|
|
return True |
|
|
|
|
|
keyword_length = len(keyword.split()) |
|
|
words_list = text.split() |
|
|
|
|
|
for i in range(len(words_list) - keyword_length + 1): |
|
|
phrase = ' '.join(words_list[i:i + keyword_length]) |
|
|
similarity = string_similarity(phrase, keyword) |
|
|
if similarity >= threshold: |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
def destroy_text_roi(image, *roi_params): |
|
|
image_np = np.array(image) |
|
|
|
|
|
h, w, _ = image_np.shape |
|
|
x1 = int(roi_params[0] * w) |
|
|
y1 = int(roi_params[1] * h) |
|
|
x2 = int(roi_params[2] * w) |
|
|
y2 = int(roi_params[3] * h) |
|
|
|
|
|
roi = image_np[y1:y2, x1:x2] |
|
|
|
|
|
blurred_roi = cv2.GaussianBlur(roi, (75, 75), 0) |
|
|
noise = np.random.randint(0, 50, (blurred_roi.shape[0], blurred_roi.shape[1], 3), dtype=np.uint8) |
|
|
noisy_blurred_roi = cv2.add(blurred_roi, noise) |
|
|
image_np[y1:y2, x1:x2] = noisy_blurred_roi |
|
|
return Image.fromarray(image_np) |
|
|
|
|
|
def is_english(text): |
|
|
allowed_pattern = re.compile( |
|
|
r'^[a-zA-Z०-९\u0930\s\.,!?\-;:"\'()]*$' |
|
|
) |
|
|
return bool(allowed_pattern.match(text)) |
|
|
|
|
|
def is_valid_english(text): |
|
|
english_words = set(words.words()) |
|
|
cleaned_words = ''.join(c.lower() if c.isalnum() else ' ' for c in text).split() |
|
|
return all(word.lower() in english_words for word in cleaned_words) |
|
|
|