VilanOCR-Urdu-English-Chinese

Runtime error

App Files Files Community

VilanOCR-Urdu-English-Chinese / app.py

musadac

Update app.py

a4172ab over 2 years ago

raw

history blame contribute delete

3.7 kB

	import streamlit as st
	import torch
	from PIL import Image
	from huggingface_hub import hf_hub_download
	from transformers import VisionEncoderDecoderModel


	import warnings
	from contextlib import contextmanager
	from transformers import MBartTokenizer, ViTImageProcessor, XLMRobertaTokenizer
	from transformers import ProcessorMixin


	class CustomOCRProcessor(ProcessorMixin):
	attributes = ["image_processor", "tokenizer"]
	image_processor_class = "AutoImageProcessor"
	tokenizer_class = "AutoTokenizer"

	def __init__(self, image_processor=None, tokenizer=None, **kwargs):
	if "feature_extractor" in kwargs:
	warnings.warn(
	"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
	" instead.",
	FutureWarning,
	)
	feature_extractor = kwargs.pop("feature_extractor")

	image_processor = image_processor if image_processor is not None else feature_extractor
	if image_processor is None:
	raise ValueError("You need to specify an `image_processor`.")
	if tokenizer is None:
	raise ValueError("You need to specify a `tokenizer`.")

	super().__init__(image_processor, tokenizer)
	self.current_processor = self.image_processor
	self._in_target_context_manager = False

	def __call__(self, args, *kwargs):
	# For backward compatibility
	if self._in_target_context_manager:
	return self.current_processor(args, *kwargs)

	images = kwargs.pop("images", None)
	text = kwargs.pop("text", None)
	if len(args) > 0:
	images = args[0]
	args = args[1:]

	if images is None and text is None:
	raise ValueError("You need to specify either an `images` or `text` input to process.")

	if images is not None:
	inputs = self.image_processor(images, args, *kwargs)
	if text is not None:
	encodings = self.tokenizer(text, **kwargs)

	if text is None:
	return inputs
	elif images is None:
	return encodings
	else:
	inputs["labels"] = encodings["input_ids"]
	return inputs

	def batch_decode(self, args, *kwargs):
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args, *kwargs):
	return self.tokenizer.decode(args, *kwargs)


	image_processor = ViTImageProcessor.from_pretrained(
	'microsoft/swin-base-patch4-window12-384-in22k'
	)
	tokenizer = MBartTokenizer.from_pretrained(
	'facebook/mbart-large-50'
	)
	processortext2 = CustomOCRProcessor(image_processor,tokenizer)

	import os
	huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
	model = {}
	model['single-urdu'] = "musadac/vilanocr-single-urdu"
	model['multi-urdu'] = "musadac/ViLanOCR"
	model['medical'] = "musadac/vilanocr-multi-medical"
	model['chinese'] = "musadac/vilanocr-single-chinese"

	st.title("Image OCR with musadac/vilanocr")
	model_name = st.selectbox("Choose an OCR model", ["single-urdu", "multi-urdu", "medical","chinese" ])
	uploaded_file = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png"])
	if uploaded_file is not None:
	model2 = VisionEncoderDecoderModel.from_pretrained(model[model_name], use_auth_token=huggingface_token)
	img = Image.open(uploaded_file).convert("RGB")
	pixel_values = processortext2(img.convert("RGB"), return_tensors="pt").pixel_values

	with torch.no_grad():
	generated_ids = model2.generate(pixel_values)

	result = processortext2.batch_decode(generated_ids, skip_special_tokens=True)[0]
	st.write("OCR Result:")
	st.write(result)