OCR / app.py
cafierom's picture
Update app.py
533ddd5 verified
import spaces
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
import gradio as gr
device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = "zai-org/GLM-OCR"
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForImageTextToText.from_pretrained(
pretrained_model_name_or_path=MODEL_PATH,
torch_dtype="auto",
device_map="auto",
).to(device)
@spaces.GPU
def read_img(img):
'''
Takes in an image file and returns the text recognized from the image.
Args:
img: the input image file
Returns:
output_text: a string of the text recognized from the image
'''
messages = [
{
"role": "user",
"content": [
{"type": "image",
"url": img},
{"type": "text",
"text": "Text Recognition:"}],
}
]
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
).to(device)
inputs.pop("token_type_ids", None)
generated_ids = model.generate(**inputs, max_new_tokens=8192)
output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
return output_text
with gr.Blocks() as imgsmiles:
top = gr.Markdown(
"""
# OCR with ZAI GLM
""")
agent_flag_choice = gr.Radio(choices = ['True', 'False'],label="Are you an Agent?", interactive=True, value='False', scale = 2)
with gr.Row():
inputs=gr.Image(type="filepath")
text_out = gr.Textbox(lines=2, label="Text Output")
submit_button = gr.Button("Submit")
clear_button = gr.ClearButton([inputs, text_out], value = "Clear")
# agent_button = gr.Button("Agent use only")
submit_button.click(read_img, [inputs], [text_out])
# agent_button.click(agent_read_img, [agent_flag_choice, inputs], [text_out, None])
imgsmiles.launch(mcp_server=True)