|
|
import spaces |
|
|
from transformers import AutoProcessor, AutoModelForImageTextToText |
|
|
import torch |
|
|
import gradio as gr |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
MODEL_PATH = "zai-org/GLM-OCR" |
|
|
processor = AutoProcessor.from_pretrained(MODEL_PATH) |
|
|
model = AutoModelForImageTextToText.from_pretrained( |
|
|
pretrained_model_name_or_path=MODEL_PATH, |
|
|
torch_dtype="auto", |
|
|
device_map="auto", |
|
|
).to(device) |
|
|
|
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def read_img(img): |
|
|
''' |
|
|
Takes in an image file and returns the text recognized from the image. |
|
|
Args: |
|
|
img: the input image file |
|
|
Returns: |
|
|
output_text: a string of the text recognized from the image |
|
|
''' |
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", |
|
|
"url": img}, |
|
|
{"type": "text", |
|
|
"text": "Text Recognition:"}], |
|
|
} |
|
|
] |
|
|
|
|
|
inputs = processor.apply_chat_template( |
|
|
messages, |
|
|
tokenize=True, |
|
|
add_generation_prompt=True, |
|
|
return_dict=True, |
|
|
return_tensors="pt" |
|
|
).to(device) |
|
|
|
|
|
inputs.pop("token_type_ids", None) |
|
|
generated_ids = model.generate(**inputs, max_new_tokens=8192) |
|
|
output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False) |
|
|
|
|
|
return output_text |
|
|
|
|
|
with gr.Blocks() as imgsmiles: |
|
|
top = gr.Markdown( |
|
|
""" |
|
|
# OCR with ZAI GLM |
|
|
""") |
|
|
|
|
|
agent_flag_choice = gr.Radio(choices = ['True', 'False'],label="Are you an Agent?", interactive=True, value='False', scale = 2) |
|
|
with gr.Row(): |
|
|
inputs=gr.Image(type="filepath") |
|
|
text_out = gr.Textbox(lines=2, label="Text Output") |
|
|
|
|
|
submit_button = gr.Button("Submit") |
|
|
clear_button = gr.ClearButton([inputs, text_out], value = "Clear") |
|
|
|
|
|
|
|
|
submit_button.click(read_img, [inputs], [text_out]) |
|
|
|
|
|
|
|
|
imgsmiles.launch(mcp_server=True) |