import gradio as gr from PIL import Image import torch from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from peft import PeftModel import gc import os # Add this line immediately after your imports os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # --- Configuration --- base_model_id = "Qwen/Qwen2.5-VL-3B-Instruct" adapter_id = "joeee2321512/Qwen2.5-VL-3B-Instruct-finetuned" # --- Model Loading --- print("Loading base model...") model = Qwen2_5_VLForConditionalGeneration.from_pretrained( base_model_id, torch_dtype=torch.float16, device_map="auto", token=os.getenv("token_HF") ) print("Loading processor...") processor = AutoProcessor.from_pretrained( base_model_id, token=os.getenv("token_HF") ) processor.tokenizer.padding_side = "right" print("Loading and applying adapter...") model = PeftModel.from_pretrained(model, adapter_id) print("Model loaded successfully!") # --- The Inference Function --- def perform_ocr_on_image(image_input: Image.Image) -> str: """ This is the core function that Gradio will call. It takes a PIL image and returns the transcribed text string. """ if image_input is None: return "Please upload an image." try: # Format the prompt using the chat template messages = [ { "role": "user", "content": [ {"type": "image", "image": image_input}, {"type": "text", "text": ( "Analyze the input image and detect all Arabic text. " "Output only the extracted text—verbatim and in its original script—" "without any added commentary, translation, punctuation or formatting. " "Present each line of text as plain UTF-8 strings, with no extra characters or words." )}, ], } ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Prepare inputs for the model inputs = processor(text=text, images=image_input, return_tensors="pt").to(model.device) # Generate prediction with torch.no_grad(): generated_ids = model.generate(**inputs, max_new_tokens=512) # Decode the output full_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # --- FIX: Post-process the response to remove the prompt --- # The model's actual output starts after the "assistant" marker. # We split the full response by this marker and take the last part. parts = full_response.split("assistant") if len(parts) > 1: # Take the last part and remove any leading/trailing whitespace cleaned_response = parts[-1].strip() else: # If the marker isn't found, return the full response as a fallback cleaned_response = full_response # --- END OF FIX --- # Clean up memory gc.collect() torch.cuda.empty_cache() return cleaned_response except Exception as e: print(f"An error occurred during inference: {e}") return f"An error occurred: {str(e)}" # --- Create and Launch the Gradio Interface --- demo = gr.Interface( fn=perform_ocr_on_image, inputs=gr.Image(type="pil", label="Upload Arabic Document Image"), outputs=gr.Textbox(label="Transcription", lines=10, show_copy_button=True), title="Basira: Fine-Tuned Qwen-VL for Arabic OCR", description="A demo for the Qwen-VL 2.5 (3B) model, fine-tuned for enhanced Arabic OCR. Upload an image to see the transcription.", allow_flagging="never" ) if __name__ == "__main__": demo.launch()