import gradio as gr
from PIL import Image
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from peft import PeftModel
import gc
import os

# Add this line immediately after your imports
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# --- Configuration ---
base_model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
adapter_id = "joeee2321512/Qwen2.5-VL-3B-Instruct-finetuned"

# --- Model Loading ---
print("Loading base model...")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    token=os.getenv("token_HF")
)

print("Loading processor...")
processor = AutoProcessor.from_pretrained(
    base_model_id,
    token=os.getenv("token_HF")
)
processor.tokenizer.padding_side = "right"

print("Loading and applying adapter...")
model = PeftModel.from_pretrained(model, adapter_id)
print("Model loaded successfully!")

# --- The Inference Function ---
def perform_ocr_on_image(image_input: Image.Image) -> str:
    """
    This is the core function that Gradio will call.
    It takes a PIL image and returns the transcribed text string.
    """
    if image_input is None:
        return "Please upload an image."

    try:
        # Format the prompt using the chat template
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image_input},
                    {"type": "text", "text": (
                        "Analyze the input image and detect all Arabic text. "
                        "Output only the extracted text—verbatim and in its original script—"
                        "without any added commentary, translation, punctuation or formatting. "
                        "Present each line of text as plain UTF-8 strings, with no extra characters or words."
                    )},
                ],
            }
        ]
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        # Prepare inputs for the model
        inputs = processor(text=text, images=image_input, return_tensors="pt").to(model.device)

        # Generate prediction
        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=512)

        # Decode the output
        full_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        
        # --- FIX: Post-process the response to remove the prompt ---
        # The model's actual output starts after the "assistant" marker.
        # We split the full response by this marker and take the last part.
        parts = full_response.split("assistant")
        if len(parts) > 1:
            # Take the last part and remove any leading/trailing whitespace
            cleaned_response = parts[-1].strip()
        else:
            # If the marker isn't found, return the full response as a fallback
            cleaned_response = full_response
        # --- END OF FIX ---

        # Clean up memory
        gc.collect()
        torch.cuda.empty_cache()

        return cleaned_response
        
    except Exception as e:
        print(f"An error occurred during inference: {e}")
        return f"An error occurred: {str(e)}"

# --- Create and Launch the Gradio Interface ---
demo = gr.Interface(
    fn=perform_ocr_on_image,
    inputs=gr.Image(type="pil", label="Upload Arabic Document Image"),
    outputs=gr.Textbox(label="Transcription", lines=10, show_copy_button=True),
    title="Basira: Fine-Tuned Qwen-VL for Arabic OCR",
    description="A demo for the Qwen-VL 2.5 (3B) model, fine-tuned for enhanced Arabic OCR. Upload an image to see the transcription.",
    allow_flagging="never"
)

if __name__ == "__main__":
    demo.launch()