Spaces:

seanpedrickcase
/

document_redaction_vlm

Running on Zero

File size: 31,432 Bytes

d864d45

import os
import sys
from threading import Thread

import gradio as gr
import spaces
from PIL import Image

from tools.config import (
    LOAD_PADDLE_AT_STARTUP,
    MAX_NEW_TOKENS,
    MAX_SPACES_GPU_RUN_TIME,
    PADDLE_DET_DB_UNCLIP_RATIO,
    PADDLE_FONT_PATH,
    PADDLE_MODEL_PATH,
    PADDLE_USE_TEXTLINE_ORIENTATION,
    QUANTISE_VLM_MODELS,
    REPORT_VLM_OUTPUTS_TO_GUI,
    SHOW_VLM_MODEL_OPTIONS,
    USE_FLASH_ATTENTION,
    VLM_DEFAULT_DO_SAMPLE,
    VLM_DEFAULT_MIN_P,
    VLM_DEFAULT_PRESENCE_PENALTY,
    VLM_DEFAULT_REPETITION_PENALTY,
    VLM_DEFAULT_TEMPERATURE,
    VLM_DEFAULT_TOP_K,
    VLM_DEFAULT_TOP_P,
    VLM_MAX_IMAGE_SIZE,
    VLM_MIN_IMAGE_SIZE,
    VLM_SEED,
)
from tools.helper_functions import get_system_font_path

if LOAD_PADDLE_AT_STARTUP is True:
    # Set PaddleOCR environment variables BEFORE importing PaddleOCR
    # This ensures fonts are configured before the package loads

    # Set PaddleOCR model directory environment variable (only if specified).
    if PADDLE_MODEL_PATH and PADDLE_MODEL_PATH.strip():
        os.environ["PADDLEOCR_MODEL_DIR"] = PADDLE_MODEL_PATH
        print(f"Setting PaddleOCR model path to: {PADDLE_MODEL_PATH}")
    else:
        print("Using default PaddleOCR model storage location")

    # Set PaddleOCR font path to use system fonts instead of downloading simfang.ttf/PingFang-SC-Regular.ttf
    # This MUST be set before importing PaddleOCR to prevent font downloads
    if (
        PADDLE_FONT_PATH
        and PADDLE_FONT_PATH.strip()
        and os.path.exists(PADDLE_FONT_PATH)
    ):
        os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = PADDLE_FONT_PATH
        print(f"Setting PaddleOCR font path to configured font: {PADDLE_FONT_PATH}")
    else:
        system_font_path = get_system_font_path()
        if system_font_path:
            os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = system_font_path
            print(f"Setting PaddleOCR font path to system font: {system_font_path}")
        else:
            print(
                "Warning: No suitable system font found. PaddleOCR may download default fonts."
            )

    try:
        from paddleocr import PaddleOCR

        print("PaddleOCR imported successfully")

        paddle_kwargs = None

        # Default paddle configuration if none provided
        if paddle_kwargs is None:
            paddle_kwargs = {
                "det_db_unclip_ratio": PADDLE_DET_DB_UNCLIP_RATIO,
                "use_textline_orientation": PADDLE_USE_TEXTLINE_ORIENTATION,
                "use_doc_orientation_classify": False,
                "use_doc_unwarping": False,
                "lang": "en",
            }
        else:
            # Enforce language if not explicitly provided
            paddle_kwargs.setdefault("lang", "en")

        try:
            PaddleOCR(**paddle_kwargs)
        except Exception as e:
            # Handle DLL loading errors (common on Windows with GPU version)
            if (
                "WinError 127" in str(e)
                or "could not be found" in str(e).lower()
                or "dll" in str(e).lower()
            ):
                print(
                    f"Warning: GPU initialization failed (likely missing CUDA/cuDNN dependencies): {e}"
                )
                print("PaddleOCR will not be available. To fix GPU issues:")
                print("1. Install Visual C++ Redistributables (latest version)")
                print("2. Ensure CUDA runtime libraries are in your PATH")
                print(
                    "3. Or reinstall paddlepaddle CPU version: pip install paddlepaddle"
                )
                raise ImportError(
                    f"Error initializing PaddleOCR: {e}. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
                )
            else:
                raise e

    except ImportError:
        PaddleOCR = None
        print(
            "PaddleOCR not found. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
        )


# Define module-level defaults for model parameters (always available for import)
# These will be overridden inside the SHOW_VLM_MODEL_OPTIONS block if enabled
model_default_prompt = """Read all the text in the image."""
model_default_do_sample = (
    VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else None
)
model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None
model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None
model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None
model_default_temperature = (
    VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None
)
model_default_repetition_penalty = (
    VLM_DEFAULT_REPETITION_PENALTY
    if VLM_DEFAULT_REPETITION_PENALTY is not None
    else None
)
model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
model_default_max_new_tokens = int(MAX_NEW_TOKENS)
model_default_seed = VLM_SEED if VLM_SEED is not None else None


if SHOW_VLM_MODEL_OPTIONS is True:
    import torch
    from huggingface_hub import snapshot_download
    from transformers import (
        AutoModelForCausalLM,
        AutoProcessor,
        BitsAndBytesConfig,
        Qwen2_5_VLForConditionalGeneration,
        Qwen3VLForConditionalGeneration,
        TextIteratorStreamer,
    )

    from tools.config import (
        MAX_NEW_TOKENS,
        MODEL_CACHE_PATH,
        QUANTISE_VLM_MODELS,
        SELECTED_MODEL,
        USE_FLASH_ATTENTION,
        VLM_DEFAULT_DO_SAMPLE,
        VLM_DEFAULT_MIN_P,
        VLM_DEFAULT_PRESENCE_PENALTY,
        VLM_DEFAULT_REPETITION_PENALTY,
        VLM_DEFAULT_TEMPERATURE,
        VLM_DEFAULT_TOP_K,
        VLM_DEFAULT_TOP_P,
        VLM_SEED,
    )

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    print("torch.__version__ =", torch.__version__)
    print("torch.version.cuda =", torch.version.cuda)
    print("cuda available:", torch.cuda.is_available())
    print("cuda device count:", torch.cuda.device_count())
    if torch.cuda.is_available():
        print("current device:", torch.cuda.current_device())
        print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

    print("Using device:", device)

    CACHE_PATH = MODEL_CACHE_PATH
    if not os.path.exists(CACHE_PATH):
        os.makedirs(CACHE_PATH)

    # Initialize model and processor variables
    processor = None
    model = None

    # Initialize model-specific generation parameters (will be set by specific models if needed)
    # If config values are provided, use them; otherwise leave as None to use model defaults
    model_default_prompt = """Read all the text in the image."""
    model_default_do_sample = (
        VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else None
    )
    model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None
    model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None
    model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None
    model_default_temperature = (
        VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None
    )
    model_default_repetition_penalty = (
        VLM_DEFAULT_REPETITION_PENALTY
        if VLM_DEFAULT_REPETITION_PENALTY is not None
        else None
    )
    model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
    model_default_max_new_tokens = int(MAX_NEW_TOKENS)
    # Track which models support presence_penalty (only Qwen3-VL models currently)
    model_supports_presence_penalty = False
    model_default_seed = VLM_SEED if VLM_SEED is not None else None

    if USE_FLASH_ATTENTION is True:
        attn_implementation = "flash_attention_2"
    else:
        attn_implementation = "eager"

    # Setup quantisation config if enabled
    quantization_config = None
    if QUANTISE_VLM_MODELS is True:
        if not torch.cuda.is_available():
            print(
                "Warning: 4-bit quantisation requires CUDA, but CUDA is not available."
            )
            print("Falling back to loading models without quantisation")
            quantization_config = None
        else:
            try:
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_compute_dtype=torch.float16,
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_quant_type="nf4",
                )
                print("4-bit quantization enabled using bitsandbytes")
            except Exception as e:
                print(f"Warning: Could not setup bitsandbytes quantization: {e}")
                print("Falling back to loading models without quantization")
                quantization_config = None

    print(f"Loading vision model: {SELECTED_MODEL}")

    # Load only the selected model based on configuration
    if SELECTED_MODEL == "Nanonets-OCR2-3B":
        MODEL_ID = "nanonets/Nanonets-OCR2-3B"
        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
        load_kwargs = {
            "trust_remote_code": True,
        }
        if quantization_config is not None:
            load_kwargs["quantization_config"] = quantization_config
            load_kwargs["device_map"] = "auto"
        else:
            load_kwargs["torch_dtype"] = torch.float16
        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            MODEL_ID, **load_kwargs
        ).eval()
        if quantization_config is None:
            model = model.to(device)

        model_default_prompt = """Extract the text from the above document as if you were reading it naturally."""

    elif SELECTED_MODEL == "Dots.OCR":
        # Download and patch Dots.OCR model
        model_path_d_local = snapshot_download(
            repo_id="rednote-hilab/dots.ocr",
            local_dir=os.path.join(CACHE_PATH, "dots.ocr"),
            max_workers=20,
            local_dir_use_symlinks=False,
        )

        config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")

        if os.path.exists(config_file_path):
            with open(config_file_path, "r") as f:
                input_code = f.read()

            lines = input_code.splitlines()
            if "class DotsVLProcessor" in input_code and not any(
                "attributes = " in line for line in lines
            ):
                output_lines = []
                for line in lines:
                    output_lines.append(line)
                    if line.strip().startswith("class DotsVLProcessor"):
                        output_lines.append(
                            '    attributes = ["image_processor", "tokenizer"]'
                        )

                with open(config_file_path, "w") as f:
                    f.write("\n".join(output_lines))
                print("Patched configuration_dots.py successfully.")

        sys.path.append(model_path_d_local)

        MODEL_ID = model_path_d_local
        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
        load_kwargs = {
            "attn_implementation": attn_implementation,
            "device_map": "auto",
            "trust_remote_code": True,
        }
        if quantization_config is not None:
            load_kwargs["quantization_config"] = quantization_config
        else:
            load_kwargs["torch_dtype"] = torch.bfloat16
        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval()

        model_default_prompt = """Extract the text content from this image."""
        model_default_max_new_tokens = MAX_NEW_TOKENS

    elif SELECTED_MODEL == "Qwen3-VL-2B-Instruct":
        MODEL_ID = "Qwen/Qwen3-VL-2B-Instruct"
        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
        load_kwargs = {
            "device_map": "auto",
            "trust_remote_code": True,
        }
        if quantization_config is not None:
            load_kwargs["quantization_config"] = quantization_config
        else:
            load_kwargs["dtype"] = "auto"
        model = Qwen3VLForConditionalGeneration.from_pretrained(
            MODEL_ID, **load_kwargs
        ).eval()

        model_default_prompt = """Read all the text in the image."""
        model_default_do_sample = False
        model_default_top_p = 0.8
        model_default_min_p = 0.0
        model_default_top_k = 20
        model_default_temperature = 0.7
        model_default_repetition_penalty = 1.0
        model_default_presence_penalty = 1.5
        model_default_max_new_tokens = MAX_NEW_TOKENS
        model_supports_presence_penalty = (
            False  # I found that this doesn't work when using transformers
        )

    elif SELECTED_MODEL == "Qwen3-VL-4B-Instruct":
        MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"
        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
        load_kwargs = {
            "attn_implementation": attn_implementation,
            "device_map": "auto",
            "trust_remote_code": True,
        }
        if quantization_config is not None:
            load_kwargs["quantization_config"] = quantization_config
        else:
            load_kwargs["dtype"] = "auto"
        model = Qwen3VLForConditionalGeneration.from_pretrained(
            MODEL_ID, **load_kwargs
        ).eval()

        model_default_prompt = """Read all the text in the image."""
        model_default_do_sample = False
        model_default_top_p = 0.8
        model_default_min_p = 0.0
        model_default_top_k = 20
        model_default_temperature = 0.7
        model_default_repetition_penalty = 1.0
        model_default_presence_penalty = 1.5
        model_default_max_new_tokens = MAX_NEW_TOKENS
        model_supports_presence_penalty = (
            False  # I found that this doesn't work when using transformers
        )
    elif SELECTED_MODEL == "Qwen3-VL-8B-Instruct":
        MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
        load_kwargs = {
            "attn_implementation": attn_implementation,
            "device_map": "auto",
            "trust_remote_code": True,
        }
        if quantization_config is not None:
            load_kwargs["quantization_config"] = quantization_config
        else:
            load_kwargs["dtype"] = "auto"
        model = Qwen3VLForConditionalGeneration.from_pretrained(
            MODEL_ID, **load_kwargs
        ).eval()

        model_default_prompt = """Read all the text in the image."""
        model_default_do_sample = False
        model_default_top_p = 0.8
        model_default_min_p = 0.0
        model_default_top_k = 20
        model_default_temperature = 0.7
        model_default_repetition_penalty = 1.0
        model_default_presence_penalty = 1.5
        model_default_max_new_tokens = MAX_NEW_TOKENS
        model_supports_presence_penalty = (
            False  # I found that this doesn't work when using transformers
        )

    elif SELECTED_MODEL == "Qwen3-VL-30B-A3B-Instruct":
        MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
        from transformers import Qwen3VLMoeForConditionalGeneration

        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
        load_kwargs = {
            "attn_implementation": attn_implementation,
            "device_map": "auto",
            "trust_remote_code": True,
        }

        # budget for image processor, since the compression ratio is 32 for Qwen3-VL, we can set the number of visual tokens of a single image to 256-1280
        # processor.image_processor.size = {
        #     "longest_edge": VLM_MAX_IMAGE_SIZE,
        #     "shortest_edge": VLM_MIN_IMAGE_SIZE,
        # }

        if quantization_config is not None:
            load_kwargs["quantization_config"] = quantization_config
        else:
            load_kwargs["dtype"] = "auto"
        model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
            MODEL_ID, **load_kwargs
        ).eval()

        model_default_prompt = """Read all the text in the image."""
        model_default_do_sample = False
        model_default_top_p = 0.8
        model_default_min_p = 0.0
        model_default_top_k = 20
        model_default_temperature = 0.7
        model_default_repetition_penalty = 1.0
        model_default_presence_penalty = 1.5
        model_default_max_new_tokens = MAX_NEW_TOKENS
        model_supports_presence_penalty = (
            False  # I found that this doesn't work when using transformers
        )

    elif SELECTED_MODEL == "PaddleOCR-VL":
        MODEL_ID = "PaddlePaddle/PaddleOCR-VL"
        load_kwargs = {
            "trust_remote_code": True,
        }
        if quantization_config is not None:
            load_kwargs["quantization_config"] = quantization_config
            load_kwargs["device_map"] = "auto"
        else:
            load_kwargs["torch_dtype"] = torch.bfloat16
        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval()
        if quantization_config is None:
            model = model.to(device)
        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

        model_default_prompt = """OCR:"""
        model_default_max_new_tokens = MAX_NEW_TOKENS

    elif SELECTED_MODEL == "None":
        model = None
        processor = None

    else:
        raise ValueError(
            f"Invalid model selected: {SELECTED_MODEL}. Valid options are: Nanonets-OCR2-3B, Dots.OCR, Qwen3-VL-2B-Instruct, Qwen3-VL-4B-Instruct, Qwen3-VL-8B-Instruct, Qwen3-VL-30B-A3B-Instruct, PaddleOCR-VL"
        )

    # Override model defaults with user-provided config values if they are set
    # Priority: user config value > model default
    if VLM_DEFAULT_DO_SAMPLE is not None:
        model_default_do_sample = VLM_DEFAULT_DO_SAMPLE
    if VLM_DEFAULT_TOP_P is not None:
        model_default_top_p = VLM_DEFAULT_TOP_P
    if VLM_DEFAULT_MIN_P is not None:
        model_default_min_p = VLM_DEFAULT_MIN_P
    if VLM_DEFAULT_TOP_K is not None:
        model_default_top_k = VLM_DEFAULT_TOP_K
    if VLM_DEFAULT_TEMPERATURE is not None:
        model_default_temperature = VLM_DEFAULT_TEMPERATURE
    if VLM_DEFAULT_REPETITION_PENALTY is not None:
        model_default_repetition_penalty = VLM_DEFAULT_REPETITION_PENALTY
    if VLM_DEFAULT_PRESENCE_PENALTY is not None:
        model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
    if VLM_SEED is not None:
        model_default_seed = VLM_SEED

    print(f"Successfully loaded {SELECTED_MODEL}")


@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
def extract_text_from_image_vlm(
    text: str,
    image: Image.Image,
    max_new_tokens: int = None,
    temperature: float = None,
    top_p: float = None,
    min_p: float = None,
    top_k: int = None,
    repetition_penalty: float = None,
    do_sample: bool = None,
    presence_penalty: float = None,
    seed: int = None,
    model_default_prompt: str = None,
):
    """
    Generates responses using the configured vision model for image input.
    Streams text to console and returns complete text only at the end.

    Uses model-specific defaults if they were set during model initialization,
    falling back to function argument defaults if provided, and finally to sensible
    general defaults if neither are available.

    Args:
        text (str): The text prompt to send to the vision model. If empty and model
            has a default prompt, the model default will be used.
        image (Image.Image): The PIL Image to process. Must not be None.
        max_new_tokens (int, optional): Maximum number of new tokens to generate.
            Defaults to model-specific value (MAX_NEW_TOKENS for models with defaults) or MAX_NEW_TOKENS from config.
        temperature (float, optional): Sampling temperature for generation.
            Defaults to model-specific value (0.7 for Qwen3-VL models) or 0.7.
        top_p (float, optional): Nucleus sampling parameter (top-p).
            Defaults to model-specific value (0.8 for Qwen3-VL models) or 0.9.
        min_p (float, optional): Minimum probability threshold for token sampling.
            Defaults to model-specific value or 0.0.
        top_k (int, optional): Top-k sampling parameter.
            Defaults to model-specific value (20 for Qwen3-VL models) or 50.
        repetition_penalty (float, optional): Penalty for token repetition.
            Defaults to model-specific value (1.0 for Qwen3-VL models) or 1.3.
        do_sample (bool, optional): If True, use sampling (do_sample=True).
            If False, use sampling (do_sample=True). If None, defaults to False
            (sampling) for Qwen3-VL models, or True (sampling) for other models.
        presence_penalty (float, optional): Penalty for token presence.
            Defaults to model-specific value (1.5 for Qwen3-VL models) or None.
            Note: Not all models support this parameter.
        seed (int, optional): Random seed for generation. If None, uses VLM_SEED
            from config if set, otherwise no seed is set (non-deterministic).
        model_default_prompt (str, optional): The default prompt to use if no text is provided.
            Defaults to model-specific value (None for Dots.OCR, "Read all the text in the image." for Qwen3-VL models) or "Read all the text in the image."

    Returns:
        str: The complete generated text response from the model.
    """
    if image is None:
        return "Please upload an image."

    # Determine parameter values with priority: function args > model defaults > general defaults
    # Priority order: function argument (if not None) > model default > general default

    # Text/prompt handling
    if text and text.strip():
        actual_text = text
    elif model_default_prompt is not None:
        actual_text = model_default_prompt
    else:
        actual_text = "Read all the text in the image."  # General default

    # max_new_tokens: function arg > model default > general default
    if max_new_tokens is not None:
        actual_max_new_tokens = max_new_tokens
    elif model_default_max_new_tokens is not None:
        actual_max_new_tokens = model_default_max_new_tokens
    else:
        actual_max_new_tokens = MAX_NEW_TOKENS  # General default (from config)

    # temperature: function arg > model default (which may include config override)
    if temperature is not None:
        actual_temperature = temperature
    elif model_default_temperature is not None:
        actual_temperature = model_default_temperature
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_temperature = 0.1

    # top_p: function arg > model default (which may include config override)
    if top_p is not None:
        actual_top_p = top_p
    elif model_default_top_p is not None:
        actual_top_p = model_default_top_p
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_top_p = 0.8

    # min_p: function arg > model default (which may include config override)
    if min_p is not None:
        actual_min_p = min_p
    elif model_default_min_p is not None:
        actual_min_p = model_default_min_p
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_min_p = 0.0

    # top_k: function arg > model default (which may include config override)
    if top_k is not None:
        actual_top_k = top_k
    elif model_default_top_k is not None:
        actual_top_k = model_default_top_k
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_top_k = 20

    # repetition_penalty: function arg > model default (which may include config override)
    if repetition_penalty is not None:
        actual_repetition_penalty = repetition_penalty
    elif model_default_repetition_penalty is not None:
        actual_repetition_penalty = model_default_repetition_penalty
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_repetition_penalty = 1.0

    # do_sample: function arg > model default (which may include config override)
    if do_sample is not None:
        actual_do_sample = do_sample
    elif model_default_do_sample is not None:
        actual_do_sample = model_default_do_sample
    else:
        # Fallback to a sensible default if neither function arg nor model default is set
        actual_do_sample = True

    # presence_penalty: function arg > model default (which may include config override) > None
    actual_presence_penalty = None
    if presence_penalty is not None:
        actual_presence_penalty = presence_penalty
    elif model_default_presence_penalty is not None:
        actual_presence_penalty = model_default_presence_penalty

    # seed: function arg > model default (which may include config override)
    actual_seed = None
    if seed is not None:
        actual_seed = seed
    elif model_default_seed is not None:
        actual_seed = model_default_seed

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": actual_text},
            ],
        }
    ]
    prompt_full = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = processor(
        text=[prompt_full],
        images=[image],
        return_tensors="pt",
        padding=True,
        min_pixels=VLM_MIN_IMAGE_SIZE,
        max_pixels=VLM_MAX_IMAGE_SIZE,
    ).to(device)

    streamer = TextIteratorStreamer(
        processor, skip_prompt=True, skip_special_tokens=True
    )

    # Set random seed if specified
    if actual_seed is not None:
        torch.manual_seed(actual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(actual_seed)

    # Build generation kwargs with resolved parameters
    generation_kwargs = {
        **inputs,
        "streamer": streamer,
        "max_new_tokens": actual_max_new_tokens,
        "do_sample": actual_do_sample,
        "temperature": actual_temperature,
        "top_p": actual_top_p,
        "min_p": actual_min_p,
        "top_k": actual_top_k,
        "repetition_penalty": actual_repetition_penalty,
    }

    # Add presence_penalty if it's set and the model supports it
    # Only Qwen3-VL models currently support presence_penalty
    if actual_presence_penalty is not None and model_supports_presence_penalty:
        generation_kwargs["presence_penalty"] = actual_presence_penalty
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    buffer = ""
    line_buffer = ""  # Accumulate text for the current line
    for new_text in streamer:
        buffer += new_text
        buffer = buffer.replace("<|im_end|>", "")
        line_buffer += new_text

        # Print to console as it streams
        print(new_text, end="", flush=True)

        # If we hit a newline, report the entire accumulated line to GUI
        if REPORT_VLM_OUTPUTS_TO_GUI and "\n" in new_text:
            # Split by newline to handle the line(s) we just completed
            parts = line_buffer.split("\n")
            # Report all complete lines (everything except the last part which may be incomplete)
            for line in parts[:-1]:
                if line.strip():  # Only report non-empty lines
                    gr.Info(line, duration=2)
            # Keep the last part (after the last newline) for the next line
            line_buffer = parts[-1] if parts else ""

        # time.sleep(0.01)

    # Print final newline after streaming is complete
    print()  # Add newline at the end

    # Return the complete text only at the end
    return buffer


full_page_ocr_vlm_prompt = """Spot all the text in the image at line-level, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': 'identified text'}, ...].

IMPORTANT: Extract each horizontal line of text separately. Do NOT combine multiple lines into paragraphs. Each line that appears on a separate horizontal row in the image should be a separate entry.

Rules:
- Each line must be on a separate horizontal row in the image
- Even if a sentence is split over multiple horizontal lines, it should be split into separate entries (one per line)
- If text spans multiple horizontal lines, split it into separate entries (one per line)
- Do NOT combine lines that appear on different horizontal rows
- Each bounding box should tightly fit around a single horizontal line of text
- Empty lines should be skipped

# Only return valid JSON, no additional text or explanation."""

full_page_ocr_people_vlm_prompt = """Spot all photos of people's faces in the image, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[PERSON]'}, ...].

Always return the JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[PERSON]'}, ...].

Rules:
- Each photo of a person's face must be a separate entry.
- Do NOT combine multiple photos into a single entry.
- Each photo of a person's face that appears in the image should be a separate entry.
- 'text' should always be exactly '[PERSON]'.
- Do NOT include any other text or information in the JSON.
- If there are no photos of people's faces in the image, return an empty JSON array.

# Only return valid JSON, no additional text or explanation."""

full_page_ocr_signature_vlm_prompt = """Spot all signatures in the image, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[SIGNATURE]'}, ...].

Always return the JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[SIGNATURE]'}, ...].

Rules:
- Each signature must be a separate entry.
- Do NOT combine multiple signatures into a single entry.
- Each signature that appears in the image should be a separate entry.
- 'text' should always be exactly '[SIGNATURE]'.
- Do NOT include any other text or information in the JSON.
- If there are no signatures in the image, return an empty JSON array.

# Only return valid JSON, no additional text or explanation."""

# Test for word-level OCR with VLMs - makes some mistakes but not bad
# full_page_ocr_vlm_prompt = """Spot all the text in the image at word-level, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': 'identified word'}, ...].

# IMPORTANT: Extract each word in the image separately. Do NOT combine words into longer fragments, sentences, or paragraphs. Each entry must correspond to a single, individual word as visually separated in the image.

# Rules:
# - Each entry should correspond to a single distinct word (not groups of words, not whole lines).
# - For each word, provide a tight bounding box [x1, y1, x2, y2] around just that word.
# - Do not merge words. Do not split words into letters. Only return one entry per word.
# - Maintain the order of words as they appear spatially from top to bottom, left to right.
# - Skip any empty or whitespace-only entries.
# - Do not include extraneous text, explanations, or formatting beyond the required JSON.

# Only return valid JSON, no additional text or explanation."""