Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

document_redaction / tools /run_vlm.py

seanpedrickcase

Sync: Merge pull request #112 from seanpedrick-case/dev

66f8083 about 6 hours ago

raw

history blame contribute delete

32.4 kB

	import os
	import sys
	from threading import Thread

	import gradio as gr
	import spaces
	from PIL import Image

	from tools.config import (
	LOAD_PADDLE_AT_STARTUP,
	MAX_NEW_TOKENS,
	MAX_SPACES_GPU_RUN_TIME,
	PADDLE_DET_DB_UNCLIP_RATIO,
	PADDLE_FONT_PATH,
	PADDLE_MODEL_PATH,
	PADDLE_USE_TEXTLINE_ORIENTATION,
	QUANTISE_VLM_MODELS,
	REPORT_VLM_OUTPUTS_TO_GUI,
	SHOW_VLM_MODEL_OPTIONS,
	USE_FLASH_ATTENTION,
	VLM_DEFAULT_DO_SAMPLE,
	VLM_DEFAULT_MIN_P,
	VLM_DEFAULT_PRESENCE_PENALTY,
	VLM_DEFAULT_REPETITION_PENALTY,
	VLM_DEFAULT_TEMPERATURE,
	VLM_DEFAULT_TOP_K,
	VLM_DEFAULT_TOP_P,
	VLM_MAX_IMAGE_SIZE,
	VLM_MIN_IMAGE_SIZE,
	VLM_SEED,
	)
	from tools.helper_functions import get_system_font_path

	if LOAD_PADDLE_AT_STARTUP is True:
	# Set PaddleOCR environment variables BEFORE importing PaddleOCR
	# This ensures fonts are configured before the package loads

	# Set PaddleOCR model directory environment variable (only if specified).
	if PADDLE_MODEL_PATH and PADDLE_MODEL_PATH.strip():
	os.environ["PADDLEOCR_MODEL_DIR"] = PADDLE_MODEL_PATH
	print(f"Setting PaddleOCR model path to: {PADDLE_MODEL_PATH}")
	else:
	print("Using default PaddleOCR model storage location")

	# Set PaddleOCR font path to use system fonts instead of downloading simfang.ttf/PingFang-SC-Regular.ttf
	# This MUST be set before importing PaddleOCR to prevent font downloads
	if (
	PADDLE_FONT_PATH
	and PADDLE_FONT_PATH.strip()
	and os.path.exists(PADDLE_FONT_PATH)
	):
	os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = PADDLE_FONT_PATH
	print(f"Setting PaddleOCR font path to configured font: {PADDLE_FONT_PATH}")
	else:
	system_font_path = get_system_font_path()
	if system_font_path:
	os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = system_font_path
	print(f"Setting PaddleOCR font path to system font: {system_font_path}")
	else:
	print(
	"Warning: No suitable system font found. PaddleOCR may download default fonts."
	)

	try:
	from paddleocr import PaddleOCR

	print("PaddleOCR imported successfully")

	paddle_kwargs = None

	# Default paddle configuration if none provided
	if paddle_kwargs is None:
	paddle_kwargs = {
	"det_db_unclip_ratio": PADDLE_DET_DB_UNCLIP_RATIO,
	"use_textline_orientation": PADDLE_USE_TEXTLINE_ORIENTATION,
	"use_doc_orientation_classify": False,
	"use_doc_unwarping": False,
	"lang": "en",
	}
	else:
	# Enforce language if not explicitly provided
	paddle_kwargs.setdefault("lang", "en")

	try:
	PaddleOCR(**paddle_kwargs)
	except Exception as e:
	# Handle DLL loading errors (common on Windows with GPU version)
	if (
	"WinError 127" in str(e)
	or "could not be found" in str(e).lower()
	or "dll" in str(e).lower()
	):
	print(
	f"Warning: GPU initialization failed (likely missing CUDA/cuDNN dependencies): {e}"
	)
	print("PaddleOCR will not be available. To fix GPU issues:")
	print("1. Install Visual C++ Redistributables (latest version)")
	print("2. Ensure CUDA runtime libraries are in your PATH")
	print(
	"3. Or reinstall paddlepaddle CPU version: pip install paddlepaddle"
	)
	raise ImportError(
	f"Error initializing PaddleOCR: {e}. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
	)
	else:
	raise e

	except ImportError:
	PaddleOCR = None
	print(
	"PaddleOCR not found. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
	)


	# Define module-level defaults for model parameters (always available for import)
	# These will be overridden inside the SHOW_VLM_MODEL_OPTIONS block if enabled
	model_default_prompt = """Read all the text in the image."""
	model_default_do_sample = (
	VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else None
	)
	model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None
	model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None
	model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None
	model_default_temperature = (
	VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None
	)
	model_default_repetition_penalty = (
	VLM_DEFAULT_REPETITION_PENALTY
	if VLM_DEFAULT_REPETITION_PENALTY is not None
	else None
	)
	model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
	model_default_max_new_tokens = int(MAX_NEW_TOKENS)
	model_default_seed = VLM_SEED if VLM_SEED is not None else None


	if SHOW_VLM_MODEL_OPTIONS is True:
	import torch
	from huggingface_hub import snapshot_download
	from transformers import (
	AutoModelForCausalLM,
	AutoProcessor,
	BitsAndBytesConfig,
	Qwen2_5_VLForConditionalGeneration,
	Qwen3VLForConditionalGeneration,
	TextIteratorStreamer,
	)

	from tools.config import (
	MAX_NEW_TOKENS,
	MODEL_CACHE_PATH,
	QUANTISE_VLM_MODELS,
	SELECTED_MODEL,
	USE_FLASH_ATTENTION,
	VLM_DEFAULT_DO_SAMPLE,
	VLM_DEFAULT_MIN_P,
	VLM_DEFAULT_PRESENCE_PENALTY,
	VLM_DEFAULT_REPETITION_PENALTY,
	VLM_DEFAULT_TEMPERATURE,
	VLM_DEFAULT_TOP_K,
	VLM_DEFAULT_TOP_P,
	VLM_SEED,
	)

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	print("torch.__version__ =", torch.__version__)
	print("torch.version.cuda =", torch.version.cuda)
	print("cuda available:", torch.cuda.is_available())
	print("cuda device count:", torch.cuda.device_count())
	if torch.cuda.is_available():
	print("current device:", torch.cuda.current_device())
	print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

	print("Using device:", device)

	CACHE_PATH = MODEL_CACHE_PATH
	if not os.path.exists(CACHE_PATH):
	os.makedirs(CACHE_PATH)

	# Initialize model and processor variables
	processor = None
	model = None

	# Initialize model-specific generation parameters (will be set by specific models if needed)
	# If config values are provided, use them; otherwise leave as None to use model defaults
	model_default_prompt = """Read all the text in the image."""
	model_default_do_sample = (
	VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else None
	)
	model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None
	model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None
	model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None
	model_default_temperature = (
	VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None
	)
	model_default_repetition_penalty = (
	VLM_DEFAULT_REPETITION_PENALTY
	if VLM_DEFAULT_REPETITION_PENALTY is not None
	else None
	)
	model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
	model_default_max_new_tokens = int(MAX_NEW_TOKENS)
	# Track which models support presence_penalty (only Qwen3-VL models currently)
	model_supports_presence_penalty = False
	model_default_seed = VLM_SEED if VLM_SEED is not None else None

	if USE_FLASH_ATTENTION is True:
	attn_implementation = "flash_attention_2"
	else:
	attn_implementation = "eager"

	# Setup quantisation config if enabled
	quantization_config = None
	if QUANTISE_VLM_MODELS is True:
	if not torch.cuda.is_available():
	print(
	"Warning: 4-bit quantisation requires CUDA, but CUDA is not available."
	)
	print("Falling back to loading models without quantisation")
	quantization_config = None
	else:
	try:
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	)
	print("4-bit quantization enabled using bitsandbytes")
	except Exception as e:
	print(f"Warning: Could not setup bitsandbytes quantization: {e}")
	print("Falling back to loading models without quantization")
	quantization_config = None

	print(f"Loading vision model: {SELECTED_MODEL}")

	# Load only the selected model based on configuration
	if SELECTED_MODEL == "Nanonets-OCR2-3B":
	MODEL_ID = "nanonets/Nanonets-OCR2-3B"
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	load_kwargs = {
	"trust_remote_code": True,
	}
	if quantization_config is not None:
	load_kwargs["quantization_config"] = quantization_config
	load_kwargs["device_map"] = "auto"
	else:
	load_kwargs["torch_dtype"] = torch.float16
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID, **load_kwargs
	).eval()
	if quantization_config is None:
	model = model.to(device)

	model_default_prompt = """Extract the text from the above document as if you were reading it naturally."""

	elif SELECTED_MODEL == "Dots.OCR":
	# Download and patch Dots.OCR model
	model_path_d_local = snapshot_download(
	repo_id="rednote-hilab/dots.ocr",
	local_dir=os.path.join(CACHE_PATH, "dots.ocr"),
	max_workers=20,
	local_dir_use_symlinks=False,
	)

	config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")

	if os.path.exists(config_file_path):
	with open(config_file_path, "r") as f:
	input_code = f.read()

	lines = input_code.splitlines()
	if "class DotsVLProcessor" in input_code and not any(
	"attributes = " in line for line in lines
	):
	output_lines = []
	for line in lines:
	output_lines.append(line)
	if line.strip().startswith("class DotsVLProcessor"):
	output_lines.append(
	' attributes = ["image_processor", "tokenizer"]'
	)

	with open(config_file_path, "w") as f:
	f.write("\n".join(output_lines))
	print("Patched configuration_dots.py successfully.")

	sys.path.append(model_path_d_local)

	MODEL_ID = model_path_d_local
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	load_kwargs = {
	"attn_implementation": attn_implementation,
	"device_map": "auto",
	"trust_remote_code": True,
	}
	if quantization_config is not None:
	load_kwargs["quantization_config"] = quantization_config
	else:
	load_kwargs["torch_dtype"] = torch.bfloat16
	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval()

	model_default_prompt = """Extract the text content from this image."""
	model_default_max_new_tokens = MAX_NEW_TOKENS

	elif SELECTED_MODEL == "Qwen3-VL-2B-Instruct":
	MODEL_ID = "Qwen/Qwen3-VL-2B-Instruct"
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	load_kwargs = {
	"device_map": "auto",
	"trust_remote_code": True,
	}
	if quantization_config is not None:
	load_kwargs["quantization_config"] = quantization_config
	else:
	load_kwargs["dtype"] = "auto"
	model = Qwen3VLForConditionalGeneration.from_pretrained(
	MODEL_ID, **load_kwargs
	).eval()

	model_default_prompt = """Read all the text in the image."""
	model_default_do_sample = False
	model_default_top_p = 0.8
	model_default_min_p = 0.0
	model_default_top_k = 20
	model_default_temperature = 0.7
	model_default_repetition_penalty = 1.0
	model_default_presence_penalty = 1.5
	model_default_max_new_tokens = MAX_NEW_TOKENS
	model_supports_presence_penalty = (
	False # I found that this doesn't work when using transformers
	)

	elif SELECTED_MODEL == "Qwen3-VL-4B-Instruct":
	MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	load_kwargs = {
	"attn_implementation": attn_implementation,
	"device_map": "auto",
	"trust_remote_code": True,
	}
	if quantization_config is not None:
	load_kwargs["quantization_config"] = quantization_config
	else:
	load_kwargs["dtype"] = "auto"
	model = Qwen3VLForConditionalGeneration.from_pretrained(
	MODEL_ID, **load_kwargs
	).eval()

	model_default_prompt = """Read all the text in the image."""
	model_default_do_sample = False
	model_default_top_p = 0.8
	model_default_min_p = 0.0
	model_default_top_k = 20
	model_default_temperature = 0.7
	model_default_repetition_penalty = 1.0
	model_default_presence_penalty = 1.5
	model_default_max_new_tokens = MAX_NEW_TOKENS
	model_supports_presence_penalty = (
	False # I found that this doesn't work when using transformers
	)
	elif SELECTED_MODEL == "Qwen3-VL-8B-Instruct":
	MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	load_kwargs = {
	"attn_implementation": attn_implementation,
	"device_map": "auto",
	"trust_remote_code": True,
	}
	if quantization_config is not None:
	load_kwargs["quantization_config"] = quantization_config
	else:
	load_kwargs["dtype"] = "auto"
	model = Qwen3VLForConditionalGeneration.from_pretrained(
	MODEL_ID, **load_kwargs
	).eval()

	model_default_prompt = """Read all the text in the image."""
	model_default_do_sample = False
	model_default_top_p = 0.8
	model_default_min_p = 0.0
	model_default_top_k = 20
	model_default_temperature = 0.7
	model_default_repetition_penalty = 1.0
	model_default_presence_penalty = 1.5
	model_default_max_new_tokens = MAX_NEW_TOKENS
	model_supports_presence_penalty = (
	False # I found that this doesn't work when using transformers
	)

	elif SELECTED_MODEL == "Qwen3-VL-30B-A3B-Instruct":
	MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
	from transformers import Qwen3VLMoeForConditionalGeneration

	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	load_kwargs = {
	"attn_implementation": attn_implementation,
	"device_map": "auto",
	"trust_remote_code": True,
	}

	if quantization_config is not None:
	load_kwargs["quantization_config"] = quantization_config
	else:
	load_kwargs["dtype"] = "auto"
	model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
	MODEL_ID, **load_kwargs
	).eval()

	model_default_prompt = """Read all the text in the image."""
	model_default_do_sample = False
	model_default_top_p = 0.8
	model_default_min_p = 0.0
	model_default_top_k = 20
	model_default_temperature = 0.7
	model_default_repetition_penalty = 1.0
	model_default_presence_penalty = 1.5
	model_default_max_new_tokens = MAX_NEW_TOKENS
	model_supports_presence_penalty = (
	False # I found that this doesn't work when using transformers
	)

	elif SELECTED_MODEL == "Qwen3-VL-235B-A22B-Instruct":
	MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct"
	from transformers import Qwen3VLMoeForConditionalGeneration

	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	load_kwargs = {
	"attn_implementation": attn_implementation,
	"device_map": "auto",
	"trust_remote_code": True,
	}

	if quantization_config is not None:
	load_kwargs["quantization_config"] = quantization_config
	else:
	load_kwargs["dtype"] = "auto"
	model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
	MODEL_ID, **load_kwargs
	).eval()

	model_default_prompt = """Read all the text in the image."""
	model_default_do_sample = False
	model_default_top_p = 0.8
	model_default_min_p = 0.0
	model_default_top_k = 20
	model_default_temperature = 0.7
	model_default_repetition_penalty = 1.0
	model_default_presence_penalty = 1.5
	model_default_max_new_tokens = MAX_NEW_TOKENS
	model_supports_presence_penalty = (
	False # I found that this doesn't work when using transformers
	)

	elif SELECTED_MODEL == "PaddleOCR-VL":
	MODEL_ID = "PaddlePaddle/PaddleOCR-VL"
	load_kwargs = {
	"trust_remote_code": True,
	}
	if quantization_config is not None:
	load_kwargs["quantization_config"] = quantization_config
	load_kwargs["device_map"] = "auto"
	else:
	load_kwargs["torch_dtype"] = torch.bfloat16
	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval()
	if quantization_config is None:
	model = model.to(device)
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

	model_default_prompt = """OCR:"""
	model_default_max_new_tokens = MAX_NEW_TOKENS

	elif SELECTED_MODEL == "None":
	model = None
	processor = None

	else:
	raise ValueError(
	f"Invalid model selected: {SELECTED_MODEL}. Valid options are: Nanonets-OCR2-3B, Dots.OCR, Qwen3-VL-2B-Instruct, Qwen3-VL-4B-Instruct, Qwen3-VL-8B-Instruct, Qwen3-VL-30B-A3B-Instruct, PaddleOCR-VL"
	)

	# Override model defaults with user-provided config values if they are set
	# Priority: user config value > model default
	if VLM_DEFAULT_DO_SAMPLE is not None:
	model_default_do_sample = VLM_DEFAULT_DO_SAMPLE
	if VLM_DEFAULT_TOP_P is not None:
	model_default_top_p = VLM_DEFAULT_TOP_P
	if VLM_DEFAULT_MIN_P is not None:
	model_default_min_p = VLM_DEFAULT_MIN_P
	if VLM_DEFAULT_TOP_K is not None:
	model_default_top_k = VLM_DEFAULT_TOP_K
	if VLM_DEFAULT_TEMPERATURE is not None:
	model_default_temperature = VLM_DEFAULT_TEMPERATURE
	if VLM_DEFAULT_REPETITION_PENALTY is not None:
	model_default_repetition_penalty = VLM_DEFAULT_REPETITION_PENALTY
	if VLM_DEFAULT_PRESENCE_PENALTY is not None:
	model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
	if VLM_SEED is not None:
	model_default_seed = VLM_SEED

	print(f"Successfully loaded {SELECTED_MODEL}")


	@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
	def extract_text_from_image_vlm(
	text: str,
	image: Image.Image,
	max_new_tokens: int = None,
	temperature: float = None,
	top_p: float = None,
	min_p: float = None,
	top_k: int = None,
	repetition_penalty: float = None,
	do_sample: bool = None,
	presence_penalty: float = None,
	seed: int = None,
	model_default_prompt: str = None,
	):
	"""
	Generates responses using the configured vision model for image input.
	Streams text to console and returns complete text only at the end.

	Uses model-specific defaults if they were set during model initialization,
	falling back to function argument defaults if provided, and finally to sensible
	general defaults if neither are available.

	Args:
	text (str): The text prompt to send to the vision model. If empty and model
	has a default prompt, the model default will be used.
	image (Image.Image): The PIL Image to process. Must not be None.
	max_new_tokens (int, optional): Maximum number of new tokens to generate.
	Defaults to model-specific value (MAX_NEW_TOKENS for models with defaults) or MAX_NEW_TOKENS from config.
	temperature (float, optional): Sampling temperature for generation.
	Defaults to model-specific value (0.7 for Qwen3-VL models) or 0.7.
	top_p (float, optional): Nucleus sampling parameter (top-p).
	Defaults to model-specific value (0.8 for Qwen3-VL models) or 0.9.
	min_p (float, optional): Minimum probability threshold for token sampling.
	Defaults to model-specific value or 0.0.
	top_k (int, optional): Top-k sampling parameter.
	Defaults to model-specific value (20 for Qwen3-VL models) or 50.
	repetition_penalty (float, optional): Penalty for token repetition.
	Defaults to model-specific value (1.0 for Qwen3-VL models) or 1.3.
	do_sample (bool, optional): If True, use sampling (do_sample=True).
	If False, use sampling (do_sample=True). If None, defaults to False
	(sampling) for Qwen3-VL models, or True (sampling) for other models.
	presence_penalty (float, optional): Penalty for token presence.
	Defaults to model-specific value (1.5 for Qwen3-VL models) or None.
	Note: Not all models support this parameter.
	seed (int, optional): Random seed for generation. If None, uses VLM_SEED
	from config if set, otherwise no seed is set (non-deterministic).
	model_default_prompt (str, optional): The default prompt to use if no text is provided.
	Defaults to model-specific value (None for Dots.OCR, "Read all the text in the image." for Qwen3-VL models) or "Read all the text in the image."

	Returns:
	str: The complete generated text response from the model.
	"""
	if image is None:
	return "Please upload an image."

	# Determine parameter values with priority: function args > model defaults > general defaults
	# Priority order: function argument (if not None) > model default > general default

	# Text/prompt handling
	if text and text.strip():
	actual_text = text
	elif model_default_prompt is not None:
	actual_text = model_default_prompt
	else:
	actual_text = "Read all the text in the image." # General default

	# max_new_tokens: function arg > model default > general default
	if max_new_tokens is not None:
	actual_max_new_tokens = max_new_tokens
	elif model_default_max_new_tokens is not None:
	actual_max_new_tokens = model_default_max_new_tokens
	else:
	actual_max_new_tokens = MAX_NEW_TOKENS # General default (from config)

	# temperature: function arg > model default (which may include config override)
	if temperature is not None:
	actual_temperature = temperature
	elif model_default_temperature is not None:
	actual_temperature = model_default_temperature
	else:
	# Fallback to a sensible default if neither function arg nor model default is set
	actual_temperature = 0.1

	# top_p: function arg > model default (which may include config override)
	if top_p is not None:
	actual_top_p = top_p
	elif model_default_top_p is not None:
	actual_top_p = model_default_top_p
	else:
	# Fallback to a sensible default if neither function arg nor model default is set
	actual_top_p = 0.8

	# min_p: function arg > model default (which may include config override)
	if min_p is not None:
	actual_min_p = min_p
	elif model_default_min_p is not None:
	actual_min_p = model_default_min_p
	else:
	# Fallback to a sensible default if neither function arg nor model default is set
	actual_min_p = 0.0

	# top_k: function arg > model default (which may include config override)
	if top_k is not None:
	actual_top_k = top_k
	elif model_default_top_k is not None:
	actual_top_k = model_default_top_k
	else:
	# Fallback to a sensible default if neither function arg nor model default is set
	actual_top_k = 20

	# repetition_penalty: function arg > model default (which may include config override)
	if repetition_penalty is not None:
	actual_repetition_penalty = repetition_penalty
	elif model_default_repetition_penalty is not None:
	actual_repetition_penalty = model_default_repetition_penalty
	else:
	# Fallback to a sensible default if neither function arg nor model default is set
	actual_repetition_penalty = 1.0

	# do_sample: function arg > model default (which may include config override)
	if do_sample is not None:
	actual_do_sample = do_sample
	elif model_default_do_sample is not None:
	actual_do_sample = model_default_do_sample
	else:
	# Fallback to a sensible default if neither function arg nor model default is set
	actual_do_sample = True

	# presence_penalty: function arg > model default (which may include config override) > None
	actual_presence_penalty = None
	if presence_penalty is not None:
	actual_presence_penalty = presence_penalty
	elif model_default_presence_penalty is not None:
	actual_presence_penalty = model_default_presence_penalty

	# seed: function arg > model default (which may include config override)
	actual_seed = None
	if seed is not None:
	actual_seed = seed
	elif model_default_seed is not None:
	actual_seed = model_default_seed

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": actual_text},
	],
	}
	]
	prompt_full = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	inputs = processor(
	text=[prompt_full],
	images=[image],
	return_tensors="pt",
	padding=True,
	min_pixels=VLM_MIN_IMAGE_SIZE,
	max_pixels=VLM_MAX_IMAGE_SIZE,
	).to(device)

	streamer = TextIteratorStreamer(
	processor, skip_prompt=True, skip_special_tokens=True
	)

	# Set random seed if specified
	if actual_seed is not None:
	torch.manual_seed(actual_seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(actual_seed)

	# Build generation kwargs with resolved parameters
	generation_kwargs = {
	**inputs,
	"streamer": streamer,
	"max_new_tokens": actual_max_new_tokens,
	"do_sample": actual_do_sample,
	"temperature": actual_temperature,
	"top_p": actual_top_p,
	"min_p": actual_min_p,
	"top_k": actual_top_k,
	"repetition_penalty": actual_repetition_penalty,
	}

	# Add presence_penalty if it's set and the model supports it
	# Only Qwen3-VL models currently support presence_penalty
	if actual_presence_penalty is not None and model_supports_presence_penalty:
	generation_kwargs["presence_penalty"] = actual_presence_penalty
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	buffer = ""
	line_buffer = "" # Accumulate text for the current line
	for new_text in streamer:
	buffer += new_text
	buffer = buffer.replace("<\|im_end\|>", "")
	line_buffer += new_text

	# Print to console as it streams
	print(new_text, end="", flush=True)

	# If we hit a newline, report the entire accumulated line to GUI
	if REPORT_VLM_OUTPUTS_TO_GUI and "\n" in new_text:
	# Split by newline to handle the line(s) we just completed
	parts = line_buffer.split("\n")
	# Report all complete lines (everything except the last part which may be incomplete)
	for line in parts[:-1]:
	if line.strip(): # Only report non-empty lines
	gr.Info(line, duration=2)
	# Keep the last part (after the last newline) for the next line
	line_buffer = parts[-1] if parts else ""

	# time.sleep(0.01)

	# Print final newline after streaming is complete
	print() # Add newline at the end

	# Return the complete text only at the end
	return buffer


	full_page_ocr_vlm_prompt = """Spot all the text in the image at line-level, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': 'identified text'}, ...].

	IMPORTANT: Extract each horizontal line of text separately. Do NOT combine multiple lines into paragraphs. Each line that appears on a separate horizontal row in the image should be a separate entry.

	Rules:
	- Each line must be on a separate horizontal row in the image
	- Even if a sentence is split over multiple horizontal lines, it should be split into separate entries (one per line)
	- If text spans multiple horizontal lines, split it into separate entries (one per line)
	- Do NOT combine lines that appear on different horizontal rows
	- Each bounding box should tightly fit around a single horizontal line of text
	- Empty lines should be skipped

	# Only return valid JSON, no additional text or explanation."""

	full_page_ocr_people_vlm_prompt = """Spot all photos of people's faces in the image, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[PERSON]'}, ...].

	Always return the JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[PERSON]'}, ...].

	Rules:
	- Each photo of a person's face must be a separate entry.
	- Do NOT combine multiple photos into a single entry.
	- Each photo of a person's face that appears in the image should be a separate entry.
	- 'text' should always be exactly '[PERSON]'.
	- Do NOT include any other text or information in the JSON.
	- If there are no photos of people's faces in the image, return an empty JSON array.

	# Only return valid JSON, no additional text or explanation."""

	full_page_ocr_signature_vlm_prompt = """Spot all signatures in the image, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[SIGNATURE]'}, ...].

	Always return the JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[SIGNATURE]'}, ...].

	Rules:
	- Each signature must be a separate entry.
	- Do NOT combine multiple signatures into a single entry.
	- Each signature that appears in the image should be a separate entry.
	- 'text' should always be exactly '[SIGNATURE]'.
	- Do NOT include any other text or information in the JSON.
	- If there are no signatures in the image, return an empty JSON array.

	# Only return valid JSON, no additional text or explanation."""

	# Test for word-level OCR with VLMs - makes some mistakes but not bad
	# full_page_ocr_vlm_prompt = """Spot all the text in the image at word-level, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': 'identified word'}, ...].

	# IMPORTANT: Extract each word in the image separately. Do NOT combine words into longer fragments, sentences, or paragraphs. Each entry must correspond to a single, individual word as visually separated in the image.

	# Rules:
	# - Each entry should correspond to a single distinct word (not groups of words, not whole lines).
	# - For each word, provide a tight bounding box [x1, y1, x2, y2] around just that word.
	# - Do not merge words. Do not split words into letters. Only return one entry per word.
	# - Maintain the order of words as they appear spatially from top to bottom, left to right.
	# - Skip any empty or whitespace-only entries.
	# - Do not include extraneous text, explanations, or formatting beyond the required JSON.

	# Only return valid JSON, no additional text or explanation."""