seanpedrickcase's picture
Sync: Merge pull request #112 from seanpedrick-case/dev
66f8083
import os
import sys
from threading import Thread
import gradio as gr
import spaces
from PIL import Image
from tools.config import (
LOAD_PADDLE_AT_STARTUP,
MAX_NEW_TOKENS,
MAX_SPACES_GPU_RUN_TIME,
PADDLE_DET_DB_UNCLIP_RATIO,
PADDLE_FONT_PATH,
PADDLE_MODEL_PATH,
PADDLE_USE_TEXTLINE_ORIENTATION,
QUANTISE_VLM_MODELS,
REPORT_VLM_OUTPUTS_TO_GUI,
SHOW_VLM_MODEL_OPTIONS,
USE_FLASH_ATTENTION,
VLM_DEFAULT_DO_SAMPLE,
VLM_DEFAULT_MIN_P,
VLM_DEFAULT_PRESENCE_PENALTY,
VLM_DEFAULT_REPETITION_PENALTY,
VLM_DEFAULT_TEMPERATURE,
VLM_DEFAULT_TOP_K,
VLM_DEFAULT_TOP_P,
VLM_MAX_IMAGE_SIZE,
VLM_MIN_IMAGE_SIZE,
VLM_SEED,
)
from tools.helper_functions import get_system_font_path
if LOAD_PADDLE_AT_STARTUP is True:
# Set PaddleOCR environment variables BEFORE importing PaddleOCR
# This ensures fonts are configured before the package loads
# Set PaddleOCR model directory environment variable (only if specified).
if PADDLE_MODEL_PATH and PADDLE_MODEL_PATH.strip():
os.environ["PADDLEOCR_MODEL_DIR"] = PADDLE_MODEL_PATH
print(f"Setting PaddleOCR model path to: {PADDLE_MODEL_PATH}")
else:
print("Using default PaddleOCR model storage location")
# Set PaddleOCR font path to use system fonts instead of downloading simfang.ttf/PingFang-SC-Regular.ttf
# This MUST be set before importing PaddleOCR to prevent font downloads
if (
PADDLE_FONT_PATH
and PADDLE_FONT_PATH.strip()
and os.path.exists(PADDLE_FONT_PATH)
):
os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = PADDLE_FONT_PATH
print(f"Setting PaddleOCR font path to configured font: {PADDLE_FONT_PATH}")
else:
system_font_path = get_system_font_path()
if system_font_path:
os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = system_font_path
print(f"Setting PaddleOCR font path to system font: {system_font_path}")
else:
print(
"Warning: No suitable system font found. PaddleOCR may download default fonts."
)
try:
from paddleocr import PaddleOCR
print("PaddleOCR imported successfully")
paddle_kwargs = None
# Default paddle configuration if none provided
if paddle_kwargs is None:
paddle_kwargs = {
"det_db_unclip_ratio": PADDLE_DET_DB_UNCLIP_RATIO,
"use_textline_orientation": PADDLE_USE_TEXTLINE_ORIENTATION,
"use_doc_orientation_classify": False,
"use_doc_unwarping": False,
"lang": "en",
}
else:
# Enforce language if not explicitly provided
paddle_kwargs.setdefault("lang", "en")
try:
PaddleOCR(**paddle_kwargs)
except Exception as e:
# Handle DLL loading errors (common on Windows with GPU version)
if (
"WinError 127" in str(e)
or "could not be found" in str(e).lower()
or "dll" in str(e).lower()
):
print(
f"Warning: GPU initialization failed (likely missing CUDA/cuDNN dependencies): {e}"
)
print("PaddleOCR will not be available. To fix GPU issues:")
print("1. Install Visual C++ Redistributables (latest version)")
print("2. Ensure CUDA runtime libraries are in your PATH")
print(
"3. Or reinstall paddlepaddle CPU version: pip install paddlepaddle"
)
raise ImportError(
f"Error initializing PaddleOCR: {e}. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
)
else:
raise e
except ImportError:
PaddleOCR = None
print(
"PaddleOCR not found. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry."
)
# Define module-level defaults for model parameters (always available for import)
# These will be overridden inside the SHOW_VLM_MODEL_OPTIONS block if enabled
model_default_prompt = """Read all the text in the image."""
model_default_do_sample = (
VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else None
)
model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None
model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None
model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None
model_default_temperature = (
VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None
)
model_default_repetition_penalty = (
VLM_DEFAULT_REPETITION_PENALTY
if VLM_DEFAULT_REPETITION_PENALTY is not None
else None
)
model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
model_default_max_new_tokens = int(MAX_NEW_TOKENS)
model_default_seed = VLM_SEED if VLM_SEED is not None else None
if SHOW_VLM_MODEL_OPTIONS is True:
import torch
from huggingface_hub import snapshot_download
from transformers import (
AutoModelForCausalLM,
AutoProcessor,
BitsAndBytesConfig,
Qwen2_5_VLForConditionalGeneration,
Qwen3VLForConditionalGeneration,
TextIteratorStreamer,
)
from tools.config import (
MAX_NEW_TOKENS,
MODEL_CACHE_PATH,
QUANTISE_VLM_MODELS,
SELECTED_MODEL,
USE_FLASH_ATTENTION,
VLM_DEFAULT_DO_SAMPLE,
VLM_DEFAULT_MIN_P,
VLM_DEFAULT_PRESENCE_PENALTY,
VLM_DEFAULT_REPETITION_PENALTY,
VLM_DEFAULT_TEMPERATURE,
VLM_DEFAULT_TOP_K,
VLM_DEFAULT_TOP_P,
VLM_SEED,
)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("torch.__version__ =", torch.__version__)
print("torch.version.cuda =", torch.version.cuda)
print("cuda available:", torch.cuda.is_available())
print("cuda device count:", torch.cuda.device_count())
if torch.cuda.is_available():
print("current device:", torch.cuda.current_device())
print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
print("Using device:", device)
CACHE_PATH = MODEL_CACHE_PATH
if not os.path.exists(CACHE_PATH):
os.makedirs(CACHE_PATH)
# Initialize model and processor variables
processor = None
model = None
# Initialize model-specific generation parameters (will be set by specific models if needed)
# If config values are provided, use them; otherwise leave as None to use model defaults
model_default_prompt = """Read all the text in the image."""
model_default_do_sample = (
VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else None
)
model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None
model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None
model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None
model_default_temperature = (
VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None
)
model_default_repetition_penalty = (
VLM_DEFAULT_REPETITION_PENALTY
if VLM_DEFAULT_REPETITION_PENALTY is not None
else None
)
model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
model_default_max_new_tokens = int(MAX_NEW_TOKENS)
# Track which models support presence_penalty (only Qwen3-VL models currently)
model_supports_presence_penalty = False
model_default_seed = VLM_SEED if VLM_SEED is not None else None
if USE_FLASH_ATTENTION is True:
attn_implementation = "flash_attention_2"
else:
attn_implementation = "eager"
# Setup quantisation config if enabled
quantization_config = None
if QUANTISE_VLM_MODELS is True:
if not torch.cuda.is_available():
print(
"Warning: 4-bit quantisation requires CUDA, but CUDA is not available."
)
print("Falling back to loading models without quantisation")
quantization_config = None
else:
try:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
)
print("4-bit quantization enabled using bitsandbytes")
except Exception as e:
print(f"Warning: Could not setup bitsandbytes quantization: {e}")
print("Falling back to loading models without quantization")
quantization_config = None
print(f"Loading vision model: {SELECTED_MODEL}")
# Load only the selected model based on configuration
if SELECTED_MODEL == "Nanonets-OCR2-3B":
MODEL_ID = "nanonets/Nanonets-OCR2-3B"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
load_kwargs = {
"trust_remote_code": True,
}
if quantization_config is not None:
load_kwargs["quantization_config"] = quantization_config
load_kwargs["device_map"] = "auto"
else:
load_kwargs["torch_dtype"] = torch.float16
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_ID, **load_kwargs
).eval()
if quantization_config is None:
model = model.to(device)
model_default_prompt = """Extract the text from the above document as if you were reading it naturally."""
elif SELECTED_MODEL == "Dots.OCR":
# Download and patch Dots.OCR model
model_path_d_local = snapshot_download(
repo_id="rednote-hilab/dots.ocr",
local_dir=os.path.join(CACHE_PATH, "dots.ocr"),
max_workers=20,
local_dir_use_symlinks=False,
)
config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
if os.path.exists(config_file_path):
with open(config_file_path, "r") as f:
input_code = f.read()
lines = input_code.splitlines()
if "class DotsVLProcessor" in input_code and not any(
"attributes = " in line for line in lines
):
output_lines = []
for line in lines:
output_lines.append(line)
if line.strip().startswith("class DotsVLProcessor"):
output_lines.append(
' attributes = ["image_processor", "tokenizer"]'
)
with open(config_file_path, "w") as f:
f.write("\n".join(output_lines))
print("Patched configuration_dots.py successfully.")
sys.path.append(model_path_d_local)
MODEL_ID = model_path_d_local
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
load_kwargs = {
"attn_implementation": attn_implementation,
"device_map": "auto",
"trust_remote_code": True,
}
if quantization_config is not None:
load_kwargs["quantization_config"] = quantization_config
else:
load_kwargs["torch_dtype"] = torch.bfloat16
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval()
model_default_prompt = """Extract the text content from this image."""
model_default_max_new_tokens = MAX_NEW_TOKENS
elif SELECTED_MODEL == "Qwen3-VL-2B-Instruct":
MODEL_ID = "Qwen/Qwen3-VL-2B-Instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
load_kwargs = {
"device_map": "auto",
"trust_remote_code": True,
}
if quantization_config is not None:
load_kwargs["quantization_config"] = quantization_config
else:
load_kwargs["dtype"] = "auto"
model = Qwen3VLForConditionalGeneration.from_pretrained(
MODEL_ID, **load_kwargs
).eval()
model_default_prompt = """Read all the text in the image."""
model_default_do_sample = False
model_default_top_p = 0.8
model_default_min_p = 0.0
model_default_top_k = 20
model_default_temperature = 0.7
model_default_repetition_penalty = 1.0
model_default_presence_penalty = 1.5
model_default_max_new_tokens = MAX_NEW_TOKENS
model_supports_presence_penalty = (
False # I found that this doesn't work when using transformers
)
elif SELECTED_MODEL == "Qwen3-VL-4B-Instruct":
MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
load_kwargs = {
"attn_implementation": attn_implementation,
"device_map": "auto",
"trust_remote_code": True,
}
if quantization_config is not None:
load_kwargs["quantization_config"] = quantization_config
else:
load_kwargs["dtype"] = "auto"
model = Qwen3VLForConditionalGeneration.from_pretrained(
MODEL_ID, **load_kwargs
).eval()
model_default_prompt = """Read all the text in the image."""
model_default_do_sample = False
model_default_top_p = 0.8
model_default_min_p = 0.0
model_default_top_k = 20
model_default_temperature = 0.7
model_default_repetition_penalty = 1.0
model_default_presence_penalty = 1.5
model_default_max_new_tokens = MAX_NEW_TOKENS
model_supports_presence_penalty = (
False # I found that this doesn't work when using transformers
)
elif SELECTED_MODEL == "Qwen3-VL-8B-Instruct":
MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
load_kwargs = {
"attn_implementation": attn_implementation,
"device_map": "auto",
"trust_remote_code": True,
}
if quantization_config is not None:
load_kwargs["quantization_config"] = quantization_config
else:
load_kwargs["dtype"] = "auto"
model = Qwen3VLForConditionalGeneration.from_pretrained(
MODEL_ID, **load_kwargs
).eval()
model_default_prompt = """Read all the text in the image."""
model_default_do_sample = False
model_default_top_p = 0.8
model_default_min_p = 0.0
model_default_top_k = 20
model_default_temperature = 0.7
model_default_repetition_penalty = 1.0
model_default_presence_penalty = 1.5
model_default_max_new_tokens = MAX_NEW_TOKENS
model_supports_presence_penalty = (
False # I found that this doesn't work when using transformers
)
elif SELECTED_MODEL == "Qwen3-VL-30B-A3B-Instruct":
MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
from transformers import Qwen3VLMoeForConditionalGeneration
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
load_kwargs = {
"attn_implementation": attn_implementation,
"device_map": "auto",
"trust_remote_code": True,
}
if quantization_config is not None:
load_kwargs["quantization_config"] = quantization_config
else:
load_kwargs["dtype"] = "auto"
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
MODEL_ID, **load_kwargs
).eval()
model_default_prompt = """Read all the text in the image."""
model_default_do_sample = False
model_default_top_p = 0.8
model_default_min_p = 0.0
model_default_top_k = 20
model_default_temperature = 0.7
model_default_repetition_penalty = 1.0
model_default_presence_penalty = 1.5
model_default_max_new_tokens = MAX_NEW_TOKENS
model_supports_presence_penalty = (
False # I found that this doesn't work when using transformers
)
elif SELECTED_MODEL == "Qwen3-VL-235B-A22B-Instruct":
MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct"
from transformers import Qwen3VLMoeForConditionalGeneration
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
load_kwargs = {
"attn_implementation": attn_implementation,
"device_map": "auto",
"trust_remote_code": True,
}
if quantization_config is not None:
load_kwargs["quantization_config"] = quantization_config
else:
load_kwargs["dtype"] = "auto"
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
MODEL_ID, **load_kwargs
).eval()
model_default_prompt = """Read all the text in the image."""
model_default_do_sample = False
model_default_top_p = 0.8
model_default_min_p = 0.0
model_default_top_k = 20
model_default_temperature = 0.7
model_default_repetition_penalty = 1.0
model_default_presence_penalty = 1.5
model_default_max_new_tokens = MAX_NEW_TOKENS
model_supports_presence_penalty = (
False # I found that this doesn't work when using transformers
)
elif SELECTED_MODEL == "PaddleOCR-VL":
MODEL_ID = "PaddlePaddle/PaddleOCR-VL"
load_kwargs = {
"trust_remote_code": True,
}
if quantization_config is not None:
load_kwargs["quantization_config"] = quantization_config
load_kwargs["device_map"] = "auto"
else:
load_kwargs["torch_dtype"] = torch.bfloat16
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval()
if quantization_config is None:
model = model.to(device)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model_default_prompt = """OCR:"""
model_default_max_new_tokens = MAX_NEW_TOKENS
elif SELECTED_MODEL == "None":
model = None
processor = None
else:
raise ValueError(
f"Invalid model selected: {SELECTED_MODEL}. Valid options are: Nanonets-OCR2-3B, Dots.OCR, Qwen3-VL-2B-Instruct, Qwen3-VL-4B-Instruct, Qwen3-VL-8B-Instruct, Qwen3-VL-30B-A3B-Instruct, PaddleOCR-VL"
)
# Override model defaults with user-provided config values if they are set
# Priority: user config value > model default
if VLM_DEFAULT_DO_SAMPLE is not None:
model_default_do_sample = VLM_DEFAULT_DO_SAMPLE
if VLM_DEFAULT_TOP_P is not None:
model_default_top_p = VLM_DEFAULT_TOP_P
if VLM_DEFAULT_MIN_P is not None:
model_default_min_p = VLM_DEFAULT_MIN_P
if VLM_DEFAULT_TOP_K is not None:
model_default_top_k = VLM_DEFAULT_TOP_K
if VLM_DEFAULT_TEMPERATURE is not None:
model_default_temperature = VLM_DEFAULT_TEMPERATURE
if VLM_DEFAULT_REPETITION_PENALTY is not None:
model_default_repetition_penalty = VLM_DEFAULT_REPETITION_PENALTY
if VLM_DEFAULT_PRESENCE_PENALTY is not None:
model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY
if VLM_SEED is not None:
model_default_seed = VLM_SEED
print(f"Successfully loaded {SELECTED_MODEL}")
@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
def extract_text_from_image_vlm(
text: str,
image: Image.Image,
max_new_tokens: int = None,
temperature: float = None,
top_p: float = None,
min_p: float = None,
top_k: int = None,
repetition_penalty: float = None,
do_sample: bool = None,
presence_penalty: float = None,
seed: int = None,
model_default_prompt: str = None,
):
"""
Generates responses using the configured vision model for image input.
Streams text to console and returns complete text only at the end.
Uses model-specific defaults if they were set during model initialization,
falling back to function argument defaults if provided, and finally to sensible
general defaults if neither are available.
Args:
text (str): The text prompt to send to the vision model. If empty and model
has a default prompt, the model default will be used.
image (Image.Image): The PIL Image to process. Must not be None.
max_new_tokens (int, optional): Maximum number of new tokens to generate.
Defaults to model-specific value (MAX_NEW_TOKENS for models with defaults) or MAX_NEW_TOKENS from config.
temperature (float, optional): Sampling temperature for generation.
Defaults to model-specific value (0.7 for Qwen3-VL models) or 0.7.
top_p (float, optional): Nucleus sampling parameter (top-p).
Defaults to model-specific value (0.8 for Qwen3-VL models) or 0.9.
min_p (float, optional): Minimum probability threshold for token sampling.
Defaults to model-specific value or 0.0.
top_k (int, optional): Top-k sampling parameter.
Defaults to model-specific value (20 for Qwen3-VL models) or 50.
repetition_penalty (float, optional): Penalty for token repetition.
Defaults to model-specific value (1.0 for Qwen3-VL models) or 1.3.
do_sample (bool, optional): If True, use sampling (do_sample=True).
If False, use sampling (do_sample=True). If None, defaults to False
(sampling) for Qwen3-VL models, or True (sampling) for other models.
presence_penalty (float, optional): Penalty for token presence.
Defaults to model-specific value (1.5 for Qwen3-VL models) or None.
Note: Not all models support this parameter.
seed (int, optional): Random seed for generation. If None, uses VLM_SEED
from config if set, otherwise no seed is set (non-deterministic).
model_default_prompt (str, optional): The default prompt to use if no text is provided.
Defaults to model-specific value (None for Dots.OCR, "Read all the text in the image." for Qwen3-VL models) or "Read all the text in the image."
Returns:
str: The complete generated text response from the model.
"""
if image is None:
return "Please upload an image."
# Determine parameter values with priority: function args > model defaults > general defaults
# Priority order: function argument (if not None) > model default > general default
# Text/prompt handling
if text and text.strip():
actual_text = text
elif model_default_prompt is not None:
actual_text = model_default_prompt
else:
actual_text = "Read all the text in the image." # General default
# max_new_tokens: function arg > model default > general default
if max_new_tokens is not None:
actual_max_new_tokens = max_new_tokens
elif model_default_max_new_tokens is not None:
actual_max_new_tokens = model_default_max_new_tokens
else:
actual_max_new_tokens = MAX_NEW_TOKENS # General default (from config)
# temperature: function arg > model default (which may include config override)
if temperature is not None:
actual_temperature = temperature
elif model_default_temperature is not None:
actual_temperature = model_default_temperature
else:
# Fallback to a sensible default if neither function arg nor model default is set
actual_temperature = 0.1
# top_p: function arg > model default (which may include config override)
if top_p is not None:
actual_top_p = top_p
elif model_default_top_p is not None:
actual_top_p = model_default_top_p
else:
# Fallback to a sensible default if neither function arg nor model default is set
actual_top_p = 0.8
# min_p: function arg > model default (which may include config override)
if min_p is not None:
actual_min_p = min_p
elif model_default_min_p is not None:
actual_min_p = model_default_min_p
else:
# Fallback to a sensible default if neither function arg nor model default is set
actual_min_p = 0.0
# top_k: function arg > model default (which may include config override)
if top_k is not None:
actual_top_k = top_k
elif model_default_top_k is not None:
actual_top_k = model_default_top_k
else:
# Fallback to a sensible default if neither function arg nor model default is set
actual_top_k = 20
# repetition_penalty: function arg > model default (which may include config override)
if repetition_penalty is not None:
actual_repetition_penalty = repetition_penalty
elif model_default_repetition_penalty is not None:
actual_repetition_penalty = model_default_repetition_penalty
else:
# Fallback to a sensible default if neither function arg nor model default is set
actual_repetition_penalty = 1.0
# do_sample: function arg > model default (which may include config override)
if do_sample is not None:
actual_do_sample = do_sample
elif model_default_do_sample is not None:
actual_do_sample = model_default_do_sample
else:
# Fallback to a sensible default if neither function arg nor model default is set
actual_do_sample = True
# presence_penalty: function arg > model default (which may include config override) > None
actual_presence_penalty = None
if presence_penalty is not None:
actual_presence_penalty = presence_penalty
elif model_default_presence_penalty is not None:
actual_presence_penalty = model_default_presence_penalty
# seed: function arg > model default (which may include config override)
actual_seed = None
if seed is not None:
actual_seed = seed
elif model_default_seed is not None:
actual_seed = model_default_seed
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": actual_text},
],
}
]
prompt_full = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = processor(
text=[prompt_full],
images=[image],
return_tensors="pt",
padding=True,
min_pixels=VLM_MIN_IMAGE_SIZE,
max_pixels=VLM_MAX_IMAGE_SIZE,
).to(device)
streamer = TextIteratorStreamer(
processor, skip_prompt=True, skip_special_tokens=True
)
# Set random seed if specified
if actual_seed is not None:
torch.manual_seed(actual_seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(actual_seed)
# Build generation kwargs with resolved parameters
generation_kwargs = {
**inputs,
"streamer": streamer,
"max_new_tokens": actual_max_new_tokens,
"do_sample": actual_do_sample,
"temperature": actual_temperature,
"top_p": actual_top_p,
"min_p": actual_min_p,
"top_k": actual_top_k,
"repetition_penalty": actual_repetition_penalty,
}
# Add presence_penalty if it's set and the model supports it
# Only Qwen3-VL models currently support presence_penalty
if actual_presence_penalty is not None and model_supports_presence_penalty:
generation_kwargs["presence_penalty"] = actual_presence_penalty
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
buffer = ""
line_buffer = "" # Accumulate text for the current line
for new_text in streamer:
buffer += new_text
buffer = buffer.replace("<|im_end|>", "")
line_buffer += new_text
# Print to console as it streams
print(new_text, end="", flush=True)
# If we hit a newline, report the entire accumulated line to GUI
if REPORT_VLM_OUTPUTS_TO_GUI and "\n" in new_text:
# Split by newline to handle the line(s) we just completed
parts = line_buffer.split("\n")
# Report all complete lines (everything except the last part which may be incomplete)
for line in parts[:-1]:
if line.strip(): # Only report non-empty lines
gr.Info(line, duration=2)
# Keep the last part (after the last newline) for the next line
line_buffer = parts[-1] if parts else ""
# time.sleep(0.01)
# Print final newline after streaming is complete
print() # Add newline at the end
# Return the complete text only at the end
return buffer
full_page_ocr_vlm_prompt = """Spot all the text in the image at line-level, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': 'identified text'}, ...].
IMPORTANT: Extract each horizontal line of text separately. Do NOT combine multiple lines into paragraphs. Each line that appears on a separate horizontal row in the image should be a separate entry.
Rules:
- Each line must be on a separate horizontal row in the image
- Even if a sentence is split over multiple horizontal lines, it should be split into separate entries (one per line)
- If text spans multiple horizontal lines, split it into separate entries (one per line)
- Do NOT combine lines that appear on different horizontal rows
- Each bounding box should tightly fit around a single horizontal line of text
- Empty lines should be skipped
# Only return valid JSON, no additional text or explanation."""
full_page_ocr_people_vlm_prompt = """Spot all photos of people's faces in the image, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[PERSON]'}, ...].
Always return the JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[PERSON]'}, ...].
Rules:
- Each photo of a person's face must be a separate entry.
- Do NOT combine multiple photos into a single entry.
- Each photo of a person's face that appears in the image should be a separate entry.
- 'text' should always be exactly '[PERSON]'.
- Do NOT include any other text or information in the JSON.
- If there are no photos of people's faces in the image, return an empty JSON array.
# Only return valid JSON, no additional text or explanation."""
full_page_ocr_signature_vlm_prompt = """Spot all signatures in the image, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[SIGNATURE]'}, ...].
Always return the JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[SIGNATURE]'}, ...].
Rules:
- Each signature must be a separate entry.
- Do NOT combine multiple signatures into a single entry.
- Each signature that appears in the image should be a separate entry.
- 'text' should always be exactly '[SIGNATURE]'.
- Do NOT include any other text or information in the JSON.
- If there are no signatures in the image, return an empty JSON array.
# Only return valid JSON, no additional text or explanation."""
# Test for word-level OCR with VLMs - makes some mistakes but not bad
# full_page_ocr_vlm_prompt = """Spot all the text in the image at word-level, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': 'identified word'}, ...].
# IMPORTANT: Extract each word in the image separately. Do NOT combine words into longer fragments, sentences, or paragraphs. Each entry must correspond to a single, individual word as visually separated in the image.
# Rules:
# - Each entry should correspond to a single distinct word (not groups of words, not whole lines).
# - For each word, provide a tight bounding box [x1, y1, x2, y2] around just that word.
# - Do not merge words. Do not split words into letters. Only return one entry per word.
# - Maintain the order of words as they appear spatially from top to bottom, left to right.
# - Skip any empty or whitespace-only entries.
# - Do not include extraneous text, explanations, or formatting beyond the required JSON.
# Only return valid JSON, no additional text or explanation."""