Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import sys | |
| from threading import Thread | |
| import gradio as gr | |
| import spaces | |
| from PIL import Image | |
| from tools.config import ( | |
| LOAD_PADDLE_AT_STARTUP, | |
| MAX_NEW_TOKENS, | |
| MAX_SPACES_GPU_RUN_TIME, | |
| PADDLE_DET_DB_UNCLIP_RATIO, | |
| PADDLE_FONT_PATH, | |
| PADDLE_MODEL_PATH, | |
| PADDLE_USE_TEXTLINE_ORIENTATION, | |
| QUANTISE_VLM_MODELS, | |
| REPORT_VLM_OUTPUTS_TO_GUI, | |
| SHOW_VLM_MODEL_OPTIONS, | |
| USE_FLASH_ATTENTION, | |
| VLM_DEFAULT_DO_SAMPLE, | |
| VLM_DEFAULT_MIN_P, | |
| VLM_DEFAULT_PRESENCE_PENALTY, | |
| VLM_DEFAULT_REPETITION_PENALTY, | |
| VLM_DEFAULT_TEMPERATURE, | |
| VLM_DEFAULT_TOP_K, | |
| VLM_DEFAULT_TOP_P, | |
| VLM_MAX_IMAGE_SIZE, | |
| VLM_MIN_IMAGE_SIZE, | |
| VLM_SEED, | |
| ) | |
| from tools.helper_functions import get_system_font_path | |
| if LOAD_PADDLE_AT_STARTUP is True: | |
| # Set PaddleOCR environment variables BEFORE importing PaddleOCR | |
| # This ensures fonts are configured before the package loads | |
| # Set PaddleOCR model directory environment variable (only if specified). | |
| if PADDLE_MODEL_PATH and PADDLE_MODEL_PATH.strip(): | |
| os.environ["PADDLEOCR_MODEL_DIR"] = PADDLE_MODEL_PATH | |
| print(f"Setting PaddleOCR model path to: {PADDLE_MODEL_PATH}") | |
| else: | |
| print("Using default PaddleOCR model storage location") | |
| # Set PaddleOCR font path to use system fonts instead of downloading simfang.ttf/PingFang-SC-Regular.ttf | |
| # This MUST be set before importing PaddleOCR to prevent font downloads | |
| if ( | |
| PADDLE_FONT_PATH | |
| and PADDLE_FONT_PATH.strip() | |
| and os.path.exists(PADDLE_FONT_PATH) | |
| ): | |
| os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = PADDLE_FONT_PATH | |
| print(f"Setting PaddleOCR font path to configured font: {PADDLE_FONT_PATH}") | |
| else: | |
| system_font_path = get_system_font_path() | |
| if system_font_path: | |
| os.environ["PADDLE_PDX_LOCAL_FONT_FILE_PATH"] = system_font_path | |
| print(f"Setting PaddleOCR font path to system font: {system_font_path}") | |
| else: | |
| print( | |
| "Warning: No suitable system font found. PaddleOCR may download default fonts." | |
| ) | |
| try: | |
| from paddleocr import PaddleOCR | |
| print("PaddleOCR imported successfully") | |
| paddle_kwargs = None | |
| # Default paddle configuration if none provided | |
| if paddle_kwargs is None: | |
| paddle_kwargs = { | |
| "det_db_unclip_ratio": PADDLE_DET_DB_UNCLIP_RATIO, | |
| "use_textline_orientation": PADDLE_USE_TEXTLINE_ORIENTATION, | |
| "use_doc_orientation_classify": False, | |
| "use_doc_unwarping": False, | |
| "lang": "en", | |
| } | |
| else: | |
| # Enforce language if not explicitly provided | |
| paddle_kwargs.setdefault("lang", "en") | |
| try: | |
| PaddleOCR(**paddle_kwargs) | |
| except Exception as e: | |
| # Handle DLL loading errors (common on Windows with GPU version) | |
| if ( | |
| "WinError 127" in str(e) | |
| or "could not be found" in str(e).lower() | |
| or "dll" in str(e).lower() | |
| ): | |
| print( | |
| f"Warning: GPU initialization failed (likely missing CUDA/cuDNN dependencies): {e}" | |
| ) | |
| print("PaddleOCR will not be available. To fix GPU issues:") | |
| print("1. Install Visual C++ Redistributables (latest version)") | |
| print("2. Ensure CUDA runtime libraries are in your PATH") | |
| print( | |
| "3. Or reinstall paddlepaddle CPU version: pip install paddlepaddle" | |
| ) | |
| raise ImportError( | |
| f"Error initializing PaddleOCR: {e}. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry." | |
| ) | |
| else: | |
| raise e | |
| except ImportError: | |
| PaddleOCR = None | |
| print( | |
| "PaddleOCR not found. Please install it using 'pip install paddleocr paddlepaddle' in your python environment and retry." | |
| ) | |
| # Define module-level defaults for model parameters (always available for import) | |
| # These will be overridden inside the SHOW_VLM_MODEL_OPTIONS block if enabled | |
| model_default_prompt = """Read all the text in the image.""" | |
| model_default_do_sample = ( | |
| VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else None | |
| ) | |
| model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None | |
| model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None | |
| model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None | |
| model_default_temperature = ( | |
| VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None | |
| ) | |
| model_default_repetition_penalty = ( | |
| VLM_DEFAULT_REPETITION_PENALTY | |
| if VLM_DEFAULT_REPETITION_PENALTY is not None | |
| else None | |
| ) | |
| model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY | |
| model_default_max_new_tokens = int(MAX_NEW_TOKENS) | |
| model_default_seed = VLM_SEED if VLM_SEED is not None else None | |
| if SHOW_VLM_MODEL_OPTIONS is True: | |
| import torch | |
| from huggingface_hub import snapshot_download | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoProcessor, | |
| BitsAndBytesConfig, | |
| Qwen2_5_VLForConditionalGeneration, | |
| Qwen3VLForConditionalGeneration, | |
| TextIteratorStreamer, | |
| ) | |
| from tools.config import ( | |
| MAX_NEW_TOKENS, | |
| MODEL_CACHE_PATH, | |
| QUANTISE_VLM_MODELS, | |
| SELECTED_MODEL, | |
| USE_FLASH_ATTENTION, | |
| VLM_DEFAULT_DO_SAMPLE, | |
| VLM_DEFAULT_MIN_P, | |
| VLM_DEFAULT_PRESENCE_PENALTY, | |
| VLM_DEFAULT_REPETITION_PENALTY, | |
| VLM_DEFAULT_TEMPERATURE, | |
| VLM_DEFAULT_TOP_K, | |
| VLM_DEFAULT_TOP_P, | |
| VLM_SEED, | |
| ) | |
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
| print("torch.__version__ =", torch.__version__) | |
| print("torch.version.cuda =", torch.version.cuda) | |
| print("cuda available:", torch.cuda.is_available()) | |
| print("cuda device count:", torch.cuda.device_count()) | |
| if torch.cuda.is_available(): | |
| print("current device:", torch.cuda.current_device()) | |
| print("device name:", torch.cuda.get_device_name(torch.cuda.current_device())) | |
| print("Using device:", device) | |
| CACHE_PATH = MODEL_CACHE_PATH | |
| if not os.path.exists(CACHE_PATH): | |
| os.makedirs(CACHE_PATH) | |
| # Initialize model and processor variables | |
| processor = None | |
| model = None | |
| # Initialize model-specific generation parameters (will be set by specific models if needed) | |
| # If config values are provided, use them; otherwise leave as None to use model defaults | |
| model_default_prompt = """Read all the text in the image.""" | |
| model_default_do_sample = ( | |
| VLM_DEFAULT_DO_SAMPLE if VLM_DEFAULT_DO_SAMPLE is not None else None | |
| ) | |
| model_default_top_p = VLM_DEFAULT_TOP_P if VLM_DEFAULT_TOP_P is not None else None | |
| model_default_min_p = VLM_DEFAULT_MIN_P if VLM_DEFAULT_MIN_P is not None else None | |
| model_default_top_k = VLM_DEFAULT_TOP_K if VLM_DEFAULT_TOP_K is not None else None | |
| model_default_temperature = ( | |
| VLM_DEFAULT_TEMPERATURE if VLM_DEFAULT_TEMPERATURE is not None else None | |
| ) | |
| model_default_repetition_penalty = ( | |
| VLM_DEFAULT_REPETITION_PENALTY | |
| if VLM_DEFAULT_REPETITION_PENALTY is not None | |
| else None | |
| ) | |
| model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY | |
| model_default_max_new_tokens = int(MAX_NEW_TOKENS) | |
| # Track which models support presence_penalty (only Qwen3-VL models currently) | |
| model_supports_presence_penalty = False | |
| model_default_seed = VLM_SEED if VLM_SEED is not None else None | |
| if USE_FLASH_ATTENTION is True: | |
| attn_implementation = "flash_attention_2" | |
| else: | |
| attn_implementation = "eager" | |
| # Setup quantisation config if enabled | |
| quantization_config = None | |
| if QUANTISE_VLM_MODELS is True: | |
| if not torch.cuda.is_available(): | |
| print( | |
| "Warning: 4-bit quantisation requires CUDA, but CUDA is not available." | |
| ) | |
| print("Falling back to loading models without quantisation") | |
| quantization_config = None | |
| else: | |
| try: | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4", | |
| ) | |
| print("4-bit quantization enabled using bitsandbytes") | |
| except Exception as e: | |
| print(f"Warning: Could not setup bitsandbytes quantization: {e}") | |
| print("Falling back to loading models without quantization") | |
| quantization_config = None | |
| print(f"Loading vision model: {SELECTED_MODEL}") | |
| # Load only the selected model based on configuration | |
| if SELECTED_MODEL == "Nanonets-OCR2-3B": | |
| MODEL_ID = "nanonets/Nanonets-OCR2-3B" | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| load_kwargs = { | |
| "trust_remote_code": True, | |
| } | |
| if quantization_config is not None: | |
| load_kwargs["quantization_config"] = quantization_config | |
| load_kwargs["device_map"] = "auto" | |
| else: | |
| load_kwargs["torch_dtype"] = torch.float16 | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, **load_kwargs | |
| ).eval() | |
| if quantization_config is None: | |
| model = model.to(device) | |
| model_default_prompt = """Extract the text from the above document as if you were reading it naturally.""" | |
| elif SELECTED_MODEL == "Dots.OCR": | |
| # Download and patch Dots.OCR model | |
| model_path_d_local = snapshot_download( | |
| repo_id="rednote-hilab/dots.ocr", | |
| local_dir=os.path.join(CACHE_PATH, "dots.ocr"), | |
| max_workers=20, | |
| local_dir_use_symlinks=False, | |
| ) | |
| config_file_path = os.path.join(model_path_d_local, "configuration_dots.py") | |
| if os.path.exists(config_file_path): | |
| with open(config_file_path, "r") as f: | |
| input_code = f.read() | |
| lines = input_code.splitlines() | |
| if "class DotsVLProcessor" in input_code and not any( | |
| "attributes = " in line for line in lines | |
| ): | |
| output_lines = [] | |
| for line in lines: | |
| output_lines.append(line) | |
| if line.strip().startswith("class DotsVLProcessor"): | |
| output_lines.append( | |
| ' attributes = ["image_processor", "tokenizer"]' | |
| ) | |
| with open(config_file_path, "w") as f: | |
| f.write("\n".join(output_lines)) | |
| print("Patched configuration_dots.py successfully.") | |
| sys.path.append(model_path_d_local) | |
| MODEL_ID = model_path_d_local | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| load_kwargs = { | |
| "attn_implementation": attn_implementation, | |
| "device_map": "auto", | |
| "trust_remote_code": True, | |
| } | |
| if quantization_config is not None: | |
| load_kwargs["quantization_config"] = quantization_config | |
| else: | |
| load_kwargs["torch_dtype"] = torch.bfloat16 | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval() | |
| model_default_prompt = """Extract the text content from this image.""" | |
| model_default_max_new_tokens = MAX_NEW_TOKENS | |
| elif SELECTED_MODEL == "Qwen3-VL-2B-Instruct": | |
| MODEL_ID = "Qwen/Qwen3-VL-2B-Instruct" | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| load_kwargs = { | |
| "device_map": "auto", | |
| "trust_remote_code": True, | |
| } | |
| if quantization_config is not None: | |
| load_kwargs["quantization_config"] = quantization_config | |
| else: | |
| load_kwargs["dtype"] = "auto" | |
| model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, **load_kwargs | |
| ).eval() | |
| model_default_prompt = """Read all the text in the image.""" | |
| model_default_do_sample = False | |
| model_default_top_p = 0.8 | |
| model_default_min_p = 0.0 | |
| model_default_top_k = 20 | |
| model_default_temperature = 0.7 | |
| model_default_repetition_penalty = 1.0 | |
| model_default_presence_penalty = 1.5 | |
| model_default_max_new_tokens = MAX_NEW_TOKENS | |
| model_supports_presence_penalty = ( | |
| False # I found that this doesn't work when using transformers | |
| ) | |
| elif SELECTED_MODEL == "Qwen3-VL-4B-Instruct": | |
| MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct" | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| load_kwargs = { | |
| "attn_implementation": attn_implementation, | |
| "device_map": "auto", | |
| "trust_remote_code": True, | |
| } | |
| if quantization_config is not None: | |
| load_kwargs["quantization_config"] = quantization_config | |
| else: | |
| load_kwargs["dtype"] = "auto" | |
| model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, **load_kwargs | |
| ).eval() | |
| model_default_prompt = """Read all the text in the image.""" | |
| model_default_do_sample = False | |
| model_default_top_p = 0.8 | |
| model_default_min_p = 0.0 | |
| model_default_top_k = 20 | |
| model_default_temperature = 0.7 | |
| model_default_repetition_penalty = 1.0 | |
| model_default_presence_penalty = 1.5 | |
| model_default_max_new_tokens = MAX_NEW_TOKENS | |
| model_supports_presence_penalty = ( | |
| False # I found that this doesn't work when using transformers | |
| ) | |
| elif SELECTED_MODEL == "Qwen3-VL-8B-Instruct": | |
| MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct" | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| load_kwargs = { | |
| "attn_implementation": attn_implementation, | |
| "device_map": "auto", | |
| "trust_remote_code": True, | |
| } | |
| if quantization_config is not None: | |
| load_kwargs["quantization_config"] = quantization_config | |
| else: | |
| load_kwargs["dtype"] = "auto" | |
| model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, **load_kwargs | |
| ).eval() | |
| model_default_prompt = """Read all the text in the image.""" | |
| model_default_do_sample = False | |
| model_default_top_p = 0.8 | |
| model_default_min_p = 0.0 | |
| model_default_top_k = 20 | |
| model_default_temperature = 0.7 | |
| model_default_repetition_penalty = 1.0 | |
| model_default_presence_penalty = 1.5 | |
| model_default_max_new_tokens = MAX_NEW_TOKENS | |
| model_supports_presence_penalty = ( | |
| False # I found that this doesn't work when using transformers | |
| ) | |
| elif SELECTED_MODEL == "Qwen3-VL-30B-A3B-Instruct": | |
| MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct" | |
| from transformers import Qwen3VLMoeForConditionalGeneration | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| load_kwargs = { | |
| "attn_implementation": attn_implementation, | |
| "device_map": "auto", | |
| "trust_remote_code": True, | |
| } | |
| # budget for image processor, since the compression ratio is 32 for Qwen3-VL, we can set the number of visual tokens of a single image to 256-1280 | |
| # processor.image_processor.size = { | |
| # "longest_edge": VLM_MAX_IMAGE_SIZE, | |
| # "shortest_edge": VLM_MIN_IMAGE_SIZE, | |
| # } | |
| if quantization_config is not None: | |
| load_kwargs["quantization_config"] = quantization_config | |
| else: | |
| load_kwargs["dtype"] = "auto" | |
| model = Qwen3VLMoeForConditionalGeneration.from_pretrained( | |
| MODEL_ID, **load_kwargs | |
| ).eval() | |
| model_default_prompt = """Read all the text in the image.""" | |
| model_default_do_sample = False | |
| model_default_top_p = 0.8 | |
| model_default_min_p = 0.0 | |
| model_default_top_k = 20 | |
| model_default_temperature = 0.7 | |
| model_default_repetition_penalty = 1.0 | |
| model_default_presence_penalty = 1.5 | |
| model_default_max_new_tokens = MAX_NEW_TOKENS | |
| model_supports_presence_penalty = ( | |
| False # I found that this doesn't work when using transformers | |
| ) | |
| elif SELECTED_MODEL == "PaddleOCR-VL": | |
| MODEL_ID = "PaddlePaddle/PaddleOCR-VL" | |
| load_kwargs = { | |
| "trust_remote_code": True, | |
| } | |
| if quantization_config is not None: | |
| load_kwargs["quantization_config"] = quantization_config | |
| load_kwargs["device_map"] = "auto" | |
| else: | |
| load_kwargs["torch_dtype"] = torch.bfloat16 | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs).eval() | |
| if quantization_config is None: | |
| model = model.to(device) | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| model_default_prompt = """OCR:""" | |
| model_default_max_new_tokens = MAX_NEW_TOKENS | |
| elif SELECTED_MODEL == "None": | |
| model = None | |
| processor = None | |
| else: | |
| raise ValueError( | |
| f"Invalid model selected: {SELECTED_MODEL}. Valid options are: Nanonets-OCR2-3B, Dots.OCR, Qwen3-VL-2B-Instruct, Qwen3-VL-4B-Instruct, Qwen3-VL-8B-Instruct, Qwen3-VL-30B-A3B-Instruct, PaddleOCR-VL" | |
| ) | |
| # Override model defaults with user-provided config values if they are set | |
| # Priority: user config value > model default | |
| if VLM_DEFAULT_DO_SAMPLE is not None: | |
| model_default_do_sample = VLM_DEFAULT_DO_SAMPLE | |
| if VLM_DEFAULT_TOP_P is not None: | |
| model_default_top_p = VLM_DEFAULT_TOP_P | |
| if VLM_DEFAULT_MIN_P is not None: | |
| model_default_min_p = VLM_DEFAULT_MIN_P | |
| if VLM_DEFAULT_TOP_K is not None: | |
| model_default_top_k = VLM_DEFAULT_TOP_K | |
| if VLM_DEFAULT_TEMPERATURE is not None: | |
| model_default_temperature = VLM_DEFAULT_TEMPERATURE | |
| if VLM_DEFAULT_REPETITION_PENALTY is not None: | |
| model_default_repetition_penalty = VLM_DEFAULT_REPETITION_PENALTY | |
| if VLM_DEFAULT_PRESENCE_PENALTY is not None: | |
| model_default_presence_penalty = VLM_DEFAULT_PRESENCE_PENALTY | |
| if VLM_SEED is not None: | |
| model_default_seed = VLM_SEED | |
| print(f"Successfully loaded {SELECTED_MODEL}") | |
| def extract_text_from_image_vlm( | |
| text: str, | |
| image: Image.Image, | |
| max_new_tokens: int = None, | |
| temperature: float = None, | |
| top_p: float = None, | |
| min_p: float = None, | |
| top_k: int = None, | |
| repetition_penalty: float = None, | |
| do_sample: bool = None, | |
| presence_penalty: float = None, | |
| seed: int = None, | |
| model_default_prompt: str = None, | |
| ): | |
| """ | |
| Generates responses using the configured vision model for image input. | |
| Streams text to console and returns complete text only at the end. | |
| Uses model-specific defaults if they were set during model initialization, | |
| falling back to function argument defaults if provided, and finally to sensible | |
| general defaults if neither are available. | |
| Args: | |
| text (str): The text prompt to send to the vision model. If empty and model | |
| has a default prompt, the model default will be used. | |
| image (Image.Image): The PIL Image to process. Must not be None. | |
| max_new_tokens (int, optional): Maximum number of new tokens to generate. | |
| Defaults to model-specific value (MAX_NEW_TOKENS for models with defaults) or MAX_NEW_TOKENS from config. | |
| temperature (float, optional): Sampling temperature for generation. | |
| Defaults to model-specific value (0.7 for Qwen3-VL models) or 0.7. | |
| top_p (float, optional): Nucleus sampling parameter (top-p). | |
| Defaults to model-specific value (0.8 for Qwen3-VL models) or 0.9. | |
| min_p (float, optional): Minimum probability threshold for token sampling. | |
| Defaults to model-specific value or 0.0. | |
| top_k (int, optional): Top-k sampling parameter. | |
| Defaults to model-specific value (20 for Qwen3-VL models) or 50. | |
| repetition_penalty (float, optional): Penalty for token repetition. | |
| Defaults to model-specific value (1.0 for Qwen3-VL models) or 1.3. | |
| do_sample (bool, optional): If True, use sampling (do_sample=True). | |
| If False, use sampling (do_sample=True). If None, defaults to False | |
| (sampling) for Qwen3-VL models, or True (sampling) for other models. | |
| presence_penalty (float, optional): Penalty for token presence. | |
| Defaults to model-specific value (1.5 for Qwen3-VL models) or None. | |
| Note: Not all models support this parameter. | |
| seed (int, optional): Random seed for generation. If None, uses VLM_SEED | |
| from config if set, otherwise no seed is set (non-deterministic). | |
| model_default_prompt (str, optional): The default prompt to use if no text is provided. | |
| Defaults to model-specific value (None for Dots.OCR, "Read all the text in the image." for Qwen3-VL models) or "Read all the text in the image." | |
| Returns: | |
| str: The complete generated text response from the model. | |
| """ | |
| if image is None: | |
| return "Please upload an image." | |
| # Determine parameter values with priority: function args > model defaults > general defaults | |
| # Priority order: function argument (if not None) > model default > general default | |
| # Text/prompt handling | |
| if text and text.strip(): | |
| actual_text = text | |
| elif model_default_prompt is not None: | |
| actual_text = model_default_prompt | |
| else: | |
| actual_text = "Read all the text in the image." # General default | |
| # max_new_tokens: function arg > model default > general default | |
| if max_new_tokens is not None: | |
| actual_max_new_tokens = max_new_tokens | |
| elif model_default_max_new_tokens is not None: | |
| actual_max_new_tokens = model_default_max_new_tokens | |
| else: | |
| actual_max_new_tokens = MAX_NEW_TOKENS # General default (from config) | |
| # temperature: function arg > model default (which may include config override) | |
| if temperature is not None: | |
| actual_temperature = temperature | |
| elif model_default_temperature is not None: | |
| actual_temperature = model_default_temperature | |
| else: | |
| # Fallback to a sensible default if neither function arg nor model default is set | |
| actual_temperature = 0.1 | |
| # top_p: function arg > model default (which may include config override) | |
| if top_p is not None: | |
| actual_top_p = top_p | |
| elif model_default_top_p is not None: | |
| actual_top_p = model_default_top_p | |
| else: | |
| # Fallback to a sensible default if neither function arg nor model default is set | |
| actual_top_p = 0.8 | |
| # min_p: function arg > model default (which may include config override) | |
| if min_p is not None: | |
| actual_min_p = min_p | |
| elif model_default_min_p is not None: | |
| actual_min_p = model_default_min_p | |
| else: | |
| # Fallback to a sensible default if neither function arg nor model default is set | |
| actual_min_p = 0.0 | |
| # top_k: function arg > model default (which may include config override) | |
| if top_k is not None: | |
| actual_top_k = top_k | |
| elif model_default_top_k is not None: | |
| actual_top_k = model_default_top_k | |
| else: | |
| # Fallback to a sensible default if neither function arg nor model default is set | |
| actual_top_k = 20 | |
| # repetition_penalty: function arg > model default (which may include config override) | |
| if repetition_penalty is not None: | |
| actual_repetition_penalty = repetition_penalty | |
| elif model_default_repetition_penalty is not None: | |
| actual_repetition_penalty = model_default_repetition_penalty | |
| else: | |
| # Fallback to a sensible default if neither function arg nor model default is set | |
| actual_repetition_penalty = 1.0 | |
| # do_sample: function arg > model default (which may include config override) | |
| if do_sample is not None: | |
| actual_do_sample = do_sample | |
| elif model_default_do_sample is not None: | |
| actual_do_sample = model_default_do_sample | |
| else: | |
| # Fallback to a sensible default if neither function arg nor model default is set | |
| actual_do_sample = True | |
| # presence_penalty: function arg > model default (which may include config override) > None | |
| actual_presence_penalty = None | |
| if presence_penalty is not None: | |
| actual_presence_penalty = presence_penalty | |
| elif model_default_presence_penalty is not None: | |
| actual_presence_penalty = model_default_presence_penalty | |
| # seed: function arg > model default (which may include config override) | |
| actual_seed = None | |
| if seed is not None: | |
| actual_seed = seed | |
| elif model_default_seed is not None: | |
| actual_seed = model_default_seed | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image"}, | |
| {"type": "text", "text": actual_text}, | |
| ], | |
| } | |
| ] | |
| prompt_full = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| inputs = processor( | |
| text=[prompt_full], | |
| images=[image], | |
| return_tensors="pt", | |
| padding=True, | |
| min_pixels=VLM_MIN_IMAGE_SIZE, | |
| max_pixels=VLM_MAX_IMAGE_SIZE, | |
| ).to(device) | |
| streamer = TextIteratorStreamer( | |
| processor, skip_prompt=True, skip_special_tokens=True | |
| ) | |
| # Set random seed if specified | |
| if actual_seed is not None: | |
| torch.manual_seed(actual_seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed_all(actual_seed) | |
| # Build generation kwargs with resolved parameters | |
| generation_kwargs = { | |
| **inputs, | |
| "streamer": streamer, | |
| "max_new_tokens": actual_max_new_tokens, | |
| "do_sample": actual_do_sample, | |
| "temperature": actual_temperature, | |
| "top_p": actual_top_p, | |
| "min_p": actual_min_p, | |
| "top_k": actual_top_k, | |
| "repetition_penalty": actual_repetition_penalty, | |
| } | |
| # Add presence_penalty if it's set and the model supports it | |
| # Only Qwen3-VL models currently support presence_penalty | |
| if actual_presence_penalty is not None and model_supports_presence_penalty: | |
| generation_kwargs["presence_penalty"] = actual_presence_penalty | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| buffer = "" | |
| line_buffer = "" # Accumulate text for the current line | |
| for new_text in streamer: | |
| buffer += new_text | |
| buffer = buffer.replace("<|im_end|>", "") | |
| line_buffer += new_text | |
| # Print to console as it streams | |
| print(new_text, end="", flush=True) | |
| # If we hit a newline, report the entire accumulated line to GUI | |
| if REPORT_VLM_OUTPUTS_TO_GUI and "\n" in new_text: | |
| # Split by newline to handle the line(s) we just completed | |
| parts = line_buffer.split("\n") | |
| # Report all complete lines (everything except the last part which may be incomplete) | |
| for line in parts[:-1]: | |
| if line.strip(): # Only report non-empty lines | |
| gr.Info(line, duration=2) | |
| # Keep the last part (after the last newline) for the next line | |
| line_buffer = parts[-1] if parts else "" | |
| # time.sleep(0.01) | |
| # Print final newline after streaming is complete | |
| print() # Add newline at the end | |
| # Return the complete text only at the end | |
| return buffer | |
| full_page_ocr_vlm_prompt = """Spot all the text in the image at line-level, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': 'identified text'}, ...]. | |
| IMPORTANT: Extract each horizontal line of text separately. Do NOT combine multiple lines into paragraphs. Each line that appears on a separate horizontal row in the image should be a separate entry. | |
| Rules: | |
| - Each line must be on a separate horizontal row in the image | |
| - Even if a sentence is split over multiple horizontal lines, it should be split into separate entries (one per line) | |
| - If text spans multiple horizontal lines, split it into separate entries (one per line) | |
| - Do NOT combine lines that appear on different horizontal rows | |
| - Each bounding box should tightly fit around a single horizontal line of text | |
| - Empty lines should be skipped | |
| # Only return valid JSON, no additional text or explanation.""" | |
| full_page_ocr_people_vlm_prompt = """Spot all photos of people's faces in the image, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[PERSON]'}, ...]. | |
| Always return the JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[PERSON]'}, ...]. | |
| Rules: | |
| - Each photo of a person's face must be a separate entry. | |
| - Do NOT combine multiple photos into a single entry. | |
| - Each photo of a person's face that appears in the image should be a separate entry. | |
| - 'text' should always be exactly '[PERSON]'. | |
| - Do NOT include any other text or information in the JSON. | |
| - If there are no photos of people's faces in the image, return an empty JSON array. | |
| # Only return valid JSON, no additional text or explanation.""" | |
| full_page_ocr_signature_vlm_prompt = """Spot all signatures in the image, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[SIGNATURE]'}, ...]. | |
| Always return the JSON format as [{'bb': [x1, y1, x2, y2], 'text': '[SIGNATURE]'}, ...]. | |
| Rules: | |
| - Each signature must be a separate entry. | |
| - Do NOT combine multiple signatures into a single entry. | |
| - Each signature that appears in the image should be a separate entry. | |
| - 'text' should always be exactly '[SIGNATURE]'. | |
| - Do NOT include any other text or information in the JSON. | |
| - If there are no signatures in the image, return an empty JSON array. | |
| # Only return valid JSON, no additional text or explanation.""" | |
| # Test for word-level OCR with VLMs - makes some mistakes but not bad | |
| # full_page_ocr_vlm_prompt = """Spot all the text in the image at word-level, and output in JSON format as [{'bb': [x1, y1, x2, y2], 'text': 'identified word'}, ...]. | |
| # IMPORTANT: Extract each word in the image separately. Do NOT combine words into longer fragments, sentences, or paragraphs. Each entry must correspond to a single, individual word as visually separated in the image. | |
| # Rules: | |
| # - Each entry should correspond to a single distinct word (not groups of words, not whole lines). | |
| # - For each word, provide a tight bounding box [x1, y1, x2, y2] around just that word. | |
| # - Do not merge words. Do not split words into letters. Only return one entry per word. | |
| # - Maintain the order of words as they appear spatially from top to bottom, left to right. | |
| # - Skip any empty or whitespace-only entries. | |
| # - Do not include extraneous text, explanations, or formatting beyond the required JSON. | |
| # Only return valid JSON, no additional text or explanation.""" | |