import torch from PIL import Image from transformers import AutoProcessor, AutoModelForCausalLM MODEL_ID = "ducviet00/Florence-2-large-hf" # Global variables for lazy loading _model = None _processor = None def _load_model(): """Load model and processor lazily""" global _model, _processor if _model is None: device = "cuda" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 print(f"Loading model {MODEL_ID} on {device}...") _model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True) _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) print("Model loaded successfully!") return _model, _processor def get_task_response(task_prompt: str, image: Image.Image, text_input=None): """Return associated task response Task can be: '' '' '' """ # Lazy load model only when needed model, processor = _load_model() if text_input is None: prompt = task_prompt else: prompt = task_prompt + text_input # Ensure image is in RGB mode if image.mode != "RGB": image = image.convert("RGB") if processor is None: raise ValueError("processor is None") # Process inputs using the correct API inputs = processor(text=prompt, images=image, return_tensors="pt") # Move inputs to device if model is on CUDA device = next(model.parameters()).device inputs = {k: v.to(device) for k, v in inputs.items()} generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3, ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] parsed_answer = processor.post_process_generation( generated_text, task=task_prompt, image_size=(image.width, image.height) ) return parsed_answer[task_prompt]