| | """ |
| | Oculus Processor |
| | |
| | Handles image and text preprocessing for the Oculus model. |
| | """ |
| |
|
| | from typing import Optional, Union, List, Dict, Any |
| | from PIL import Image |
| | import numpy as np |
| |
|
| | from transformers import ProcessorMixin, BatchFeature |
| | from transformers.image_utils import ImageInput |
| |
|
| |
|
| | class OculusProcessor(ProcessorMixin): |
| | """ |
| | Processor for Oculus model. |
| | |
| | Combines image processing and text tokenization. |
| | |
| | Usage: |
| | ```python |
| | processor = OculusProcessor.from_pretrained("OceanirAI/oculus-0.2") |
| | |
| | # Process inputs |
| | inputs = processor( |
| | images=image, |
| | text="What is in this image?", |
| | mode="text", |
| | return_tensors="pt" |
| | ) |
| | ``` |
| | """ |
| | |
| | attributes = ["image_processor", "tokenizer"] |
| | image_processor_class = "AutoImageProcessor" |
| | tokenizer_class = "AutoTokenizer" |
| | |
| | def __init__( |
| | self, |
| | image_processor=None, |
| | tokenizer=None, |
| | **kwargs |
| | ): |
| | super().__init__(image_processor, tokenizer) |
| | self.image_processor = image_processor |
| | self.tokenizer = tokenizer |
| | |
| | |
| | self.thinking_token = kwargs.get("thinking_token", "<think>") |
| | self.thinking_end_token = kwargs.get("thinking_end_token", "</think>") |
| | self.focus_token = kwargs.get("focus_token", "<focus>") |
| | self.focus_end_token = kwargs.get("focus_end_token", "</focus>") |
| | |
| | |
| | self.mode_tokens = { |
| | "text": "<text>", |
| | "point": "<point>", |
| | "box": "<box>", |
| | "polygon": "<polygon>", |
| | } |
| | |
| | def __call__( |
| | self, |
| | images: ImageInput = None, |
| | text: Union[str, List[str]] = None, |
| | mode: str = "text", |
| | think: bool = False, |
| | return_tensors: Optional[str] = None, |
| | **kwargs |
| | ) -> BatchFeature: |
| | """ |
| | Process images and text for Oculus model. |
| | |
| | Args: |
| | images: Input image(s) |
| | text: Input text prompt(s) |
| | mode: Output mode ("text", "point", "box", "polygon") |
| | think: Enable reasoning mode |
| | return_tensors: Tensor format ("pt", "np", etc.) |
| | |
| | Returns: |
| | BatchFeature with processed inputs |
| | """ |
| | |
| | if images is not None: |
| | if self.image_processor is not None: |
| | image_features = self.image_processor(images, return_tensors=return_tensors) |
| | else: |
| | |
| | if isinstance(images, Image.Image): |
| | images = [images] |
| | image_features = {"pixel_values": images} |
| | else: |
| | image_features = {} |
| | |
| | |
| | if text is not None: |
| | |
| | processed_text = self._format_prompt(text, mode, think) |
| | |
| | if self.tokenizer is not None: |
| | text_features = self.tokenizer( |
| | processed_text, |
| | return_tensors=return_tensors, |
| | padding=True, |
| | truncation=True, |
| | **kwargs |
| | ) |
| | else: |
| | text_features = {"text": processed_text} |
| | else: |
| | text_features = {} |
| | |
| | |
| | return BatchFeature( |
| | data={ |
| | **image_features, |
| | **text_features, |
| | "mode": mode, |
| | "think": think, |
| | }, |
| | tensor_type=return_tensors |
| | ) |
| | |
| | def _format_prompt( |
| | self, |
| | text: Union[str, List[str]], |
| | mode: str, |
| | think: bool |
| | ) -> Union[str, List[str]]: |
| | """Format prompt with special tokens.""" |
| | |
| | def format_single(t: str) -> str: |
| | parts = [] |
| | |
| | |
| | if mode in self.mode_tokens: |
| | parts.append(self.mode_tokens[mode]) |
| | |
| | |
| | if think: |
| | parts.append(self.thinking_token) |
| | |
| | |
| | parts.append(t) |
| | |
| | return " ".join(parts) |
| | |
| | if isinstance(text, str): |
| | return format_single(text) |
| | else: |
| | return [format_single(t) for t in text] |
| | |
| | def decode( |
| | self, |
| | token_ids, |
| | skip_special_tokens: bool = True, |
| | **kwargs |
| | ) -> str: |
| | """Decode token IDs to text.""" |
| | if self.tokenizer is not None: |
| | text = self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs) |
| | else: |
| | text = str(token_ids) |
| | |
| | |
| | thinking_trace = None |
| | if self.thinking_token in text and self.thinking_end_token in text: |
| | start = text.find(self.thinking_token) + len(self.thinking_token) |
| | end = text.find(self.thinking_end_token) |
| | thinking_trace = text[start:end].strip() |
| | text = text[end + len(self.thinking_end_token):].strip() |
| | |
| | return text, thinking_trace |
| | |
| | def batch_decode( |
| | self, |
| | token_ids, |
| | skip_special_tokens: bool = True, |
| | **kwargs |
| | ) -> List[str]: |
| | """Decode batch of token IDs.""" |
| | return [ |
| | self.decode(ids, skip_special_tokens=skip_special_tokens, **kwargs) |
| | for ids in token_ids |
| | ] |
| | |
| | @classmethod |
| | def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): |
| | """Load processor from pretrained.""" |
| | try: |
| | from transformers import AutoImageProcessor, AutoTokenizer |
| | |
| | image_processor = AutoImageProcessor.from_pretrained( |
| | pretrained_model_name_or_path, **kwargs |
| | ) |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | pretrained_model_name_or_path, **kwargs |
| | ) |
| | return cls(image_processor=image_processor, tokenizer=tokenizer, **kwargs) |
| | except: |
| | |
| | return cls(**kwargs) |
| | |
| | def save_pretrained(self, save_directory: str, **kwargs): |
| | """Save processor to directory.""" |
| | if self.image_processor is not None: |
| | self.image_processor.save_pretrained(save_directory) |
| | if self.tokenizer is not None: |
| | self.tokenizer.save_pretrained(save_directory) |
| |
|