# coding=utf-8 # Copyright 2025 The SMB Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from transformers.configuration_utils import PretrainedConfig class SMBVisionConfig(PretrainedConfig): model_type = "smb_vision_encoder" base_config_key = "vision_config" def __init__( self, depth=27, hidden_size=1152, hidden_act="gelu_pytorch_tanh", intermediate_size=4304, num_heads=16, in_channels=3, patch_size=16, spatial_merge_size=2, temporal_patch_size=2, out_hidden_size=3584, num_position_embeddings=2304, deepstack_visual_indexes=[8, 16, 24], initializer_range=0.02, **kwargs, ): super().__init__(**kwargs) self.depth = depth self.hidden_size = hidden_size self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.num_heads = num_heads self.in_channels = in_channels self.patch_size = patch_size self.spatial_merge_size = spatial_merge_size self.temporal_patch_size = temporal_patch_size self.out_hidden_size = out_hidden_size self.num_position_embeddings = num_position_embeddings self.initializer_range = initializer_range self.deepstack_visual_indexes = deepstack_visual_indexes class SMBVisionPredictorConfig(PretrainedConfig): model_type = "smb_vision_predictor" base_config_key = "predictor_config" def __init__( self, depth=27, in_hidden_size=1152, hidden_size=512, hidden_act="gelu_pytorch_tanh", intermediate_size=1536, num_heads=16, in_channels=1, initializer_range=0.02, **kwargs, ): super().__init__(**kwargs) self.depth = depth self.in_hidden_size = in_hidden_size self.hidden_size = hidden_size self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.num_heads = num_heads self.in_channels = in_channels self.initializer_range = initializer_range class SMBVisionModelConfig(PretrainedConfig): model_type = "smb_vision_model" sub_configs = {"vision_config": SMBVisionConfig, "predictor_config": SMBVisionPredictorConfig} def __init__( self, vision_config=None, predictor_config=None, hidden_size=1152, masking_ratio=0.1, **kwargs, ): if isinstance(vision_config, dict): self.vision_config = self.sub_configs["vision_config"](**vision_config) elif vision_config is None: self.vision_config = self.sub_configs["vision_config"]() if isinstance(predictor_config, dict): self.predictor_config = self.sub_configs["predictor_config"](**predictor_config) elif predictor_config is None: self.predictor_config = self.sub_configs["predictor_config"]() self.hidden_size = hidden_size self.masking_ratio = masking_ratio super().__init__(**kwargs) __all__ = ["SMBVisionConfig", "SMBVisionPredictorConfig", "SMBVisionModelConfig"]