File size: 3,699 Bytes
f94c6d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
# coding=utf-8
# Copyright 2025 The SMB Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from transformers.configuration_utils import PretrainedConfig
class SMBVisionConfig(PretrainedConfig):
model_type = "smb_vision_encoder"
base_config_key = "vision_config"
def __init__(
self,
depth=27,
hidden_size=1152,
hidden_act="gelu_pytorch_tanh",
intermediate_size=4304,
num_heads=16,
in_channels=3,
patch_size=16,
spatial_merge_size=2,
temporal_patch_size=2,
out_hidden_size=3584,
num_position_embeddings=2304,
deepstack_visual_indexes=[8, 16, 24],
initializer_range=0.02,
**kwargs,
):
super().__init__(**kwargs)
self.depth = depth
self.hidden_size = hidden_size
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.num_heads = num_heads
self.in_channels = in_channels
self.patch_size = patch_size
self.spatial_merge_size = spatial_merge_size
self.temporal_patch_size = temporal_patch_size
self.out_hidden_size = out_hidden_size
self.num_position_embeddings = num_position_embeddings
self.initializer_range = initializer_range
self.deepstack_visual_indexes = deepstack_visual_indexes
class SMBVisionPredictorConfig(PretrainedConfig):
model_type = "smb_vision_predictor"
base_config_key = "predictor_config"
def __init__(
self,
depth=27,
in_hidden_size=1152,
hidden_size=512,
hidden_act="gelu_pytorch_tanh",
intermediate_size=1536,
num_heads=16,
in_channels=1,
initializer_range=0.02,
**kwargs,
):
super().__init__(**kwargs)
self.depth = depth
self.in_hidden_size = in_hidden_size
self.hidden_size = hidden_size
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.num_heads = num_heads
self.in_channels = in_channels
self.initializer_range = initializer_range
class SMBVisionModelConfig(PretrainedConfig):
model_type = "smb_vision_model"
sub_configs = {"vision_config": SMBVisionConfig, "predictor_config": SMBVisionPredictorConfig}
def __init__(
self,
vision_config=None,
predictor_config=None,
hidden_size=1152,
masking_ratio=0.1,
**kwargs,
):
if isinstance(vision_config, dict):
self.vision_config = self.sub_configs["vision_config"](**vision_config)
elif vision_config is None:
self.vision_config = self.sub_configs["vision_config"]()
if isinstance(predictor_config, dict):
self.predictor_config = self.sub_configs["predictor_config"](**predictor_config)
elif predictor_config is None:
self.predictor_config = self.sub_configs["predictor_config"]()
self.hidden_size = hidden_size
self.masking_ratio = masking_ratio
super().__init__(**kwargs)
__all__ = ["SMBVisionConfig", "SMBVisionPredictorConfig", "SMBVisionModelConfig"]
|