|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
|
|
|
|
|
|
|
class SMBVisionConfig(PretrainedConfig): |
|
|
model_type = "smb_vision_encoder" |
|
|
base_config_key = "vision_config" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
depth=27, |
|
|
hidden_size=1152, |
|
|
hidden_act="gelu_pytorch_tanh", |
|
|
intermediate_size=4304, |
|
|
num_heads=16, |
|
|
in_channels=3, |
|
|
patch_size=16, |
|
|
spatial_merge_size=2, |
|
|
temporal_patch_size=2, |
|
|
out_hidden_size=3584, |
|
|
num_position_embeddings=2304, |
|
|
deepstack_visual_indexes=[8, 16, 24], |
|
|
initializer_range=0.02, |
|
|
**kwargs, |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
self.depth = depth |
|
|
self.hidden_size = hidden_size |
|
|
self.hidden_act = hidden_act |
|
|
self.intermediate_size = intermediate_size |
|
|
self.num_heads = num_heads |
|
|
self.in_channels = in_channels |
|
|
self.patch_size = patch_size |
|
|
self.spatial_merge_size = spatial_merge_size |
|
|
self.temporal_patch_size = temporal_patch_size |
|
|
self.out_hidden_size = out_hidden_size |
|
|
self.num_position_embeddings = num_position_embeddings |
|
|
self.initializer_range = initializer_range |
|
|
self.deepstack_visual_indexes = deepstack_visual_indexes |
|
|
|
|
|
|
|
|
class SMBVisionPredictorConfig(PretrainedConfig): |
|
|
model_type = "smb_vision_predictor" |
|
|
base_config_key = "predictor_config" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
depth=27, |
|
|
in_hidden_size=1152, |
|
|
hidden_size=512, |
|
|
hidden_act="gelu_pytorch_tanh", |
|
|
intermediate_size=1536, |
|
|
num_heads=16, |
|
|
in_channels=1, |
|
|
initializer_range=0.02, |
|
|
**kwargs, |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
self.depth = depth |
|
|
self.in_hidden_size = in_hidden_size |
|
|
self.hidden_size = hidden_size |
|
|
self.hidden_act = hidden_act |
|
|
self.intermediate_size = intermediate_size |
|
|
self.num_heads = num_heads |
|
|
self.in_channels = in_channels |
|
|
self.initializer_range = initializer_range |
|
|
|
|
|
|
|
|
class SMBVisionModelConfig(PretrainedConfig): |
|
|
model_type = "smb_vision_model" |
|
|
sub_configs = {"vision_config": SMBVisionConfig, "predictor_config": SMBVisionPredictorConfig} |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
vision_config=None, |
|
|
predictor_config=None, |
|
|
hidden_size=1152, |
|
|
masking_ratio=0.1, |
|
|
**kwargs, |
|
|
): |
|
|
if isinstance(vision_config, dict): |
|
|
self.vision_config = self.sub_configs["vision_config"](**vision_config) |
|
|
elif vision_config is None: |
|
|
self.vision_config = self.sub_configs["vision_config"]() |
|
|
|
|
|
if isinstance(predictor_config, dict): |
|
|
self.predictor_config = self.sub_configs["predictor_config"](**predictor_config) |
|
|
elif predictor_config is None: |
|
|
self.predictor_config = self.sub_configs["predictor_config"]() |
|
|
|
|
|
self.hidden_size = hidden_size |
|
|
self.masking_ratio = masking_ratio |
|
|
|
|
|
super().__init__(**kwargs) |
|
|
|
|
|
|
|
|
__all__ = ["SMBVisionConfig", "SMBVisionPredictorConfig", "SMBVisionModelConfig"] |
|
|
|