WishArdently
/

InternVideo2Stage2-VisionEncoder

@@ -1,23 +1,149 @@
 from transformers import PretrainedConfig, PreTrainedModel, AutoModel, AutoConfig
-class DotDict(dict):
-    """字典类，支持通过属性访问键值对。"""
-    def __getattr__(self, key):
-        if key in self:
-            return self[key]
-        else:
-            raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{key}'")
-    def __setattr__(self, key, value):
-        self[key] = value
-    def __delattr__(self, key):
-        if key in self:
-            del self[key]
-        else:
-            raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{key}'")
 class InternVideo2Config(PretrainedConfig):
     model_type = "internvideo2"
@@ -72,7 +198,7 @@ class InternVideo2Config(PretrainedConfig):
         # Data configuration
         self.train_file = train_file or "available_corpus[\"pretrain_example_data_1B\"]"
-        self.test_file = DotDict(test_file or {
             "msrvtt_1k_test": "available_corpus[\"msrvtt_1k_test\"]",
             "didemo_ret_test": "available_corpus[\"didemo_ret_test\"]"
         })
@@ -86,25 +212,25 @@ class InternVideo2Config(PretrainedConfig):
         self.batch_size = batch_size
         self.batch_size_test = batch_size_test
         self.max_txt_l = max_txt_l
-        self.inputs = DotDict(inputs or {
             "image_res": 224,
-            "video_input": DotDict({
                 "num_frames": num_frames,
                 "sample_type": "rand",
                 "num_frames_test": num_frames_test,
                 "sample_type_test": "middle",
                 "random_aug": False
             }),
-            "max_txt_l": DotDict({"image": max_txt_l, "video": max_txt_l}),
-            "batch_size": DotDict({"image": batch_size, "video": batch_size}),
-            "batch_size_test": DotDict({"image": batch_size_test, "video": batch_size_test})
         })
         # Model configuration
         self.text_enc = text_enc
-        self.model = DotDict(model or {
             "model_cls": "InternVideo2_Stage2",
-            "vision_encoder": DotDict({
                 "name": "pretrain_internvideo2_1b_patch14_224",
                 "img_size": 224,
                 "num_frames": num_frames,
@@ -135,15 +261,15 @@ class InternVideo2Config(PretrainedConfig):
                 "only_mask": True
             }),
             "text_encoder": text_enc,
-            "multimodal": DotDict({"enable": True}),
             "embed_dim": 512,
             "temp": 0.07,
             "find_unused_parameters": False
         })
         # Criterion configuration
-        self.criterion = DotDict(criterion or {
-            "loss_weight": DotDict({
                 "vtc": 1.0,
                 "mlm": 1.0,
                 "vtm": 1.0,
@@ -157,17 +283,17 @@ class InternVideo2Config(PretrainedConfig):
         })
         # Optimizer configuration
-        self.optimizer = DotDict(optimizer or {
             "opt": "adamW",
             "lr": 5e-5,
             "opt_betas": [0.9, 0.98],
             "weight_decay": 0.05,
             "max_grad_norm": 3.0,
-            "different_lr": DotDict({"enable": False, "module_names": [], "lr": 1e-3})
         })
         # Scheduler configuration
-        self.scheduler = DotDict(scheduler or {
             "sched": "cosine",
             "epochs": 10,
             "min_lr_multi": 0.01,
@@ -177,7 +303,7 @@ class InternVideo2Config(PretrainedConfig):
         # Evaluation configuration
         self.evaluate = evaluate
         self.deep_fusion = deep_fusion
-        self.evaluation = DotDict(evaluation or {
             "eval_frame_ensemble": "concat",
             "eval_x_only": False,
             "k_test": 128,
@@ -192,7 +318,7 @@ class InternVideo2Config(PretrainedConfig):
         self.use_mem_efficient_sdp = use_mem_efficient_sdp
         self.compile_model = compile_model
-        self.wandb = DotDict(wandb or {
             "enable": False,
             "entity": "opengvlab",
             "project": "InternVideo2-Stage2"
@@ -214,7 +340,7 @@ class InternVideo2Config(PretrainedConfig):
         self.save_ckpt_iter = save_ckpt_iter
         self.delete_ds_optim_states = delete_ds_optim_states
-        self.deepspeed = DotDict(deepspeed or {
             "enable": True,
             "stage": 1
         })

 from transformers import PretrainedConfig, PreTrainedModel, AutoModel, AutoConfig
+class EasyDict(dict):
+    """
+    Get attributes
+    >>> d = EasyDict({'foo':3})
+    >>> d['foo']
+    3
+    >>> d.foo
+    3
+    >>> d.bar
+    Traceback (most recent call last):
+    ...
+    AttributeError: 'EasyDict' object has no attribute 'bar'
+    Works recursively
+    >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}})
+    >>> isinstance(d.bar, dict)
+    True
+    >>> d.bar.x
+    1
+    Bullet-proof
+    >>> EasyDict({})
+    {}
+    >>> EasyDict(d={})
+    {}
+    >>> EasyDict(None)
+    {}
+    >>> d = {'a': 1}
+    >>> EasyDict(**d)
+    {'a': 1}
+    Set attributes
+    >>> d = EasyDict()
+    >>> d.foo = 3
+    >>> d.foo
+    3
+    >>> d.bar = {'prop': 'value'}
+    >>> d.bar.prop
+    'value'
+    >>> d
+    {'foo': 3, 'bar': {'prop': 'value'}}
+    >>> d.bar.prop = 'newer'
+    >>> d.bar.prop
+    'newer'
+    Values extraction
+    >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]})
+    >>> isinstance(d.bar, list)
+    True
+    >>> from operator import attrgetter
+    >>> map(attrgetter('x'), d.bar)
+    [1, 3]
+    >>> map(attrgetter('y'), d.bar)
+    [2, 4]
+    >>> d = EasyDict()
+    >>> d.keys()
+    []
+    >>> d = EasyDict(foo=3, bar=dict(x=1, y=2))
+    >>> d.foo
+    3
+    >>> d.bar.x
+    1
+    Still like a dict though
+    >>> o = EasyDict({'clean':True})
+    >>> o.items()
+    [('clean', True)]
+    And like a class
+    >>> class Flower(EasyDict):
+    ...     power = 1
+    ...
+    >>> f = Flower()
+    >>> f.power
+    1
+    >>> f = Flower({'height': 12})
+    >>> f.height
+    12
+    >>> f['power']
+    1
+    >>> sorted(f.keys())
+    ['height', 'power']
+    update and pop items
+    >>> d = EasyDict(a=1, b='2')
+    >>> e = EasyDict(c=3.0, a=9.0)
+    >>> d.update(e)
+    >>> d.c
+    3.0
+    >>> d['c']
+    3.0
+    >>> d.get('c')
+    3.0
+    >>> d.update(a=4, b=4)
+    >>> d.b
+    4
+    >>> d.pop('a')
+    4
+    >>> d.a
+    Traceback (most recent call last):
+    ...
+    AttributeError: 'EasyDict' object has no attribute 'a'
+    """
+    def __init__(self, d=None, **kwargs):
+        if d is None:
+            d = {}
+        if kwargs:
+            d.update(**kwargs)
+        for k, v in d.items():
+            setattr(self, k, v)
+        # Class attributes
+        for k in self.__class__.__dict__.keys():
+            if not (k.startswith("__") and k.endswith("__")) and not k in ("update", "pop"):
+                setattr(self, k, getattr(self, k))
+    def __setattr__(self, name, value):
+        if isinstance(value, (list, tuple)):
+            value = [self.__class__(x) if isinstance(x, dict) else x for x in value]
+        elif isinstance(value, dict) and not isinstance(value, self.__class__):
+            value = self.__class__(value)
+        super(EasyDict, self).__setattr__(name, value)
+        super(EasyDict, self).__setitem__(name, value)
+    __setitem__ = __setattr__
+    def update(self, e=None, **f):
+        d = e or dict()
+        d.update(f)
+        for k in d:
+            setattr(self, k, d[k])
+    def pop(self, k, d=None):
+        if hasattr(self, k):
+            delattr(self, k)
+        return super(EasyDict, self).pop(k, d)
 class InternVideo2Config(PretrainedConfig):
     model_type = "internvideo2"
         # Data configuration
         self.train_file = train_file or "available_corpus[\"pretrain_example_data_1B\"]"
+        self.test_file = EasyDict(test_file or {
             "msrvtt_1k_test": "available_corpus[\"msrvtt_1k_test\"]",
             "didemo_ret_test": "available_corpus[\"didemo_ret_test\"]"
         })
         self.batch_size = batch_size
         self.batch_size_test = batch_size_test
         self.max_txt_l = max_txt_l
+        self.inputs = EasyDict(inputs or {
             "image_res": 224,
+            "video_input": EasyDict({
                 "num_frames": num_frames,
                 "sample_type": "rand",
                 "num_frames_test": num_frames_test,
                 "sample_type_test": "middle",
                 "random_aug": False
             }),
+            "max_txt_l": EasyDict({"image": max_txt_l, "video": max_txt_l}),
+            "batch_size": EasyDict({"image": batch_size, "video": batch_size}),
+            "batch_size_test": EasyDict({"image": batch_size_test, "video": batch_size_test})
         })
         # Model configuration
         self.text_enc = text_enc
+        self.model = EasyDict(model or {
             "model_cls": "InternVideo2_Stage2",
+            "vision_encoder": EasyDict({
                 "name": "pretrain_internvideo2_1b_patch14_224",
                 "img_size": 224,
                 "num_frames": num_frames,
                 "only_mask": True
             }),
             "text_encoder": text_enc,
+            "multimodal": EasyDict({"enable": True}),
             "embed_dim": 512,
             "temp": 0.07,
             "find_unused_parameters": False
         })
         # Criterion configuration
+        self.criterion = EasyDict(criterion or {
+            "loss_weight": EasyDict({
                 "vtc": 1.0,
                 "mlm": 1.0,
                 "vtm": 1.0,
         })
         # Optimizer configuration
+        self.optimizer = EasyDict(optimizer or {
             "opt": "adamW",
             "lr": 5e-5,
             "opt_betas": [0.9, 0.98],
             "weight_decay": 0.05,
             "max_grad_norm": 3.0,
+            "different_lr": EasyDict({"enable": False, "module_names": [], "lr": 1e-3})
         })
         # Scheduler configuration
+        self.scheduler = EasyDict(scheduler or {
             "sched": "cosine",
             "epochs": 10,
             "min_lr_multi": 0.01,
         # Evaluation configuration
         self.evaluate = evaluate
         self.deep_fusion = deep_fusion
+        self.evaluation = EasyDict(evaluation or {
             "eval_frame_ensemble": "concat",
             "eval_x_only": False,
             "k_test": 128,
         self.use_mem_efficient_sdp = use_mem_efficient_sdp
         self.compile_model = compile_model
+        self.wandb = EasyDict(wandb or {
             "enable": False,
             "entity": "opengvlab",
             "project": "InternVideo2-Stage2"
         self.save_ckpt_iter = save_ckpt_iter
         self.delete_ds_optim_states = delete_ds_optim_states
+        self.deepspeed = EasyDict(deepspeed or {
             "enable": True,
             "stage": 1
         })

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d1217b84f65506792a1fe141d636ffb05cfe151005a8bc9ec46006b343af02ee
 size 2104856154

 version https://git-lfs.github.com/spec/v1
+oid sha256:f0e5845f86e194d4043bb2d0cfb78fadaae0481882163350973df077cb22256a
 size 2104856154