Upload InternVideo2Stage2VideoEncoder

Browse files

Files changed (5) hide show

README.md +199 -0
config.json +169 -0
config.py +220 -0
model.py +41 -0
model.safetensors +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,169 @@

+{
+  "architectures": [
+    "InternVideo2Stage2VideoEncoder"
+  ],
+  "auto_map": {
+    "AutoConfig": "config.InternVideo2Config",
+    "AutoModel": "model.InternVideo2Stage2VideoEncoder"
+  },
+  "auto_resume": false,
+  "batch_size": 64,
+  "batch_size_test": 4,
+  "best_key": [
+    "msrvtt_1k_test_match",
+    "t2v_r1"
+  ],
+  "compile_model": false,
+  "criterion": {
+    "clip_loss_ratio": [
+      1.0,
+      1.0
+    ],
+    "distill_final_features": true,
+    "loss_weight": {
+      "mlm": 1.0,
+      "mvm": 0.0,
+      "uta": 0.0,
+      "vtc": 1.0,
+      "vtm": 1.0
+    },
+    "mlm_masking_prob": 0.5,
+    "vtm_hard_neg": true
+  },
+  "debug": false,
+  "deep_fusion": false,
+  "deepspeed": {
+    "enable": true,
+    "stage": 1
+  },
+  "delete_ds_optim_states": true,
+  "device": "cuda",
+  "dist_url": "env://",
+  "evaluate": false,
+  "evaluation": {
+    "eval_frame_ensemble": "concat",
+    "eval_offload": true,
+    "eval_x_only": false,
+    "k_test": 128
+  },
+  "gradient_checkpointing": true,
+  "inputs": {
+    "batch_size": {
+      "image": 64,
+      "video": 64
+    },
+    "batch_size_test": {
+      "image": 4,
+      "video": 4
+    },
+    "image_res": 224,
+    "max_txt_l": {
+      "image": 32,
+      "video": 32
+    },
+    "video_input": {
+      "num_frames": 8,
+      "num_frames_test": 8,
+      "random_aug": false,
+      "sample_type": "rand",
+      "sample_type_test": "middle"
+    }
+  },
+  "jump_evaluate": false,
+  "log_freq": 100,
+  "max_txt_l": 32,
+  "mode": "pt",
+  "model": {
+    "embed_dim": 512,
+    "find_unused_parameters": false,
+    "model_cls": "InternVideo2_Stage2",
+    "multimodal": {
+      "enable": true
+    },
+    "temp": 0.07,
+    "text_encoder": "bert_large",
+    "vision_encoder": {
+      "checkpoint_num": 40,
+      "clip_embed_dim": 768,
+      "clip_input_resolution": 224,
+      "clip_norm_type": "l2",
+      "clip_return_layer": 6,
+      "clip_student_return_interval": 1,
+      "clip_teacher": null,
+      "clip_teacher_embed_dim": 3200,
+      "clip_teacher_final_dim": 768,
+      "clip_teacher_return_interval": 1,
+      "d_model": 1408,
+      "image_mask_ratio": 0.5,
+      "image_mask_type": "random",
+      "img_size": 224,
+      "keep_temporal": false,
+      "name": "pretrain_internvideo2_1b_patch14_224",
+      "num_frames": 8,
+      "only_mask": true,
+      "patch_size": 14,
+      "pretrained": "/home/linanxi/InternVideo/checkpoints/InternVideo2-stage2_1b-224p-f4/InternVideo2-stage2_1b-224p-f4.pt",
+      "sep_image_video_pos_embed": true,
+      "tubelet_size": 1,
+      "use_checkpoint": false,
+      "use_flash_attn": true,
+      "use_fused_mlp": true,
+      "use_fused_rmsnorm": true,
+      "video_mask_ratio": 0.8,
+      "video_mask_type": "random"
+    }
+  },
+  "model_type": "internvideo2",
+  "num_frames": 8,
+  "num_frames_test": 8,
+  "num_workers": 6,
+  "optimizer": {
+    "different_lr": {
+      "enable": false,
+      "lr": 0.001,
+      "module_names": []
+    },
+    "lr": 5e-05,
+    "max_grad_norm": 3.0,
+    "opt": "adamW",
+    "opt_betas": [
+      0.9,
+      0.98
+    ],
+    "weight_decay": 0.05
+  },
+  "output_dir": null,
+  "pretrained_path": "",
+  "resume": false,
+  "save_ckpt_iter": null,
+  "save_latest": true,
+  "scheduler": {
+    "epochs": 10,
+    "min_lr_multi": 0.01,
+    "sched": "cosine",
+    "warmup_epochs": 1
+  },
+  "seed": 42,
+  "test_file": {
+    "didemo_ret_test": "available_corpus[\"didemo_ret_test\"]",
+    "msrvtt_1k_test": "available_corpus[\"msrvtt_1k_test\"]"
+  },
+  "test_types": [
+    "msrvtt_1k_test",
+    "didemo_ret_test"
+  ],
+  "text_enc": "bert_large",
+  "tokenizer": null,
+  "torch_dtype": "float16",
+  "train_file": "available_corpus[\"pretrain_example_data_1B\"]",
+  "transformers_version": "4.47.0",
+  "use_bf16": true,
+  "use_flash_sdp": false,
+  "use_half_precision": true,
+  "use_mem_efficient_sdp": false,
+  "wandb": {
+    "enable": false,
+    "entity": "opengvlab",
+    "project": "InternVideo2-Stage2"
+  }
+}

config.py ADDED Viewed

	@@ -0,0 +1,220 @@

+from transformers import PretrainedConfig, PreTrainedModel, AutoModel, AutoConfig
+class DotDict(dict):
+    """字典类，支持通过属性访问键值对。"""
+    def __getattr__(self, key):
+        if key in self:
+            return self[key]
+        else:
+            raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{key}'")
+    def __setattr__(self, key, value):
+        self[key] = value
+    def __delattr__(self, key):
+        if key in self:
+            del self[key]
+        else:
+            raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{key}'")
+class InternVideo2Config(PretrainedConfig):
+    model_type = "internvideo2"
+    def __init__(self,
+                 tokenizer=None,
+                 train_file=None,
+                 test_file=None,
+                 test_types=None,
+                 num_workers=6,
+                 best_key=None,
+                 num_frames=8,
+                 num_frames_test=8,
+                 batch_size=64,
+                 batch_size_test=4,
+                 max_txt_l=32,
+                 inputs=None,
+                 text_enc="bert_large",
+                 model=None,
+                 criterion=None,
+                 optimizer=None,
+                 scheduler=None,
+                 evaluate=False,
+                 deep_fusion=False,
+                 evaluation=None,
+                 use_half_precision=True,
+                 use_bf16=True,
+                 gradient_checkpointing=True,
+                 use_flash_sdp=False,
+                 use_mem_efficient_sdp=False,
+                 compile_model=False,
+                 wandb=None,
+                 dist_url="env://",
+                 device="cuda",
+                 mode="pt",
+                 output_dir=None,
+                 resume=False,
+                 debug=False,
+                 log_freq=100,
+                 seed=42,
+                 save_latest=True,
+                 auto_resume=False,
+                 jump_evaluate=False,
+                 pretrained_path="",
+                 save_ckpt_iter=None,
+                 delete_ds_optim_states=True,
+                 deepspeed=None,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.tokenizer = tokenizer
+        # Data configuration
+        self.train_file = train_file or "available_corpus[\"pretrain_example_data_1B\"]"
+        self.test_file = DotDict(test_file or {
+            "msrvtt_1k_test": "available_corpus[\"msrvtt_1k_test\"]",
+            "didemo_ret_test": "available_corpus[\"didemo_ret_test\"]"
+        })
+        self.test_types = test_types or ["msrvtt_1k_test", "didemo_ret_test"]
+        self.num_workers = num_workers
+        self.best_key = best_key or ["msrvtt_1k_test_match", "t2v_r1"]
+        # Input configuration
+        self.num_frames = num_frames
+        self.num_frames_test = num_frames_test
+        self.batch_size = batch_size
+        self.batch_size_test = batch_size_test
+        self.max_txt_l = max_txt_l
+        self.inputs = DotDict(inputs or {
+            "image_res": 224,
+            "video_input": DotDict({
+                "num_frames": num_frames,
+                "sample_type": "rand",
+                "num_frames_test": num_frames_test,
+                "sample_type_test": "middle",
+                "random_aug": False
+            }),
+            "max_txt_l": DotDict({"image": max_txt_l, "video": max_txt_l}),
+            "batch_size": DotDict({"image": batch_size, "video": batch_size}),
+            "batch_size_test": DotDict({"image": batch_size_test, "video": batch_size_test})
+        })
+        # Model configuration
+        self.text_enc = text_enc
+        self.model = DotDict(model or {
+            "model_cls": "InternVideo2_Stage2",
+            "vision_encoder": DotDict({
+                "name": "pretrain_internvideo2_1b_patch14_224",
+                "img_size": 224,
+                "num_frames": num_frames,
+                "tubelet_size": 1,
+                "patch_size": 14,
+                "d_model": 1408,
+                "clip_embed_dim": 768,
+                "clip_teacher_embed_dim": 3200,
+                "clip_teacher_final_dim": 768,
+                "clip_norm_type": "l2",
+                "clip_return_layer": 6,
+                "clip_student_return_interval": 1,
+                "pretrained": "/home/linanxi/InternVideo/checkpoints/InternVideo2-stage2_1b-224p-f4/InternVideo2-stage2_1b-224p-f4.pt",
+                "use_checkpoint": False,
+                "checkpoint_num": 40,
+                "use_flash_attn": True,
+                "use_fused_rmsnorm": True,
+                "use_fused_mlp": True,
+                "clip_teacher": None,
+                "clip_input_resolution": 224,
+                "clip_teacher_return_interval": 1,
+                "video_mask_type": "random",
+                "video_mask_ratio": 0.8,
+                "image_mask_type": "random",
+                "image_mask_ratio": 0.5,
+                "sep_image_video_pos_embed": True,
+                "keep_temporal": False,
+                "only_mask": True
+            }),
+            "text_encoder": text_enc,
+            "multimodal": DotDict({"enable": True}),
+            "embed_dim": 512,
+            "temp": 0.07,
+            "find_unused_parameters": False
+        })
+        # Criterion configuration
+        self.criterion = DotDict(criterion or {
+            "loss_weight": DotDict({
+                "vtc": 1.0,
+                "mlm": 1.0,
+                "vtm": 1.0,
+                "mvm": 0.0,
+                "uta": 0.0
+            }),
+            "vtm_hard_neg": True,
+            "mlm_masking_prob": 0.5,
+            "distill_final_features": True,
+            "clip_loss_ratio": [1.0, 1.0]
+        })
+        # Optimizer configuration
+        self.optimizer = DotDict(optimizer or {
+            "opt": "adamW",
+            "lr": 5e-5,
+            "opt_betas": [0.9, 0.98],
+            "weight_decay": 0.05,
+            "max_grad_norm": 3.0,
+            "different_lr": DotDict({"enable": False, "module_names": [], "lr": 1e-3})
+        })
+        # Scheduler configuration
+        self.scheduler = DotDict(scheduler or {
+            "sched": "cosine",
+            "epochs": 10,
+            "min_lr_multi": 0.01,
+            "warmup_epochs": 1
+        })
+        # Evaluation configuration
+        self.evaluate = evaluate
+        self.deep_fusion = deep_fusion
+        self.evaluation = DotDict(evaluation or {
+            "eval_frame_ensemble": "concat",
+            "eval_x_only": False,
+            "k_test": 128,
+            "eval_offload": True
+        })
+        # Miscellaneous
+        self.use_half_precision = use_half_precision
+        self.use_bf16 = use_bf16
+        self.gradient_checkpointing = gradient_checkpointing
+        self.use_flash_sdp = use_flash_sdp
+        self.use_mem_efficient_sdp = use_mem_efficient_sdp
+        self.compile_model = compile_model
+        self.wandb = DotDict(wandb or {
+            "enable": False,
+            "entity": "opengvlab",
+            "project": "InternVideo2-Stage2"
+        })
+        self.dist_url = dist_url
+        self.device = device
+        self.mode = mode
+        self.output_dir = output_dir
+        self.resume = resume
+        self.debug = debug
+        self.log_freq = log_freq
+        self.seed = seed
+        self.save_latest = save_latest
+        self.auto_resume = auto_resume
+        self.jump_evaluate = jump_evaluate
+        self.pretrained_path = pretrained_path
+        self.save_ckpt_iter = save_ckpt_iter
+        self.delete_ds_optim_states = delete_ds_optim_states
+        self.deepspeed = DotDict(deepspeed or {
+            "enable": True,
+            "stage": 1
+        })

model.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from internvideo2_stage2 import InternVideo2_Stage2 as IV2S2
+from transformers import PretrainedConfig, PreTrainedModel, AutoModel, AutoConfig
+from config import InternVideo2Config as config
+import warnings
+import torch
+warnings.filterwarnings("ignore")
+# model_config = config()
+# model = IV2S2(model_config)
+# print(model)
+class InternVideo2Stage2VideoEncoder(PreTrainedModel):
+    config_class = config
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.model = IV2S2(config).half().to(config.device)
+    def forward(self, x: torch.tensor):
+        """forward pass
+        Args:
+            x (torch.tensor): Shape (B, N, C, H, W) or (N, C, H, W)
+        Returns:
+            torch.tensor: Shape (B*N, hidden_size)
+        """
+        # x: Shape(B, C, N, H, W)
+        # output: Shape(B, N*98, hidden_size)
+        if len(x.shape) == 4:
+            x = x.unsqueeze(0)
+        B, N, C, H, W = x.shape
+        x = x.permute(0, 2, 1, 3, 4)    # Shape(B, C, N, H, W)
+        output = self.model.encode_vision(x)
+        pooled_vision_embeds = output[1]
+        return pooled_vision_embeds
+if __name__ == "__main__":
+    model_config = config()
+    model = InternVideo2Stage2VideoEncoder(model_config)
+    x = torch.randn(2, 3, 8, 224, 224, dtype=torch.float16).to(model_config.device)
+    output = model(x)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5aa6e5518080f7c11b1a55221c8fd72ee0d9dff5ba50c11794b32cf3c6df1c71
+size 2104856154