Upload InternVideo2Stage2VideoEncoder

Browse files

Files changed (4) hide show

config.json +3 -3
config.py +7 -2
model.py +25 -9
model.safetensors +1 -1

config.json CHANGED Viewed

@@ -102,7 +102,7 @@
       "num_frames": 8,
       "only_mask": true,
       "patch_size": 14,
-      "pretrained": "/home/linanxi/InternVideo/checkpoints/InternVideo2-stage2_1b-224p-f4/InternVideo2-stage2_1b-224p-f4.pt",
       "sep_image_video_pos_embed": true,
       "tubelet_size": 1,
       "use_checkpoint": false,
@@ -156,10 +156,10 @@
   "tokenizer": null,
   "torch_dtype": "float16",
   "train_file": "available_corpus[\"pretrain_example_data_1B\"]",
-  "transformers_version": "4.47.0",
   "use_bf16": true,
   "use_flash_sdp": false,
-  "use_half_precision": true,
   "use_mem_efficient_sdp": false,
   "wandb": {
     "enable": false,

       "num_frames": 8,
       "only_mask": true,
       "patch_size": 14,
+      "pretrained": "/home/bingxing2/home/scx7l3k/linanxi/workspace/low_level/Encoders/InternVideo2-stage2_1b-224p-f4.pt",
       "sep_image_video_pos_embed": true,
       "tubelet_size": 1,
       "use_checkpoint": false,
   "tokenizer": null,
   "torch_dtype": "float16",
   "train_file": "available_corpus[\"pretrain_example_data_1B\"]",
+  "transformers_version": "4.42.4",
   "use_bf16": true,
   "use_flash_sdp": false,
+  "use_half_precision": false,
   "use_mem_efficient_sdp": false,
   "wandb": {
     "enable": false,

config.py CHANGED Viewed

@@ -58,7 +58,7 @@ class InternVideo2Config(PretrainedConfig):
                  evaluate=False,
                  deep_fusion=False,
                  evaluation=None,
-                 use_half_precision=True,
                  use_bf16=True,
                  gradient_checkpointing=True,
                  use_flash_sdp=False,
@@ -132,7 +132,7 @@ class InternVideo2Config(PretrainedConfig):
                 "clip_norm_type": "l2",
                 "clip_return_layer": 6,
                 "clip_student_return_interval": 1,
-                "pretrained": "/home/linanxi/InternVideo/checkpoints/InternVideo2-stage2_1b-224p-f4/InternVideo2-stage2_1b-224p-f4.pt",
                 "use_checkpoint": False,
                 "checkpoint_num": 40,
                 "use_flash_attn": True,
@@ -233,3 +233,8 @@ class InternVideo2Config(PretrainedConfig):
             "enable": True,
             "stage": 1
         })

                  evaluate=False,
                  deep_fusion=False,
                  evaluation=None,
+                 use_half_precision=False,
                  use_bf16=True,
                  gradient_checkpointing=True,
                  use_flash_sdp=False,
                 "clip_norm_type": "l2",
                 "clip_return_layer": 6,
                 "clip_student_return_interval": 1,
+                "pretrained": "/home/bingxing2/home/scx7l3k/linanxi/workspace/low_level/Encoders/InternVideo2-stage2_1b-224p-f4.pt",
                 "use_checkpoint": False,
                 "checkpoint_num": 40,
                 "use_flash_attn": True,
             "enable": True,
             "stage": 1
         })
+    def set_num_frames(self, num_frames):
+        # print('Here ', num_frames)
+        self.num_frames = num_frames
+        self.inputs.video_input.num_frames = num_frames
+        self.model.vision_encoder.num_frames = num_frames

model.py CHANGED Viewed

@@ -3,8 +3,11 @@ from transformers import PretrainedConfig, PreTrainedModel, AutoModel, AutoConfi
 from config import InternVideo2Config as config
 import warnings
 import torch
 warnings.filterwarnings("ignore")
 # model_config = config()
 # model = IV2S2(model_config)
 # print(model)
@@ -15,24 +18,37 @@ class InternVideo2Stage2VideoEncoder(PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.config = config
-        self.model = IV2S2(config).half().to(config.device)
     def forward(self, x: torch.tensor):
         """forward pass
         Args:
-            x (torch.tensor): Shape (B, N, C, H, W) or (N, C, H, W)
         Returns:
-            torch.tensor: Shape (B*N, hidden_size)
         """
-        # x: Shape(B, C, N, H, W)
-        # output: Shape(B, N*98, hidden_size)
         if len(x.shape) == 4:
-            x = x.unsqueeze(0)
         B, N, C, H, W = x.shape
-        x = x.permute(0, 2, 1, 3, 4)    # Shape(B, C, N, H, W)
         output = self.model.encode_vision(x)
-        pooled_vision_embeds = output[1]
-        return pooled_vision_embeds
 if __name__ == "__main__":
     model_config = config()

 from config import InternVideo2Config as config
 import warnings
 import torch
+# from transformers.utils import logging
 warnings.filterwarnings("ignore")
+# logging.set_verbosity_error()
 # model_config = config()
 # model = IV2S2(model_config)
 # print(model)
     def __init__(self, config):
         super().__init__(config)
         self.config = config
+        # print(self.config.model.vision_encoder.num_frames)
+        self.model = IV2S2(self.config).to(config.device).to(torch.float16)
     def forward(self, x: torch.tensor):
         """forward pass
         Args:
+            x (torch.tensor): Shape (B, N, C, H, W) or (B, C, H, W)
         Returns:
+            torch.tensor: Shape (B*N, hidden_size) or (B, hidden_size)
         """
+        if len(x.shape) == 5 and x.shape[1] > 8:
+            ## There is no way, the weight limits the number of input frames to be less than or equal to 8.
+            ## Forgive me for dealing with input frames greater than 8 in such a stupid way. T^T
+            T = x.shape[1]
+            embs = torch.cat([self.forward(x[:, i:i+8, :, :, :])for i in range(0, T, 8)], dim=1)
+            return embs
+        image = False
         if len(x.shape) == 4:
+            x = x.unsqueeze(1)
+            image = True
         B, N, C, H, W = x.shape
+        # x = x.permute(0, 2, 1, 3, 4)                 # Shape(B, N, C, H, W)
         output = self.model.encode_vision(x)
+        pooled_vision_embeds = output[1]               # Shape(B, N*256 + 1, Hidden_size)
+        output = pooled_vision_embeds[:, :256*N, :]    # Shape(B, N*256, Hidden_size)
+        output = output.reshape(B, N, 256, -1)         # Shape(B, N, 256, Hidden_size)
+        output = output.mean(dim=2)                    # Shape(B, N, Hidden_size)
+        if image:
+            output = output.squeeze(1)
+        return output
 if __name__ == "__main__":
     model_config = config()

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ad189b4ab3a2e80495bc7c9997d6e7a3408faf3a11c40da99c553cecb52c42a4
 size 2104856154

 version https://git-lfs.github.com/spec/v1
+oid sha256:611b74750f429e7d50ee53c0df0d05a524c6b55961a8cff7da57ae8e8cb7fb82
 size 2104856154