morpheushoc commited on
Commit
7a32bc9
·
verified ·
1 Parent(s): dd58375

Upload InternVideo2_Classification

Browse files
config.json CHANGED
@@ -1,52 +1,54 @@
1
  {
2
  "architectures": [
3
- "InternVideo2_Classification_test"
4
  ],
5
  "auto_map": {
6
- "AutoModel": "modeling_videochat2_classification.InternVideo2_Classification_test"
7
  },
8
- "bridge": {
9
- "extra_num_query_token": 64,
10
- "name": "qformer",
11
- "num_query_token": 32,
12
- "qformer_attention_probs_dropout_prob": 0.1,
13
- "qformer_drop_path_rate": 0.2,
14
- "qformer_hidden_dropout_prob": 0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  },
16
- "freeze_bridge": false,
17
- "freeze_llm": false,
18
- "freeze_vision_encoder": false,
19
- "llm": {
20
- "lora_alpha": 32,
21
- "lora_dropout": 0.1,
22
- "lora_r": 16,
23
- "name": "mistral_7b",
24
- "pretrained_llm_path": "mistralai/Mistral-7B-Instruct-v0.3",
25
- "use_lora": true
26
- },
27
- "loss": {
28
- "use_vision_regression_loss": false
29
- },
30
- "model_type": "InternVideo2_VideoChat2_test",
31
- "pretrained_paths": {},
32
  "torch_dtype": "float32",
33
- "transformers_version": "4.46.1",
34
- "use_flash_attention": true,
35
- "vision_encoder": {
36
- "checkpoint_num": 48,
37
- "d_model": 1408,
38
- "encoder_embed_dim": 1408,
39
- "img_size": 224,
40
- "name": "internvideo2-1B",
41
- "num_frames": 8,
42
- "origin_num_frames": 4,
43
- "patch_size": 14,
44
- "pretrained": null,
45
- "sep_image_video_pos_embed": true,
46
- "tubelet_size": 1,
47
- "use_checkpoint": true,
48
- "vit_add_ln": true,
49
- "x_vis_only": true,
50
- "x_vis_return_idx": -2
51
- }
52
  }
 
1
  {
2
  "architectures": [
3
+ "InternVideo2_Classification"
4
  ],
5
  "auto_map": {
6
+ "AutoModel": "modeling_videochat2_classification.InternVideo2_Classification"
7
  },
8
+ "model_config": {
9
+ "bridge": {
10
+ "extra_num_query_token": 64,
11
+ "name": "qformer",
12
+ "num_query_token": 32,
13
+ "qformer_attention_probs_dropout_prob": 0.1,
14
+ "qformer_drop_path_rate": 0.2,
15
+ "qformer_hidden_dropout_prob": 0.1
16
+ },
17
+ "freeze_bridge": false,
18
+ "freeze_llm": false,
19
+ "freeze_vision_encoder": false,
20
+ "llm": {
21
+ "lora_alpha": 32,
22
+ "lora_dropout": 0.1,
23
+ "lora_r": 16,
24
+ "name": "mistral_7b",
25
+ "pretrained_llm_path": "mistralai/Mistral-7B-Instruct-v0.3",
26
+ "use_lora": true
27
+ },
28
+ "loss": {
29
+ "use_vision_regression_loss": false
30
+ },
31
+ "pretrained_paths": {},
32
+ "use_flash_attention": true,
33
+ "vision_encoder": {
34
+ "checkpoint_num": 48,
35
+ "d_model": 1408,
36
+ "encoder_embed_dim": 1408,
37
+ "img_size": 224,
38
+ "name": "internvideo2-1B",
39
+ "num_frames": 8,
40
+ "origin_num_frames": 4,
41
+ "patch_size": 14,
42
+ "pretrained": null,
43
+ "sep_image_video_pos_embed": true,
44
+ "tubelet_size": 1,
45
+ "use_checkpoint": true,
46
+ "vit_add_ln": true,
47
+ "x_vis_only": true,
48
+ "x_vis_return_idx": -2
49
+ }
50
  },
51
+ "model_type": "InternVideo2_Classification_test",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  "torch_dtype": "float32",
53
+ "transformers_version": "4.46.1"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  }
model-00007-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06bb1178e0e08dd4363637dffed42017c083f97fe6c2023b7eed2b6dd1cf1007
3
  size 4109221232
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79c534fd5231ae42d92fde6d705d517b6721d37d87d304b7f699b6b2c850c276
3
  size 4109221232
modeling_videochat2_classification.py CHANGED
@@ -389,15 +389,48 @@ class InternVideo2_Classification_test(PreTrainedModel):
389
  config_class = VideoChat2Config
390
  def __init__(self, config):
391
  super().__init__(config)
392
- self.w = torch.randn(10,10, requires_grad=True)
 
 
 
393
 
394
 
395
  def forward(self, x):
396
- return x
 
397
 
398
  def test_lol(self, x):
399
  return x
400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  if __name__ == "__main__":
402
 
403
  tokenizer = AutoTokenizer.from_pretrained('OpenGVLab/InternVideo2-Chat-8B',trust_remote_code=True,use_fast=False)
 
389
  config_class = VideoChat2Config
390
  def __init__(self, config):
391
  super().__init__(config)
392
+ self.conv1 = nn.Conv2d(1, 20, 5)
393
+ self.conv2 = nn.Conv2d(20, 20, 5)
394
+ self.model_config = config.model_config
395
+ self.build_bridge()
396
 
397
 
398
  def forward(self, x):
399
+ x = self.conv1(x)
400
+ return self.conv2(x)
401
 
402
  def test_lol(self, x):
403
  return x
404
 
405
+ def build_bridge(self):
406
+
407
+ if 'qformer' in self.model_config.bridge.name.lower():
408
+ from transformers import BertTokenizer
409
+ self.qformer_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="left")
410
+ self.qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
411
+ self.qformer_tokenizer.padding_side = "left"
412
+ if self.model_config.bridge.name == 'qformer':
413
+ self.qformer, self.query_tokens = build_qformer(
414
+ self.model_config.bridge.num_query_token, self.model_config.vision_encoder.encoder_embed_dim,
415
+ qformer_hidden_dropout_prob=self.model_config.bridge.qformer_hidden_dropout_prob,
416
+ qformer_attention_probs_dropout_prob=self.model_config.bridge.qformer_attention_probs_dropout_prob,
417
+ qformer_drop_path_rate=self.model_config.bridge.qformer_drop_path_rate,
418
+ )
419
+ self.qformer.resize_token_embeddings(len(self.qformer_tokenizer))
420
+ self.qformer.cls = None
421
+ self.extra_num_query_token = self.model_config.bridge.extra_num_query_token
422
+ if self.model_config.bridge.extra_num_query_token > 0:
423
+ logger.info(f"Add extra {self.model_config.bridge.extra_num_query_token} tokens in QFormer")
424
+ self.extra_query_tokens = nn.Parameter(
425
+ torch.zeros(1, self.model_config.bridge.extra_num_query_token, self.query_tokens.shape[-1])
426
+ )
427
+
428
+ self.freeze_bridge = self.model_config.get("freeze_bridge", False)
429
+ if self.freeze_bridge:
430
+ logger.info("freeze bridge")
431
+ freeze_module(self.qformer)
432
+ self.query_tokens.requires_grad = False
433
+
434
  if __name__ == "__main__":
435
 
436
  tokenizer = AutoTokenizer.from_pretrained('OpenGVLab/InternVideo2-Chat-8B',trust_remote_code=True,use_fast=False)