RichardWang
/

test

@@ -1,6 +1,6 @@
 {
   "architectures": [
-    "TSPModelForPretraining"
   ],
   "auto_map": {
     "AutoConfig": "configuration_tsp.TSPConfig",
@@ -11,6 +11,7 @@
     "AutoModelForTokenClassification": "modeling_tsp.TSPModelForTokenClassification"
   },
   "dropout_prob": 0.1,
   "embedding_size": 128,
   "hidden_size": 256,
   "intermediate_size": 1024,
@@ -21,6 +22,7 @@
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "torch_dtype": "float32",
-  "transformers_version": "4.17.0",
   "vocab_size": 30522
 }

 {
   "architectures": [
+    "TSPModelForPreTraining"
   ],
   "auto_map": {
     "AutoConfig": "configuration_tsp.TSPConfig",
     "AutoModelForTokenClassification": "modeling_tsp.TSPModelForTokenClassification"
   },
   "dropout_prob": 0.1,
+  "electra_generator_size_divisor": 4,
   "embedding_size": 128,
   "hidden_size": 256,
   "intermediate_size": 1024,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "torch_dtype": "float32",
+  "transformers_version": "4.19.0.dev0",
+  "use_electra": true,
   "vocab_size": 30522
 }

modeling_tsp.py CHANGED Viewed

@@ -9,12 +9,12 @@ import torch
 from torch import nn
 import torch.nn.functional as F
 from transformers import PreTrainedModel
-from .configuration_tsp import TSPConfig
 class TSPPreTrainedModel(PreTrainedModel):
     config_class = TSPConfig
-    base_model_prefix = "tsp"
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -32,20 +32,21 @@ class TSPPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 # ====================================
 # Pretraining Model
 # ====================================
-class TSPModelForPretraining(TSPPreTrainedModel):
-    def __init__(self, config, use_electra=False):
         super().__init__(config)
         self.backbone = TSPModel(config)
-        if use_electra:
             mlm_config = deepcopy(config)
-            mlm_config.hidden_size /= config.generator_size_divisor
-            mlm_config.intermediate_size /= config.generator_size_divisor
-            mlm_config.num_attention_heads /= config.generator_size_divisor
             self.mlm_backbone = TSPModel(mlm_config)
             self.mlm_head = MaskedLMHead(
                 mlm_config, word_embeddings=self.mlm_backbone.embeddings.word_embeddings
@@ -55,7 +56,10 @@ class TSPModelForPretraining(TSPPreTrainedModel):
             self.rtd_head = ReplacedTokenDiscriminationHead(config)
         else:
             self.mlm_backbone = self.backbone
-            self.mlm_head = MaskedLMHead(config)
         self.apply(self._init_weights)
     def forward(self, *args, **kwargs):
@@ -63,40 +67,6 @@ class TSPModelForPretraining(TSPPreTrainedModel):
             "Refer to the implementation of text structrue prediction task for how to use the model."
         )
-    def mlm_forward(
-        self,
-        corrupted_ids,  # <int>(B,L), partially masked/replaced input token ids
-        attention_mask,  # <int>(B,L), 1 / 0 for tokens that are not attended/ attended
-        token_type_ids,  # <int>(B,L), 0 / 1 corresponds to a segment A / B token
-        mlm_selected=None,  # <bool>(B,L), True at mlm selected positiosns. Calculate logits at mlm selected positions if not None.
-    ):
-        hidden_states = self.mlm_backbone(
-            input_ids=corrupted_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-        )  # (B,L,D)
-        return self.mlm_head(
-            hidden_states, is_selected=mlm_selected
-        )  # (#mlm selected, vocab size)/ (B,L,vocab size)
-    def rtd_forward(
-        self,
-        corrupted_ids,  # <int>(B,L), partially replaced input token ids
-        attention_mask,  # <int>(B,L), 1 / 0 for tokens that are not attended/ attended
-        token_type_ids,  # <int>(B,L), 0 / 1 corresponds to a segment A / B token
-    ):
-        hidden_states = self.rtd_backbone(
-            input_ids=corrupted_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-        )  # (B,L,D)
-        return self.rtd_backbone(hidden_states)  # (B,L)
-    def tsp_forward(
-        self, hidden_states,  # (B,L,D)
-    ):
-        raise NotImplementedError
 class MaskedLMHead(nn.Module):
     def __init__(self, config, word_embeddings=None):
@@ -135,6 +105,22 @@ class ReplacedTokenDiscriminationHead(nn.Module):
         return x.squeeze(-1)  # (B,L)
 # ====================================
 # Finetuning Model
 # ====================================
@@ -164,8 +150,8 @@ class TSPModelForTokenClassification(TSPPreTrainedModel):
 class TokenClassificationHead(nn.Module):
     def __init__(self, config, num_classes):
         super().__init__()
-        self.dropout = nn.Dropout(c.dropout_prob)
-        self.classifier = nn.Linear(c.hidden_size, num_classes)
     def forward(self, x):  # (B,L,D)
         x = self.dropout(x)  # (B,L,D)
@@ -213,6 +199,7 @@ class TSPModelForQuestionAnswering(TSPPreTrainedModel):
         super().__init__()
         self.backbone = TSPModel(config)
         self.head = SequenceClassififcationHead(config, num_classes)
     def forward(
         self,
@@ -345,9 +332,6 @@ class SquadHead(nn.Module):
 class TSPModel(TSPPreTrainedModel):
-    config_class = TSPConfig
-    base_model_prefix = "tsp"
     def __init__(self, config):
         super().__init__(config)
         self.embeddings = Embeddings(config)
@@ -405,9 +389,9 @@ class Embeddings(nn.Module):
     ):
         B, L = input_ids.shape
         embeddings = self.word_embeddings(input_ids)  # (B,L,E)
         if hasattr(self, "position_embeddings"):
             embeddings += self.position_embeddings.weight[None, :L, :]
-        embeddings += self.token_type_embeddings(token_type_ids)
         embeddings = self.norm(embeddings)  # (B,L,E)
         embeddings = self.dropout(embeddings)  # (B,L,E)
         return embeddings  # (B,L,E)
@@ -453,6 +437,8 @@ class MultiHeadSelfAttention(nn.Module):
         self.o_proj = nn.Linear(config.hidden_size, config.hidden_size)
         self.H = config.num_attention_heads
         self.d = config.hidden_size // self.H
     def forward(
         self,
@@ -463,6 +449,8 @@ class MultiHeadSelfAttention(nn.Module):
         query, key, value = (
             self.mix_proj(x).view(B, L, H, 3 * d).transpose(1, 2).split(d, dim=-1)
         )  # (B,H,L,d),(B,H,L,d),(B,H,L,d)
         output = self.attention(query, key, value, attention_mask)  # (B,H,L,d)
         output = self.o_proj(output.transpose(1, 2).reshape(B, L, D))  # (B,L,D)
         return output  # (B,L,D)
@@ -503,4 +491,43 @@ class FeedForwardNetwork(nn.Module):
         return x  # (B,L,D)

 from torch import nn
 import torch.nn.functional as F
 from transformers import PreTrainedModel
+from configuration_tsp import TSPConfig
 class TSPPreTrainedModel(PreTrainedModel):
     config_class = TSPConfig
+    base_model_prefix = "backbone"
     def _init_weights(self, module):
         """Initialize the weights"""
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 # ====================================
 # Pretraining Model
 # ====================================
+class TSPModelForPreTraining(TSPPreTrainedModel):
+    def __init__(self, config):
         super().__init__(config)
         self.backbone = TSPModel(config)
+        if config.use_electra:
             mlm_config = deepcopy(config)
+            mlm_config.hidden_size //= config.electra_generator_size_divisor
+            mlm_config.intermediate_size //= config.electra_generator_size_divisor
+            mlm_config.num_attention_heads //= config.electra_generator_size_divisor
             self.mlm_backbone = TSPModel(mlm_config)
             self.mlm_head = MaskedLMHead(
                 mlm_config, word_embeddings=self.mlm_backbone.embeddings.word_embeddings
             self.rtd_head = ReplacedTokenDiscriminationHead(config)
         else:
             self.mlm_backbone = self.backbone
+            self.mlm_head = MaskedLMHead(
+                config, word_embeddings=self.mlm_backbone.embeddings.word_embeddings
+            )
+        self.tsp_head = TextStructurePredictionHead(config)
         self.apply(self._init_weights)
     def forward(self, *args, **kwargs):
             "Refer to the implementation of text structrue prediction task for how to use the model."
         )
 class MaskedLMHead(nn.Module):
     def __init__(self, config, word_embeddings=None):
         return x.squeeze(-1)  # (B,L)
+class TextStructurePredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.linear1 = nn.Linear(config.hidden_size * 2, config.hidden_size * 2)
+        self.norm = nn.LayerNorm(config.hidden_size * 2)
+        self.linear2 = nn.Linear(config.hidden_size * 2, 6)
+    def forward(
+        self, x,  # (...,2D)
+    ):
+        x = self.linear1(x)  # (...,2D)
+        x = F.gelu(x)  # (...,2D)
+        x = self.norm(x)  # (...,2D)
+        return self.linear2(x)  # (...,C)
 # ====================================
 # Finetuning Model
 # ====================================
 class TokenClassificationHead(nn.Module):
     def __init__(self, config, num_classes):
         super().__init__()
+        self.dropout = nn.Dropout(config.dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_classes)
     def forward(self, x):  # (B,L,D)
         x = self.dropout(x)  # (B,L,D)
         super().__init__()
         self.backbone = TSPModel(config)
         self.head = SequenceClassififcationHead(config, num_classes)
+        self.apply(self._init_weights)
     def forward(
         self,
 class TSPModel(TSPPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.embeddings = Embeddings(config)
     ):
         B, L = input_ids.shape
         embeddings = self.word_embeddings(input_ids)  # (B,L,E)
+        embeddings += self.token_type_embeddings(token_type_ids)
         if hasattr(self, "position_embeddings"):
             embeddings += self.position_embeddings.weight[None, :L, :]
         embeddings = self.norm(embeddings)  # (B,L,E)
         embeddings = self.dropout(embeddings)  # (B,L,E)
         return embeddings  # (B,L,E)
         self.o_proj = nn.Linear(config.hidden_size, config.hidden_size)
         self.H = config.num_attention_heads
         self.d = config.hidden_size // self.H
+        if config.position_embedding_type == "rotary":
+            self.rotray_position_embeds = RotaryEmbedding(self.d)
     def forward(
         self,
         query, key, value = (
             self.mix_proj(x).view(B, L, H, 3 * d).transpose(1, 2).split(d, dim=-1)
         )  # (B,H,L,d),(B,H,L,d),(B,H,L,d)
+        if hasattr(self, "rotray_position_embeds"):
+            query, key = self.rotray_position_embeds(query, key)
         output = self.attention(query, key, value, attention_mask)  # (B,H,L,d)
         output = self.o_proj(output.transpose(1, 2).reshape(B, L, D))  # (B,L,D)
         return output  # (B,L,D)
         return x  # (B,L,D)
+class RotaryEmbedding(nn.Module):
+    seq_len_cached = 0
+    cos_cached = None
+    sin_cached = None
+    def __init__(self, dim):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+    def _forward(self, x):  # (B,H,L,d)
+        # Get rotary embeddings on the fly
+        ## create
+        seq_len = x.shape[2]
+        if seq_len > RotaryEmbedding.seq_len_cached:
+            RotaryEmbedding.seq_len_cached = seq_len
+            t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = t.view(-1, 1) @ self.inv_freq.view(1, -1)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)  # (L,d)
+            RotaryEmbedding.cos_cached = emb.cos()[None, None, :, :]
+            RotaryEmbedding.sin_cached = emb.sin()[None, None, :, :]
+        ## take
+        if seq_len == RotaryEmbedding.seq_len_cached:
+            cos, sin = RotaryEmbedding.cos_cached, RotaryEmbedding.sin_cached
+        else:
+            cos, sin = (
+                RotaryEmbedding.cos_cached[:, :, :seq_len, :],  # (1,1,L,d)
+                RotaryEmbedding.sin_cached[:, :, :seq_len, :],  # (1,1,L,d)
+            )
+        # Apply rotary embeddings
+        sections = [x.shape[-1] // 2, x.shape[-1] - x.shape[-1] // 2]
+        x1, x2 = x.split(sections, dim=-1)
+        half_rotated_x = torch.cat((-x2, x1), dim=-1)
+        return (x * cos) + (half_rotated_x * sin)
+    def forward(
+        self, query, key,  # (B,H,L,d)  # (B,H,L,d)
+    ):
+        return self._forward(query), self._forward(key)

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9bad83b6706d009b6c0b06cd6d42664b8c67c22f2d3215f88b09f83c9eb93604
-size 69713927

 version https://git-lfs.github.com/spec/v1
+oid sha256:8c8decb4de84befc5103d4b4b7c9ed0d61fc598ad859c30163e92107f76ea731
+size 57777425