File size: 4,850 Bytes
2ab4c76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a09f129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ab4c76
 
 
 
 
 
 
 
a09f129
2ab4c76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
# coding: utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script creates a tiny random model
#
# It will be used then as "hf-internal-testing/tiny-albert"

# ***To build from scratch***
#
# 1. clone sentencepiece into a parent dir
# git clone https://github.com/google/sentencepiece
#
# 2. create a new repo at https://huggingface.co/new
# make sure to choose 'hf-internal-testing' as the Owner
#
# 3. clone
# git clone https://huggingface.co/hf-internal-testing/tiny-albert
# cd tiny-albert

# 4. start with some pre-existing script from one of the https://huggingface.co/hf-internal-testing/ tiny model repos, e.g.
# wget https://huggingface.co/hf-internal-testing/tiny-albert/raw/main/make-tiny-albert.py
# chmod a+x ./make-tiny-albert.py
# mv ./make-tiny-albert.py ./make-tiny-albert.py
#
# 5. automatically rename things from the old names to new ones
# perl -pi -e 's|Deberta|Deberta|g' make-*
# perl -pi -e 's|deberta|deberta|g' make-*
#
# 6. edit and re-run this script while fixing it up
# ./make-tiny-deberta.py
#
# 7. add/commit/push
# git add *
# git commit -m "new tiny model"
# git push

# ***To update***
#
# 1. clone the existing repo
# git clone https://huggingface.co/hf-internal-testing/tiny-deberta
# cd tiny-deberta
#
# 2. edit and re-run this script after doing whatever changes are needed
# ./make-tiny-deberta.py
#
# 3. commit/push
# git commit -m "new tiny model"
# git push

import sys
import os

# workaround for fast tokenizer protobuf issue, and it's much faster too!
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

from transformers import DebertaTokenizer, DebertaTokenizerFast, DebertaConfig, DebertaForMaskedLM

mname_orig = "microsoft/deberta-base"
mname_tiny = "tiny-deberta"


### Tokenizer

import json
from transformers import AutoTokenizer
from tokenizers import Tokenizer
vocab_keep_items = 5000
tokenizer = AutoTokenizer.from_pretrained(mname_orig, use_fast=True)
assert tokenizer.is_fast, "This only works for fast tokenizers."
tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
vocab = tokenizer_json["model"]["vocab"]
if tokenizer_json["model"]["type"] == "BPE":
    new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
    merges = tokenizer_json["model"]["merges"]
    new_merges = []
    for i in range(len(merges)):
        a, b = merges[i].split()
        new_token = "".join((a, b))
        if a in new_vocab and b in new_vocab and new_token in new_vocab:
            new_merges.append(merges[i])
    tokenizer_json["model"]["merges"] = new_merges
elif tokenizer_json["model"]["type"] == "Unigram":
    new_vocab = vocab[:vocab_keep_items]
elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel":
    new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
else:
    raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}")
tokenizer_json["model"]["vocab"] = new_vocab
tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
tokenizer_fast_tiny = tokenizer


### Config

config_tiny = DebertaConfig.from_pretrained(mname_orig)
print(config_tiny)
# remember to update this to the actual config as each model is different and then shrink the numbers
config_tiny.update(dict(
    vocab_size=vocab_keep_items,
    embedding_size=32,
    pooler_size=32,
    hidden_size=32,
    intermediate_size=64,
    max_position_embeddings=128,
    num_attention_heads=2,
    num_hidden_layers=2,
))
print("New config", config_tiny)

### Model

model_tiny = DebertaForMaskedLM(config_tiny)
print(f"{mname_tiny}: num of params {model_tiny.num_parameters()}")
model_tiny.resize_token_embeddings(len(tokenizer_fast_tiny))

# Test
inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt")
#print(inputs)
outputs = model_tiny(**inputs)
print("Test with normal tokenizer:", len(outputs.logits[0]))

# Save
model_tiny.half() # makes it smaller
model_tiny.save_pretrained(".")
tokenizer_fast_tiny.save_pretrained(".")

#print(model_tiny)

readme = "README.md"
if not os.path.exists(readme):
    with open(readme, "w") as f:
        f.write(f"This is a {mname_tiny} random model to be used for basic testing.\n")

print(f"Generated {mname_tiny}")