Spaces:
Runtime error
Runtime error
Update rpc.py
Browse files
rpc.py
CHANGED
|
@@ -5,6 +5,7 @@ import keras_nlp
|
|
| 5 |
|
| 6 |
import math
|
| 7 |
import json
|
|
|
|
| 8 |
from transformers import AutoTokenizer
|
| 9 |
from tokenizers import AddedToken
|
| 10 |
|
|
@@ -23,6 +24,23 @@ print("vocab_size:", vocab_size)
|
|
| 23 |
print("pad token id:", tokenizer.pad_token)
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# Masked Accuracy Metric
|
| 27 |
def masked_accuracy(y_true, y_pred, padding_token=tokenizer.pad_token_id):
|
| 28 |
y_true = tf.cast(y_true, tf.int32)
|
|
|
|
| 5 |
|
| 6 |
import math
|
| 7 |
import json
|
| 8 |
+
import spacy
|
| 9 |
from transformers import AutoTokenizer
|
| 10 |
from tokenizers import AddedToken
|
| 11 |
|
|
|
|
| 24 |
print("pad token id:", tokenizer.pad_token)
|
| 25 |
|
| 26 |
|
| 27 |
+
nlp = spacy.load("en_core_web_lg")
|
| 28 |
+
nlp.max_length = 2000000
|
| 29 |
+
selected = {'NUM', 'PROPN'}
|
| 30 |
+
alltoks = sorted(list(tokenizer.get_vocab().items()), key=lambda x:x[1])
|
| 31 |
+
all_toks_text = "\n".join([t[0].replace("▁", "") for t in alltoks])
|
| 32 |
+
doc = nlp(all_toks_text)
|
| 33 |
+
carry_toks = set()
|
| 34 |
+
i = 0
|
| 35 |
+
for ii, token in enumerate(doc):
|
| 36 |
+
if str(token) in alltoks[i][0]: pass
|
| 37 |
+
else: i += 1
|
| 38 |
+
if str(token) in alltoks[i][0] and token.pos_ in selected and i > 100:
|
| 39 |
+
if (token.pos_ != "PROPN" or alltoks[i][0].replace("▁", "")[0].isupper()):
|
| 40 |
+
carry_toks.add(alltoks[i][1])
|
| 41 |
+
print(len(carry_toks))
|
| 42 |
+
|
| 43 |
+
|
| 44 |
# Masked Accuracy Metric
|
| 45 |
def masked_accuracy(y_true, y_pred, padding_token=tokenizer.pad_token_id):
|
| 46 |
y_true = tf.cast(y_true, tf.int32)
|