Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,6 @@ import textwrap
|
|
| 7 |
import numpy as np
|
| 8 |
import pandas as pd
|
| 9 |
import streamlit as st
|
| 10 |
-
from tqdm.auto import tqdm
|
| 11 |
from collections import Counter
|
| 12 |
from tokenizers import Tokenizer
|
| 13 |
import plotly.graph_objects as go
|
|
@@ -35,18 +34,20 @@ MODELS = [
|
|
| 35 |
|
| 36 |
def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str):
|
| 37 |
"""
|
| 38 |
-
Estimate the most common tokens in the language. You should first download the 1M sentences dataset
|
| 39 |
-
Source: https://wortschatz.uni-leipzig.de/en/download/English
|
| 40 |
"""
|
| 41 |
sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
|
| 42 |
if os.path.exists(sentences_file):
|
| 43 |
-
df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
|
| 44 |
my_bar = st.progress(0)
|
| 45 |
-
|
|
|
|
| 46 |
for i, text in enumerate(df.text):
|
| 47 |
-
counter.update(
|
| 48 |
my_bar.progress(i/len(df), text=f"{i/len(df)*100:.0f}%")
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
else:
|
| 51 |
raise FileNotFoundError
|
| 52 |
|
|
@@ -119,8 +120,6 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str,
|
|
| 119 |
# Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
|
| 120 |
tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
|
| 121 |
original_vocab = tokenizer_json['model']['vocab']
|
| 122 |
-
|
| 123 |
-
# Build a mapping from tokens to their original IDs
|
| 124 |
original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
|
| 125 |
|
| 126 |
# Filter out the tokens to remove and reassign new IDs
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
import pandas as pd
|
| 9 |
import streamlit as st
|
|
|
|
| 10 |
from collections import Counter
|
| 11 |
from tokenizers import Tokenizer
|
| 12 |
import plotly.graph_objects as go
|
|
|
|
| 34 |
|
| 35 |
def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str):
|
| 36 |
"""
|
| 37 |
+
Estimate the most common tokens in the language. You should first download the 1M sentences dataset
|
| 38 |
+
for the desired language. Source: https://wortschatz.uni-leipzig.de/en/download/English
|
| 39 |
"""
|
| 40 |
sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
|
| 41 |
if os.path.exists(sentences_file):
|
|
|
|
| 42 |
my_bar = st.progress(0)
|
| 43 |
+
df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
|
| 44 |
+
counter = Counter(tokenizer.all_special_ids)
|
| 45 |
for i, text in enumerate(df.text):
|
| 46 |
+
counter.update(tokid for tokid in tokenizer.encode(text))
|
| 47 |
my_bar.progress(i/len(df), text=f"{i/len(df)*100:.0f}%")
|
| 48 |
+
filtered_token_ids = sorted(counter.keys())
|
| 49 |
+
filtered_tokens = tokenizer.convert_ids_to_tokens(filtered_token_ids)
|
| 50 |
+
return set(filtered_tokens)
|
| 51 |
else:
|
| 52 |
raise FileNotFoundError
|
| 53 |
|
|
|
|
| 120 |
# Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
|
| 121 |
tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
|
| 122 |
original_vocab = tokenizer_json['model']['vocab']
|
|
|
|
|
|
|
| 123 |
original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
|
| 124 |
|
| 125 |
# Filter out the tokens to remove and reassign new IDs
|