File size: 579 Bytes
cea4a4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
from transformers import BertTokenizerFast
import emoji
import pandas as pd
from tqdm import tqdm
def add_emoji_tokens(tokenizer: BertTokenizerFast):
all_emojis = list(emoji.EMOJI_DATA.keys())
tokenizer.add_tokens(all_emojis)
return tokenizer
def add_new_line_token(tokenizer: BertTokenizerFast):
tokenizer.add_special_tokens({"additional_special_tokens": ["\n"]})
return tokenizer
def batch_iterator(df: pd.DataFrame, batch_size=10000, col: str = "text"):
for i in tqdm(range(0, len(df), batch_size)):
yield df.loc[i : i + batch_size, col]
|