ctr-ll4 / src /utils /bert.py
sanjin7's picture
Upload src/ with huggingface_hub
cea4a4b
raw
history blame contribute delete
579 Bytes
from transformers import BertTokenizerFast
import emoji
import pandas as pd
from tqdm import tqdm
def add_emoji_tokens(tokenizer: BertTokenizerFast):
all_emojis = list(emoji.EMOJI_DATA.keys())
tokenizer.add_tokens(all_emojis)
return tokenizer
def add_new_line_token(tokenizer: BertTokenizerFast):
tokenizer.add_special_tokens({"additional_special_tokens": ["\n"]})
return tokenizer
def batch_iterator(df: pd.DataFrame, batch_size=10000, col: str = "text"):
for i in tqdm(range(0, len(df), batch_size)):
yield df.loc[i : i + batch_size, col]