from transformers import BertTokenizerFast import emoji import pandas as pd from tqdm import tqdm def add_emoji_tokens(tokenizer: BertTokenizerFast): all_emojis = list(emoji.EMOJI_DATA.keys()) tokenizer.add_tokens(all_emojis) return tokenizer def add_new_line_token(tokenizer: BertTokenizerFast): tokenizer.add_special_tokens({"additional_special_tokens": ["\n"]}) return tokenizer def batch_iterator(df: pd.DataFrame, batch_size=10000, col: str = "text"): for i in tqdm(range(0, len(df), batch_size)): yield df.loc[i : i + batch_size, col]