| from transformers import BertTokenizerFast | |
| import emoji | |
| import pandas as pd | |
| from tqdm import tqdm | |
| def add_emoji_tokens(tokenizer: BertTokenizerFast): | |
| all_emojis = list(emoji.EMOJI_DATA.keys()) | |
| tokenizer.add_tokens(all_emojis) | |
| return tokenizer | |
| def add_new_line_token(tokenizer: BertTokenizerFast): | |
| tokenizer.add_special_tokens({"additional_special_tokens": ["\n"]}) | |
| return tokenizer | |
| def batch_iterator(df: pd.DataFrame, batch_size=10000, col: str = "text"): | |
| for i in tqdm(range(0, len(df), batch_size)): | |
| yield df.loc[i : i + batch_size, col] | |