File size: 579 Bytes
cea4a4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from transformers import BertTokenizerFast
import emoji
import pandas as pd
from tqdm import tqdm


def add_emoji_tokens(tokenizer: BertTokenizerFast):
    all_emojis = list(emoji.EMOJI_DATA.keys())
    tokenizer.add_tokens(all_emojis)
    return tokenizer


def add_new_line_token(tokenizer: BertTokenizerFast):
    tokenizer.add_special_tokens({"additional_special_tokens": ["\n"]})
    return tokenizer


def batch_iterator(df: pd.DataFrame, batch_size=10000, col: str = "text"):
    for i in tqdm(range(0, len(df), batch_size)):
        yield df.loc[i : i + batch_size, col]