import numpy as np import pandas as pd import torch from torch.utils.data import Dataset class DecoderDatasetTorch(Dataset): """Train dataset.""" def __init__(self, df: pd.DataFrame, embedding_column: str = "my_full_mean_embedding"): """ Args: df (pd.DataFrame): dataframe with ads embedding_column (str, optional): Column whose values to output in __get_item__. Defaults to 'full_mean_embedding'. """ self.df = df self.embedding_column = embedding_column df[[embedding_column, "ctr"]] = df[[embedding_column, "ctr"]].applymap(lambda x: np.float32(x)) # df["ctr"] = df["ctr"].astype(np.float32) def __len__(self): return len(self.df) def __getitem__(self, idx): if torch.is_tensor(idx): idx = idx.tolist() embedding = self.df.loc[idx, self.embedding_column] ctr = self.df.loc[idx, "ctr"] return {"embedding": embedding, "ctr": ctr} # tokenizer = BertTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity") # train_dataset = AdDataset(df=dataset.train, tokenizer=tokenizer)