File size: 1,149 Bytes
cea4a4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
class DecoderDatasetTorch(Dataset):
"""Train dataset."""
def __init__(self, df: pd.DataFrame, embedding_column: str = "my_full_mean_embedding"):
"""
Args:
df (pd.DataFrame): dataframe with ads
embedding_column (str, optional): Column whose values to output in __get_item__. Defaults to 'full_mean_embedding'.
"""
self.df = df
self.embedding_column = embedding_column
df[[embedding_column, "ctr"]] = df[[embedding_column, "ctr"]].applymap(lambda x: np.float32(x))
# df["ctr"] = df["ctr"].astype(np.float32)
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
embedding = self.df.loc[idx, self.embedding_column]
ctr = self.df.loc[idx, "ctr"]
return {"embedding": embedding, "ctr": ctr}
# tokenizer = BertTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
# train_dataset = AdDataset(df=dataset.train, tokenizer=tokenizer)
|