Abigail99216's picture
Upload folder using huggingface_hub
f43af3c verified
import datetime
import pickle
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
def make_datetime(year, month, day):
try:
date = datetime.datetime(int(year), int(month), int(day))
except ValueError as e:
if e.args[0] == 'day is out of range for month':
date = datetime.datetime(int(year), int(month), int(day)-1)
return datetime.datetime.timestamp(date) + 61851630000 # make sure the timestamp is positive
def clean_csv():
source_dir = 'events.csv'
df = pd.read_csv(source_dir, header=0)
df = df[~df['event_date_year'].isna()]
df = df[df['event_date_year'] > 0]
df['event_date_month'].fillna(1, inplace=True)
df['event_date_day'].fillna(1, inplace=True)
df.drop_duplicates(inplace=True)
norm_const = 1000000
df['event_timestamp'] = df.apply(
lambda x: make_datetime(x['event_date_year'], x['event_date_month'], x['event_date_day']),
axis=1)/norm_const
df.sort_values(by=['event_date_year', 'event_date_month', 'event_date_day'], inplace=True)
df['event_type'] = [0] * len(df)
df.to_csv('volcano.csv', index=False, header=True)
return
def make_seq(df):
seq = []
df['time_diff'] = df['event_timestamp'].diff()
df.index = np.arange(len(df))
for index, row in df.iterrows():
if index == 0:
event_dict = {"time_since_last_event": 0.0,
"time_since_start": 0.0,
"type_event": row['event_type']
}
start_event_time = row['event_timestamp']
else:
event_dict = {"time_since_last_event": row['time_diff'],
"time_since_start": row['event_timestamp'] - start_event_time,
"type_event": row['event_type']
}
seq.append(event_dict)
return seq
def make_pkl(target_dir, dim_process, split, seqs):
with open(target_dir, "wb") as f_out:
pickle.dump(
{
"dim_process": dim_process,
split: seqs
}, f_out
)
return
def make_dataset(source_dir):
df = pd.read_csv(source_dir, header=0)
vols = np.unique(df['volcano_name'])
total_seq = []
for vol in vols:
df_ = df[df['volcano_name'] == vol]
df_.sort_values('event_timestamp', inplace=True)
total_seq.append(make_seq(df_))
print(len(total_seq))
make_pkl('train.pkl', 1, 'train', total_seq[:400])
count_seq(total_seq[:400])
make_pkl('dev.pkl', 1, 'dev', total_seq[400:450])
count_seq(total_seq[400:450])
make_pkl('test.pkl', 1, 'test', total_seq[450:])
count_seq(total_seq[450:])
return
def count_seq(seqs):
total_len = [len(seq) for seq in seqs]
print(np.mean(total_len))
print(np.sum(total_len))
return
if __name__ == '__main__':
# clean_csv()
make_dataset('volcano.csv')