Spaces:

tappyness1
/

one_dash

Sleeping

one_dash / src /preprocess.py

tappyness1

initial commit

cb22296 over 2 years ago

3.55 kB

	import re
	import numpy as np
	from src.arcs import generate_arc
	import warnings
	import pandas as pd
	from configparser import ConfigParser, ExtendedInterpolation

	warnings.filterwarnings("ignore")

	def get_last_known_bounty(row):
	"""get latest bounty for each character row
	"""
	if type(row) == float:
	return row
	elif type(row) == str:
	x = re.sub(r"\[.*?\]", " ", row)
	x = x.split(" ")
	ret = ''.join([n for n in x[0] if n.isdigit()])
	if len(ret) ==0:
	return np.nan
	return int(ret)

	def get_latest_age(row):
	if type(row) == str:
	x = re.sub(r"\[.*?\]", " ", row)
	x = re.sub(r"\(.*?\)", " ", x)
	x = x.replace(";", "")
	x = x.split(" ")

	ret = ' '.join([n for n in x if n.isdigit()])
	ret = ret.split(" ")
	newret = []
	for i in ret:
	try:
	newret.append(int(i))
	except:
	newret.append(i)

	return (max(newret))

	def get_main_crew(row):
	if type(row) == str:
	x = re.sub(r"\[.*?\]", " ", row)
	x = re.sub(r"\(.*?\)", " ", x)
	x = x.split(";")
	# x = x.split("")
	return x[0]

	class cleaner:
	def __init__(self, config_path = 'cfg/cfg.ini'):

	pl_config = ConfigParser(interpolation=ExtendedInterpolation())
	pl_config.read(config_path)

	self.end_chap = pl_config['SCRAPER'].getint('end_chap') + 1
	self.char_link_fp = pl_config['SCRAPER'].get('char_link_fp')
	self.chap_appearance_fp = pl_config['SCRAPER'].get('chap_appearance_fp')
	self.char_details_fp = pl_config['SCRAPER'].get('char_details_fp')
	self.age_bounty_fp = pl_config['SCRAPER'].get('age_bounty_fp')
	self.arcs = generate_arc(self.end_chap)

	def arc_col(self,row):
	"""function to generate arc per row for appearance df
	"""
	for key in self.arcs:
	if row['Chapter'] in self.arcs[key]:
	return key
	return "None"

	def preprocess_data(self):
	# preprocess to add arc
	appearance_df = pd.read_csv(self.chap_appearance_fp)
	# appearance_df['Chapter'] = appearance_df['Chapter'].ffill()
	# df['Arc Name'] = df['Arc Name'].ffill()

	appearance_df['Appearance'] = appearance_df['Character'].str.split("(",expand=True)[0]
	appearance_df['Appearance Notes'] = appearance_df['Character'].str.split("(",expand=True)[1]
	appearance_df['Appearance Notes'] = appearance_df['Appearance Notes'].str.replace(")", "", regex = True)
	appearance_df['Arc'] = appearance_df.apply(self.arc_col, axis =1)

	char_details_df = pd.read_csv(self.char_details_fp)
	char_details_df['last_bounty'] = char_details_df['bounty'].apply(get_last_known_bounty)
	char_details_df['latest_age'] = char_details_df['age'].apply(get_latest_age)
	char_details_df['latest_age']= char_details_df['latest_age'].fillna(value=np.nan)
	char_details_df['main_crew'] = char_details_df['affiliation'].apply(get_main_crew)
	df_age_bounty = char_details_df.dropna(subset=['latest_age', 'last_bounty'])
	df_age_bounty['latest_age'] = df_age_bounty['latest_age'].astype('int')

	appearance_df.to_csv(self.chap_appearance_fp, index = False)
	char_details_df.to_csv(self.char_details_fp, index = False)
	df_age_bounty.to_csv(self.age_bounty_fp, index = False)

	if __name__ == '__main__':
	cleaner = cleaner()
	cleaner.preprocess_data()