| | import json
|
| | import argparse
|
| |
|
| | def parse_args():
|
| | parser = argparse.ArgumentParser()
|
| | parser.add_argument('--review_data_path', type=str, default='./data')
|
| | parser.add_argument('--meta_data_path', type=str, default='./data')
|
| | parser.add_argument('--save_dir', type=str, default='./data/dataset')
|
| | parser.add_argument('--dataset', type=str, default='dataset')
|
| | return parser.parse_args()
|
| |
|
| | args = parse_args()
|
| |
|
| | import os
|
| | if not os.path.exists(args.save_dir + '/' + args.dataset):
|
| | os.mkdir(args.save_dir + '/' + args.dataset)
|
| |
|
| |
|
| | ''' Extract interaction sequence '''
|
| | inters = {}
|
| | with open(args.review_data_path, 'r', encoding = 'utf-8') as file:
|
| |
|
| | for line in file:
|
| | element = json.loads(line)
|
| | if element['reviewerID'] in inters:
|
| | inters[element['reviewerID']].append({'time': element['unixReviewTime'], 'item': element['asin']})
|
| | else:
|
| | inters[element['reviewerID']] = [{'time': element['unixReviewTime'], 'item': element['asin']}]
|
| |
|
| | filtered_inters = {key: value for key, value in inters.items() if len(value) > 4}
|
| | final_inters = {}
|
| | for key, value in filtered_inters.items():
|
| |
|
| | value.sort(key = lambda x: x['time'])
|
| | final_inters[key] = [x['item'] for x in value]
|
| |
|
| | with open(args.save_dir + '/' + args.dataset + '/' + args.dataset + '.inters.json', 'w', encoding = 'utf-8') as f:
|
| | json.dump(final_inters, f, ensure_ascii = False, indent = 4)
|
| |
|
| | ''' Extract user review '''
|
| | reviews = {}
|
| | with open(args.review_data_path, 'r', encoding = 'utf-8') as file:
|
| |
|
| | for line in file:
|
| | element = json.loads(line)
|
| | if len(element.get('reviewText', '')) > 0:
|
| | reviews[element['reviewerID']] = {element['asin']: element['reviewText']}
|
| | else:
|
| | continue
|
| | with open(args.save_dir + '/' + args.dataset + '/' + args.dataset + '.reviews.json', 'w', encoding = 'utf-8') as f:
|
| | json.dump(reviews, f, ensure_ascii = False, indent = 4)
|
| |
|
| | ''' Extract item features '''
|
| | features = {}
|
| | with open(args.meta_data_path, 'r', encoding = 'utf-8') as file:
|
| |
|
| | for line in file:
|
| | element = json.loads(line)
|
| | if len(element.get('title', '')) > 0 and len(element.get('description', '')) > 0 and len(element.get('imageURL', '')) > 0 and len(element.get('imageURLHighRes', '')) > 0:
|
| | features[element['asin']] = {
|
| | 'title': element['title'],
|
| | 'description': element['description'],
|
| | 'image': element['imageURL'],
|
| | 'imageH': element['imageURLHighRes']}
|
| | else:
|
| | continue
|
| | with open(args.save_dir + '/' + args.dataset + '/' + args.dataset + '.features.json', 'w', encoding = 'utf-8') as f:
|
| | json.dump(features, f, ensure_ascii = False, indent = 4) |