dataset-builder / data3 /instruct_generation.py
SunDou's picture
Upload data3/instruct_generation.py with huggingface_hub
f2670ef verified
import csv
import json
csv.field_size_limit(10 * 1024 * 1024 * 1024) # 10MB
score_dict = {}
def load_score():
with open('res2.csv', 'r') as f:
reader = csv.reader(f)
amount = 0
for row in reader:
row_json = None
try:
start_index, end_index = row[1].find('['), row[1].find(']')
row_json = json.loads(row[1][start_index: end_index+1])
except:
pass
if row_json is not None and isinstance(row_json, list):
for i in row_json:
try:
if isinstance(i, dict) and 'relevance_score' in i and 'function_start_line' in i:
if i['relevance_score'] is not None and int(i['relevance_score']) > 1 and i['function_start_line'] is not None and int(i['function_start_line']) > 1:
# yield row_json, row[0]
amount += 1
score_dict[row[0]] = row_json
except:
pass
print(amount)
def load_code_file():
with open('/home/weifengsun/tangou1/domain_code/src/datasets/data_merged/dataset_all.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
if row[0] in score_dict:
score_dict[row[0]] = {'code_file': row, 'score_json': score_dict[row[0]]}
print(score_dict[row[0]])
break
if __name__ == '__main__':
load_score()
load_code_file()
# print(len(score_dict))