|
|
import json |
|
|
file_path = '/root/dev-zen-v1.0.json' |
|
|
SEP_TOKEN = "<sep>" |
|
|
|
|
|
data_loader = [] |
|
|
|
|
|
with open(file_path, 'r') as f: |
|
|
data = json.load(f) |
|
|
for content in data['data']: |
|
|
title = content['title'] |
|
|
paragraphs = content['paragraphs'] |
|
|
for paragraph in paragraphs: |
|
|
context = paragraph['context'] |
|
|
qas = paragraph['qas'] |
|
|
for qa_pair in qas: |
|
|
question = qa_pair.get('question', None) |
|
|
answers = qa_pair.get('answers', None) |
|
|
for answer in answers: |
|
|
answer_text = answer.get('text', None) |
|
|
if answer_text and question != None: |
|
|
data_loader.append({'title': title, 'context': context, 'question': question, 'answer': answer_text}) |
|
|
|
|
|
|
|
|
prompt_templatae = """根据下面input的上下文,生成和上下文有关的问答对,并输出到output中。""" |
|
|
|
|
|
prompt_chunk = [] |
|
|
|
|
|
for i in data_loader: |
|
|
|
|
|
prompt_chunk.append({"instruction": prompt_templatae, "input": i['context'], "output": ''}) |
|
|
|
|
|
|
|
|
with open('prompt_chunk_predict.json', 'w') as f: |
|
|
json.dump(prompt_chunk, f, ensure_ascii=False, indent=4) |
|
|
|