Spaces:

qgyd2021
/

llm_eval_system

Running

App Files Files Community

llm_eval_system / examples /make_dataset /make_choice_lingoace_v2.py

HoneyTian

update

a984ba9 6 days ago

raw

history blame contribute delete

5.04 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	import argparse
	import json
	import os
	from pathlib import Path
	import re
	import sys
	import time

	pwd = os.path.abspath(os.path.dirname(__file__))
	sys.path.append(os.path.join(pwd, "../../"))

	from project_settings import environment, project_path


	def get_args():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--raw_dataset",
	default=(project_path / "data/raw_dataset/finished/agent-lingoace-zh-375-choice-v2").as_posix(),
	type=str
	)
	parser.add_argument(
	"--dataset",
	default=(project_path / "data/dataset/agent-lingoace-zh-375-choice-v2.jsonl").as_posix(),
	type=str
	)
	args = parser.parse_args()
	return args


	def main():
	args = get_args()

	raw_dataset = Path(args.raw_dataset)
	dataset = Path(args.dataset)
	dataset.parent.mkdir(parents=True, exist_ok=True)

	with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
	for sample_dir in raw_dataset.glob("*"):
	idx = sample_dir.parts[-1]
	system_prompt_file = sample_dir / "system_prompt.txt"
	user_prompt_file = sample_dir / "user_prompt.txt"
	response_file = sample_dir / "response.txt"

	with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
	system_prompt = f.read()
	with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
	user_prompt = f.read()
	with open(response_file.as_posix(), "r", encoding="utf-8") as f:
	response = f.read()

	# conversation
	pattern = r"\Conversation starts\(.)\Conversation ends\*"
	match = re.search(pattern, user_prompt, flags=re.DOTALL)
	if match is None:
	raise AssertionError
	conversation = match.group(1)
	pattern = r'(client:\|customer service:)([^\n]*)'
	matches = re.findall(pattern, conversation)
	conversation_ = list()
	for speaker, content in matches:
	if speaker == "customer service:":
	speaker = "assistant"
	elif speaker == "client:":
	speaker = "user"
	else:
	raise AssertionError(speaker)
	conversation_.append({
	"role": speaker,
	"content": content,
	})

	# examples
	pattern = r"\Conversation ends\(.)\\Output\\*"
	match = re.search(pattern, user_prompt, flags=re.DOTALL)
	if match is not None:
	examples = match.group(0)
	else:
	examples = ""

	examples_ = list()
	pattern = re.compile(r'(?m)^\[(用户\|你)\]:\s"([^"])"\s$\|^输出:\s(\S+)\s$\|^解释:\s(.+)\s*$')
	example_conversation_ = list()
	outputs = dict()
	for m in pattern.finditer(examples):
	speaker, content, out, explanation = m.group(1), m.group(2), m.group(3), m.group(4)
	if speaker:
	if speaker == "你":
	# speaker = "customer service"
	speaker = "assistant"
	elif speaker == "用户":
	# speaker = "client"
	speaker = "user"
	else:
	raise AssertionError
	conversation_turn = {"role": speaker, "content": content}
	example_conversation_.append(conversation_turn)
	elif out:
	outputs["output"] = out
	elif explanation:
	outputs["explanation"] = explanation
	examples_.append({
	"conversation": example_conversation_,
	"outputs": outputs,
	})
	example_conversation_ = list()
	outputs = dict()

	splits = user_prompt.split("Output")
	choice = splits[1].strip()
	pattern = r'If (.*?)output ([A-F])'
	matches = re.findall(pattern, choice, re.DOTALL)
	choices_ = list()
	for condition, output_letter in matches:
	condition_ = f"If {condition[:-2]}"
	choice_letter = output_letter
	row = {
	"condition": condition_,
	"choice_letter": choice_letter,
	}
	choices_.append(row)

	row = {
	"idx": idx,
	"system_prompt": system_prompt,
	"conversation": conversation_,
	"examples": examples_,
	"choices": choices_,
	"response": response,
	}
	row = json.dumps(row, ensure_ascii=False)
	fout.write(f"{row}\n")
	fout.flush()

	return


	if __name__ == "__main__":
	main()