llm_eval_system / examples /make_dataset /make_choice_lingoace_v2.py
HoneyTian's picture
update
a984ba9
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import json
import os
from pathlib import Path
import re
import sys
import time
pwd = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.join(pwd, "../../"))
from project_settings import environment, project_path
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--raw_dataset",
default=(project_path / "data/raw_dataset/finished/agent-lingoace-zh-375-choice-v2").as_posix(),
type=str
)
parser.add_argument(
"--dataset",
default=(project_path / "data/dataset/agent-lingoace-zh-375-choice-v2.jsonl").as_posix(),
type=str
)
args = parser.parse_args()
return args
def main():
args = get_args()
raw_dataset = Path(args.raw_dataset)
dataset = Path(args.dataset)
dataset.parent.mkdir(parents=True, exist_ok=True)
with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
for sample_dir in raw_dataset.glob("*"):
idx = sample_dir.parts[-1]
system_prompt_file = sample_dir / "system_prompt.txt"
user_prompt_file = sample_dir / "user_prompt.txt"
response_file = sample_dir / "response.txt"
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
system_prompt = f.read()
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
user_prompt = f.read()
with open(response_file.as_posix(), "r", encoding="utf-8") as f:
response = f.read()
# conversation
pattern = r"\*Conversation starts\*(.*)\*Conversation ends\*"
match = re.search(pattern, user_prompt, flags=re.DOTALL)
if match is None:
raise AssertionError
conversation = match.group(1)
pattern = r'(client:|customer service:)([^\n]*)'
matches = re.findall(pattern, conversation)
conversation_ = list()
for speaker, content in matches:
if speaker == "customer service:":
speaker = "assistant"
elif speaker == "client:":
speaker = "user"
else:
raise AssertionError(speaker)
conversation_.append({
"role": speaker,
"content": content,
})
# examples
pattern = r"\*Conversation ends\*(.*)\*\*Output\*\*"
match = re.search(pattern, user_prompt, flags=re.DOTALL)
if match is not None:
examples = match.group(0)
else:
examples = ""
examples_ = list()
pattern = re.compile(r'(?m)^\[(用户|你)\]:\s*"([^"]*)"\s*$|^输出:\s*(\S+)\s*$|^解释:\s*(.+)\s*$')
example_conversation_ = list()
outputs = dict()
for m in pattern.finditer(examples):
speaker, content, out, explanation = m.group(1), m.group(2), m.group(3), m.group(4)
if speaker:
if speaker == "你":
# speaker = "customer service"
speaker = "assistant"
elif speaker == "用户":
# speaker = "client"
speaker = "user"
else:
raise AssertionError
conversation_turn = {"role": speaker, "content": content}
example_conversation_.append(conversation_turn)
elif out:
outputs["output"] = out
elif explanation:
outputs["explanation"] = explanation
examples_.append({
"conversation": example_conversation_,
"outputs": outputs,
})
example_conversation_ = list()
outputs = dict()
splits = user_prompt.split("**Output**")
choice = splits[1].strip()
pattern = r'If (.*?)output ([A-F])'
matches = re.findall(pattern, choice, re.DOTALL)
choices_ = list()
for condition, output_letter in matches:
condition_ = f"If {condition[:-2]}"
choice_letter = output_letter
row = {
"condition": condition_,
"choice_letter": choice_letter,
}
choices_.append(row)
row = {
"idx": idx,
"system_prompt": system_prompt,
"conversation": conversation_,
"examples": examples_,
"choices": choices_,
"response": response,
}
row = json.dumps(row, ensure_ascii=False)
fout.write(f"{row}\n")
fout.flush()
return
if __name__ == "__main__":
main()