Spaces:
Running
Running
| #!/usr/bin/python3 | |
| # -*- coding: utf-8 -*- | |
| import argparse | |
| import json | |
| import os | |
| from pathlib import Path | |
| import re | |
| import sys | |
| import time | |
| pwd = os.path.abspath(os.path.dirname(__file__)) | |
| sys.path.append(os.path.join(pwd, "../../")) | |
| from project_settings import environment, project_path | |
| def get_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "--raw_dataset", | |
| default=(project_path / "data/raw_dataset/finished/agent-lingoace-zh-375-choice-v2").as_posix(), | |
| type=str | |
| ) | |
| parser.add_argument( | |
| "--dataset", | |
| default=(project_path / "data/dataset/agent-lingoace-zh-375-choice-v2.jsonl").as_posix(), | |
| type=str | |
| ) | |
| args = parser.parse_args() | |
| return args | |
| def main(): | |
| args = get_args() | |
| raw_dataset = Path(args.raw_dataset) | |
| dataset = Path(args.dataset) | |
| dataset.parent.mkdir(parents=True, exist_ok=True) | |
| with open(dataset.as_posix(), "w", encoding="utf-8") as fout: | |
| for sample_dir in raw_dataset.glob("*"): | |
| idx = sample_dir.parts[-1] | |
| system_prompt_file = sample_dir / "system_prompt.txt" | |
| user_prompt_file = sample_dir / "user_prompt.txt" | |
| response_file = sample_dir / "response.txt" | |
| with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f: | |
| system_prompt = f.read() | |
| with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f: | |
| user_prompt = f.read() | |
| with open(response_file.as_posix(), "r", encoding="utf-8") as f: | |
| response = f.read() | |
| # conversation | |
| pattern = r"\*Conversation starts\*(.*)\*Conversation ends\*" | |
| match = re.search(pattern, user_prompt, flags=re.DOTALL) | |
| if match is None: | |
| raise AssertionError | |
| conversation = match.group(1) | |
| pattern = r'(client:|customer service:)([^\n]*)' | |
| matches = re.findall(pattern, conversation) | |
| conversation_ = list() | |
| for speaker, content in matches: | |
| if speaker == "customer service:": | |
| speaker = "assistant" | |
| elif speaker == "client:": | |
| speaker = "user" | |
| else: | |
| raise AssertionError(speaker) | |
| conversation_.append({ | |
| "role": speaker, | |
| "content": content, | |
| }) | |
| # examples | |
| pattern = r"\*Conversation ends\*(.*)\*\*Output\*\*" | |
| match = re.search(pattern, user_prompt, flags=re.DOTALL) | |
| if match is not None: | |
| examples = match.group(0) | |
| else: | |
| examples = "" | |
| examples_ = list() | |
| pattern = re.compile(r'(?m)^\[(用户|你)\]:\s*"([^"]*)"\s*$|^输出:\s*(\S+)\s*$|^解释:\s*(.+)\s*$') | |
| example_conversation_ = list() | |
| outputs = dict() | |
| for m in pattern.finditer(examples): | |
| speaker, content, out, explanation = m.group(1), m.group(2), m.group(3), m.group(4) | |
| if speaker: | |
| if speaker == "你": | |
| # speaker = "customer service" | |
| speaker = "assistant" | |
| elif speaker == "用户": | |
| # speaker = "client" | |
| speaker = "user" | |
| else: | |
| raise AssertionError | |
| conversation_turn = {"role": speaker, "content": content} | |
| example_conversation_.append(conversation_turn) | |
| elif out: | |
| outputs["output"] = out | |
| elif explanation: | |
| outputs["explanation"] = explanation | |
| examples_.append({ | |
| "conversation": example_conversation_, | |
| "outputs": outputs, | |
| }) | |
| example_conversation_ = list() | |
| outputs = dict() | |
| splits = user_prompt.split("**Output**") | |
| choice = splits[1].strip() | |
| pattern = r'If (.*?)output ([A-F])' | |
| matches = re.findall(pattern, choice, re.DOTALL) | |
| choices_ = list() | |
| for condition, output_letter in matches: | |
| condition_ = f"If {condition[:-2]}" | |
| choice_letter = output_letter | |
| row = { | |
| "condition": condition_, | |
| "choice_letter": choice_letter, | |
| } | |
| choices_.append(row) | |
| row = { | |
| "idx": idx, | |
| "system_prompt": system_prompt, | |
| "conversation": conversation_, | |
| "examples": examples_, | |
| "choices": choices_, | |
| "response": response, | |
| } | |
| row = json.dumps(row, ensure_ascii=False) | |
| fout.write(f"{row}\n") | |
| fout.flush() | |
| return | |
| if __name__ == "__main__": | |
| main() | |