| import csv | |
| csv.field_size_limit(10 * 1024 * 1024 * 1024) # 10MB | |
| # length_max = 300 | |
| # length_max = 28906320 | |
| # length_max = 8290 | |
| # with open('res.csv', 'r', encoding='utf-8') as f: | |
| # reader = csv.reader(f) | |
| # id_set = set() | |
| # for i, row in enumerate(reader): | |
| # id_set.add(row[0]) | |
| # print(len(id_set)) | |
| with open('res1.csv', 'r', encoding='utf-8') as f: | |
| reader = csv.reader(f) | |
| id_set = set() | |
| for i, row in enumerate(reader): | |
| id_set.add(row[0]) | |
| print(len(id_set)) | |
| with open('res2.csv', 'r', encoding='utf-8') as f: | |
| reader = csv.reader(f) | |
| for i, row in enumerate(reader): | |
| if row[0] in id_set: | |
| id_set.remove(row[0]) | |
| print(len(id_set)) | |
| length_max = len(id_set) | |
| def load_dataset(): | |
| prompt = open('score_prompt.txt', 'r', encoding='utf-8').read() | |
| # prompt = open('is_sci_prompt.txt', 'r', encoding='utf-8').read() | |
| with open('/home/weifengsun/tangou1/domain_code/src/datasets/data_merged/dataset_all.csv', 'r', encoding='utf-8') as f: | |
| reader = csv.reader(f) | |
| amount = 0 | |
| # ['', 'text', 'repo_name', 'path', 'language', 'license', 'size', 'keyword', 'text_hash', 'config', 'split', 'repo_path', 'ds_source'] | |
| for i, row in enumerate(reader): | |
| if i == 0: | |
| continue | |
| if amount > length_max: | |
| return | |
| if len(row[1]) > 100000 or len(row[1]) < 2000: | |
| continue | |
| if row[0] in id_set: | |
| amount += 1 | |
| yield prompt.format(CODE_FILE=add_line_numbers(row[1])), row[0] | |
| # yield prompt.format(CODE_FILE=row[1][:20000]), row[0] | |
| def add_line_numbers(text: str) -> str: | |
| out_lines = [] | |
| line_num = 1 | |
| start = 0 | |
| for line in text.splitlines(keepends=True): | |
| out_lines.append(f"{line_num} " + line) | |
| line_num += 1 | |
| return "".join(out_lines) | |
| if __name__ == "__main__": | |
| # with open('/home/weifengsun/tangou1/domain_code/src/datasets/data_merged/dataset_all.csv', 'r', encoding='utf-8') as f: | |
| # reader = csv.reader(f) | |
| # # ['', 'text', 'repo_name', 'path', 'language', 'license', 'size', 'keyword', 'text_hash', 'config', 'split', 'repo_path', 'ds_source'] | |
| # for i, row in enumerate(reader): | |
| # if i > 100: | |
| # break | |
| # print(row[0]) | |
| # amount = 0 | |
| # for i in load_dataset(): | |
| # amount += 1 | |
| # print(amount) | |
| # print(len(id_set)) | |
| pass | |