import csv csv.field_size_limit(10 * 1024 * 1024 * 1024) # 10MB # length_max = 300 # length_max = 28906320 # length_max = 8290 # with open('res.csv', 'r', encoding='utf-8') as f: # reader = csv.reader(f) # id_set = set() # for i, row in enumerate(reader): # id_set.add(row[0]) # print(len(id_set)) with open('res1.csv', 'r', encoding='utf-8') as f: reader = csv.reader(f) id_set = set() for i, row in enumerate(reader): id_set.add(row[0]) print(len(id_set)) with open('res2.csv', 'r', encoding='utf-8') as f: reader = csv.reader(f) for i, row in enumerate(reader): if row[0] in id_set: id_set.remove(row[0]) print(len(id_set)) length_max = len(id_set) def load_dataset(): prompt = open('score_prompt.txt', 'r', encoding='utf-8').read() # prompt = open('is_sci_prompt.txt', 'r', encoding='utf-8').read() with open('/home/weifengsun/tangou1/domain_code/src/datasets/data_merged/dataset_all.csv', 'r', encoding='utf-8') as f: reader = csv.reader(f) amount = 0 # ['', 'text', 'repo_name', 'path', 'language', 'license', 'size', 'keyword', 'text_hash', 'config', 'split', 'repo_path', 'ds_source'] for i, row in enumerate(reader): if i == 0: continue if amount > length_max: return if len(row[1]) > 100000 or len(row[1]) < 2000: continue if row[0] in id_set: amount += 1 yield prompt.format(CODE_FILE=add_line_numbers(row[1])), row[0] # yield prompt.format(CODE_FILE=row[1][:20000]), row[0] def add_line_numbers(text: str) -> str: out_lines = [] line_num = 1 start = 0 for line in text.splitlines(keepends=True): out_lines.append(f"{line_num} " + line) line_num += 1 return "".join(out_lines) if __name__ == "__main__": # with open('/home/weifengsun/tangou1/domain_code/src/datasets/data_merged/dataset_all.csv', 'r', encoding='utf-8') as f: # reader = csv.reader(f) # # ['', 'text', 'repo_name', 'path', 'language', 'license', 'size', 'keyword', 'text_hash', 'config', 'split', 'repo_path', 'ds_source'] # for i, row in enumerate(reader): # if i > 100: # break # print(row[0]) # amount = 0 # for i in load_dataset(): # amount += 1 # print(amount) # print(len(id_set)) pass