dataset-builder / data3 /load_dataset.py
SunDou's picture
Upload data3/load_dataset.py with huggingface_hub
96eab9a verified
import csv
csv.field_size_limit(10 * 1024 * 1024 * 1024) # 10MB
# length_max = 300
# length_max = 28906320
# length_max = 8290
# with open('res.csv', 'r', encoding='utf-8') as f:
# reader = csv.reader(f)
# id_set = set()
# for i, row in enumerate(reader):
# id_set.add(row[0])
# print(len(id_set))
with open('res1.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
id_set = set()
for i, row in enumerate(reader):
id_set.add(row[0])
print(len(id_set))
with open('res2.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for i, row in enumerate(reader):
if row[0] in id_set:
id_set.remove(row[0])
print(len(id_set))
length_max = len(id_set)
def load_dataset():
prompt = open('score_prompt.txt', 'r', encoding='utf-8').read()
# prompt = open('is_sci_prompt.txt', 'r', encoding='utf-8').read()
with open('/home/weifengsun/tangou1/domain_code/src/datasets/data_merged/dataset_all.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
amount = 0
# ['', 'text', 'repo_name', 'path', 'language', 'license', 'size', 'keyword', 'text_hash', 'config', 'split', 'repo_path', 'ds_source']
for i, row in enumerate(reader):
if i == 0:
continue
if amount > length_max:
return
if len(row[1]) > 100000 or len(row[1]) < 2000:
continue
if row[0] in id_set:
amount += 1
yield prompt.format(CODE_FILE=add_line_numbers(row[1])), row[0]
# yield prompt.format(CODE_FILE=row[1][:20000]), row[0]
def add_line_numbers(text: str) -> str:
out_lines = []
line_num = 1
start = 0
for line in text.splitlines(keepends=True):
out_lines.append(f"{line_num} " + line)
line_num += 1
return "".join(out_lines)
if __name__ == "__main__":
# with open('/home/weifengsun/tangou1/domain_code/src/datasets/data_merged/dataset_all.csv', 'r', encoding='utf-8') as f:
# reader = csv.reader(f)
# # ['', 'text', 'repo_name', 'path', 'language', 'license', 'size', 'keyword', 'text_hash', 'config', 'split', 'repo_path', 'ds_source']
# for i, row in enumerate(reader):
# if i > 100:
# break
# print(row[0])
# amount = 0
# for i in load_dataset():
# amount += 1
# print(amount)
# print(len(id_set))
pass