Spaces:
Sleeping
Sleeping
| import json | |
| import csv | |
| with open("question_data.csv", "r") as f: | |
| reader = csv.reader(f) | |
| questions = [] | |
| for row in reader: | |
| questions.append(row) | |
| questions = questions[2:] # cut off top two (labels, passage #s) | |
| # qid,prompt,question,a,b,c,d,answer,gold_passage,top10_colbert,,,,,,,,,,generation_colbert,top10_e5,,,,,,,,,,generation_e5,gold_passage_generation | |
| # 0 1 2 3 4 5 6 7 8 9-18 19 20-29 30 31 | |
| # See example.json for how these files will be ported | |
| full_question_dict = {} # stores all "id":q_data pairs | |
| for entry in questions: | |
| # Create individual question data | |
| q_data = {} | |
| if not entry[1] == "": | |
| entry[2] = entry[1] + " " + entry[2] | |
| q_data["question"] = entry[2] | |
| q_data["answers"] = entry[3:7] # inclusive of (3, 6) -> A, B, C, D | |
| answer_map = {"A": 0, "B": 1, "C": 2, "D": 3} | |
| q_data["correct_answer_index"] = answer_map[entry[7]] # entry[7] = "A" -> index = 0 | |
| q_data["top10_colbert"] = entry[9:19] # inclusive of (9-18) -> 10 retrievals | |
| q_data["generation_colbert"] = entry[19] | |
| q_data["top10_e5"] = entry[20:30] # inclusive of (20-29) -> 10 retrievals | |
| q_data["generation_e5"] = entry[30] | |
| q_data["top10_contains_gold_passage"] = False # this is always the case b/c of programming. Does not reflect reality | |
| q_data["gold_passage"] = entry[8] | |
| q_data["gold_passage_generation"] = entry[31] | |
| # Add to full question dictionary | |
| full_question_dict[entry[0]] = q_data # entry[0] is qid | |
| with open("question_data.json", "w") as f: | |
| json.dump(full_question_dict, f) |