File size: 4,457 Bytes
46861c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import argparse
import json
import os

"""
question:
{
    "question_id": 1, 
    "image": "000000104486.jpg", 
    "category": "natural_box", 
    "text": "Please analyze the relationship between all marked regions in the image.", 
    "annotation": {
        "bbox": [[157.23, 341.07, 10.67, 2.08]], 
        "segmentation": []
    }
}
answer:
{
    "question_id": 1, 
    "image": "000000104486.jpg", 
    "category": "natural_box", 
    "text": "<Region 1>: This region includes an individual who is caught in a moment that seems to involve some sort of task or activity. The person is engaged with a luggage cart, which suggests they might be arriving or departing from a location that offers such amenities, possibly a hotel. The cart holds luggage indicating travel or transit. The man's expression and attire provide clues to his role or state at the moment, such as potentially being a guest handling his luggage. The other individual seen partially in the background creates a sense of movement or interaction, but their relationship to the man or the context is unclear.\n"
}
predictions:
{
    "question_id": 1, 
    "image": "000000104486.jpg", 
    "category": "natural_box", 
    "text": "<Region 1>: The marked region does not appear to have any direct relationship with other marked regions, as there are no other marks to compare or contrast with.\n"
}
"""


def main(args):
    output_name = args.output_path.split("/")[-1]  # android_QA_box.json

    for phase in [
        "android_detailed_caption_box",
        "multipanel_detailed_caption_box",
        "natural_detailed_caption_box",
        "ocr_doc_detailed_caption_box",
        "ocr_spotting_detailed_caption_box",
        "web_detailed_caption_box",
    ]:
        vp = "bbox"
        domain = phase.split("_box")[0]  # android_QA

        if not os.path.exists(f"mdvp_for_gpt4v_eval/{phase}"):
            os.mkdir(f"mdvp_for_gpt4v_eval/{phase}")

        with open(args.output_path, "r") as f:
            data = json.load(f)

        with open("annotations/mdvp_caption_mask.json", "r") as f:
            mask_data = json.load(f)

        format_answer_list = []
        format_prediction_list = []
        for index, item in enumerate(data):
            meta = mask_data[index]
            assert meta["caption"] == item["gt"]

            try:
                image_path = item["image_path"]
            except:
                image_path = item["file_name"]

            format_answer = {
                "question_id": index + 1,
                "image": image_path,
                "category": meta["dataset_name"],
                "text": item["gt"],
            }
            format_answer_list.append(format_answer)

            format_prediction = {
                "question_id": index + 1,
                "image": image_path,
                "category": meta["dataset_name"],
                "text": item["caption"],
            }
            format_prediction_list.append(format_prediction)

        with open(f"mdvp_for_gpt4v_eval/{phase}/answer.json", "w") as f:
            json.dump(format_answer_list, f, indent=4, ensure_ascii=False)
        print(f"mdvp_for_gpt4v_eval/{phase}/answer.json saved successfully!")

        with open(f"mdvp_for_gpt4v_eval/{phase}/prediction.json", "w") as f:
            json.dump(format_prediction_list, f, indent=4, ensure_ascii=False)
        print(f"mdvp_for_gpt4v_eval/{phase}/prediction.json saved successfully!")

        with open(f"data/{domain}/{domain}_box.json", "r") as f:
            data = json.load(f)
        format_question_list = []
        for index, item in enumerate(data):
            format_question = {
                "question_id": index + 1,
                "image": item["image_name"],
                "category": phase,
                "text": item["question"],
                "annotation": {f"{vp}": item[f"{vp}"], "segmentation": []},
            }
            format_question_list.append(format_question)
        with open(f"mdvp_for_gpt4v_eval/{phase}/question.json", "w") as f:
            json.dump(format_question_list, f, indent=4, ensure_ascii=False)
        print(f"mdvp_for_gpt4v_eval/{phase}/question.json saved successfully!")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process args.")
    parser.add_argument(
        "--output_path", type=str, required=True, help="Path to output results"
    )
    args = parser.parse_args()

    main(args)