jbilcke-hf's picture
Upload core files for paper 2510.18876
46861c5 verified
"""
Reference: https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/eval_gpt_review.py
"""
import argparse
import json
import os
import time
import cv2
import numpy as np
import openai
import requests
from paint_util import encode_image, paint_text_box, paint_text_point
from tqdm import tqdm
# Define Azure OpenAI details
model_name = "gpt-4o-2024-11-20"
max_tokens = 1000 # range: [1, 4095]
# Initialize the Azure client
client = openai.AzureOpenAI(
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
api_key=os.getenv("AZURE_OPENAI_KEY"),
api_version="2024-03-01-preview",
)
def get_eval(content: str, max_tokens: int):
while True:
try:
messages = [
{
"role": "system",
"content": "You are a helpful and precise assistant for checking the quality of the answer.",
},
{
"role": "user",
"content": content,
},
]
completion = client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=max_tokens,
temperature=0,
)
ret = completion.choices[0].message.content
break
except Exception as e:
print(e)
time.sleep(1)
return ret
def parse_score(review):
try:
score_pair = review.split("\n")[0]
score_pair = score_pair.replace(",", " ")
sp = score_pair.split(" ")
print("score_pair:", score_pair, sp)
return [float(sp[0]), float(sp[1])]
except Exception as e:
print(e)
print("error", review)
return [-1, -1]
def main(args):
phase = args.phase # android_QA_box
domain = phase.split("_box")[0] # android_QA
if "natural" in phase:
context_str = "The image is a natural image."
elif "ocr" in phase:
context_str = "The image contains text, and the user wishes to know the content of the text."
elif "screen" in phase:
context_str = "The image is a screenshot from a mobile phone or webpage."
elif "panel" in phase:
context_str = "The image is a multi-panel figure."
elif "android" in phase:
context_str = "The image is an andriod screenshot."
elif "web" in phase:
context_str = "The image is a webpage screenshot."
question_path = f"mdvp_for_gpt4v_eval/{phase}/question.json"
args.question = question_path
# parser.add_argument('--question', default=question_path, help='path to question file')
answer_list_path = [
f"mdvp_for_gpt4v_eval/{phase}/answer.json",
f"mdvp_for_gpt4v_eval/{phase}/prediction.json",
]
args.answer_list = answer_list_path
# parser.add_argument('--answer-list', nargs='+', default=answer_list_path, help='gpt answer and model answer json files')
rule_path = f"annotations/rule.json"
args.rule = rule_path
# parser.add_argument('--rule', default=rule_path ,help='gpt rule')
f_q = json.load(open(os.path.expanduser(args.question)))
f_ans1 = json.load(open(os.path.expanduser(args.answer_list[0])))
f_ans2 = json.load(open(os.path.expanduser(args.answer_list[1])))
rule_dict = json.load(open(os.path.expanduser(args.rule), "r"))
os.makedirs("./result", exist_ok=True)
if os.path.isfile(os.path.expanduser(args.output)):
cur_reviews = [
json.loads(line) for line in open(os.path.expanduser(args.output))
]
else:
cur_reviews = []
review_file = open(f"{args.output}", "a")
idx = 0
for ques, ans1, ans2 in tqdm(zip(f_q, f_ans1, f_ans2)):
# paint som mark on image
image_name = ques["image"]
image_path = f"data/{domain}/images/" + image_name
# print("loading image from {}".format(image_path))
image = cv2.imread(image_path)
height, width, channels = image.shape
(width, height)
if "bbox" in ques["annotation"]:
bbox = ques["annotation"]["bbox"]
paint_image_path = paint_text_box(image_path, bbox)
rule = rule_dict["box"]
elif "points" in ques["annotation"]:
points = ques["annotation"]["points"]
paint_image_path = paint_text_point(image_path, points)
rule = rule_dict["point"]
base64_image = encode_image(paint_image_path)
prompt = rule["prompt"]
role = rule["role"]
content_text = (
f"[Context]\{context_str}\n\n"
f'[Question]\n{ques["text"]}\n\n'
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
f"[System]\n{prompt}\n\n"
)
content = [
{
"type": "text",
"text": content_text,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high",
},
},
]
cur_js = {
"id": idx + 1,
"question_id": ques["question_id"],
"answer1_id": ans1.get("answer_id", ans1["question_id"]),
"answer2_id": ans2.get("answer_id", ans2["question_id"]),
"category": phase,
}
# pdb.set_trace()
if idx >= len(cur_reviews):
review = get_eval(content, args.max_tokens)
# print(review)
scores = parse_score(review)
cur_js["content"] = review
cur_js["tuple"] = scores
cur_js["answer1"] = ans1["text"]
cur_js["answer2"] = ans2["text"]
review_file.write(json.dumps(cur_js) + "\n")
review_file.flush()
else:
print(f"Skipping {idx} as we already have it.")
idx += 1
print(idx)
review_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.")
parser.add_argument(
"--phase", help="MDVP domain", type=str, required=True
) # android_QA_box
parser.add_argument(
"--max-tokens",
type=int,
default=1024,
help="maximum number of tokens produced in the output",
)
parser.add_argument(
"--output", default=f"result/gpt_score.jsonl", help="output json dir"
)
args = parser.parse_args()
main(args)