Spaces:

jbilcke-hf
/

SNIPED_grasp-any-region

Running on Zero

App Files Files Community

SNIPED_grasp-any-region / evaluation /MDVP-Bench /eval_gpt.py

jbilcke-hf

Upload core files for paper 2510.18876

46861c5 verified about 1 month ago

raw

history blame contribute delete

6.55 kB

	"""
	Reference: https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/eval_gpt_review.py
	"""

	import argparse
	import json
	import os
	import time

	import cv2
	import numpy as np
	import openai
	import requests
	from paint_util import encode_image, paint_text_box, paint_text_point
	from tqdm import tqdm

	# Define Azure OpenAI details
	model_name = "gpt-4o-2024-11-20"
	max_tokens = 1000 # range: [1, 4095]

	# Initialize the Azure client
	client = openai.AzureOpenAI(
	azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
	api_key=os.getenv("AZURE_OPENAI_KEY"),
	api_version="2024-03-01-preview",
	)


	def get_eval(content: str, max_tokens: int):
	while True:
	try:
	messages = [
	{
	"role": "system",
	"content": "You are a helpful and precise assistant for checking the quality of the answer.",
	},
	{
	"role": "user",
	"content": content,
	},
	]
	completion = client.chat.completions.create(
	model=model_name,
	messages=messages,
	max_tokens=max_tokens,
	temperature=0,
	)
	ret = completion.choices[0].message.content
	break

	except Exception as e:
	print(e)
	time.sleep(1)

	return ret


	def parse_score(review):
	try:
	score_pair = review.split("\n")[0]
	score_pair = score_pair.replace(",", " ")
	sp = score_pair.split(" ")
	print("score_pair:", score_pair, sp)
	return [float(sp[0]), float(sp[1])]
	except Exception as e:
	print(e)
	print("error", review)
	return [-1, -1]


	def main(args):
	phase = args.phase # android_QA_box
	domain = phase.split("_box")[0] # android_QA

	if "natural" in phase:
	context_str = "The image is a natural image."
	elif "ocr" in phase:
	context_str = "The image contains text, and the user wishes to know the content of the text."
	elif "screen" in phase:
	context_str = "The image is a screenshot from a mobile phone or webpage."
	elif "panel" in phase:
	context_str = "The image is a multi-panel figure."
	elif "android" in phase:
	context_str = "The image is an andriod screenshot."
	elif "web" in phase:
	context_str = "The image is a webpage screenshot."

	question_path = f"mdvp_for_gpt4v_eval/{phase}/question.json"
	args.question = question_path
	# parser.add_argument('--question', default=question_path, help='path to question file')

	answer_list_path = [
	f"mdvp_for_gpt4v_eval/{phase}/answer.json",
	f"mdvp_for_gpt4v_eval/{phase}/prediction.json",
	]
	args.answer_list = answer_list_path
	# parser.add_argument('--answer-list', nargs='+', default=answer_list_path, help='gpt answer and model answer json files')

	rule_path = f"annotations/rule.json"
	args.rule = rule_path
	# parser.add_argument('--rule', default=rule_path ,help='gpt rule')

	f_q = json.load(open(os.path.expanduser(args.question)))
	f_ans1 = json.load(open(os.path.expanduser(args.answer_list[0])))
	f_ans2 = json.load(open(os.path.expanduser(args.answer_list[1])))
	rule_dict = json.load(open(os.path.expanduser(args.rule), "r"))

	os.makedirs("./result", exist_ok=True)

	if os.path.isfile(os.path.expanduser(args.output)):
	cur_reviews = [
	json.loads(line) for line in open(os.path.expanduser(args.output))
	]
	else:
	cur_reviews = []

	review_file = open(f"{args.output}", "a")

	idx = 0
	for ques, ans1, ans2 in tqdm(zip(f_q, f_ans1, f_ans2)):
	# paint som mark on image
	image_name = ques["image"]
	image_path = f"data/{domain}/images/" + image_name
	# print("loading image from {}".format(image_path))
	image = cv2.imread(image_path)
	height, width, channels = image.shape
	(width, height)
	if "bbox" in ques["annotation"]:
	bbox = ques["annotation"]["bbox"]
	paint_image_path = paint_text_box(image_path, bbox)
	rule = rule_dict["box"]
	elif "points" in ques["annotation"]:
	points = ques["annotation"]["points"]
	paint_image_path = paint_text_point(image_path, points)
	rule = rule_dict["point"]
	base64_image = encode_image(paint_image_path)

	prompt = rule["prompt"]
	role = rule["role"]
	content_text = (
	f"[Context]\{context_str}\n\n"
	f'[Question]\n{ques["text"]}\n\n'
	f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
	f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
	f"[System]\n{prompt}\n\n"
	)

	content = [
	{
	"type": "text",
	"text": content_text,
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}",
	"detail": "high",
	},
	},
	]

	cur_js = {
	"id": idx + 1,
	"question_id": ques["question_id"],
	"answer1_id": ans1.get("answer_id", ans1["question_id"]),
	"answer2_id": ans2.get("answer_id", ans2["question_id"]),
	"category": phase,
	}
	# pdb.set_trace()
	if idx >= len(cur_reviews):
	review = get_eval(content, args.max_tokens)
	# print(review)

	scores = parse_score(review)
	cur_js["content"] = review
	cur_js["tuple"] = scores
	cur_js["answer1"] = ans1["text"]
	cur_js["answer2"] = ans2["text"]
	review_file.write(json.dumps(cur_js) + "\n")
	review_file.flush()
	else:
	print(f"Skipping {idx} as we already have it.")

	idx += 1
	print(idx)

	review_file.close()


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.")
	parser.add_argument(
	"--phase", help="MDVP domain", type=str, required=True
	) # android_QA_box
	parser.add_argument(
	"--max-tokens",
	type=int,
	default=1024,
	help="maximum number of tokens produced in the output",
	)
	parser.add_argument(
	"--output", default=f"result/gpt_score.jsonl", help="output json dir"
	)
	args = parser.parse_args()
	main(args)