Spaces:

jbilcke-hf
/

SNIPED_grasp-any-region

Running on Zero

App Files Files Community

SNIPED_grasp-any-region / evaluation /Ferret-Bench /eval_gpt.py

jbilcke-hf

Upload core files for paper 2510.18876

46861c5 verified about 1 month ago

raw

history blame contribute delete

7.08 kB

	import argparse
	import json
	import os
	import re
	import time

	import openai
	from tqdm import tqdm

	NUM_SECONDS_TO_SLEEP = 0.5
	VOCAB_IMAGE_W = 1000
	VOCAB_IMAGE_H = 1000

	# Define Azure OpenAI details
	model_name = "gpt-4o-2024-11-20"
	max_tokens = 1000 # range: [1, 4095]

	# Initialize the Azure client
	client = openai.AzureOpenAI(
	azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
	api_key=os.getenv("AZURE_OPENAI_KEY"),
	api_version="2024-03-01-preview",
	)


	def get_eval(content: str, max_tokens: int):
	while True:
	try:
	completion = client.chat.completions.create(
	model=model_name,
	messages=[
	{
	"role": "system",
	"content": "You are a helpful and precise assistant for checking the quality of the answer.",
	},
	{
	"role": "user",
	"content": content,
	},
	],
	max_tokens=max_tokens,
	temperature=0,
	)
	ret = completion.choices[0].message.content
	break
	except openai.error.RateLimitError:
	pass
	except Exception as e:
	print(e)
	time.sleep(NUM_SECONDS_TO_SLEEP)

	return ret


	def postprocess_answer(answer, category):
	if category == "refer_desc" or category == "refer_reason":
	pattern = r"\[.*?\]"
	matches = re.findall(pattern, answer)
	for match in matches:
	answer = answer.replace(" " + match, "")
	elif category == "ground_conv":
	pattern = r"\[.*?\]"
	matches = re.findall(pattern, answer)
	for match in matches:
	coor_cur = match.replace("[", "")
	coor_cur = coor_cur.replace("]", "")
	coor_cur = coor_cur.split(",")
	coor_cur = [float(i.strip()) for i in coor_cur]
	try:
	assert len(coor_cur) == 4
	except:
	print("Found a exception when parsing coordinates")
	answer = answer.replace(match, "")
	converted_box_coor = [
	coor_cur[0] / VOCAB_IMAGE_W,
	coor_cur[1] / VOCAB_IMAGE_H,
	coor_cur[2] / VOCAB_IMAGE_W,
	coor_cur[3] / VOCAB_IMAGE_H,
	]
	answer = answer.replace(
	match,
	f"[{converted_box_coor[0]:.3f}, {converted_box_coor[1]:.3f}, {converted_box_coor[2]:.3f}, {converted_box_coor[3]:.3f}]",
	)

	return answer


	def parse_score(review):
	try:
	score_pair = review.split("\n")[0]
	score_pair = score_pair.replace(",", " ")
	sp = score_pair.split(" ")
	print("score:", sp)
	return [float(sp[0]), float(sp[1])]
	except Exception as e:
	print(e)
	print("error", review)
	return [-1, -1]


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="ChatGPT-based QA evaluation.")
	parser.add_argument("-q", "--question")
	parser.add_argument("-c", "--context")
	parser.add_argument("-a", "--answer-list", nargs="+", default=[])
	parser.add_argument("-r", "--rule")
	parser.add_argument("-o", "--output")
	parser.add_argument(
	"--max-tokens",
	type=int,
	default=1024,
	help="maximum number of tokens produced in the output",
	)
	parser.add_argument("--source-file", type=str, required=True)
	args = parser.parse_args()

	f_q = open(os.path.expanduser(args.question))
	f_ans1 = open(os.path.expanduser(args.answer_list[0]))

	# convert first
	target_path = os.path.expanduser(args.answer_list[1])
	with open(args.source_file, "r") as file:
	source_file = json.load(file)
	with open(target_path, "w") as file:
	for idx, item in enumerate(source_file):
	info = {
	"question_id": idx,
	"image": item["image_path"].split("/")[-1],
	"category": "refer_desc",
	"text": item["caption"],
	}
	json.dump(info, file, ensure_ascii=False)
	file.write("\n")

	f_ans2 = open(os.path.expanduser(args.answer_list[1]))
	rule_dict = json.load(open(os.path.expanduser(args.rule), "r"))

	if os.path.isfile(os.path.expanduser(args.output)):
	cur_reviews = [
	json.loads(line) for line in open(os.path.expanduser(args.output))
	]
	else:
	cur_reviews = []

	review_file = open(f"{args.output}", "a")

	context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
	image_to_context = {context["image"]: context for context in context_list}

	handles = []
	idx = 0
	for ques_js, ans1_js, ans2_js in tqdm(zip(f_q, f_ans1, f_ans2)):
	ques = json.loads(ques_js)
	ans1 = json.loads(ans1_js)
	ans2 = json.loads(ans2_js)

	inst = image_to_context[ques["image"]]
	# cap_str = '\n'.join(inst['captions'])
	# box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])

	category = json.loads(ques_js)["category"]
	if category in rule_dict:
	rule = rule_dict[category]
	else:
	assert False, f"Visual QA category not found in rule file: {category}."

	# Assume ans2 is the predicted one.
	processed_answer = postprocess_answer(ans2["text"], category)
	# pdb.set_trace()
	ans2["text"] = processed_answer
	# if category == 'refer_desc':

	prompt = rule["prompt"]
	role = rule["role"]
	content = (
	f'[Context]\{inst["text"]}\n\n'
	f'[Question]\n{ques["text"]}\n\n'
	f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
	f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
	f"[System]\n{prompt}\n\n"
	)
	# content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
	# f'[Question]\n{ques["text"]}\n\n'
	# f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
	# f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
	# f'[System]\n{prompt}\n\n')
	cur_js = {
	"id": idx + 1,
	"question_id": ques["question_id"],
	"answer1_id": ans1.get("answer_id", ans1["question_id"]),
	"answer2_id": ans2.get("answer_id", ans2["question_id"]),
	"category": category,
	}
	if idx >= len(cur_reviews):
	review = get_eval(content, args.max_tokens)
	scores = parse_score(review)
	cur_js["content"] = review
	cur_js["tuple"] = scores
	cur_js["answer1"] = ans1["text"]
	cur_js["answer2"] = ans2["text"]
	review_file.write(json.dumps(cur_js) + "\n")
	review_file.flush()
	else:
	print(f"Skipping {idx} as we already have it.")
	idx += 1
	print(idx)
	review_file.close()