Spaces:

jbilcke-hf
/

SNIPED_grasp-any-region

Running on Zero

App Files Files Community

SNIPED_grasp-any-region / evaluation /MDVP-Bench /transfer.py

jbilcke-hf

Upload core files for paper 2510.18876

46861c5 verified about 1 month ago

raw

history blame contribute delete

4.46 kB

	import argparse
	import json
	import os

	"""
	question:
	{
	"question_id": 1,
	"image": "000000104486.jpg",
	"category": "natural_box",
	"text": "Please analyze the relationship between all marked regions in the image.",
	"annotation": {
	"bbox": [[157.23, 341.07, 10.67, 2.08]],
	"segmentation": []
	}
	}
	answer:
	{
	"question_id": 1,
	"image": "000000104486.jpg",
	"category": "natural_box",
	"text": "<Region 1>: This region includes an individual who is caught in a moment that seems to involve some sort of task or activity. The person is engaged with a luggage cart, which suggests they might be arriving or departing from a location that offers such amenities, possibly a hotel. The cart holds luggage indicating travel or transit. The man's expression and attire provide clues to his role or state at the moment, such as potentially being a guest handling his luggage. The other individual seen partially in the background creates a sense of movement or interaction, but their relationship to the man or the context is unclear.\n"
	}
	predictions:
	{
	"question_id": 1,
	"image": "000000104486.jpg",
	"category": "natural_box",
	"text": "<Region 1>: The marked region does not appear to have any direct relationship with other marked regions, as there are no other marks to compare or contrast with.\n"
	}
	"""


	def main(args):
	output_name = args.output_path.split("/")[-1] # android_QA_box.json

	for phase in [
	"android_detailed_caption_box",
	"multipanel_detailed_caption_box",
	"natural_detailed_caption_box",
	"ocr_doc_detailed_caption_box",
	"ocr_spotting_detailed_caption_box",
	"web_detailed_caption_box",
	]:
	vp = "bbox"
	domain = phase.split("_box")[0] # android_QA

	if not os.path.exists(f"mdvp_for_gpt4v_eval/{phase}"):
	os.mkdir(f"mdvp_for_gpt4v_eval/{phase}")

	with open(args.output_path, "r") as f:
	data = json.load(f)

	with open("annotations/mdvp_caption_mask.json", "r") as f:
	mask_data = json.load(f)

	format_answer_list = []
	format_prediction_list = []
	for index, item in enumerate(data):
	meta = mask_data[index]
	assert meta["caption"] == item["gt"]

	try:
	image_path = item["image_path"]
	except:
	image_path = item["file_name"]

	format_answer = {
	"question_id": index + 1,
	"image": image_path,
	"category": meta["dataset_name"],
	"text": item["gt"],
	}
	format_answer_list.append(format_answer)

	format_prediction = {
	"question_id": index + 1,
	"image": image_path,
	"category": meta["dataset_name"],
	"text": item["caption"],
	}
	format_prediction_list.append(format_prediction)

	with open(f"mdvp_for_gpt4v_eval/{phase}/answer.json", "w") as f:
	json.dump(format_answer_list, f, indent=4, ensure_ascii=False)
	print(f"mdvp_for_gpt4v_eval/{phase}/answer.json saved successfully!")

	with open(f"mdvp_for_gpt4v_eval/{phase}/prediction.json", "w") as f:
	json.dump(format_prediction_list, f, indent=4, ensure_ascii=False)
	print(f"mdvp_for_gpt4v_eval/{phase}/prediction.json saved successfully!")

	with open(f"data/{domain}/{domain}_box.json", "r") as f:
	data = json.load(f)
	format_question_list = []
	for index, item in enumerate(data):
	format_question = {
	"question_id": index + 1,
	"image": item["image_name"],
	"category": phase,
	"text": item["question"],
	"annotation": {f"{vp}": item[f"{vp}"], "segmentation": []},
	}
	format_question_list.append(format_question)
	with open(f"mdvp_for_gpt4v_eval/{phase}/question.json", "w") as f:
	json.dump(format_question_list, f, indent=4, ensure_ascii=False)
	print(f"mdvp_for_gpt4v_eval/{phase}/question.json saved successfully!")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Process args.")
	parser.add_argument(
	"--output_path", type=str, required=True, help="Path to output results"
	)
	args = parser.parse_args()

	main(args)