synthesis / ObjectRecognition /prompt.py

Upload folder using huggingface_hub

55500d6 verified 8 months ago

23.5 kB

	import random
	random.seed(42)

	mcqa_example_pool = [
	{
	"Q": "What does Jon Snow use to fight with Ramsay Bolton?",
	"Options": ["A. A shield.", "B. A sword.", "C. An Axe.", "D. A spear."],
	"Answer": "A. A shield"
	},
	{
	"Q": "What card does the male judge pick?",
	"Options": ["A. 2 of spades.", "B. 2 of diamonds.", "C. 2 of hearts.", "D. 2 of clubs."],
	"Answer": "A"
	},
	{
	"Q": "Who finally find the lost city?",
	"Options": ["A. Terra preta.", "B. Fawcett.", "C. European expeditions.", "D. Dr.Michael Heckenberger."],
	"Answer": "D. Dr.Michael Heckenberger."
	},
	{
	"Q": "What sport are the two teams of athletes playing?",
	"Options": ["A. Ice hockey.", "B. Soccer.", "C. Rugby.", "D. Basketball."],
	"Answer": "C"
	},
	{
	"Q": "What item is not used to decorate the Christmas tree?",
	"Options": ["A. Red balls.", "B. Lights.", "C. Green stars.", "D. Icicles."],
	"Answer": "C. Green stars."
	},
	{
	"Q": "What is the main subject matter of the advertisement featured in the video?",
	"Options": ["A. Audible app.", "B. Music listening app.", "C. Shopping app.", "D. Video online playing app."],
	"Answer": "A"
	},
	{
	"Q": "What country's practice game is this?",
	"Options": ["A. UK.", "B. USA.", "C. Canada.", "D. Australia."],
	"Answer": "B. USA."
	},
	{
	"Q": "According to the video, which team ultimately won?",
	"Options": ["A. China.", "B. Italy.", "C. USA.", "D. France."],
	"Answer": "A"
	},
	{
	"Q": "Which cellular structure is responsible for receiving proteins according to the video?",
	"Options": ["A. Golgi apparatus (Golgi body).", "B. Nucleus.", "C. Ribosome.", "D. Mitochondrion."],
	"Answer": "A. Golgi apparatus (Golgi body)."
	},
	{
	"Q": "At the beginning, what is the player's rank?",
	"Options": ["A. Third.", "B. First.", "C. Second.", "D. Last."],
	"Answer": "D"
	},
	{
	"Q": "Which team in the video reached the finish line first?",
	"Options": ["A. USA team.", "B. Canadian team.", "C. Ghana team.", "D. South Africa team."],
	"Answer": "B"
	},
	{
	"Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?",
	"Options": ["A. He is an athlete of the Chinese team.", "B. He is an athlete of the Jamaican team.", "C. He is a neutral individual athlete.", "D. It is not mentioned in the video."],
	"Answer": "C. He is a neutral individual athlete."
	},
	{
	"Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?",
	"Options": ["A. Lunar Ridge.", "B. Collapsed lava tubes.", "C. Rift valley systems.", "D. Scratch marks."],
	"Answer": "B"
	},
	{
	"Q": "Which woman works as a chef?",
	"Options": ["A. Diamante.", "B. Carola Ordenes.", "C. Amina.", "D. Ghizlane."],
	"Answer": "A"
	},
	{
	"Q": "What kind of chess are the old people in the video playing?",
	"Options": ["A. Mahjong.", "B. Go.", "C. Chinese chess.", "D. Five-in-a-row."],
	"Answer": "C. Chinese chess."
	},
	{
	"Q": "Which ingredient is not used in the video?",
	"Options": ["A. Hot glue.", "B. Pieces of burlap.", "C. Florals.", "D. Plastic bottles."],
	"Answer": "D"
	},
	{
	"Q": "Who does the video focus on regarding their work with globular clusters?",
	"Options": ["A. Harlow Shapley.", "B. Walter Baade.", "C. William Herschel.", "D. Henrietta Swan Levitt."],
	"Answer": "A"
	}
	]

	def prompt_miradata_based_text_constraint_mcqa(dense_caption, background_caption, main_object_caption):
	task_inst_part = (
	"You are an AI assistant tasked with generating high-quality object recognition questions based on a video snippet description from a long video.\n\n"
	"## TASK:\n"
	"Generate one high-quality object recognition question that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness in long videos.\n"
	"You must also provide 4 answer options (A–D), with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
	"## CRITICAL RULES:"
	"1. Uniqueness Guarantee: Each question must include either:"
	" - A specific action (e.g., 'What does the woman use to cut the ribbon?'), OR"
	" - A specific event (e.g., 'What falls off the table when the dog bumps into it?'), OR"
	" - A composite feature (e.g., 'What does the girl in the red dress hold in her hand?')."
	"2. Visual Grounding: Answers must be verifiable from a single frame or short clip.\n"
	"3. Description DescrGroundingiption: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
	"4. No Temporal Reasoning: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n"
	"5. Focus on Visual Entities: The question must test the model’s ability to recognize objects.\n"
	"6. Avoid Extraneous Information: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
	"7. Clear and Logical Phrasing: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
	"## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n"
	" - `'Q'`: The question.\n"
	" - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n"
	" - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n"
	"\n"
	)

	choosed_example_pool = random.sample(mcqa_example_pool, 3)
	example_part_header = "## EXAMPLES:\n"
	for idx, example in enumerate(choosed_example_pool):
	Q = example['Q']
	Options = example['Options']
	Answer = example['Answer']
	body = (
	f"{idx+1}. {{'Q': '{Q}',\n"
	" 'Options': [\n"
	f" '{Options[0]}',\n"
	f" '{Options[1]}',\n"
	f" '{Options[2]}',\n"
	f" '{Options[3]}'\n"
	" ],\n"
	f" 'Answer': '{Answer}'}}\n"
	"\n"
	)
	example_part_header = example_part_header + body

	example_part = example_part_header
	system_prompt = task_inst_part + example_part

	user_prompt = (
	"I have provided you with three different aspect description of a specific clip from a long video. Below is these description:\n\n"
	"Dense Description:\n"
	f"{dense_caption}\n\n"
	"Background Description:\n"
	f"{background_caption}\n\n"
	"Main Object Description:\n"
	f"{main_object_caption}\n\n"
	"Based on these description and the system instructions, generate one high-quality object recognition question-and-answer pair.\n\n"
	"## REQUIREMENTS:\n"
	"- The question must focus on identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n"
	"- You must use an action, event or composite feature in question to constrain the question, thereby ensuring answer uniqueness.\n"
	"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
	"Only return the QA pair in the specified JSON list format."
	)

	return system_prompt, user_prompt

	def prompt_miradata_based_text_mcqa(dense_caption, background_caption, main_object_caption):

	task_inst_part = (
	"You are an AI assistant tasked with generating high-quality object recognition questions based on a video snippet description from a long video.\n\n"
	"## TASK:\n"
	"Generate one high-quality object recognition question that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n"
	"You must also provide 4 answer options (A–D), with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
	"## INSTRUCTIONS:\n"
	"- Focus on Visual Entities: The question must test the model’s ability to recognize objects.\n"
	"- Ground in Visuals: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n"
	"- Ground in the Description: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
	"- Avoid Extraneous Information: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
	"- Clear and Logical Phrasing: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n"
	"- Output Format: Format the output as a list of dictionaries with the following keys:\n"
	" - `'Q'`: The question.\n"
	" - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n"
	" - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n"
	"\n"
	)

	choosed_example_pool = random.sample(mcqa_example_pool, 3)
	example_part_header = "## EXAMPLES:\n"
	for idx, example in enumerate(choosed_example_pool):
	Q = example['Q']
	Options = example['Options']
	Answer = example['Answer']
	body = (
	f"{idx+1}. {{'Q': '{Q}',\n"
	" 'Options': [\n"
	f" '{Options[0]}',\n"
	f" '{Options[1]}',\n"
	f" '{Options[2]}',\n"
	f" '{Options[3]}'\n"
	" ],\n"
	f" 'Answer': '{Answer}'}}\n"
	"\n"
	)
	example_part_header = example_part_header + body

	example_part = example_part_header

	guidelines_part = (
	"## GUIDELINES FOR CREATING QUESTIONS:\n"
	"- Specificity: Ask about singular, clearly defined object.\n"
	"- Visual Certainty: Ensure the correct answer is unambiguous.\n"
	"- Description Grounding: Base all questions and answers on the video description.\n"
	"- Plausible Distractors: Wrong options should be visually similar (e.g., other kitchen tools if asking about a pan).\n"
	"- No Implicit Knowledge: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n"
	"\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]")


	system_prompt = task_inst_part + example_part + guidelines_part

	user_prompt = (
	"I have provided you with three different aspect description of a specific clip in a video. Below is these description:\n\n"
	"Dense Description:\n"
	f"{dense_caption}\n\n"
	"Background Description:\n"
	f"{background_caption}\n\n"
	"Main Object Description:\n"
	f"{main_object_caption}\n\n"
	"Based on these description and the system instructions, generate one high-quality object recognition question-and-answer pair.\n\n"
	"## REQUIREMENTS:\n"
	"- The question must focus on identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n"
	"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
	"Only return the QA pair in the specified JSON list format."
	)

	return system_prompt, user_prompt

	openqa_example_pool = [
	{
	"Q": "What does Jon Snow use to fight with Ramsay Bolton?",
	"Answer": "Jon Snow uses a shield to fight with Ramsay Bolton."
	},
	{
	"Q": "What card does the male judge pick?",
	"Answer": "The male judge picks the 2 of spades."
	},
	{
	"Q": "Who finally finds the lost city?",
	"Answer": "Dr. Michael Heckenberger is the person who finally finds the lost city."
	},
	{
	"Q": "What sport are the two teams of athletes playing?",
	"Answer": "The two teams of athletes are playing rugby."
	},
	{
	"Q": "What item is not used to decorate the Christmas tree?",
	"Answer": "Green stars are not used to decorate the Christmas tree."
	},
	{
	"Q": "What is the main subject matter of the advertisement featured in the video?",
	"Answer": "The main subject matter of the advertisement featured in the video is the Audible app."
	},
	{
	"Q": "What country's practice game is this?",
	"Answer": "This is a practice game from the USA."
	},
	{
	"Q": "According to the video, which team ultimately won?",
	"Answer": "According to the video, the team that ultimately won is China."
	},
	{
	"Q": "Which cellular structure is responsible for receiving proteins according to the video?",
	"Answer": "According to the video, the Golgi apparatus (Golgi body) is responsible for receiving proteins."
	},
	{
	"Q": "At the beginning, what is the player's rank?",
	"Answer": "At the beginning, the player's rank is last."
	},
	{
	"Q": "Which team in the video reached the finish line first?",
	"Answer": "In the video, the Canadian team reached the finish line first."
	},
	{
	"Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?",
	"Answer": "The athlete in the video who committed fouls on all attempts except the first one is a neutral individual athlete."
	},
	{
	"Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?",
	"Answer": "The straight line that the main character notices on the surface of the moon is collapsed lava tubes."
	},
	{
	"Q": "Which woman works as a chef?",
	"Answer": "The woman who works as a chef is Diamante."
	},
	{
	"Q": "What kind of chess are the old people in the video playing?",
	"Answer": "The old people in the video are playing Chinese chess."
	},
	{
	"Q": "Which ingredient is not used in the video?",
	"Answer": "Plastic bottles are not used in the video."
	},
	{
	"Q": "Who does the video focus on regarding their work with globular clusters?",
	"Answer": "The video focuses on Harlow Shapley regarding his work with globular clusters."
	}
	]


	def prompt_miradata_based_text_constraint_openqa(dense_caption, background_caption, main_object_caption):

	task_inst_part = (
	"You are an AI assistant tasked with generating high-quality object recognition questions based on a video snippet description from a long video.\n\n"
	"## TASK:\n"
	"Generate one high-quality object recognition question that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness in long videos.\n\n"
	"## CRITICAL RULES:"
	"1. Uniqueness Guarantee: Each question must include either:"
	" - A specific action (e.g., 'What does the woman use to cut the ribbon?'), OR"
	" - A specific event (e.g., 'What falls off the table when the dog bumps into it?'), OR"
	" - A composite feature (e.g., 'What does the girl in the red dress hold in her hand?')."
	"2. Visual Grounding: Answers must be verifiable from a single frame or short clip.\n"
	"3. Description Grounding: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
	"4. No Temporal Reasoning: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n"
	"5. Focus on Visual Entities: The question must test the model’s ability to recognize objects.\n"
	"6. Avoid Extraneous Information: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
	"7. Clear and Logical Phrasing: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
	"## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n"
	" - `'Q'`: The question.\n"
	" - `'Answer'`: The correct answer as a complete sentence.\n"
	"\n"
	)

	# 使用 OpenQA 示例池
	choosed_example_pool = random.sample(openqa_example_pool, 3)
	example_part_header = "## EXAMPLES:\n"
	for idx, example in enumerate(choosed_example_pool):
	Q = example['Q']
	Answer = example['Answer']
	body = (
	f"{idx+1}. {{'Q': '{Q}',\n"
	f" 'Answer': '{Answer}'}}\n"
	"\n"
	)
	example_part_header = example_part_header + body

	example_part = example_part_header
	system_prompt = task_inst_part + example_part

	user_prompt = (
	"I have provided you with three different aspect descriptions of a specific clip from a long video. Below are these descriptions:\n\n"
	"Dense Description:\n"
	f"{dense_caption}\n\n"
	"Background Description:\n"
	f"{background_caption}\n\n"
	"Main Object Description:\n"
	f"{main_object_caption}\n\n"
	"Based on these descriptions and the system instructions, generate one high-quality object recognition question-and-answer pair.\n\n"
	"## REQUIREMENTS:\n"
	"- The question must focus on identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n"
	"- You must use an action, event or composite feature in the question to constrain the question, thereby ensuring answer uniqueness.\n"
	"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
	"Only return the QA pair in the specified JSON list format."
	)

	return system_prompt, user_prompt

	import random

	def prompt_miradata_based_text_openqa(dense_caption, background_caption, main_object_caption):
	task_inst_part = (
	"You are an AI assistant tasked with generating high-quality object recognition questions based on a video snippet description from a long video.\n\n"
	"## TASK:\n"
	"Generate one high-quality object recognition question that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n\n"
	"The answer must be provided as a complete sentence, clearly supported by the visual or narrative content of the video description.\n\n"
	"## INSTRUCTIONS:\n"
	"- Focus on Visual Entities: The question must test the model’s ability to recognize objects.\n"
	"- Ground in Visuals: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n"
	"- Ground in the Description: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
	"- Avoid Extraneous Information: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
	"- Clear and Logical Phrasing: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n"
	"- Output Format: Format the output as a list of dictionaries with the following keys:\n"
	" - `'Q'`: The question.\n"
	" - `'Answer'`: The correct answer as a complete sentence.\n"
	"\n"
	)

	# 使用 OpenQA 示例池
	choosed_example_pool = random.sample(openqa_example_pool, 3)
	example_part_header = "## EXAMPLES:\n"
	for idx, example in enumerate(choosed_example_pool):
	Q = example['Q']
	Answer = example['Answer']
	body = (
	f"{idx+1}. {{'Q': '{Q}',\n"
	f" 'Answer': '{Answer}'}}\n"
	"\n"
	)
	example_part_header = example_part_header + body

	example_part = example_part_header

	guidelines_part = (
	"## GUIDELINES FOR CREATING QUESTIONS:\n"
	"- Specificity: Ask about singular, clearly defined objects.\n"
	"- Visual Certainty: Ensure the correct answer is unambiguous and directly observable in the description.\n"
	"- Description Grounding: Base all questions and answers on the video description.\n"
	"- No Implicit Knowledge: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n"
	"- Complete Sentence Answers: Always provide the answer as a grammatically correct, complete sentence.\n"
	"\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]"
	)

	system_prompt = task_inst_part + example_part + guidelines_part

	user_prompt = (
	"I have provided you with three different aspect descriptions of a specific clip in a video. Below are these descriptions:\n\n"
	"Dense Description:\n"
	f"{dense_caption}\n\n"
	"Background Description:\n"
	f"{background_caption}\n\n"
	"Main Object Description:\n"
	f"{main_object_caption}\n\n"
	"Based on these descriptions and the system instructions, generate one high-quality object recognition question-and-answer pair.\n\n"
	"## REQUIREMENTS:\n"
	"- The question must focus on identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n"
	"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
	"## OUTPUT FORMAT:\n"
	"[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
	"Only return the QA pair in the specified JSON list format."
	)

	return system_prompt, user_prompt