CharmingDog's picture
Upload folder using huggingface_hub
55500d6 verified
import random
random.seed(42)
mcqa_example_pool = [
{
"Q": "What does Jon Snow use to fight with Ramsay Bolton?",
"Options": ["A. A shield.", "B. A sword.", "C. An Axe.", "D. A spear."],
"Answer": "A. A shield"
},
{
"Q": "What card does the male judge pick?",
"Options": ["A. 2 of spades.", "B. 2 of diamonds.", "C. 2 of hearts.", "D. 2 of clubs."],
"Answer": "A"
},
{
"Q": "Who finally find the lost city?",
"Options": ["A. Terra preta.", "B. Fawcett.", "C. European expeditions.", "D. Dr.Michael Heckenberger."],
"Answer": "D. Dr.Michael Heckenberger."
},
{
"Q": "What sport are the two teams of athletes playing?",
"Options": ["A. Ice hockey.", "B. Soccer.", "C. Rugby.", "D. Basketball."],
"Answer": "C"
},
{
"Q": "What item is not used to decorate the Christmas tree?",
"Options": ["A. Red balls.", "B. Lights.", "C. Green stars.", "D. Icicles."],
"Answer": "C. Green stars."
},
{
"Q": "What is the main subject matter of the advertisement featured in the video?",
"Options": ["A. Audible app.", "B. Music listening app.", "C. Shopping app.", "D. Video online playing app."],
"Answer": "A"
},
{
"Q": "What country's practice game is this?",
"Options": ["A. UK.", "B. USA.", "C. Canada.", "D. Australia."],
"Answer": "B. USA."
},
{
"Q": "According to the video, which team ultimately won?",
"Options": ["A. China.", "B. Italy.", "C. USA.", "D. France."],
"Answer": "A"
},
{
"Q": "Which cellular structure is responsible for receiving proteins according to the video?",
"Options": ["A. Golgi apparatus (Golgi body).", "B. Nucleus.", "C. Ribosome.", "D. Mitochondrion."],
"Answer": "A. Golgi apparatus (Golgi body)."
},
{
"Q": "At the beginning, what is the player's rank?",
"Options": ["A. Third.", "B. First.", "C. Second.", "D. Last."],
"Answer": "D"
},
{
"Q": "Which team in the video reached the finish line first?",
"Options": ["A. USA team.", "B. Canadian team.", "C. Ghana team.", "D. South Africa team."],
"Answer": "B"
},
{
"Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?",
"Options": ["A. He is an athlete of the Chinese team.", "B. He is an athlete of the Jamaican team.", "C. He is a neutral individual athlete.", "D. It is not mentioned in the video."],
"Answer": "C. He is a neutral individual athlete."
},
{
"Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?",
"Options": ["A. Lunar Ridge.", "B. Collapsed lava tubes.", "C. Rift valley systems.", "D. Scratch marks."],
"Answer": "B"
},
{
"Q": "Which woman works as a chef?",
"Options": ["A. Diamante.", "B. Carola Ordenes.", "C. Amina.", "D. Ghizlane."],
"Answer": "A"
},
{
"Q": "What kind of chess are the old people in the video playing?",
"Options": ["A. Mahjong.", "B. Go.", "C. Chinese chess.", "D. Five-in-a-row."],
"Answer": "C. Chinese chess."
},
{
"Q": "Which ingredient is not used in the video?",
"Options": ["A. Hot glue.", "B. Pieces of burlap.", "C. Florals.", "D. Plastic bottles."],
"Answer": "D"
},
{
"Q": "Who does the video focus on regarding their work with globular clusters?",
"Options": ["A. Harlow Shapley.", "B. Walter Baade.", "C. William Herschel.", "D. Henrietta Swan Levitt."],
"Answer": "A"
}
]
def prompt_miradata_based_text_constraint_mcqa(dense_caption, background_caption, main_object_caption):
task_inst_part = (
"You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
"## TASK:\n"
"Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. **while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness** in long videos.\n"
"You must also provide **4 answer options (A–D)**, with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
"## CRITICAL RULES:"
"1. **Uniqueness Guarantee**: Each question must include either:"
" - A **specific action** (e.g., 'What does the woman use to cut the ribbon?'), OR"
" - A **specific event** (e.g., 'What falls off the table when the dog bumps into it?'), OR"
" - A **composite feature** (e.g., 'What does the girl in the red dress hold in her hand?')."
"2. **Visual Grounding**: Answers must be verifiable from a single frame or short clip.\n"
"3. **Description DescrGroundingiption**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
"4. **No Temporal Reasoning**: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n"
"5. **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
"6. **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
"7. **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
"## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n"
" - `'Q'`: The question.\n"
" - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n"
" - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n"
"\n"
)
choosed_example_pool = random.sample(mcqa_example_pool, 3)
example_part_header = "## EXAMPLES:\n"
for idx, example in enumerate(choosed_example_pool):
Q = example['Q']
Options = example['Options']
Answer = example['Answer']
body = (
f"{idx+1}. {{'Q': '{Q}',\n"
" 'Options': [\n"
f" '{Options[0]}',\n"
f" '{Options[1]}',\n"
f" '{Options[2]}',\n"
f" '{Options[3]}'\n"
" ],\n"
f" 'Answer': '{Answer}'}}\n"
"\n"
)
example_part_header = example_part_header + body
example_part = example_part_header
system_prompt = task_inst_part + example_part
user_prompt = (
"I have provided you with three different aspect description of a specific clip from a long video. Below is these description:\n\n"
"**Dense Description:**\n"
f"{dense_caption}\n\n"
"**Background Description:**\n"
f"{background_caption}\n\n"
"**Main Object Description:**\n"
f"{main_object_caption}\n\n"
"Based on these description and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
"## REQUIREMENTS:\n"
"- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
"- You must use an action, event or composite feature in question to constrain the question, thereby ensuring answer uniqueness.\n"
"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
"**Only return the QA pair in the specified JSON list format.**"
)
return system_prompt, user_prompt
def prompt_miradata_based_text_mcqa(dense_caption, background_caption, main_object_caption):
task_inst_part = (
"You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
"## TASK:\n"
"Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n"
"You must also provide **4 answer options (A–D)**, with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n"
"## INSTRUCTIONS:\n"
"- **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
"- **Ground in Visuals**: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n"
"- **Ground in the Description**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
"- **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
"- **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n"
"- **Output Format**: Format the output as a list of dictionaries with the following keys:\n"
" - `'Q'`: The question.\n"
" - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n"
" - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n"
"\n"
)
choosed_example_pool = random.sample(mcqa_example_pool, 3)
example_part_header = "## EXAMPLES:\n"
for idx, example in enumerate(choosed_example_pool):
Q = example['Q']
Options = example['Options']
Answer = example['Answer']
body = (
f"{idx+1}. {{'Q': '{Q}',\n"
" 'Options': [\n"
f" '{Options[0]}',\n"
f" '{Options[1]}',\n"
f" '{Options[2]}',\n"
f" '{Options[3]}'\n"
" ],\n"
f" 'Answer': '{Answer}'}}\n"
"\n"
)
example_part_header = example_part_header + body
example_part = example_part_header
guidelines_part = (
"## GUIDELINES FOR CREATING QUESTIONS:\n"
"- **Specificity**: Ask about singular, clearly defined object.\n"
"- **Visual Certainty**: Ensure the correct answer is unambiguous.\n"
"- **Description Grounding**: Base all questions and answers on the video description.\n"
"- **Plausible Distractors**: Wrong options should be visually similar (e.g., other kitchen tools if asking about a pan).\n"
"- **No Implicit Knowledge**: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n"
"\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]")
system_prompt = task_inst_part + example_part + guidelines_part
user_prompt = (
"I have provided you with three different aspect description of a specific clip in a video. Below is these description:\n\n"
"**Dense Description:**\n"
f"{dense_caption}\n\n"
"**Background Description:**\n"
f"{background_caption}\n\n"
"**Main Object Description:**\n"
f"{main_object_caption}\n\n"
"Based on these description and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
"## REQUIREMENTS:\n"
"- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n"
"**Only return the QA pair in the specified JSON list format.**"
)
return system_prompt, user_prompt
openqa_example_pool = [
{
"Q": "What does Jon Snow use to fight with Ramsay Bolton?",
"Answer": "Jon Snow uses a shield to fight with Ramsay Bolton."
},
{
"Q": "What card does the male judge pick?",
"Answer": "The male judge picks the 2 of spades."
},
{
"Q": "Who finally finds the lost city?",
"Answer": "Dr. Michael Heckenberger is the person who finally finds the lost city."
},
{
"Q": "What sport are the two teams of athletes playing?",
"Answer": "The two teams of athletes are playing rugby."
},
{
"Q": "What item is not used to decorate the Christmas tree?",
"Answer": "Green stars are not used to decorate the Christmas tree."
},
{
"Q": "What is the main subject matter of the advertisement featured in the video?",
"Answer": "The main subject matter of the advertisement featured in the video is the Audible app."
},
{
"Q": "What country's practice game is this?",
"Answer": "This is a practice game from the USA."
},
{
"Q": "According to the video, which team ultimately won?",
"Answer": "According to the video, the team that ultimately won is China."
},
{
"Q": "Which cellular structure is responsible for receiving proteins according to the video?",
"Answer": "According to the video, the Golgi apparatus (Golgi body) is responsible for receiving proteins."
},
{
"Q": "At the beginning, what is the player's rank?",
"Answer": "At the beginning, the player's rank is last."
},
{
"Q": "Which team in the video reached the finish line first?",
"Answer": "In the video, the Canadian team reached the finish line first."
},
{
"Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?",
"Answer": "The athlete in the video who committed fouls on all attempts except the first one is a neutral individual athlete."
},
{
"Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?",
"Answer": "The straight line that the main character notices on the surface of the moon is collapsed lava tubes."
},
{
"Q": "Which woman works as a chef?",
"Answer": "The woman who works as a chef is Diamante."
},
{
"Q": "What kind of chess are the old people in the video playing?",
"Answer": "The old people in the video are playing Chinese chess."
},
{
"Q": "Which ingredient is not used in the video?",
"Answer": "Plastic bottles are not used in the video."
},
{
"Q": "Who does the video focus on regarding their work with globular clusters?",
"Answer": "The video focuses on Harlow Shapley regarding his work with globular clusters."
}
]
def prompt_miradata_based_text_constraint_openqa(dense_caption, background_caption, main_object_caption):
task_inst_part = (
"You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
"## TASK:\n"
"Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. **while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness** in long videos.\n\n"
"## CRITICAL RULES:"
"1. **Uniqueness Guarantee**: Each question must include either:"
" - A **specific action** (e.g., 'What does the woman use to cut the ribbon?'), OR"
" - A **specific event** (e.g., 'What falls off the table when the dog bumps into it?'), OR"
" - A **composite feature** (e.g., 'What does the girl in the red dress hold in her hand?')."
"2. **Visual Grounding**: Answers must be verifiable from a single frame or short clip.\n"
"3. **Description Grounding**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
"4. **No Temporal Reasoning**: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n"
"5. **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
"6. **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
"7. **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n"
"## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n"
" - `'Q'`: The question.\n"
" - `'Answer'`: The correct answer as a complete sentence.\n"
"\n"
)
# 使用 OpenQA 示例池
choosed_example_pool = random.sample(openqa_example_pool, 3)
example_part_header = "## EXAMPLES:\n"
for idx, example in enumerate(choosed_example_pool):
Q = example['Q']
Answer = example['Answer']
body = (
f"{idx+1}. {{'Q': '{Q}',\n"
f" 'Answer': '{Answer}'}}\n"
"\n"
)
example_part_header = example_part_header + body
example_part = example_part_header
system_prompt = task_inst_part + example_part
user_prompt = (
"I have provided you with three different aspect descriptions of a specific clip from a long video. Below are these descriptions:\n\n"
"**Dense Description:**\n"
f"{dense_caption}\n\n"
"**Background Description:**\n"
f"{background_caption}\n\n"
"**Main Object Description:**\n"
f"{main_object_caption}\n\n"
"Based on these descriptions and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
"## REQUIREMENTS:\n"
"- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
"- You must use an action, event or composite feature in the question to constrain the question, thereby ensuring answer uniqueness.\n"
"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
"**Only return the QA pair in the specified JSON list format.**"
)
return system_prompt, user_prompt
import random
def prompt_miradata_based_text_openqa(dense_caption, background_caption, main_object_caption):
task_inst_part = (
"You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n"
"## TASK:\n"
"Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n\n"
"The answer must be provided as a complete sentence, clearly supported by the visual or narrative content of the video description.\n\n"
"## INSTRUCTIONS:\n"
"- **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n"
"- **Ground in Visuals**: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n"
"- **Ground in the Description**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n"
"- **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n"
"- **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n"
"- **Output Format**: Format the output as a list of dictionaries with the following keys:\n"
" - `'Q'`: The question.\n"
" - `'Answer'`: The correct answer as a complete sentence.\n"
"\n"
)
# 使用 OpenQA 示例池
choosed_example_pool = random.sample(openqa_example_pool, 3)
example_part_header = "## EXAMPLES:\n"
for idx, example in enumerate(choosed_example_pool):
Q = example['Q']
Answer = example['Answer']
body = (
f"{idx+1}. {{'Q': '{Q}',\n"
f" 'Answer': '{Answer}'}}\n"
"\n"
)
example_part_header = example_part_header + body
example_part = example_part_header
guidelines_part = (
"## GUIDELINES FOR CREATING QUESTIONS:\n"
"- **Specificity**: Ask about singular, clearly defined objects.\n"
"- **Visual Certainty**: Ensure the correct answer is unambiguous and directly observable in the description.\n"
"- **Description Grounding**: Base all questions and answers on the video description.\n"
"- **No Implicit Knowledge**: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n"
"- **Complete Sentence Answers**: Always provide the answer as a grammatically correct, complete sentence.\n"
"\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]"
)
system_prompt = task_inst_part + example_part + guidelines_part
user_prompt = (
"I have provided you with three different aspect descriptions of a specific clip in a video. Below are these descriptions:\n\n"
"**Dense Description:**\n"
f"{dense_caption}\n\n"
"**Background Description:**\n"
f"{background_caption}\n\n"
"**Main Object Description:**\n"
f"{main_object_caption}\n\n"
"Based on these descriptions and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n"
"## REQUIREMENTS:\n"
"- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n"
"- The answer must be directly observable in the description without any reasoning or inference.\n\n"
"## OUTPUT FORMAT:\n"
"[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n"
"**Only return the QA pair in the specified JSON list format.**"
)
return system_prompt, user_prompt