Add pipeline tag, library name, and additional tags

This PR enhances the model card by adding key metadata to improve discoverability and integration on the Hugging Face Hub:

* Adds `pipeline_tag: any-to-any` to reflect the model's versatile multimodal capabilities, including generation and understanding tasks. This will ensure it appears in relevant pipeline filters.
* Adds `library_name: transformers` as the model is compatible with the 🤗 Transformers library, enabling an automated "how to use" widget.
* Adds several descriptive `tags` (e.g., `multimodal`, `text-to-image`, `image-to-image`, `image-to-text`, `vqa`) to further categorize the model's functionalities based on its described capabilities.

The existing comprehensive content, including the detailed Python inference example, paper links, and project/code links, remains unchanged.

Files changed (1) hide show

README.md +23 -7

README.md CHANGED Viewed

@@ -1,6 +1,15 @@
 ---
 license: apache-2.0
 ---
 <h2 align="center" style="line-height: 25px;">
 Unlocking Aha Moments via Reinforcement Learning: Advancing Collaborative Visual Comprehension and Generation
 </h2>
@@ -128,8 +137,12 @@ def generate_with_refine(
     task_list: List[int] = [1,2,3],
 ):
     prompt = [
-        '<end_of_image>\nLet me think Does this image match the prompt...',
-        '<｜end▁of▁sentence｜>\nNext, I will draw a new image<begin_of_image>'
     ]
     all_imgs_1,embeds_1,attention_mask_1 = [],[],[]
     output_text_ids,selfcheck,attention_mask_txt = [],[],[]
@@ -209,8 +222,8 @@ def generate_with_refine(
         reflect_len = 0
         eos_list = torch.zeros((parallel_size, 1), dtype=torch.int).cuda()
         add_padding = torch.zeros((parallel_size, 1), dtype=torch.int).cuda()
-        eos_token = vl_chat_processor.tokenizer.encode("<｜end▁of▁sentence｜>")[-1]
-        padding_token = vl_chat_processor.tokenizer.encode("<｜▁pad▁｜>")[-1]
         yes_token = vl_chat_processor.tokenizer.encode("Yes")[-1]
         no_token = vl_chat_processor.tokenizer.encode("No")[-1]
         attn_mask = torch.ones((parallel_size, inputs_embeds.shape[1]), dtype=torch.int).cuda()
@@ -234,7 +247,7 @@ def generate_with_refine(
                                                 dim=-1,
                                                 descending=True)
                 probs_sum = torch.cumsum(probs_sort, dim=-1)
-                mask = probs_sum - probs_sort > txt_top_p
                 probs_sort[mask] = 0.0
                 probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
                 next_token = torch.multinomial(probs_sort, num_samples=1)
@@ -384,7 +397,9 @@ if __name__ == "__main__":
     # You can flexibly modify the code here to perform batched inference.
     allprompts = []
-    # prompt = f'<|User|>: {args.caption}\n\n<|Assistant|>:<begin_of_image>'
     conversation = [
         {
             "role": "<|User|>",
@@ -434,7 +449,8 @@ if __name__ == "__main__":
             reason_data["correct"] = bool(selfcheck[i])
             reason_data["reason"] = vl_chat_processor.tokenizer.decode(output_text_ids[i].cpu().tolist(), skip_special_tokens=True)
             reason_data = json.dumps(reason_data, ensure_ascii=False)
-            f.write(reason_data+'\n')
     for i in range(args.parallel_size):

 ---
 license: apache-2.0
+pipeline_tag: any-to-any
+library_name: transformers
+tags:
+- multimodal
+- text-to-image
+- image-to-image
+- image-to-text
+- vqa
 ---
 <h2 align="center" style="line-height: 25px;">
 Unlocking Aha Moments via Reinforcement Learning: Advancing Collaborative Visual Comprehension and Generation
 </h2>
     task_list: List[int] = [1,2,3],
 ):
     prompt = [
+        '
+Let me think Does this image match the prompt...',
+        '<｜end of sentence｜>
+Next, I will draw a new image<begin_of_image>'
     ]
     all_imgs_1,embeds_1,attention_mask_1 = [],[],[]
     output_text_ids,selfcheck,attention_mask_txt = [],[],[]
         reflect_len = 0
         eos_list = torch.zeros((parallel_size, 1), dtype=torch.int).cuda()
         add_padding = torch.zeros((parallel_size, 1), dtype=torch.int).cuda()
+        eos_token = vl_chat_processor.tokenizer.encode("<｜end of sentence｜>")[-1]
+        padding_token = vl_chat_processor.tokenizer.encode("<｜ pad ｜>")[-1]
         yes_token = vl_chat_processor.tokenizer.encode("Yes")[-1]
         no_token = vl_chat_processor.tokenizer.encode("No")[-1]
         attn_mask = torch.ones((parallel_size, inputs_embeds.shape[1]), dtype=torch.int).cuda()
                                                 dim=-1,
                                                 descending=True)
                 probs_sum = torch.cumsum(probs_sort, dim=-1)
+                mask = probs_sum - probs_sort > img_top_p
                 probs_sort[mask] = 0.0
                 probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
                 next_token = torch.multinomial(probs_sort, num_samples=1)
     # You can flexibly modify the code here to perform batched inference.
     allprompts = []
+    # prompt = f'<|User|>: {args.caption}
+<|Assistant|>:<begin_of_image>'
     conversation = [
         {
             "role": "<|User|>",
             reason_data["correct"] = bool(selfcheck[i])
             reason_data["reason"] = vl_chat_processor.tokenizer.decode(output_text_ids[i].cpu().tolist(), skip_special_tokens=True)
             reason_data = json.dumps(reason_data, ensure_ascii=False)
+            f.write(reason_data+'
+')
     for i in range(args.parallel_size):