nielsr HF Staff commited on
Commit
be8df1c
·
verified ·
1 Parent(s): ed62b80

Add pipeline tag, library name, and additional tags

Browse files

This PR enhances the model card by adding key metadata to improve discoverability and integration on the Hugging Face Hub:

* Adds `pipeline_tag: any-to-any` to reflect the model's versatile multimodal capabilities, including generation and understanding tasks. This will ensure it appears in relevant pipeline filters.
* Adds `library_name: transformers` as the model is compatible with the 🤗 Transformers library, enabling an automated "how to use" widget.
* Adds several descriptive `tags` (e.g., `multimodal`, `text-to-image`, `image-to-image`, `image-to-text`, `vqa`) to further categorize the model's functionalities based on its described capabilities.

The existing comprehensive content, including the detailed Python inference example, paper links, and project/code links, remains unchanged.

Files changed (1) hide show
  1. README.md +23 -7
README.md CHANGED
@@ -1,6 +1,15 @@
1
  ---
2
  license: apache-2.0
 
 
 
 
 
 
 
 
3
  ---
 
4
  <h2 align="center" style="line-height: 25px;">
5
  Unlocking Aha Moments via Reinforcement Learning: Advancing Collaborative Visual Comprehension and Generation
6
  </h2>
@@ -128,8 +137,12 @@ def generate_with_refine(
128
  task_list: List[int] = [1,2,3],
129
  ):
130
  prompt = [
131
- '<end_of_image>\nLet me think Does this image match the prompt...',
132
- '<|end▁of▁sentence|>\nNext, I will draw a new image<begin_of_image>'
 
 
 
 
133
  ]
134
  all_imgs_1,embeds_1,attention_mask_1 = [],[],[]
135
  output_text_ids,selfcheck,attention_mask_txt = [],[],[]
@@ -209,8 +222,8 @@ def generate_with_refine(
209
  reflect_len = 0
210
  eos_list = torch.zeros((parallel_size, 1), dtype=torch.int).cuda()
211
  add_padding = torch.zeros((parallel_size, 1), dtype=torch.int).cuda()
212
- eos_token = vl_chat_processor.tokenizer.encode("<|endofsentence|>")[-1]
213
- padding_token = vl_chat_processor.tokenizer.encode("<|▁pad▁|>")[-1]
214
  yes_token = vl_chat_processor.tokenizer.encode("Yes")[-1]
215
  no_token = vl_chat_processor.tokenizer.encode("No")[-1]
216
  attn_mask = torch.ones((parallel_size, inputs_embeds.shape[1]), dtype=torch.int).cuda()
@@ -234,7 +247,7 @@ def generate_with_refine(
234
  dim=-1,
235
  descending=True)
236
  probs_sum = torch.cumsum(probs_sort, dim=-1)
237
- mask = probs_sum - probs_sort > txt_top_p
238
  probs_sort[mask] = 0.0
239
  probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
240
  next_token = torch.multinomial(probs_sort, num_samples=1)
@@ -384,7 +397,9 @@ if __name__ == "__main__":
384
 
385
  # You can flexibly modify the code here to perform batched inference.
386
  allprompts = []
387
- # prompt = f'<|User|>: {args.caption}\n\n<|Assistant|>:<begin_of_image>'
 
 
388
  conversation = [
389
  {
390
  "role": "<|User|>",
@@ -434,7 +449,8 @@ if __name__ == "__main__":
434
  reason_data["correct"] = bool(selfcheck[i])
435
  reason_data["reason"] = vl_chat_processor.tokenizer.decode(output_text_ids[i].cpu().tolist(), skip_special_tokens=True)
436
  reason_data = json.dumps(reason_data, ensure_ascii=False)
437
- f.write(reason_data+'\n')
 
438
 
439
 
440
  for i in range(args.parallel_size):
 
1
  ---
2
  license: apache-2.0
3
+ pipeline_tag: any-to-any
4
+ library_name: transformers
5
+ tags:
6
+ - multimodal
7
+ - text-to-image
8
+ - image-to-image
9
+ - image-to-text
10
+ - vqa
11
  ---
12
+
13
  <h2 align="center" style="line-height: 25px;">
14
  Unlocking Aha Moments via Reinforcement Learning: Advancing Collaborative Visual Comprehension and Generation
15
  </h2>
 
137
  task_list: List[int] = [1,2,3],
138
  ):
139
  prompt = [
140
+ '
141
+
142
+
143
+ Let me think Does this image match the prompt...',
144
+ '<|end of sentence|>
145
+ Next, I will draw a new image<begin_of_image>'
146
  ]
147
  all_imgs_1,embeds_1,attention_mask_1 = [],[],[]
148
  output_text_ids,selfcheck,attention_mask_txt = [],[],[]
 
222
  reflect_len = 0
223
  eos_list = torch.zeros((parallel_size, 1), dtype=torch.int).cuda()
224
  add_padding = torch.zeros((parallel_size, 1), dtype=torch.int).cuda()
225
+ eos_token = vl_chat_processor.tokenizer.encode("<|end of sentence|>")[-1]
226
+ padding_token = vl_chat_processor.tokenizer.encode("<| pad |>")[-1]
227
  yes_token = vl_chat_processor.tokenizer.encode("Yes")[-1]
228
  no_token = vl_chat_processor.tokenizer.encode("No")[-1]
229
  attn_mask = torch.ones((parallel_size, inputs_embeds.shape[1]), dtype=torch.int).cuda()
 
247
  dim=-1,
248
  descending=True)
249
  probs_sum = torch.cumsum(probs_sort, dim=-1)
250
+ mask = probs_sum - probs_sort > img_top_p
251
  probs_sort[mask] = 0.0
252
  probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
253
  next_token = torch.multinomial(probs_sort, num_samples=1)
 
397
 
398
  # You can flexibly modify the code here to perform batched inference.
399
  allprompts = []
400
+ # prompt = f'<|User|>: {args.caption}
401
+
402
+ <|Assistant|>:<begin_of_image>'
403
  conversation = [
404
  {
405
  "role": "<|User|>",
 
449
  reason_data["correct"] = bool(selfcheck[i])
450
  reason_data["reason"] = vl_chat_processor.tokenizer.decode(output_text_ids[i].cpu().tolist(), skip_special_tokens=True)
451
  reason_data = json.dumps(reason_data, ensure_ascii=False)
452
+ f.write(reason_data+'
453
+ ')
454
 
455
 
456
  for i in range(args.parallel_size):