jbilcke-hf commited on
Commit
b7ab178
·
1 Parent(s): add3cec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -242
app.py CHANGED
@@ -466,79 +466,22 @@ def concatenate_images_vertical(image1, image2):
466
 
467
  return new_image
468
 
469
- def relate_anything(input_image, k):
470
- logger.info(f'relate_anything_1_{input_image.size}_')
471
- w, h = input_image.size
472
- max_edge = 1500
473
- if w > max_edge or h > max_edge:
474
- ratio = max(w, h) / max_edge
475
- new_size = (int(w / ratio), int(h / ratio))
476
- input_image.thumbnail(new_size)
477
-
478
- logger.info(f'relate_anything_2_')
479
- # load image
480
- pil_image = input_image.convert('RGBA')
481
- image = np.array(input_image)
482
- sam_masks = sam_mask_generator.generate(image)
483
- filtered_masks = sort_and_deduplicate(sam_masks)
484
-
485
- logger.info(f'relate_anything_3_')
486
- feat_list = []
487
- for fm in filtered_masks:
488
- feat = torch.Tensor(fm['feat']).unsqueeze(0).unsqueeze(0).to(device)
489
- feat_list.append(feat)
490
- feat = torch.cat(feat_list, dim=1).to(device)
491
- matrix_output, rel_triplets = ram_model.predict(feat)
492
-
493
- logger.info(f'relate_anything_4_')
494
- pil_image_list = []
495
- for i, rel in enumerate(rel_triplets[:k]):
496
- s,o,r = int(rel[0]),int(rel[1]),int(rel[2])
497
- relation = relation_classes[r]
498
-
499
- mask_image = Image.new('RGBA', pil_image.size, color=(0, 0, 0, 0))
500
- mask_draw = ImageDraw.Draw(mask_image)
501
-
502
- draw_selected_mask(filtered_masks[s]['segmentation'], mask_draw)
503
- draw_object_mask(filtered_masks[o]['segmentation'], mask_draw)
504
-
505
- current_pil_image = pil_image.copy()
506
- current_pil_image.alpha_composite(mask_image)
507
-
508
- title_image = create_title_image('Red', relation, 'Blue', current_pil_image.size[0])
509
- concate_pil_image = concatenate_images_vertical(current_pil_image, title_image)
510
- pil_image_list.append(concate_pil_image)
511
-
512
- logger.info(f'relate_anything_5_{len(pil_image_list)}')
513
- return pil_image_list
514
 
515
- mask_source_draw = "draw a mask on input image"
516
- mask_source_segment = "type what to detect below"
517
-
518
- def run_anything_task(input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold,
519
- iou_threshold, inpaint_mode, mask_source_radio, remove_mode, remove_mask_extend, num_relation, cleaner_size_limit=1080):
520
- if (task_type == 'relate anything'):
521
- output_images = relate_anything(input_image['image'], num_relation)
522
- return output_images, gr.Gallery.update(label='relate images')
523
 
524
  text_prompt = text_prompt.strip()
525
- if not ((task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_draw):
526
- if text_prompt == '':
527
- return [], gr.Gallery.update(label='Detection prompt is not found!😂😂😂😂')
528
 
529
  if input_image is None:
530
  return [], gr.Gallery.update(label='Please upload a image!😂😂😂😂')
531
 
532
  file_temp = int(time.time())
533
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}/{inpaint_mode}/[{mask_source_radio}]/{remove_mode}/{remove_mask_extend}_[{text_prompt}]/[{inpaint_prompt}]___1_')
534
 
535
  output_images = []
536
 
537
  # load image
538
- if mask_source_radio == mask_source_draw:
539
- input_mask_pil = input_image['mask']
540
- input_mask = np.array(input_mask_pil.convert("L"))
541
-
542
  if isinstance(input_image, dict):
543
  image_pil, image = load_image(input_image['image'].convert("RGB"))
544
  input_img = input_image['image']
@@ -550,166 +493,38 @@ def run_anything_task(input_image, text_prompt, task_type, inpaint_prompt, box_t
550
 
551
  size = image_pil.size
552
 
553
- pred_dict = {
554
- }
555
-
556
  # run grounding dino model
557
- if (task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_draw:
558
- pass
559
- else:
560
- groundingdino_device = 'cpu'
561
- if device != 'cpu':
562
- try:
563
- from groundingdino import _C
564
- groundingdino_device = 'cuda:0'
565
- except:
566
- warnings.warn("Failed to load custom C++ ops. Running on CPU mode Only in groundingdino!")
567
-
568
- boxes_filt, pred_phrases = get_grounding_output(
569
- groundingdino_model, image, text_prompt, box_threshold, text_threshold, device=groundingdino_device
570
- )
571
- if boxes_filt.size(0) == 0:
572
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_[{text_prompt}]_1_[No objects detected, please try others.]_')
573
- return [], gr.Gallery.update(label='No objects detected, please try others.😂😂😂😂')
574
- boxes_filt_ori = copy.deepcopy(boxes_filt)
575
-
576
- pred_dict = {
577
- "boxes": boxes_filt,
578
- "size": [size[1], size[0]], # H,W
579
- "labels": pred_phrases,
580
- }
581
-
582
- image_with_box = plot_boxes_to_image(copy.deepcopy(image_pil), pred_dict)[0]
583
- output_images.append(image_with_box)
584
-
585
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_2_')
586
- if task_type == 'segment' or ((task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_segment):
587
- image = np.array(input_img)
588
- sam_predictor.set_image(image)
589
-
590
- H, W = size[1], size[0]
591
- for i in range(boxes_filt.size(0)):
592
- boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
593
- boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
594
- boxes_filt[i][2:] += boxes_filt[i][:2]
595
-
596
- boxes_filt = boxes_filt.to(sam_device)
597
- transformed_boxes = sam_predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
598
-
599
- masks, _, _, _ = sam_predictor.predict_torch(
600
- point_coords = None,
601
- point_labels = None,
602
- boxes = transformed_boxes,
603
- multimask_output = False,
604
- )
605
- # masks: [9, 1, 512, 512]
606
- assert sam_checkpoint, 'sam_checkpoint is not found!'
607
- # draw output image
608
- plt.figure(figsize=(10, 10))
609
- plt.imshow(image)
610
- for mask in masks:
611
- show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
612
- for box, label in zip(boxes_filt, pred_phrases):
613
- show_box(box.cpu().numpy(), plt.gca(), label)
614
- plt.axis('off')
615
- image_path = os.path.join(output_dir, f"grounding_seg_output_{file_temp}.jpg")
616
- plt.savefig(image_path, bbox_inches="tight")
617
- segment_image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
618
- os.remove(image_path)
619
- output_images.append(segment_image_result)
620
-
621
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_3_')
622
- if task_type == 'detection' or task_type == 'segment':
623
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_9_')
624
- return pred_dict
625
- elif task_type == 'inpainting' or task_type == 'remove':
626
- if inpaint_prompt.strip() == '' and mask_source_radio == mask_source_segment:
627
- task_type = 'remove'
628
-
629
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_4_')
630
- if mask_source_radio == mask_source_draw:
631
- mask_pil = input_mask_pil
632
- mask = input_mask
633
- else:
634
- masks_ori = copy.deepcopy(masks)
635
- if inpaint_mode == 'merge':
636
- masks = torch.sum(masks, dim=0).unsqueeze(0)
637
- masks = torch.where(masks > 0, True, False)
638
- mask = masks[0][0].cpu().numpy()
639
- mask_pil = Image.fromarray(mask)
640
- output_images.append(mask_pil.convert("RGB"))
641
-
642
- if task_type == 'inpainting':
643
- # inpainting pipeline
644
- image_source_for_inpaint = image_pil.resize((512, 512))
645
- image_mask_for_inpaint = mask_pil.resize((512, 512))
646
- image_inpainting = sd_pipe(prompt=inpaint_prompt, image=image_source_for_inpaint, mask_image=image_mask_for_inpaint).images[0]
647
- else:
648
- # remove from mask
649
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_5_')
650
- if mask_source_radio == mask_source_segment:
651
- mask_imgs = []
652
- masks_shape = masks_ori.shape
653
- boxes_filt_ori_array = boxes_filt_ori.numpy()
654
- if inpaint_mode == 'merge':
655
- extend_shape_0 = masks_shape[0]
656
- extend_shape_1 = masks_shape[1]
657
- else:
658
- extend_shape_0 = 1
659
- extend_shape_1 = 1
660
- for i in range(extend_shape_0):
661
- for j in range(extend_shape_1):
662
- mask = masks_ori[i][j].cpu().numpy()
663
- mask_pil = Image.fromarray(mask)
664
-
665
- if remove_mode == 'segment':
666
- useRectangle = False
667
- else:
668
- useRectangle = True
669
-
670
- try:
671
- remove_mask_extend = int(remove_mask_extend)
672
- except:
673
- remove_mask_extend = 10
674
- mask_pil_exp = mask_extend(copy.deepcopy(mask_pil).convert("RGB"),
675
- xywh_to_xyxy(torch.tensor(boxes_filt_ori_array[i]), size[0], size[1]),
676
- extend_pixels=remove_mask_extend, useRectangle=useRectangle)
677
- mask_imgs.append(mask_pil_exp)
678
- mask_pil = mix_masks(mask_imgs)
679
- output_images.append(mask_pil.convert("RGB"))
680
-
681
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_6_')
682
- image_inpainting = lama_cleaner_process(np.array(image_pil), np.array(mask_pil.convert("L")), cleaner_size_limit)
683
- # output_images.append(image_inpainting)
684
-
685
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_7_')
686
- image_inpainting = image_inpainting.resize((image_pil.size[0], image_pil.size[1]))
687
- output_images.append(image_inpainting)
688
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_9_')
689
- return output_images, gr.Gallery.update(label='result images')
690
- else:
691
- logger.info(f"task_type:{task_type} error!")
692
- logger.info(f'run_anything_task_[{file_temp}]_9_9_')
693
- return output_images, gr.Gallery.update(label='result images')
694
-
695
- def change_radio_display(task_type, mask_source_radio):
696
- text_prompt_visible = True
697
- inpaint_prompt_visible = False
698
- mask_source_radio_visible = False
699
- num_relation_visible = False
700
- if task_type == "inpainting":
701
- inpaint_prompt_visible = True
702
- if task_type == "inpainting" or task_type == "remove":
703
- mask_source_radio_visible = True
704
- if mask_source_radio == mask_source_draw:
705
- text_prompt_visible = False
706
- if task_type == "relate anything":
707
- text_prompt_visible = False
708
- num_relation_visible = True
709
- return gr.Textbox.update(visible=text_prompt_visible), gr.Textbox.update(visible=inpaint_prompt_visible), gr.Radio.update(visible=mask_source_radio_visible), gr.Slider.update(visible=num_relation_visible)
710
 
711
  if __name__ == "__main__":
712
- parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
713
  parser.add_argument("--debug", action="store_true", help="using debug mode")
714
  parser.add_argument("--share", action="store_true", help="share the app")
715
  args = parser.parse_args()
@@ -730,14 +545,9 @@ if __name__ == "__main__":
730
  with gr.Row():
731
  with gr.Column():
732
  input_image = gr.Image(source='upload', elem_id="image_upload", tool='sketch', type='pil', label="Upload")
733
- task_type = gr.Radio(["detection", "segment", "inpainting", "remove", "relate anything"], value="detection",
734
- label='Task type', visible=True)
735
- mask_source_radio = gr.Radio([mask_source_draw, mask_source_segment],
736
- value=mask_source_segment, label="Mask from",
737
- visible=False)
738
  text_prompt = gr.Textbox(label="Detection Prompt[To detect multiple objects, seperating each name with '.', like this: cat . dog . chair ]", placeholder="Cannot be empty")
739
- inpaint_prompt = gr.Textbox(label="Inpaint Prompt (if this is empty, then remove)", visible=False)
740
- num_relation = gr.Slider(label="How many relations do you want to see", minimum=1, maximum=20, value=5, step=1, visible=False)
741
  run_button = gr.Button(label="Run", visible=True)
742
  with gr.Accordion("Advanced options", open=False) as advanced_options:
743
  box_threshold = gr.Slider(
@@ -749,25 +559,11 @@ if __name__ == "__main__":
749
  iou_threshold = gr.Slider(
750
  label="IOU Threshold", minimum=0.0, maximum=1.0, value=0.8, step=0.001
751
  )
752
- inpaint_mode = gr.Radio(["merge", "first"], value="merge", label="inpaint_mode")
753
- with gr.Row():
754
- with gr.Column(scale=1):
755
- remove_mode = gr.Radio(["segment", "rectangle"], value="segment", label='remove mode')
756
- with gr.Column(scale=1):
757
- remove_mask_extend = gr.Textbox(label="remove_mask_extend", value='10')
758
 
759
  run_button.click(fn=run_anything_task, inputs=[
760
- input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold, iou_threshold, inpaint_mode, mask_source_radio, remove_mode, remove_mask_extend, num_relation], outputs=[gr.outputs.JSON()], show_progress=True, queue=True)
761
-
762
- mask_source_radio.change(fn=change_radio_display, inputs=[task_type, mask_source_radio], outputs=[text_prompt, inpaint_prompt, mask_source_radio, num_relation])
763
- task_type.change(fn=change_radio_display, inputs=[task_type, mask_source_radio], outputs=[text_prompt, inpaint_prompt, mask_source_radio, num_relation])
764
-
765
- DESCRIPTION = f'### This demo from [Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything). <br>'
766
- DESCRIPTION += f'RAM from [RelateAnything](https://github.com/Luodian/RelateAnything). <br>'
767
- DESCRIPTION += f'Remove(cleaner) from [lama-cleaner](https://github.com/Sanster/lama-cleaner). <br>'
768
- DESCRIPTION += f'Thanks for their excellent work.'
769
- DESCRIPTION += f'<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. \
770
- <a href="https://huggingface.co/spaces/yizhangliu/Grounded-Segment-Anything?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
771
  gr.Markdown(DESCRIPTION)
772
 
773
  computer_info()
 
466
 
467
  return new_image
468
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
+ def run_anything_task(input_image, text_prompt, box_threshold, text_threshold, iou_threshold, cleaner_size_limit=1080):
 
 
 
 
 
 
 
471
 
472
  text_prompt = text_prompt.strip()
473
+ if text_prompt == '':
474
+ return [], gr.Gallery.update(label='Detection prompt is not found!😂😂😂😂')
 
475
 
476
  if input_image is None:
477
  return [], gr.Gallery.update(label='Please upload a image!😂😂😂😂')
478
 
479
  file_temp = int(time.time())
 
480
 
481
  output_images = []
482
 
483
  # load image
484
+
 
 
 
485
  if isinstance(input_image, dict):
486
  image_pil, image = load_image(input_image['image'].convert("RGB"))
487
  input_img = input_image['image']
 
493
 
494
  size = image_pil.size
495
 
 
 
 
496
  # run grounding dino model
497
+ groundingdino_device = 'cpu'
498
+ if device != 'cpu':
499
+ try:
500
+ from groundingdino import _C
501
+ groundingdino_device = 'cuda:0'
502
+ except:
503
+ warnings.warn("Failed to load custom C++ ops. Running on CPU mode Only in groundingdino!")
504
+
505
+ boxes_filt, pred_phrases = get_grounding_output(
506
+ groundingdino_model, image, text_prompt, box_threshold, text_threshold, device=groundingdino_device
507
+ )
508
+ if boxes_filt.size(0) == 0:
509
+ logger.info(f'run_anything_task_[{file_temp}]_[{text_prompt}]_1_[No objects detected, please try others.]_')
510
+ return [], gr.Gallery.update(label='No objects detected, please try others.😂😂😂😂')
511
+ boxes_filt_ori = copy.deepcopy(boxes_filt)
512
+
513
+ pred_dict = {
514
+ "boxes": boxes_filt,
515
+ "size": [size[1], size[0]], # H,W
516
+ "labels": pred_phrases,
517
+ }
518
+
519
+ image_with_box = plot_boxes_to_image(copy.deepcopy(image_pil), pred_dict)[0]
520
+ output_images.append(image_with_box)
521
+
522
+
523
+ return pred_dict
524
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
 
526
  if __name__ == "__main__":
527
+ parser = argparse.ArgumentParser("VideoQuest segmentation module", add_help=True)
528
  parser.add_argument("--debug", action="store_true", help="using debug mode")
529
  parser.add_argument("--share", action="store_true", help="share the app")
530
  args = parser.parse_args()
 
545
  with gr.Row():
546
  with gr.Column():
547
  input_image = gr.Image(source='upload', elem_id="image_upload", tool='sketch', type='pil', label="Upload")
548
+
 
 
 
 
549
  text_prompt = gr.Textbox(label="Detection Prompt[To detect multiple objects, seperating each name with '.', like this: cat . dog . chair ]", placeholder="Cannot be empty")
550
+
 
551
  run_button = gr.Button(label="Run", visible=True)
552
  with gr.Accordion("Advanced options", open=False) as advanced_options:
553
  box_threshold = gr.Slider(
 
559
  iou_threshold = gr.Slider(
560
  label="IOU Threshold", minimum=0.0, maximum=1.0, value=0.8, step=0.001
561
  )
 
 
 
 
 
 
562
 
563
  run_button.click(fn=run_anything_task, inputs=[
564
+ input_image, text_prompt, box_threshold, text_threshold, iou_threshold], outputs=[gr.outputs.JSON()], show_progress=True, queue=True)
565
+
566
+ DESCRIPTION = f'### This space is used by the experimental VideoQuest game. <br> It is based on <a href="https://huggingface.co/spaces/yizhangliu/Grounded-Segment-Anything?duplicate=true">Grounded-Segment-Anything</a>'
 
 
 
 
 
 
 
 
567
  gr.Markdown(DESCRIPTION)
568
 
569
  computer_info()