jbilcke-hf commited on
Commit
db2cd8e
·
1 Parent(s): af29e00

make it so that it only returns JSON

Browse files
Files changed (1) hide show
  1. app.py +40 -669
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import warnings
3
  warnings.filterwarnings('ignore')
4
 
@@ -27,7 +26,7 @@ import copy
27
 
28
  import numpy as np
29
  import torch
30
- from PIL import Image, ImageDraw, ImageFont, ImageOps
31
 
32
  # Grounding DINO
33
  import GroundingDINO.groundingdino.datasets.transforms as T
@@ -36,117 +35,6 @@ from GroundingDINO.groundingdino.util import box_ops
36
  from GroundingDINO.groundingdino.util.slconfig import SLConfig
37
  from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
38
 
39
- import cv2
40
- import numpy as np
41
- import matplotlib.pyplot as plt
42
- from lama_cleaner.model_manager import ModelManager
43
- from lama_cleaner.schema import Config as lama_Config
44
-
45
- # segment anything
46
- from segment_anything import build_sam, SamPredictor, SamAutomaticMaskGenerator
47
-
48
- # diffusers
49
- import PIL
50
- import requests
51
- import torch
52
- from io import BytesIO
53
- from diffusers import StableDiffusionInpaintPipeline
54
- from huggingface_hub import hf_hub_download
55
-
56
- from utils import computer_info
57
- # relate anything
58
- from ram_utils import iou, sort_and_deduplicate, relation_classes, MLP, show_anns, ram_show_mask
59
- from ram_train_eval import RamModel,RamPredictor
60
- from mmengine.config import Config as mmengine_Config
61
- from lama_cleaner.helper import (
62
- load_img,
63
- numpy_to_bytes,
64
- resize_max_size,
65
- )
66
-
67
- config_file = 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
68
- ckpt_repo_id = "ShilongLiu/GroundingDINO"
69
- ckpt_filenmae = "groundingdino_swint_ogc.pth"
70
- sam_checkpoint = './sam_vit_h_4b8939.pth'
71
- output_dir = "outputs"
72
- device = 'cpu'
73
-
74
- os.makedirs(output_dir, exist_ok=True)
75
- groundingdino_model = None
76
- sam_device = None
77
- sam_model = None
78
- sam_predictor = None
79
- sam_mask_generator = None
80
- sd_pipe = None
81
- lama_cleaner_model= None
82
- ram_model = None
83
-
84
- def get_sam_vit_h_4b8939():
85
- if not os.path.exists('./sam_vit_h_4b8939.pth'):
86
- logger.info(f"get sam_vit_h_4b8939.pth...")
87
- result = subprocess.run(['wget', 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth'], check=True)
88
- print(f'wget sam_vit_h_4b8939.pth result = {result}')
89
-
90
- def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
91
- args = SLConfig.fromfile(model_config_path)
92
- model = build_model(args)
93
- args.device = device
94
-
95
- cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
96
- checkpoint = torch.load(cache_file, map_location=device)
97
- log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
98
- print("Model loaded from {} \n => {}".format(cache_file, log))
99
- _ = model.eval()
100
- return model
101
-
102
- def plot_boxes_to_image(image_pil, tgt):
103
- H, W = tgt["size"]
104
- boxes = tgt["boxes"]
105
- labels = tgt["labels"]
106
- assert len(boxes) == len(labels), "boxes and labels must have same length"
107
-
108
- draw = ImageDraw.Draw(image_pil)
109
- mask = Image.new("L", image_pil.size, 0)
110
- mask_draw = ImageDraw.Draw(mask)
111
-
112
- # draw boxes and masks
113
- for box, label in zip(boxes, labels):
114
- # from 0..1 to 0..W, 0..H
115
- box = box * torch.Tensor([W, H, W, H])
116
- # from xywh to xyxy
117
- box[:2] -= box[2:] / 2
118
- box[2:] += box[:2]
119
- # random color
120
- color = tuple(np.random.randint(0, 255, size=3).tolist())
121
- # draw
122
- x0, y0, x1, y1 = box
123
- x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
124
-
125
- draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
126
- # draw.text((x0, y0), str(label), fill=color)
127
-
128
- font = ImageFont.load_default()
129
- if hasattr(font, "getbbox"):
130
- bbox = draw.textbbox((x0, y0), str(label), font)
131
- else:
132
- w, h = draw.textsize(str(label), font)
133
- bbox = (x0, y0, w + x0, y0 + h)
134
- # bbox = draw.textbbox((x0, y0), str(label))
135
- draw.rectangle(bbox, fill=color)
136
-
137
- try:
138
- font = os.path.join(cv2.__path__[0],'qt','fonts','DejaVuSans.ttf')
139
- font_size = 36
140
- new_font = ImageFont.truetype(font, font_size)
141
-
142
- draw.text((x0+2, y0+2), str(label), font=new_font, fill="white")
143
- except Exception as e:
144
- pass
145
-
146
- mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
147
-
148
-
149
- return image_pil, mask
150
 
151
  def load_image(image_path):
152
  # # load image
@@ -165,15 +53,6 @@ def load_image(image_path):
165
  image, _ = transform(image_pil, None) # 3, h, w
166
  return image_pil, image
167
 
168
- def load_model(model_config_path, model_checkpoint_path, device):
169
- args = SLConfig.fromfile(model_config_path)
170
- args.device = device
171
- model = build_model(args)
172
- checkpoint = torch.load(model_checkpoint_path, map_location=device) #"cpu")
173
- load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
174
- print(load_res)
175
- _ = model.eval()
176
- return model
177
 
178
  def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
179
  caption = caption.lower()
@@ -210,500 +89,28 @@ def get_grounding_output(model, image, caption, box_threshold, text_threshold, w
210
 
211
  return boxes_filt, pred_phrases
212
 
213
- def show_mask(mask, ax, random_color=False):
214
- if random_color:
215
- color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
216
- else:
217
- color = np.array([30/255, 144/255, 255/255, 0.6])
218
- h, w = mask.shape[-2:]
219
- mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
220
- ax.imshow(mask_image)
221
-
222
- def show_box(box, ax, label):
223
- x0, y0 = box[0], box[1]
224
- w, h = box[2] - box[0], box[3] - box[1]
225
- ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
226
- ax.text(x0, y0, label)
227
-
228
- def xywh_to_xyxy(box, sizeW, sizeH):
229
- if isinstance(box, list):
230
- box = torch.Tensor(box)
231
- box = box * torch.Tensor([sizeW, sizeH, sizeW, sizeH])
232
- box[:2] -= box[2:] / 2
233
- box[2:] += box[:2]
234
- box = box.numpy()
235
- return box
236
-
237
- def mask_extend(img, box, extend_pixels=10, useRectangle=True):
238
- box[0] = int(box[0])
239
- box[1] = int(box[1])
240
- box[2] = int(box[2])
241
- box[3] = int(box[3])
242
- region = img.crop(tuple(box))
243
- new_width = box[2] - box[0] + 2*extend_pixels
244
- new_height = box[3] - box[1] + 2*extend_pixels
245
-
246
- region_BILINEAR = region.resize((int(new_width), int(new_height)))
247
- if useRectangle:
248
- region_draw = ImageDraw.Draw(region_BILINEAR)
249
- region_draw.rectangle((0, 0, new_width, new_height), fill=(255, 255, 255))
250
- img.paste(region_BILINEAR, (int(box[0]-extend_pixels), int(box[1]-extend_pixels)))
251
- return img
252
-
253
- def mix_masks(imgs):
254
- re_img = 1 - np.asarray(imgs[0].convert("1"))
255
- for i in range(len(imgs)-1):
256
- re_img = np.multiply(re_img, 1 - np.asarray(imgs[i+1].convert("1")))
257
- re_img = 1 - re_img
258
- return Image.fromarray(np.uint8(255*re_img))
259
-
260
- def set_device():
261
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
262
- print(f'device={device}')
263
-
264
- def load_groundingdino_model():
265
- # initialize groundingdino model
266
- global groundingdino_model
267
- logger.info(f"initialize groundingdino model...")
268
- groundingdino_model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
269
-
270
- def load_sam_model():
271
- # initialize SAM
272
- global sam_model, sam_predictor, sam_mask_generator, sam_device
273
- logger.info(f"initialize SAM model...")
274
- sam_device = device
275
- sam_model = build_sam(checkpoint=sam_checkpoint).to(sam_device)
276
- sam_predictor = SamPredictor(sam_model)
277
- sam_mask_generator = SamAutomaticMaskGenerator(sam_model)
278
-
279
- def load_sd_model():
280
- # initialize stable-diffusion-inpainting
281
- global sd_pipe
282
- logger.info(f"initialize stable-diffusion-inpainting...")
283
- sd_pipe = None
284
- if os.environ.get('IS_MY_DEBUG') is None:
285
- sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
286
- "runwayml/stable-diffusion-inpainting",
287
- # revision="fp16",
288
- # "stabilityai/stable-diffusion-2-inpainting",
289
- torch_dtype=torch.float16,
290
- )
291
- sd_pipe = sd_pipe.to(device)
292
-
293
- def load_lama_cleaner_model():
294
- # initialize lama_cleaner
295
- global lama_cleaner_model
296
- logger.info(f"initialize lama_cleaner...")
297
-
298
- lama_cleaner_model = ModelManager(
299
- name='lama',
300
- device='cpu', # device,
301
- )
302
-
303
- def lama_cleaner_process(image, mask, cleaner_size_limit=1080):
304
- ori_image = image
305
- if mask.shape[0] == image.shape[1] and mask.shape[1] == image.shape[0] and mask.shape[0] != mask.shape[1]:
306
- # rotate image
307
- ori_image = np.transpose(image[::-1, ...][:, ::-1], axes=(1, 0, 2))[::-1, ...]
308
- image = ori_image
309
-
310
- original_shape = ori_image.shape
311
- interpolation = cv2.INTER_CUBIC
312
-
313
- size_limit = cleaner_size_limit
314
- if size_limit == -1:
315
- size_limit = max(image.shape)
316
- else:
317
- size_limit = int(size_limit)
318
-
319
- config = lama_Config(
320
- ldm_steps=25,
321
- ldm_sampler='plms',
322
- zits_wireframe=True,
323
- hd_strategy='Original',
324
- hd_strategy_crop_margin=196,
325
- hd_strategy_crop_trigger_size=1280,
326
- hd_strategy_resize_limit=2048,
327
- prompt='',
328
- use_croper=False,
329
- croper_x=0,
330
- croper_y=0,
331
- croper_height=512,
332
- croper_width=512,
333
- sd_mask_blur=5,
334
- sd_strength=0.75,
335
- sd_steps=50,
336
- sd_guidance_scale=7.5,
337
- sd_sampler='ddim',
338
- sd_seed=42,
339
- cv2_flag='INPAINT_NS',
340
- cv2_radius=5,
341
- )
342
-
343
- if config.sd_seed == -1:
344
- config.sd_seed = random.randint(1, 999999999)
345
-
346
- # logger.info(f"Origin image shape_0_: {original_shape} / {size_limit}")
347
- image = resize_max_size(image, size_limit=size_limit, interpolation=interpolation)
348
- # logger.info(f"Resized image shape_1_: {image.shape}")
349
-
350
- # logger.info(f"mask image shape_0_: {mask.shape} / {type(mask)}")
351
- mask = resize_max_size(mask, size_limit=size_limit, interpolation=interpolation)
352
- # logger.info(f"mask image shape_1_: {mask.shape} / {type(mask)}")
353
-
354
- res_np_img = lama_cleaner_model(image, mask, config)
355
- torch.cuda.empty_cache()
356
-
357
- image = Image.open(io.BytesIO(numpy_to_bytes(res_np_img, 'png')))
358
- return image
359
-
360
- class Ram_Predictor(RamPredictor):
361
- def __init__(self, config, device='cpu'):
362
- self.config = config
363
- self.device = torch.device(device)
364
- self._build_model()
365
-
366
- def _build_model(self):
367
- self.model = RamModel(**self.config.model).to(self.device)
368
- if self.config.load_from is not None:
369
- self.model.load_state_dict(torch.load(self.config.load_from, map_location=self.device))
370
- self.model.train()
371
-
372
- def load_ram_model():
373
- # load ram model
374
- global ram_model
375
- model_path = "./checkpoints/ram_epoch12.pth"
376
- ram_config = dict(
377
- model=dict(
378
- pretrained_model_name_or_path='bert-base-uncased',
379
- load_pretrained_weights=False,
380
- num_transformer_layer=2,
381
- input_feature_size=256,
382
- output_feature_size=768,
383
- cls_feature_size=512,
384
- num_relation_classes=56,
385
- pred_type='attention',
386
- loss_type='multi_label_ce',
387
- ),
388
- load_from=model_path,
389
- )
390
- ram_config = mmengine_Config(ram_config)
391
- ram_model = Ram_Predictor(ram_config, device)
392
-
393
- # visualization
394
- def draw_selected_mask(mask, draw):
395
- color = (255, 0, 0, 153)
396
- nonzero_coords = np.transpose(np.nonzero(mask))
397
- for coord in nonzero_coords:
398
- draw.point(coord[::-1], fill=color)
399
 
400
- def draw_object_mask(mask, draw):
401
- color = (0, 0, 255, 153)
402
- nonzero_coords = np.transpose(np.nonzero(mask))
403
- for coord in nonzero_coords:
404
- draw.point(coord[::-1], fill=color)
405
 
406
- def create_title_image(word1, word2, word3, width, font_path='./assets/OpenSans-Bold.ttf'):
407
- # Define the colors to use for each word
408
- color_red = (255, 0, 0)
409
- color_black = (0, 0, 0)
410
- color_blue = (0, 0, 255)
411
 
412
- # Define the initial font size and spacing between words
413
- font_size = 40
414
 
415
- # Create a new image with the specified width and white background
416
- image = Image.new('RGB', (width, 60), (255, 255, 255))
417
 
418
- try:
419
- # Load the specified font
420
- font = ImageFont.truetype(font_path, font_size)
421
-
422
- # Keep increasing the font size until all words fit within the desired width
423
- while True:
424
- # Create a draw object for the image
425
- draw = ImageDraw.Draw(image)
426
-
427
- word_spacing = font_size / 2
428
- # Draw each word in the appropriate color
429
- x_offset = word_spacing
430
- draw.text((x_offset, 0), word1, color_red, font=font)
431
- x_offset += font.getsize(word1)[0] + word_spacing
432
- draw.text((x_offset, 0), word2, color_black, font=font)
433
- x_offset += font.getsize(word2)[0] + word_spacing
434
- draw.text((x_offset, 0), word3, color_blue, font=font)
435
-
436
- word_sizes = [font.getsize(word) for word in [word1, word2, word3]]
437
- total_width = sum([size[0] for size in word_sizes]) + word_spacing * 3
438
-
439
- # Stop increasing font size if the image is within the desired width
440
- if total_width <= width:
441
- break
442
-
443
- # Increase font size and reset the draw object
444
- font_size -= 1
445
- image = Image.new('RGB', (width, 50), (255, 255, 255))
446
- font = ImageFont.truetype(font_path, font_size)
447
- draw = None
448
- except Exception as e:
449
- pass
450
-
451
- return image
452
-
453
- def concatenate_images_vertical(image1, image2):
454
- # Get the dimensions of the two images
455
- width1, height1 = image1.size
456
- width2, height2 = image2.size
457
-
458
- # Create a new image with the combined height and the maximum width
459
- new_image = Image.new('RGBA', (max(width1, width2), height1 + height2))
460
-
461
- # Paste the first image at the top of the new image
462
- new_image.paste(image1, (0, 0))
463
-
464
- # Paste the second image below the first image
465
- new_image.paste(image2, (0, height1))
466
-
467
- return new_image
468
-
469
- def relate_anything(input_image, k):
470
- logger.info(f'relate_anything_1_{input_image.size}_')
471
- w, h = input_image.size
472
- max_edge = 1500
473
- if w > max_edge or h > max_edge:
474
- ratio = max(w, h) / max_edge
475
- new_size = (int(w / ratio), int(h / ratio))
476
- input_image.thumbnail(new_size)
477
-
478
- logger.info(f'relate_anything_2_')
479
- # load image
480
- pil_image = input_image.convert('RGBA')
481
- image = np.array(input_image)
482
- sam_masks = sam_mask_generator.generate(image)
483
- filtered_masks = sort_and_deduplicate(sam_masks)
484
-
485
- logger.info(f'relate_anything_3_')
486
- feat_list = []
487
- for fm in filtered_masks:
488
- feat = torch.Tensor(fm['feat']).unsqueeze(0).unsqueeze(0).to(device)
489
- feat_list.append(feat)
490
- feat = torch.cat(feat_list, dim=1).to(device)
491
- matrix_output, rel_triplets = ram_model.predict(feat)
492
-
493
- logger.info(f'relate_anything_4_')
494
- pil_image_list = []
495
- for i, rel in enumerate(rel_triplets[:k]):
496
- s,o,r = int(rel[0]),int(rel[1]),int(rel[2])
497
- relation = relation_classes[r]
498
-
499
- mask_image = Image.new('RGBA', pil_image.size, color=(0, 0, 0, 0))
500
- mask_draw = ImageDraw.Draw(mask_image)
501
-
502
- draw_selected_mask(filtered_masks[s]['segmentation'], mask_draw)
503
- draw_object_mask(filtered_masks[o]['segmentation'], mask_draw)
504
-
505
- current_pil_image = pil_image.copy()
506
- current_pil_image.alpha_composite(mask_image)
507
-
508
- title_image = create_title_image('Red', relation, 'Blue', current_pil_image.size[0])
509
- concate_pil_image = concatenate_images_vertical(current_pil_image, title_image)
510
- pil_image_list.append(concate_pil_image)
511
-
512
- logger.info(f'relate_anything_5_{len(pil_image_list)}')
513
- return pil_image_list
514
-
515
- mask_source_draw = "draw a mask on input image"
516
- mask_source_segment = "type what to detect below"
517
-
518
- def run_anything_task(input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold,
519
- iou_threshold, inpaint_mode, mask_source_radio, remove_mode, remove_mask_extend, num_relation, cleaner_size_limit=1080):
520
- if (task_type == 'relate anything'):
521
- output_images = relate_anything(input_image['image'], num_relation)
522
- return output_images, gr.Gallery.update(label='relate images')
523
-
524
- text_prompt = text_prompt.strip()
525
- if not ((task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_draw):
526
- if text_prompt == '':
527
- return [], gr.Gallery.update(label='Detection prompt is not found!😂😂😂😂')
528
-
529
- if input_image is None:
530
- return [], gr.Gallery.update(label='Please upload a image!😂😂😂😂')
531
-
532
- file_temp = int(time.time())
533
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}/{inpaint_mode}/[{mask_source_radio}]/{remove_mode}/{remove_mask_extend}_[{text_prompt}]/[{inpaint_prompt}]___1_')
534
-
535
- output_images = []
536
-
537
- # load image
538
- if mask_source_radio == mask_source_draw:
539
- input_mask_pil = input_image['mask']
540
- input_mask = np.array(input_mask_pil.convert("L"))
541
-
542
- if isinstance(input_image, dict):
543
- image_pil, image = load_image(input_image['image'].convert("RGB"))
544
- input_img = input_image['image']
545
- output_images.append(input_image['image'])
546
- else:
547
- image_pil, image = load_image(input_image.convert("RGB"))
548
- input_img = input_image
549
- output_images.append(input_image)
550
-
551
- size = image_pil.size
552
-
553
- # run grounding dino model
554
- if (task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_draw:
555
- pass
556
- else:
557
- groundingdino_device = 'cpu'
558
- if device != 'cpu':
559
- try:
560
- from groundingdino import _C
561
- groundingdino_device = 'cuda:0'
562
- except:
563
- warnings.warn("Failed to load custom C++ ops. Running on CPU mode Only in groundingdino!")
564
-
565
- boxes_filt, pred_phrases = get_grounding_output(
566
- groundingdino_model, image, text_prompt, box_threshold, text_threshold, device=groundingdino_device
567
- )
568
- if boxes_filt.size(0) == 0:
569
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_[{text_prompt}]_1_[No objects detected, please try others.]_')
570
- return [], gr.Gallery.update(label='No objects detected, please try others.😂😂😂😂')
571
- boxes_filt_ori = copy.deepcopy(boxes_filt)
572
-
573
- pred_dict = {
574
- "boxes": boxes_filt,
575
- "size": [size[1], size[0]], # H,W
576
- "labels": pred_phrases,
577
- }
578
-
579
- image_with_box = plot_boxes_to_image(copy.deepcopy(image_pil), pred_dict)[0]
580
- output_images.append(image_with_box)
581
-
582
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_2_')
583
- if task_type == 'segment' or ((task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_segment):
584
- image = np.array(input_img)
585
- sam_predictor.set_image(image)
586
-
587
- H, W = size[1], size[0]
588
- for i in range(boxes_filt.size(0)):
589
- boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
590
- boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
591
- boxes_filt[i][2:] += boxes_filt[i][:2]
592
-
593
- boxes_filt = boxes_filt.to(sam_device)
594
- transformed_boxes = sam_predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
595
-
596
- masks, _, _, _ = sam_predictor.predict_torch(
597
- point_coords = None,
598
- point_labels = None,
599
- boxes = transformed_boxes,
600
- multimask_output = False,
601
- )
602
- # masks: [9, 1, 512, 512]
603
- assert sam_checkpoint, 'sam_checkpoint is not found!'
604
- # draw output image
605
- plt.figure(figsize=(10, 10))
606
- plt.imshow(image)
607
- for mask in masks:
608
- show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
609
- for box, label in zip(boxes_filt, pred_phrases):
610
- show_box(box.cpu().numpy(), plt.gca(), label)
611
- plt.axis('off')
612
- image_path = os.path.join(output_dir, f"grounding_seg_output_{file_temp}.jpg")
613
- plt.savefig(image_path, bbox_inches="tight")
614
- segment_image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
615
- os.remove(image_path)
616
- output_images.append(segment_image_result)
617
-
618
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_3_')
619
- if task_type == 'detection' or task_type == 'segment':
620
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_9_')
621
- return output_images, gr.Gallery.update(label='result images')
622
- elif task_type == 'inpainting' or task_type == 'remove':
623
- if inpaint_prompt.strip() == '' and mask_source_radio == mask_source_segment:
624
- task_type = 'remove'
625
-
626
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_4_')
627
- if mask_source_radio == mask_source_draw:
628
- mask_pil = input_mask_pil
629
- mask = input_mask
630
- else:
631
- masks_ori = copy.deepcopy(masks)
632
- if inpaint_mode == 'merge':
633
- masks = torch.sum(masks, dim=0).unsqueeze(0)
634
- masks = torch.where(masks > 0, True, False)
635
- mask = masks[0][0].cpu().numpy()
636
- mask_pil = Image.fromarray(mask)
637
- output_images.append(mask_pil.convert("RGB"))
638
-
639
- if task_type == 'inpainting':
640
- # inpainting pipeline
641
- image_source_for_inpaint = image_pil.resize((512, 512))
642
- image_mask_for_inpaint = mask_pil.resize((512, 512))
643
- image_inpainting = sd_pipe(prompt=inpaint_prompt, image=image_source_for_inpaint, mask_image=image_mask_for_inpaint).images[0]
644
- else:
645
- # remove from mask
646
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_5_')
647
- if mask_source_radio == mask_source_segment:
648
- mask_imgs = []
649
- masks_shape = masks_ori.shape
650
- boxes_filt_ori_array = boxes_filt_ori.numpy()
651
- if inpaint_mode == 'merge':
652
- extend_shape_0 = masks_shape[0]
653
- extend_shape_1 = masks_shape[1]
654
- else:
655
- extend_shape_0 = 1
656
- extend_shape_1 = 1
657
- for i in range(extend_shape_0):
658
- for j in range(extend_shape_1):
659
- mask = masks_ori[i][j].cpu().numpy()
660
- mask_pil = Image.fromarray(mask)
661
-
662
- if remove_mode == 'segment':
663
- useRectangle = False
664
- else:
665
- useRectangle = True
666
-
667
- try:
668
- remove_mask_extend = int(remove_mask_extend)
669
- except:
670
- remove_mask_extend = 10
671
- mask_pil_exp = mask_extend(copy.deepcopy(mask_pil).convert("RGB"),
672
- xywh_to_xyxy(torch.tensor(boxes_filt_ori_array[i]), size[0], size[1]),
673
- extend_pixels=remove_mask_extend, useRectangle=useRectangle)
674
- mask_imgs.append(mask_pil_exp)
675
- mask_pil = mix_masks(mask_imgs)
676
- output_images.append(mask_pil.convert("RGB"))
677
-
678
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_6_')
679
- image_inpainting = lama_cleaner_process(np.array(image_pil), np.array(mask_pil.convert("L")), cleaner_size_limit)
680
- # output_images.append(image_inpainting)
681
 
682
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_7_')
683
- image_inpainting = image_inpainting.resize((image_pil.size[0], image_pil.size[1]))
684
- output_images.append(image_inpainting)
685
- logger.info(f'run_anything_task_[{file_temp}]_{task_type}_9_')
686
- return output_images, gr.Gallery.update(label='result images')
687
- else:
688
- logger.info(f"task_type:{task_type} error!")
689
- logger.info(f'run_anything_task_[{file_temp}]_9_9_')
690
- return output_images, gr.Gallery.update(label='result images')
691
 
692
- def change_radio_display(task_type, mask_source_radio):
693
- text_prompt_visible = True
694
- inpaint_prompt_visible = False
695
- mask_source_radio_visible = False
696
- num_relation_visible = False
697
- if task_type == "inpainting":
698
- inpaint_prompt_visible = True
699
- if task_type == "inpainting" or task_type == "remove":
700
- mask_source_radio_visible = True
701
- if mask_source_radio == mask_source_draw:
702
- text_prompt_visible = False
703
- if task_type == "relate anything":
704
- text_prompt_visible = False
705
- num_relation_visible = True
706
- return gr.Textbox.update(visible=text_prompt_visible), gr.Textbox.update(visible=inpaint_prompt_visible), gr.Radio.update(visible=mask_source_radio_visible), gr.Slider.update(visible=num_relation_visible)
707
 
708
  if __name__ == "__main__":
709
  parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
@@ -712,64 +119,28 @@ if __name__ == "__main__":
712
  args = parser.parse_args()
713
  print(f'args = {args}')
714
 
715
- set_device()
716
- get_sam_vit_h_4b8939()
717
- load_groundingdino_model()
718
- load_sam_model()
719
- load_sd_model()
720
- load_lama_cleaner_model()
721
- load_ram_model()
722
-
723
- os.system("pip list")
724
-
725
- block = gr.Blocks().queue()
726
- with block:
727
- with gr.Row():
728
- with gr.Column():
729
- input_image = gr.Image(source='upload', elem_id="image_upload", tool='sketch', type='pil', label="Upload")
730
- task_type = gr.Radio(["detection", "segment", "inpainting", "remove", "relate anything"], value="detection",
731
- label='Task type', visible=True)
732
- mask_source_radio = gr.Radio([mask_source_draw, mask_source_segment],
733
- value=mask_source_segment, label="Mask from",
734
- visible=False)
735
- text_prompt = gr.Textbox(label="Detection Prompt[To detect multiple objects, seperating each name with '.', like this: cat . dog . chair ]", placeholder="Cannot be empty")
736
- inpaint_prompt = gr.Textbox(label="Inpaint Prompt (if this is empty, then remove)", visible=False)
737
- num_relation = gr.Slider(label="How many relations do you want to see", minimum=1, maximum=20, value=5, step=1, visible=False)
738
- run_button = gr.Button(label="Run", visible=True)
739
- with gr.Accordion("Advanced options", open=False) as advanced_options:
740
- box_threshold = gr.Slider(
741
- label="Box Threshold", minimum=0.0, maximum=1.0, value=0.3, step=0.001
742
- )
743
- text_threshold = gr.Slider(
744
- label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
745
- )
746
- iou_threshold = gr.Slider(
747
- label="IOU Threshold", minimum=0.0, maximum=1.0, value=0.8, step=0.001
748
- )
749
- inpaint_mode = gr.Radio(["merge", "first"], value="merge", label="inpaint_mode")
750
- with gr.Row():
751
- with gr.Column(scale=1):
752
- remove_mode = gr.Radio(["segment", "rectangle"], value="segment", label='remove mode')
753
- with gr.Column(scale=1):
754
- remove_mask_extend = gr.Textbox(label="remove_mask_extend", value='10')
755
-
756
- with gr.Column():
757
- image_gallery = gr.Gallery(label="result images", show_label=True, elem_id="gallery", visible=True
758
- ).style(preview=True, columns=[5], object_fit="scale-down", height="auto")
759
-
760
- run_button.click(fn=run_anything_task, inputs=[
761
- input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold, iou_threshold, inpaint_mode, mask_source_radio, remove_mode, remove_mask_extend, num_relation], outputs=[image_gallery, image_gallery], show_progress=True, queue=True)
762
-
763
- mask_source_radio.change(fn=change_radio_display, inputs=[task_type, mask_source_radio], outputs=[text_prompt, inpaint_prompt, mask_source_radio, num_relation])
764
- task_type.change(fn=change_radio_display, inputs=[task_type, mask_source_radio], outputs=[text_prompt, inpaint_prompt, mask_source_radio, num_relation])
765
-
766
- DESCRIPTION = f'### This demo from [Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything). <br>'
767
- DESCRIPTION += f'RAM from [RelateAnything](https://github.com/Luodian/RelateAnything). <br>'
768
- DESCRIPTION += f'Remove(cleaner) from [lama-cleaner](https://github.com/Sanster/lama-cleaner). <br>'
769
- DESCRIPTION += f'Thanks for their excellent work.'
770
- DESCRIPTION += f'<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. \
771
- <a href="https://huggingface.co/spaces/yizhangliu/Grounded-Segment-Anything?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
772
- gr.Markdown(DESCRIPTION)
773
 
774
- computer_info()
775
- block.launch(server_name='0.0.0.0', debug=args.debug, share=args.share)
 
 
1
  import warnings
2
  warnings.filterwarnings('ignore')
3
 
 
26
 
27
  import numpy as np
28
  import torch
29
+ from PIL import Image
30
 
31
  # Grounding DINO
32
  import GroundingDINO.groundingdino.datasets.transforms as T
 
35
  from GroundingDINO.groundingdino.util.slconfig import SLConfig
36
  from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def load_image(image_path):
40
  # # load image
 
53
  image, _ = transform(image_pil, None) # 3, h, w
54
  return image_pil, image
55
 
 
 
 
 
 
 
 
 
 
56
 
57
  def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
58
  caption = caption.lower()
 
89
 
90
  return boxes_filt, pred_phrases
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ def run_inference(input_image, text_prompt, box_threshold, text_threshold, config_file, ckpt_repo_id, ckpt_filenmae):
 
 
 
 
94
 
95
+ # Load the Grounding DINO model
96
+ model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
 
 
 
97
 
98
+ # Load the input image
99
+ image_pil, image = load_image(input_image)
100
 
101
+ # Run the object detection and grounding model
102
+ boxes, labels = get_grounding_output(model, image, text_prompt, box_threshold, text_threshold)
103
 
104
+ # Convert the boxes and labels to a JSON format
105
+ result = []
106
+ for box, label in zip(boxes, labels):
107
+ result.append({
108
+ "box": box.tolist(),
109
+ "label": label
110
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ return result
 
 
 
 
 
 
 
 
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  if __name__ == "__main__":
116
  parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
 
119
  args = parser.parse_args()
120
  print(f'args = {args}')
121
 
122
+ model_config_file = 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
123
+ model_ckpt_repo_id = "ShilongLiu/GroundingDINO"
124
+ model_ckpt_filenmae = "groundingdino_swint_ogc.pth"
125
+
126
+ def inference_func(input_image, text_prompt):
127
+ result = run_inference(input_image, text_prompt, 0.3, 0.25, model_config_file, model_ckpt_repo_id, model_ckpt_filenmae)
128
+ return result
129
+
130
+ # Create the Gradio interface for the model
131
+ interface = gr.Interface(
132
+ fn=inference_func,
133
+ inputs=[
134
+ gr.inputs.Image(label="Input Image"),
135
+ gr.inputs.Textbox(label="Detection Prompt")
136
+ ],
137
+ outputs=gr.outputs.Dataframe(),
138
+ title="Object Detection and Grounding",
139
+ description="A Gradio app to detect objects in an image and ground them to captions using Grounding DINO.",
140
+ server_name='0.0.0.0',
141
+ debug=args.debug,
142
+ share=args.share
143
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ # Launch the interface
146
+ interface.launch()