Spaces:
Build error
Build error
| # -------------------------------------------------------- | |
| # X-Decoder -- Generalized Decoding for Pixel, Image, and Language | |
| # Copyright (c) 2022 Microsoft | |
| # Licensed under The MIT License [see LICENSE for details] | |
| # Modified by Xueyan Zou (xueyan@cs.wisc.edu), Ziyi Dou (zdou@cs.ucla.edu) | |
| # -------------------------------------------------------- | |
| import copy | |
| import itertools | |
| import logging | |
| from collections import OrderedDict | |
| import torch | |
| from pycocotools.cocoeval import COCOeval | |
| import detectron2.utils.comm as comm | |
| from detectron2.evaluation.evaluator import DatasetEvaluator | |
| try: | |
| from detectron2.evaluation.fast_eval_api import COCOeval_opt | |
| except ImportError: | |
| COCOeval_opt = COCOeval | |
| class RetrievalEvaluator(DatasetEvaluator): | |
| """ | |
| Evaluate AR for object proposals, AP for instance detection/segmentation, AP | |
| for keypoint detection outputs using COCO's metrics. | |
| See http://cocodataset.org/#detection-eval and | |
| http://cocodataset.org/#keypoints-eval to understand its metrics. | |
| The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means | |
| the metric cannot be computed (e.g. due to no predictions made). | |
| In addition to COCO, this evaluator is able to support any bounding box detection, | |
| instance segmentation, or keypoint detection dataset. | |
| """ | |
| def __init__( | |
| self, | |
| dataset_name=None, | |
| output_dir=None, | |
| ensemble=False, | |
| distributed=True, | |
| ): | |
| """ | |
| Args: | |
| dataset_name (str): name of the dataset to be evaluated. | |
| It must have either the following corresponding metadata: | |
| "json_file": the path to the COCO format annotation | |
| Or it must be in detectron2's standard dataset format | |
| so it can be converted to COCO format automatically. | |
| tasks (tuple[str]): tasks that can be evaluated under the given | |
| configuration. A task is one of "bbox", "segm", "keypoints". | |
| By default, will infer this automatically from predictions. | |
| distributed (True): if True, will collect results from all ranks and run evaluation | |
| in the main process. | |
| Otherwise, will only evaluate the results in the current process. | |
| output_dir (str): optional, an output directory to dump all | |
| results predicted on the dataset. The dump contains two files: | |
| 1. "instances_predictions.pth" a file that can be loaded with `torch.load` and | |
| contains all the results in the format they are produced by the model. | |
| 2. "coco_instances_results.json" a json file in COCO's result format. | |
| max_dets_per_image (int): limit on the maximum number of detections per image. | |
| By default in COCO, this limit is to 100, but this can be customized | |
| to be greater, as is needed in evaluation metrics AP fixed and AP pool | |
| (see https://arxiv.org/pdf/2102.01066.pdf) | |
| This doesn't affect keypoint evaluation. | |
| use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP. | |
| Although the results should be very close to the official implementation in COCO | |
| API, it is still recommended to compute results with the official API for use in | |
| papers. The faster implementation also uses more RAM. | |
| kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS. | |
| See http://cocodataset.org/#keypoints-eval | |
| When empty, it will use the defaults in COCO. | |
| Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS. | |
| allow_cached_coco (bool): Whether to use cached coco json from previous validation | |
| runs. You should set this to False if you need to use different validation data. | |
| Defaults to True. | |
| """ | |
| self._logger = logging.getLogger(__name__) | |
| self._dataset_name = dataset_name | |
| self._output_dir = output_dir | |
| self._ensemble = ensemble | |
| self._distributed = distributed | |
| if 'p2i' in dataset_name: | |
| self.mode = 'patch2image' | |
| elif 'interactive2i' in dataset_name: | |
| self.mode = 'interactive2image' | |
| else: | |
| self.mode = 'default' | |
| def reset(self): | |
| self._text_embeds = [] | |
| self._image_embeds = [] | |
| self._image_embeds2 = [] | |
| self._text_ids = [] | |
| self._image_ids = [] | |
| def process(self, inputs, outputs): | |
| """ | |
| Args: | |
| inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). | |
| It is a list of dict. Each dict corresponds to an image and | |
| contains keys like "height", "width", "file_name", "image_id". | |
| outputs: the outputs of a COCO model. It is a list of dicts with key | |
| "instances" that contains :class:`Instances`. | |
| """ | |
| for output in outputs: | |
| self._text_ids.extend(output['caption']['caption_ids']) | |
| self._image_ids.append(output['caption']['image_ids']) | |
| self._text_embeds.append(output['caption']['text_embeds']) | |
| self._image_embeds.append(output['caption']['image_embeds'][0]) | |
| if self._ensemble: | |
| self._image_embeds2.append(output['caption']['image_embeds'][1]) | |
| def evaluate(self, img_ids=None): | |
| if self.mode == 'default': | |
| return self.evaluate_default(img_ids) | |
| elif self.mode in ['patch2image', 'interactive2image']: | |
| return self.evaluate_p2i(img_ids) | |
| else: | |
| assert False, "Unknown mode for retrieval evaluation" | |
| def evaluate_default(self, img_ids=None): | |
| """ | |
| Args: | |
| img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset | |
| """ | |
| if self._distributed: | |
| comm.synchronize() | |
| def gather(x, move=False): | |
| x = comm.gather(x) | |
| x = list(itertools.chain(*x)) | |
| if move: | |
| x = [xx.to(self._text_embeds[0].device) for xx in x] | |
| return x | |
| text_embeds = gather(self._text_embeds, move=True) | |
| image_embeds = gather(self._image_embeds, move=True) | |
| if self._ensemble: | |
| image_embeds2 = gather(self._image_embeds2, move=True) | |
| text_ids = gather(self._text_ids) | |
| image_ids = gather(self._image_ids) | |
| if not comm.is_main_process(): | |
| return {} | |
| else: | |
| text_embeds = self._text_embeds | |
| image_embeds = self._image_embeds | |
| if self._ensemble: | |
| image_embeds2 = self._image_embeds2 | |
| text_ids = self._text_ids | |
| image_ids = self._image_ids | |
| if len(text_embeds) == 0: | |
| self._logger.warning("[COCOCaptionEvaluator] Did not receive valid predictions.") | |
| return {} | |
| iids = torch.tensor(image_ids).view(-1).cuda() | |
| tiids = torch.tensor(text_ids).view(-1).cuda() | |
| image_embeds = torch.cat(image_embeds) | |
| text_embeds = torch.cat(text_embeds) | |
| image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True) | |
| text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True) | |
| scores = image_embeds @ text_embeds.t() | |
| if self._ensemble: | |
| image_embeds2 = torch.cat(image_embeds2) | |
| image_embeds2 = image_embeds2 / image_embeds2.norm(dim=-1, keepdim=True) | |
| scores2 = image_embeds2 @ text_embeds.t() | |
| scores = scores2 * 0.5 + scores * 0.5 | |
| topk10 = scores.topk(10, dim=1) | |
| topk5 = scores.topk(5, dim=1) | |
| topk1 = scores.topk(1, dim=1) | |
| topk10_iids = tiids[topk10.indices] | |
| topk5_iids = tiids[topk5.indices] | |
| topk1_iids = tiids[topk1.indices] | |
| tr_r10 = (iids.unsqueeze(1) == topk10_iids).float().max(dim=1)[0].mean() | |
| tr_r5 = (iids.unsqueeze(1) == topk5_iids).float().max(dim=1)[0].mean() | |
| tr_r1 = (iids.unsqueeze(1) == topk1_iids).float().max(dim=1)[0].mean() | |
| topk10 = scores.topk(10, dim=0) | |
| topk5 = scores.topk(5, dim=0) | |
| topk1 = scores.topk(1, dim=0) | |
| topk10_iids = iids[topk10.indices] | |
| topk5_iids = iids[topk5.indices] | |
| topk1_iids = iids[topk1.indices] | |
| ir_r10 = (tiids.unsqueeze(0) == topk10_iids).float().max(dim=0)[0].mean() | |
| ir_r5 = (tiids.unsqueeze(0) == topk5_iids).float().max(dim=0)[0].mean() | |
| ir_r1 = (tiids.unsqueeze(0) == topk1_iids).float().max(dim=0)[0].mean() | |
| self._results = OrderedDict() | |
| # Copy so the caller can do whatever with results | |
| self._results['recall'] = {} | |
| self._results['recall']['irtr'] = float("{:.3f}".format((ir_r1 + tr_r1).item() * 100)) | |
| self._results['recall']['ir1'] = float("{:.3f}".format(ir_r1.item() * 100)) | |
| self._results['recall']['ir5'] = float("{:.3f}".format(ir_r5.item() * 100)) | |
| self._results['recall']['ir10'] = float("{:.3f}".format(ir_r10.item() * 100)) | |
| self._results['recall']['tr1'] = float("{:.3f}".format(tr_r1.item() * 100)) | |
| self._results['recall']['tr5'] = float("{:.3f}".format(tr_r5.item() * 100)) | |
| self._results['recall']['tr10'] = float("{:.3f}".format(tr_r10.item() * 100)) | |
| self._logger.info(self._results) | |
| return copy.deepcopy(self._results) | |
| def evaluate_p2i(self, img_ids=None): | |
| """ | |
| Args: | |
| img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset | |
| """ | |
| if self._distributed: | |
| comm.synchronize() | |
| def gather(x, move=False): | |
| x = comm.gather(x) | |
| x = list(itertools.chain(*x)) | |
| if move: | |
| x = [xx.to(self._text_embeds[0].device) for xx in x] | |
| return x | |
| text_embeds = gather(self._text_embeds, move=True) | |
| image_embeds = gather(self._image_embeds, move=True) | |
| image_embeds2 = gather(self._image_embeds2, move=True) | |
| text_ids = gather(self._text_ids) | |
| image_ids = gather(self._image_ids) | |
| if not comm.is_main_process(): | |
| return {} | |
| else: | |
| text_embeds = self._text_embeds | |
| image_embeds = self._image_embeds | |
| image_embeds2 = self._image_embeds2 | |
| text_ids = self._text_ids | |
| image_ids = self._image_ids | |
| if len(text_embeds) == 0: | |
| self._logger.warning("[COCOCaptionEvaluator] Did not receive valid predictions.") | |
| return {} | |
| iids = torch.tensor(image_ids).view(-1).cuda() | |
| tiids = torch.tensor(text_ids).view(-1).cuda() | |
| image_embeds = torch.cat(image_embeds) | |
| text_embeds = torch.cat(text_embeds) | |
| image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True) | |
| text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True) | |
| image_embeds2 = torch.cat(image_embeds2) | |
| image_embeds2 = image_embeds2 / image_embeds2.norm(dim=-1, keepdim=True) | |
| # compute image to image retrieval | |
| self._results = OrderedDict() | |
| self._results['recall'] = {} | |
| ii_scores = image_embeds2 @ image_embeds.t() | |
| topk10 = ii_scores.topk(10, dim=1) | |
| topk5 = ii_scores.topk(5, dim=1) | |
| topk1 = ii_scores.topk(1, dim=1) | |
| topk10_iids = iids[topk10.indices] | |
| topk5_iids = iids[topk5.indices] | |
| topk1_iids = iids[topk1.indices] | |
| iir_r10 = (iids.unsqueeze(1) == topk10_iids).float().max(dim=1)[0].mean() | |
| iir_r5 = (iids.unsqueeze(1) == topk5_iids).float().max(dim=1)[0].mean() | |
| iir_r1 = (iids.unsqueeze(1) == topk1_iids).float().max(dim=1)[0].mean() | |
| # Copy so the caller can do whatever with results | |
| self._results['recall']['p2ir1'] = float("{:.3f}".format(iir_r1.item() * 100)) | |
| self._results['recall']['p2ir5'] = float("{:.3f}".format(iir_r5.item() * 100)) | |
| self._results['recall']['p2ir10'] = float("{:.3f}".format(iir_r10.item() * 100)) | |
| self._logger.info(self._results) | |
| return copy.deepcopy(self._results) |