diff --git a/model_api/python/openvino/model_api/tilers/instance_segmentation.py b/model_api/python/openvino/model_api/tilers/instance_segmentation.py index 1ac7fd7e..a9bd6910 100644 --- a/model_api/python/openvino/model_api/tilers/instance_segmentation.py +++ b/model_api/python/openvino/model_api/tilers/instance_segmentation.py @@ -14,11 +14,12 @@ limitations under the License. """ +import cv2 as cv import numpy as np from openvino.model_api.models.instance_segmentation import _segm_postprocess -from openvino.model_api.models.utils import SegmentedObject +from openvino.model_api.models.utils import InstanceSegmentationResult, SegmentedObject -from .detection import DetectionTiler +from .detection import DetectionTiler, _multiclass_nms class InstanceSegmentationTiler(DetectionTiler): @@ -73,24 +74,21 @@ def _filter_tiles(self, image, tile_coords, confidence_threshold=0.35): return tile_coords - def _postprocess_tile(self, predictions, meta): + def _postprocess_tile(self, predictions, coord): """Converts predictions to a format convinient for further merging. Args: predictions: predictions from an instance segmentation model: a list of `SegmentedObject` objects - meta: a dict containing key "coord", representing tile coordinates + coord: a list containing coordinates for the processed tile Returns: a dict with postprocessed detections in 6-items format: (label id, score, bbox) and masks """ - output_dict = super()._postprocess_tile(predictions, meta) - if hasattr(predictions, "mask"): - output_dict["masks"] = predictions.mask - else: - output_dict["masks"] = [] - for segm_res in predictions: - output_dict["masks"].append(segm_res.mask) + output_dict = super()._postprocess_tile(predictions, coord) + output_dict["masks"] = [] + for segm_res in predictions.segmentedObjects: + output_dict["masks"].append(segm_res.mask) return output_dict @@ -106,32 +104,116 @@ def _merge_results(self, results, shape, meta=None): merged prediciton """ - if meta is None: - meta = {} - detection_result = super()._merge_results(results, shape, meta) + detections_array = np.empty((0, 6), dtype=np.float32) + feature_vectors = [] + saliency_maps = [] + tiles_coords = [] + for result in results: + if len(result["bboxes"]): + detections_array = np.concatenate((detections_array, result["bboxes"])) + feature_vectors.append(result["features"]) + saliency_maps.append(result["saliency_map"]) + tiles_coords.append(result["coords"]) + + keep_idxs = [] + if np.prod(detections_array.shape): + detections_array, keep_idxs = _multiclass_nms( + detections_array, max_num=self.max_pred_number + ) + + merged_vector = ( + np.mean(feature_vectors, axis=0) if feature_vectors else np.ndarray(0) + ) + saliency_map = ( + self._merge_saliency_maps(saliency_maps, shape, tiles_coords) + if saliency_maps + else [] + ) masks = [] for result in results: - if len(result["bboxes"]): + if len(result["masks"]): masks.extend(result["masks"]) if masks: - masks = [masks[keep_idx] for keep_idx in meta["keep_idx"]] - - for i, (det, mask) in enumerate(zip(detection_result.objects, masks)): - box = np.array([det.xmin, det.ymin, det.xmax, det.ymax]) - masks[i] = _segm_postprocess(box, mask, *shape[:-1]) - - return [ - SegmentedObject( - detection.xmin, - detection.ymin, - detection.xmax, - detection.ymax, - detection.score, - detection.id, - detection.str_label, - mask, + masks = [masks[keep_idx] for keep_idx in keep_idxs] + + detected_objects = [] + for i in range(detections_array.shape[0]): + label = int(detections_array[i][0]) + score = float(detections_array[i][1]) + bbox = list(detections_array[i][2:]) + detected_objects.append( + SegmentedObject(*bbox, score, label, self.model.labels[label], masks[i]) ) - for detection, mask in zip(detection_result.objects, masks) - ] + + for i, (det, mask) in enumerate(zip(detected_objects, masks)): + box = np.array([det.xmin, det.ymin, det.xmax, det.ymax]) + masks[i] = _segm_postprocess(box, mask, *shape[:-1]) + + return InstanceSegmentationResult( + detected_objects, + saliency_map, + merged_vector, + ) + + def _merge_saliency_maps(self, saliency_maps, shape, tiles_coords): + """Merged saliency maps from each tile + + Args: + saliency_maps: list of saliency maps, shape of each map is (Nc, H, W) + shape: shape of the original image + tiles_coords: coordinates of tiles + + Returns: + Merged saliency map with shape (Nc, H, W) + """ + + if not saliency_maps: + return None + + image_saliency_map = saliency_maps[0] + + if not image_saliency_map: + return image_saliency_map + + num_classes = len(image_saliency_map) + map_h, map_w = image_saliency_map[0].shape + image_h, image_w, _ = shape + + ratio = map_h / self.tile_size, map_w / self.tile_size + image_map_h = int(image_h * ratio[0]) + image_map_w = int(image_w * ratio[1]) + + merged_map = [np.zeros((image_map_h, image_map_w)) for _ in range(num_classes)] + + for i, saliency_map in enumerate(saliency_maps[1:], 1): + for class_idx in range(num_classes): + cls_map = saliency_map[class_idx] + if len(cls_map.shape) < 2: + continue + + x_1, y_1, x_2, y_2 = tiles_coords[i] + y_1, x_1 = int(y_1 * ratio[0]), int(x_1 * ratio[1]) + y_2, x_2 = int(y_2 * ratio[0]), int(x_2 * ratio[1]) + + map_h, map_w = cls_map.shape + + if (map_h > y_2 - y_1 > 0) and (map_w > x_2 - x_1 > 0): + cls_map = cv.resize(cls_map, (x_2 - x_1, y_2 - y_1)) + + map_h, map_w = y_2 - y_1, x_2 - x_1 + + tile_map = merged_map[class_idx][y_1 : y_1 + map_h, x_1 : map_w] + tile_map = np.where(tile_map > 0, 0.5 * (tile_map + cls_map), cls_map) + merged_map[class_idx][y_1 : y_1 + map_h, x_1 : map_w] = tile_map + + for class_idx in range(num_classes): + image_map_cls = image_saliency_map[class_idx] + if len(image_map_cls.shape) < 2: + continue + image_map_cls = cv.resize(image_map_cls, (image_map_w, image_map_h)) + merged_map[class_idx] += 0.5 * image_map_cls + merged_map[class_idx] = merged_map[class_idx].astype(np.uint8) + + return merged_map