openvinotoolkit · Wovchena · Jul 3, 2023 · Jul 3, 2023 · Jul 3, 2023 · Jul 3, 2023
diff --git a/model_api/cpp/models/include/models/results.h b/model_api/cpp/models/include/models/results.h
@@ -120,8 +120,7 @@ struct DetectedObject : public cv::Rect2f {
     std::string label;
     float confidence;
 
-    friend std::ostream& operator<< (std::ostream& os, const DetectedObject& detection)
-    {
+    friend std::ostream& operator<< (std::ostream& os, const DetectedObject& detection) {
         return os << int(detection.x) << ", " << int(detection.y) << ", " << int(detection.x + detection.width)
             << ", " << int(detection.y + detection.height) << ", "
             << detection.labelID << " (" << detection.label << "): " << std::fixed << std::setprecision(3) << detection.confidence;
@@ -167,14 +166,8 @@ struct RetinaFaceDetectionResult : public DetectionResult {
 struct SegmentedObject : DetectedObject {
     cv::Mat mask;
 
-    friend std::ostream& operator<< (std::ostream& stream, const SegmentedObject& segmentation)
-    {
-        stream << "(" << int(segmentation.x) << ", " << int(segmentation.y) << ", " << int(segmentation.x + segmentation.width)
-            << ", " << int(segmentation.y + segmentation.height) << ", ";
-        stream << std::fixed;
-        stream << std::setprecision(3) << segmentation.confidence << ", ";
-        stream << std::setprecision(-1) << segmentation.labelID << ", " << segmentation.label << ", " << cv::countNonZero(segmentation.mask > 0.5) << ")";
-        return stream;
+    friend std::ostream& operator<< (std::ostream& os, const SegmentedObject& prediction) {
+        return os << static_cast<const DetectedObject&>(prediction) << ", " << cv::countNonZero(prediction.mask > 0.5);
     }
 };
 
@@ -183,18 +176,12 @@ struct SegmentedObjectWithRects : SegmentedObject {
 
     SegmentedObjectWithRects(const SegmentedObject& segmented_object) : SegmentedObject(segmented_object) {}
 
-    friend std::ostream& operator<< (std::ostream& stream, const SegmentedObjectWithRects& segmentation)
-    {
-        stream << "(" << int(segmentation.x) << ", " << int(segmentation.y) << ", " << int(segmentation.x + segmentation.width)
-            << ", " << int(segmentation.y + segmentation.height) << ", ";
-        stream << std::fixed;
-        stream << std::setprecision(3) << segmentation.confidence << ", ";
-        stream << segmentation.labelID << ", " << segmentation.label << ", " << cv::countNonZero(segmentation.mask > 0.5);
-        for (const cv::RotatedRect& rect : segmentation.rotated_rects) {
-            stream << ", RotatedRect: " << rect.center.x << ' ' << rect.center.y << ' ' <<  rect.size.width << ' ' << rect.size.height << ' ' << rect.angle;
+    friend std::ostream& operator<< (std::ostream& os, const SegmentedObjectWithRects& prediction) {
+        os << static_cast<const SegmentedObject&>(prediction) << std::fixed << std::setprecision(3);
+        for (const cv::RotatedRect& rect : prediction.rotated_rects) {
+            os << ", RotatedRect: " << rect.center.x << ' ' << rect.center.y << ' ' <<  rect.size.width << ' ' << rect.size.height << ' ' << rect.angle;
         }
-        stream << ")";
-        return stream;
+        return os;
     }
 };
 
@@ -228,6 +215,9 @@ struct InstanceSegmentationResult : ResultBase {
     InstanceSegmentationResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
         : ResultBase(frameId, metaData) {}
     std::vector<SegmentedObject> segmentedObjects;
+    // Contan per class saliency_maps and "feature_vector" model output if feature_vector exists
+    std::vector<cv::Mat_<std::uint8_t>> saliency_map;
+    ov::Tensor feature_vector;
 };
 
 struct ImageResult : public ResultBase {

diff --git a/model_api/cpp/models/src/instance_segmentation.cpp b/model_api/cpp/models/src/instance_segmentation.cpp
@@ -34,6 +34,19 @@
 #include "utils/common.hpp"
 
 namespace {
+constexpr char saliency_map_name[]{"saliency_map"};
+constexpr char feature_vector_name[]{"feature_vector"};
+
+void append_xai_names(const std::vector<ov::Output<ov::Node>>& outputs, std::vector<std::string>& outputNames) {
+    for (const ov::Output<ov::Node>& output : outputs) {
+        if (output.get_names().count(saliency_map_name) > 0) {
+            outputNames.emplace_back(saliency_map_name);
+        } else if (output.get_names().count(feature_vector_name) > 0) {
+            outputNames.push_back(feature_vector_name);
+        }
+    }
+}
+
 cv::Rect expand_box(const cv::Rect2f& box, float scale) {
     float w_half = box.width * 0.5f * scale,
         h_half = box.height * 0.5f * scale;
@@ -60,6 +73,42 @@ cv::Mat segm_postprocess(const SegmentedObject& box, const cv::Mat& unpadded, in
     im_mask(cv::Rect{x0, y0, x1-x0, y1-y0}).setTo(1, resized({cv::Point(x0-extended_box.x, y0-extended_box.y), cv::Point(x1-extended_box.x, y1-extended_box.y)}) > 0.5f);
     return im_mask;
 }
+
+std::vector<cv::Mat_<std::uint8_t>> average_and_normalize(const std::vector<std::vector<cv::Mat>>& saliency_maps) {
+    std::vector<cv::Mat_<std::uint8_t>> aggregated;
+    aggregated.reserve(saliency_maps.size());
+    for (const std::vector<cv::Mat>& per_class_maps : saliency_maps) {
+        if (per_class_maps.empty()) {
+            aggregated.emplace_back();
+        } else {
+            cv::Mat_<double> saliency_map{per_class_maps.front().size()};
+            for (const cv::Mat& per_class_map : per_class_maps) {
+                if (saliency_map.size != per_class_map.size) {
+                    throw std::runtime_error("saliency_maps must have same size");
+                } if (per_class_map.channels() != 1) {
+                    throw std::runtime_error("saliency_maps must have one channel");
+                } if (per_class_map.type() != CV_8U) {
+                    throw std::runtime_error("saliency_maps must have type CV_8U");
+                }
+            }
+            for (int row = 0; row < saliency_map.rows; ++row) {
+                for (int col = 0; col < saliency_map.cols; ++col) {
+                    double sum = 0.0;
+                    for (const cv::Mat& per_class_map : per_class_maps) {
+                        sum += per_class_map.at<std::uint8_t>(row, col);
+                    }
+                    saliency_map.at<double>(row, col) = sum / per_class_maps.size();
+                }
+            }
+            double min, max;
+            cv::minMaxLoc(saliency_map, &min, &max);
+            cv::Mat_<std::uint8_t> converted;
+            saliency_map.convertTo(converted, CV_8U, 255.0 / (max + 1e-12));
+            aggregated.push_back(std::move(converted));
+        }
+    }
+    return aggregated;
+}
 }
 std::string MaskRCNNModel::ModelType = "MaskRCNN";
 
@@ -184,25 +233,38 @@ void MaskRCNNModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
     }
 
     // --------------------------- Prepare output  -----------------------------------------------------
-    if (model->outputs().size() != 3) {
-        throw std::logic_error("MaskRCNNModel model wrapper supports topologies with only 3 outputs");
+    struct NameRank {
+        std::string name;
+        size_t rank;
+    };
+    std::vector<NameRank> filtered;
+    filtered.reserve(3);
+    for (ov::Output<ov::Node>& output : model->outputs()) {
+        const std::unordered_set<std::string>& out_names = output.get_names();
+        if (out_names.find(saliency_map_name) == out_names.end() && out_names.find(feature_vector_name) == out_names.end()) {
+            filtered.push_back({output.get_any_name(), output.get_partial_shape().get_max_shape().size()});
+        }
+    }
+    if (filtered.size() != 3) {
+        throw std::logic_error(std::string{"MaskRCNNModel model wrapper supports topologies with "} + saliency_map_name + ", " + feature_vector_name + " and 3 other outputs");
     }
     outputNames.resize(3);
-    for (const auto& output : model->outputs()) {
-        switch (output.get_partial_shape().get_max_shape().size()) {
+    for (const NameRank& name_rank : filtered) {
+        switch (name_rank.rank) {
             case 2:
-                outputNames[0] = output.get_any_name();
+                outputNames[0] = name_rank.name;
                 break;
             case 3:
-                outputNames[1] = output.get_any_name();
+                outputNames[1] = name_rank.name;
                 break;
             case 4:
-                outputNames[2] = output.get_any_name();
+                outputNames[2] = name_rank.name;
                 break;
             default:
-                throw std::runtime_error("Unexpected output: " + output.get_any_name());
+                throw std::runtime_error("Unexpected output: " + name_rank.name);
         }
     }
+    append_xai_names(model->outputs(), outputNames);
 }
 
 std::unique_ptr<ResultBase> MaskRCNNModel::postprocess(InferenceResult& infResult) {
@@ -226,9 +288,17 @@ std::unique_ptr<ResultBase> MaskRCNNModel::postprocess(InferenceResult& infResul
     const cv::Size& masks_size{int(infResult.outputsData[outputNames[2]].get_shape()[3]), int(infResult.outputsData[outputNames[2]].get_shape()[2])};
     InstanceSegmentationResult* result = new InstanceSegmentationResult(infResult.frameId, infResult.metaData);
     auto retVal = std::unique_ptr<ResultBase>(result);
+    std::vector<std::vector<cv::Mat>> saliency_maps;
+    bool has_feature_vector_name = std::find(outputNames.begin(), outputNames.end(), feature_vector_name) != outputNames.end();
+    if (has_feature_vector_name) {
+        if (this->labels.empty()) {
+            throw std::runtime_error("Can't get number of classes because labels are empty");
+        }
+        saliency_maps.resize(this->labels.size());
+    }
     for (size_t i = 0; i < infResult.outputsData[outputNames[0]].get_size(); ++i) {
         float confidence = boxes[i * objectSize + 4];
-        if (confidence <= confidence_threshold) {
+        if (confidence <= confidence_threshold && !has_feature_vector_name) {
             continue;
         }
         SegmentedObject obj;
@@ -258,8 +328,16 @@ std::unique_ptr<ResultBase> MaskRCNNModel::postprocess(InferenceResult& infResul
         } else {
             obj.mask = raw_cls_mask;
         }
-        result->segmentedObjects.push_back(obj);
-
+        if (confidence > confidence_threshold) {
+            result->segmentedObjects.push_back(obj);
+        }
+        if (has_feature_vector_name) {
+            saliency_maps[obj.labelID - 1].push_back(obj.mask);
+        }
+    }
+    result->saliency_map = average_and_normalize(saliency_maps);
+    if (has_feature_vector_name) {
+        result->feature_vector = std::move(infResult.outputsData[feature_vector_name]);
     }
     return retVal;
 }

diff --git a/model_api/python/openvino/model_api/models/__init__.py b/model_api/python/openvino/model_api/models/__init__.py
@@ -48,6 +48,7 @@
     DetectionWithLandmarks,
     ImageResultWithSoftPrediction,
     InputTransform,
+    InstanceSegmentationResult,
     OutputTransform,
     SegmentedObject,
     SegmentedObjectWithRects,
@@ -92,6 +93,7 @@
     "ImageModel",
     "ImageResultWithSoftPrediction",
     "InputTransform",
+    "InstanceSegmentationResult",
     "MaskRCNNModel",
     "Model",
     "MonoDepthModel",

diff --git a/model_api/python/openvino/model_api/models/instance_segmentation.py b/model_api/python/openvino/model_api/models/instance_segmentation.py
@@ -19,7 +19,7 @@
 
 from .image_model import ImageModel
 from .types import BooleanValue, ListValue, NumericalValue, StringValue
-from .utils import SegmentedObject, load_labels, nms
+from .utils import InstanceSegmentationResult, SegmentedObject, load_labels, nms
 
 
 class MaskRCNNModel(ImageModel):
@@ -57,8 +57,15 @@ def parameters(cls):
     def _get_outputs(self):
         if self.is_segmentoly:
             return self._get_segmentoly_outputs()
+        filtered_names = []
+        for name, output in self.outputs.items():
+            if (
+                _saliency_map_name not in output.names
+                and _feature_vector_name not in output.names
+            ):
+                filtered_names.append(name)
         outputs = {}
-        for layer_name in self.outputs:
+        for layer_name in filtered_names:
             if layer_name.startswith("TopK"):
                 continue
             layer_shape = self.outputs[layer_name].shape
@@ -70,9 +77,10 @@ def _get_outputs(self):
             elif len(layer_shape) == 3:
                 outputs["masks"] = layer_name
         if len(outputs) == 3:
+            _append_xai_names(self.outputs, outputs)
             return outputs
         outputs = {}
-        for layer_name in self.outputs:
+        for layer_name in filtered_names:
             if layer_name.startswith("TopK"):
                 continue
             layer_shape = self.outputs[layer_name].shape
@@ -84,6 +92,7 @@ def _get_outputs(self):
             elif len(layer_shape) == 4:
                 outputs["masks"] = layer_name
         if len(outputs) == 3:
+            _append_xai_names(self.outputs, outputs)
             return outputs
         self.raise_error(f"Unexpected outputs: {self.outputs}")
 
@@ -143,13 +152,8 @@ def postprocess(self, outputs, meta):
             if self.is_segmentoly
             else outputs[self.output_blob_name["boxes"]][:, 4]
         )
-        detections_filter = scores > self.confidence_threshold
-        boxes, scores, labels, masks = (
-            boxes[detections_filter],
-            scores[detections_filter],
-            outputs[self.output_blob_name["labels"]][detections_filter],
-            outputs[self.output_blob_name["masks"]][detections_filter],
-        )
+        labels = outputs[self.output_blob_name["labels"]]
+        masks = outputs[self.output_blob_name["masks"]]
         if not self.is_segmentoly:
             labels += 1
         if self.labels is None:
@@ -187,21 +191,52 @@ def postprocess(self, outputs, meta):
             out=boxes,
         )
 
-        resized_masks = []
-        for box, cls, raw_mask in zip(boxes, labels, masks):
+        objects = []
+        has_feature_vector_name = _feature_vector_name in self.outputs
+        if has_feature_vector_name:
+            if not self.labels:
+                self.raise_error("Can't get number of classes because labels are empty")
+            saliency_maps = [[] for _ in range(len(self.labels))]
+        else:
+            saliency_maps = []
+        for box, confidence, cls, str_label, raw_mask in zip(
+            boxes, scores, labels, str_labels, masks
+        ):
+            if confidence <= self.confidence_threshold and not has_feature_vector_name:
+                continue
             raw_cls_mask = raw_mask[cls, ...] if self.is_segmentoly else raw_mask
             if self.postprocess_semantic_masks:
-                resized_masks.append(
-                    _segm_postprocess(box, raw_cls_mask, *meta["original_shape"][:-1])
+                resized_mask = _segm_postprocess(
+                    box, raw_cls_mask, *meta["original_shape"][:-1]
                 )
             else:
-                resized_masks.append(raw_cls_mask)
-        return [
-            SegmentedObject(*box, confidence, label, str_label, mask)
-            for box, confidence, label, str_label, mask in zip(
-                boxes.astype(int), scores, labels, str_labels, resized_masks
-            )
-        ]
+                resized_mask = raw_cls_mask
+            if confidence > self.confidence_threshold:
+                objects.append(
+                    SegmentedObject(
+                        *box.astype(int), confidence, cls, str_label, resized_mask
+                    )
+                )
+            if has_feature_vector_name:
+                saliency_maps[cls - 1].append(resized_mask)
+        return InstanceSegmentationResult(
+            objects,
+            _average_and_normalize(saliency_maps),
+            outputs.get(_feature_vector_name, np.ndarray(0)),
+        )
+
+
+def _average_and_normalize(saliency_maps):
+    aggregated = []
+    for per_class_maps in saliency_maps:
+        if per_class_maps:
+            saliency_map = np.array(per_class_maps).mean(0)
+            max_values = np.max(saliency_map)
+            saliency_map = 255 * (saliency_map) / (max_values + 1e-12)
+            aggregated.append(saliency_map.astype(np.uint8))
+        else:
+            aggregated.append(np.ndarray(0))
+    return aggregated
 
 
 def _expand_box(box, scale):
@@ -423,3 +458,14 @@ def _sanitize_coordinates(_x1, _x2, img_size, shift=0, padding=0):
         x1 = np.clip(_x1 - padding, 0, img_size)
         x2 = np.clip(_x2 + padding, 0, img_size)
         return x1, x2
+
+
+_saliency_map_name = "saliency_map"
+_feature_vector_name = "feature_vector"
+
+
+def _append_xai_names(outputs, output_names):
+    if _saliency_map_name in outputs:
+        output_names["saliency_map"] = _saliency_map_name
+    if _feature_vector_name in outputs:
+        output_names["feature_vector"] = _feature_vector_name