From 5d966cb2905bed06912d2192de6102f88809df5b Mon Sep 17 00:00:00 2001
From: Galina <galina.zalesskaya@intel.com>
Date: Mon, 19 Aug 2024 18:25:05 +0300
Subject: [PATCH] Support milti class ann and several bb per image

---
 openvino_xai/metrics/pointing_game.py    | 82 ++++++++++++++++--------
 tests/regression/test_regression.py      | 72 ++++++++++++++-------
 tests/unit/metrics/test_pointing_game.py | 47 +++++++++++---
 3 files changed, 143 insertions(+), 58 deletions(-)

diff --git a/openvino_xai/metrics/pointing_game.py b/openvino_xai/metrics/pointing_game.py
index 3b02b60a..3b13ec94 100644
--- a/openvino_xai/metrics/pointing_game.py
+++ b/openvino_xai/metrics/pointing_game.py
@@ -1,47 +1,77 @@
-from typing import List, Tuple
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
 
 import numpy as np
 
+from openvino_xai.common.utils import logger
+from openvino_xai.explainer.explanation import Explanation
+
 
 class PointingGame:
     @staticmethod
-    def pointing_game(saliency_map: np.ndarray, gt_bbox: Tuple[int, int, int, int]) -> bool:
+    def pointing_game(saliency_map: np.ndarray, image_gt_bboxes: List[Tuple[int, int, int, int]]) -> bool:
         """
-        Implements the Pointing Game metric using bounding boxes. Returns a boolean indicating
-        if any of the most salient point falls within the ground truth bounding box.
+        Implements the Pointing Game metric using a saliency map and bounding boxes of the same image and class.
+        Returns a boolean indicating if any of the most salient points fall within the ground truth bounding boxes.
 
         :param saliency_map: A 2D numpy array representing the saliency map for the image.
         :type saliency_map: np.ndarray
-        :param gt_bbox: A tuple (x, y, w, h) representing the bounding box of the ground truth object.
-        :type gt_bbox: Tuple[int, int, int, int]
-        """
-        # TODO: Support a case with multiple bounding boxes for one imege
-        x, y, w, h = gt_bbox
+        :param image_gt_bboxes: A list of tuples (x, y, w, h) representing the bounding boxes of the ground truth objects.
+        :type image_gt_bboxes: List[Tuple[int, int, int, int]]
 
+        :return: True if any of the most salient points fall within any of the ground truth bounding boxes, False otherwise.
+        :rtype: bool
+        """
         # Find the most salient points in the saliency map
         max_indices = np.argwhere(saliency_map == np.max(saliency_map))
 
-        for max_point_y, max_point_x in max_indices:
-            # Check if this point is within the ground truth bounding box
-            if x <= max_point_x <= x + w and y <= max_point_y <= y + h:
-                return True
+        # If multiple bounding boxes are available for one image
+        for x, y, w, h in image_gt_bboxes:
+            for max_point_y, max_point_x in max_indices:
+                # Check if this point is within the ground truth bounding box
+                if x <= max_point_x <= x + w and y <= max_point_y <= y + h:
+                    return True
         return False
 
-    def evaluate(self, saliency_maps: List[np.ndarray], gt_bboxes: List[Tuple[int, int, int, int]]) -> float:
+    def evaluate(
+        self, explanations: List[Explanation], gt_bboxes: List[Dict[str, List[Tuple[int, int, int, int]]]]
+    ) -> float:
         """
-        Evaluates the Pointing Game metric over a set of images.
+        Evaluates the Pointing Game metric over a set of images. Skips saliency maps if the gt bboxes for this class are absent.
+
+        :param explanations: A list of explanations for each image.
+        :type explanations: List[Explanation]
+        :param gt_bboxes: A list of dictionaries {label_name: lists of bounding boxes} for each image.
+        :type gt_bboxes: List[Dict[str, List[Tuple[int, int, int, int]]]]
 
-        :param saliency_maps: A list of 2D numpy arrays representing the saliency maps.
-        :type saliency_maps: List[np.ndarray]
-        :param gt_bboxes: A list of bounding box of the ground truth objects for each image.
-        :type gt_bboxes: List[Tuple[int, int, int, int]]
+        :return: Pointing game score over a list of images
+        :rtype: float
         """
-        assert len(saliency_maps) == len(
+
+        assert len(explanations) == len(
             gt_bboxes
-        ), "Number of saliency maps and ground truth bounding boxes must match."
+        ), "Number of explanations and ground truth bounding boxes must match and equal to number of images."
+
+        hits = 0
+        num_sal_maps = 0
+        for explanation, image_gt_bboxes in zip(explanations, gt_bboxes):
+            label_names = explanation.label_names
+            assert label_names is not None, "Label names are required for pointing game evaluation."
+
+            for class_idx, class_sal_map in explanation.saliency_map.items():
+                label_name = label_names[int(class_idx)]
+
+                if label_name not in image_gt_bboxes:
+                    logger.info(
+                        f"No ground-truth bbox for {label_name} saliency map. "
+                        f"Skip pointing game evaluation for this saliency map."
+                    )
+                    continue
+
+                class_gt_bboxes = image_gt_bboxes[label_name]
+                hits += self.pointing_game(class_sal_map, class_gt_bboxes)
+                num_sal_maps += 1
 
-        hits = sum(
-            [self.pointing_game(s_map, image_gt_bboxes) for s_map, image_gt_bboxes in zip(saliency_maps, gt_bboxes)]
-        )
-        score = hits / len(saliency_maps)
-        return score
+        return hits / num_sal_maps if num_sal_maps > 0 else 0.0
diff --git a/tests/regression/test_regression.py b/tests/regression/test_regression.py
index 431836c1..277cf709 100644
--- a/tests/regression/test_regression.py
+++ b/tests/regression/test_regression.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
+from typing import Dict, List, Tuple
 
 import cv2
 import openvino as ov
@@ -15,23 +16,44 @@
 from tests.unit.explanation.test_explanation_utils import VOC_NAMES
 
 MODEL_NAME = "mlc_mobilenetv3_large_voc"
+IMAGE_PATH = "tests/assets/cheetah_person.jpg"
+COCO_ANN_PATH = "tests/assets/cheetah_person_coco.json"
 
 
-def load_gt_bboxes(class_name="person"):
-    with open("tests/assets/cheetah_person_coco.json", "r") as f:
+def load_gt_bboxes(json_coco_path: str) -> List[Dict[str, List[Tuple[int, int, int, int]]]]:
+    """
+    Loads ground truth bounding boxes from a COCO format JSON file.
+
+    Returns a list of dictionaries, where each dictionary corresponds to an image.
+    The key is the label name and the value is a list of bounding boxes for certain image.
+    """
+
+    with open(json_coco_path, "r") as f:
         coco_anns = json.load(f)
 
-    category_id = [category["id"] for category in coco_anns["categories"] if category["name"] == class_name]
-    category_id = category_id[0]
+    result = {}
+    category_id_to_name = {category["id"]: category["name"] for category in coco_anns["categories"]}
+
+    for annotation in coco_anns["annotations"]:
+        image_id = annotation["image_id"]
+        category_id = annotation["category_id"]
+        bbox = annotation["bbox"]
+
+        category_name = category_id_to_name[category_id]
+        if image_id not in result:
+            result[image_id] = {}
+        if category_name not in result[image_id]:
+            result[image_id][category_name] = []
 
-    category_gt_bboxes = [
-        annotation["bbox"] for annotation in coco_anns["annotations"] if annotation["category_id"] == category_id
-    ]
-    return category_gt_bboxes
+        result[image_id][category_name].append(bbox)
+
+    return list(result.values())
 
 
 class TestDummyRegression:
-    image = cv2.imread("tests/assets/cheetah_person.jpg")
+    image = cv2.imread(IMAGE_PATH)
+    gt_bboxes = load_gt_bboxes(COCO_ANN_PATH)
+    pointing_game = PointingGame()
 
     preprocess_fn = get_preprocess_fn(
         change_channel_order=True,
@@ -39,10 +61,6 @@ class TestDummyRegression:
         hwc_to_chw=True,
     )
 
-    gt_bboxes = load_gt_bboxes()
-    pointing_game = PointingGame()
-    steps = 10
-
     @pytest.fixture(autouse=True)
     def setup(self, fxt_data_root):
         data_dir = fxt_data_root
@@ -65,17 +83,23 @@ def test_explainer_image(self):
             colormap=False,
         )
         assert len(explanation.saliency_map) == 1
+        score = self.pointing_game.evaluate([explanation], self.gt_bboxes)
+        assert score == 1.0
 
-        # For now, assume that there's only one class
-        # TODO: support multiple classes
-        saliency_maps = list(explanation.saliency_map.values())
-        score = self.pointing_game.evaluate(saliency_maps, self.gt_bboxes)
-        assert score > 0.5
+        explanation = self.explainer(
+            self.image,
+            targets=["cat"],
+            label_names=VOC_NAMES,
+            colormap=False,
+        )
+        assert len(explanation.saliency_map) == 1
+        score = self.pointing_game.evaluate([explanation], self.gt_bboxes)
+        # No gt box for "cat" class
+        assert score == 0.0
 
     def test_explainer_images(self):
-        # TODO support multiple classes
         images = [self.image, self.image]
-        saliency_maps = []
+        explanations = []
         for image in images:
             explanation = self.explainer(
                 image,
@@ -83,8 +107,8 @@ def test_explainer_images(self):
                 label_names=VOC_NAMES,
                 colormap=False,
             )
-            saliency_map = list(explanation.saliency_map.values())[0]
-            saliency_maps.append(saliency_map)
+            explanations.append(explanation)
+        dataset_gt_bboxes = self.gt_bboxes * 2
 
-        score = self.pointing_game.evaluate(saliency_maps, self.gt_bboxes * 2)
-        assert score > 0.5
+        score = self.pointing_game.evaluate(explanations, dataset_gt_bboxes)
+        assert score == 1.0
diff --git a/tests/unit/metrics/test_pointing_game.py b/tests/unit/metrics/test_pointing_game.py
index f20f951a..68ca958c 100644
--- a/tests/unit/metrics/test_pointing_game.py
+++ b/tests/unit/metrics/test_pointing_game.py
@@ -1,6 +1,9 @@
+import logging
+
 import numpy as np
 import pytest
 
+from openvino_xai.explainer.explanation import Explanation
 from openvino_xai.metrics.pointing_game import PointingGame
 
 
@@ -13,19 +16,47 @@ def test_pointing_game(self):
         saliency_map = np.zeros((3, 3), dtype=np.float32)
         saliency_map[1, 1] = 1
 
-        ground_truth_bbox = (1, 1, 1, 1)
+        ground_truth_bbox = [(1, 1, 1, 1)]
         score = self.pointing_game.pointing_game(saliency_map, ground_truth_bbox)
         assert score == 1
 
-        ground_truth_bbox = (0, 0, 0, 0)
+        ground_truth_bbox = [(0, 0, 0, 0)]
         score = self.pointing_game.pointing_game(saliency_map, ground_truth_bbox)
         assert score == 0
 
-    def test_pointing_game_evaluate(self):
-        saliency_map = np.zeros((3, 3), dtype=np.float32)
-        saliency_map[1, 1] = 1
+    def test_pointing_game_evaluate(self, caplog):
+        pointing_game = PointingGame()
+        explanation = Explanation(
+            label_names=["cat", "dog"], saliency_map={0: [[0, 1], [2, 3]], 1: [[0, 0], [0, 1]]}, targets=[0, 1]
+        )
+        explanations = [explanation]
 
-        saliency_maps = [saliency_map, saliency_map]
-        ground_truth_bboxes = [(0, 0, 0, 0), (1, 1, 1, 1)]
-        score = self.pointing_game.evaluate(saliency_maps, ground_truth_bboxes)
+        gt_bboxes = [{"cat": [(0, 0, 2, 2)], "dog": [(0, 0, 1, 1)]}]
+        score = pointing_game.evaluate(explanations, gt_bboxes)
+        assert score == 1.0
+
+        # No hit for dog class saliency map, hit for cat class saliency map
+        gt_bboxes = [{"cat": [(0, 0, 2, 2), (0, 0, 1, 1)], "dog": [(0, 0, 0, 0)]}]
+        score = pointing_game.evaluate(explanations, gt_bboxes)
         assert score == 0.5
+
+        # No ground truth bboxes for available saliency map classes
+        gt_bboxes = [{"not-cat": [(0, 0, 2, 2)], "not-dog": [(0, 0, 0, 0)]}]
+        with caplog.at_level(logging.INFO):
+            score = pointing_game.evaluate(explanations, gt_bboxes)
+        assert "Skip pointing game evaluation for this saliency map." in caplog.text
+        assert score == 0.0
+
+        # Ground truth bboxes / saliency maps number mismatch
+        gt_bboxes = []
+        with pytest.raises(AssertionError):
+            score = pointing_game.evaluate(explanations, gt_bboxes)
+
+        # No label names
+        explanation = Explanation(
+            label_names=None, saliency_map={0: [[0, 1], [2, 3]], 1: [[0, 0], [0, 1]]}, targets=[0, 1]
+        )
+        explanations = [explanation]
+        gt_bboxes = [{"cat": [(0, 0, 2, 2)], "dog": [(0, 0, 1, 1)]}]
+        with pytest.raises(AssertionError):
+            score = pointing_game.evaluate(explanations, gt_bboxes)