diff --git a/changes/1820.fix.md b/changes/1820.fix.md new file mode 100644 index 00000000000..eac65a18fec --- /dev/null +++ b/changes/1820.fix.md @@ -0,0 +1 @@ +Exclude unallocated resources from kernel idle utilization checks. diff --git a/src/ai/backend/manager/idle.py b/src/ai/backend/manager/idle.py index 33790f4ae5f..32fa2bca85b 100644 --- a/src/ai/backend/manager/idle.py +++ b/src/ai/backend/manager/idle.py @@ -773,7 +773,7 @@ class UtilizationIdleChecker(BaseIdleChecker): time_window: timedelta initial_grace_period: timedelta _evhandlers: List[EventHandler[None, AbstractEvent]] - slot_resource_map: Mapping[str, Set[str]] = { + slot_prefix_to_utilization_metric_map: Mapping[str, Set[str]] = { "cpu": {"cpu_util"}, "mem": {"mem"}, "cuda": {"cuda_util", "cuda_mem"}, @@ -789,7 +789,7 @@ async def populate_config(self, raw_config: Mapping[str, Any]) -> None: } else: resources: list[str] = [] - for r in self.slot_resource_map.values(): + for r in self.slot_prefix_to_utilization_metric_map.values(): resources = [*resources, *r] self.resource_thresholds = {r: None for r in resources} self.thresholds_check_operator: ThresholdOperator = config.get("thresholds-check-operator") @@ -889,16 +889,16 @@ async def check_idleness( # Merge same type of (exclusive) resources as a unique resource with the values added. # Example: {cuda.device: 0, cuda.shares: 0.5} -> {cuda: 0.5}. - unique_res_map: DefaultDict[str, Any] = defaultdict(Decimal) - for k, v in occupied_slots.items(): - unique_key = k.split(".")[0] - unique_res_map[unique_key] += v + unique_res_map: DefaultDict[str, Decimal] = defaultdict(Decimal) + for slot_name, alloc in occupied_slots.items(): + unique_key = slot_name.split(".")[0] + unique_res_map[unique_key] += alloc # Do not take into account unallocated resources. For example, do not garbage collect # a session without GPU even if cuda_util is configured in resource-thresholds. - for slot in unique_res_map: - if unique_res_map[slot] == 0: - unavailable_resources.update(self.slot_resource_map[slot]) + for slot_prefix, util_metric in self.slot_prefix_to_utilization_metric_map.items(): + if unique_res_map.get(slot_prefix, 0) == 0: + unavailable_resources.update(util_metric) # Get current utilization data from all containers of the session. if kernel["cluster_size"] > 1: