Skip to content

Commit

Permalink
fix: Exclude utilization checks for unallocated resources (#1820)
Browse files Browse the repository at this point in the history
  • Loading branch information
fregataa authored and kyujin-cho committed Feb 21, 2024
1 parent abd937b commit aa0d3dd
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 9 deletions.
1 change: 1 addition & 0 deletions changes/1820.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Exclude unallocated resources from kernel idle utilization checks.
18 changes: 9 additions & 9 deletions src/ai/backend/manager/idle.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,7 +773,7 @@ class UtilizationIdleChecker(BaseIdleChecker):
time_window: timedelta
initial_grace_period: timedelta
_evhandlers: List[EventHandler[None, AbstractEvent]]
slot_resource_map: Mapping[str, Set[str]] = {
slot_prefix_to_utilization_metric_map: Mapping[str, Set[str]] = {
"cpu": {"cpu_util"},
"mem": {"mem"},
"cuda": {"cuda_util", "cuda_mem"},
Expand All @@ -789,7 +789,7 @@ async def populate_config(self, raw_config: Mapping[str, Any]) -> None:
}
else:
resources: list[str] = []
for r in self.slot_resource_map.values():
for r in self.slot_prefix_to_utilization_metric_map.values():
resources = [*resources, *r]
self.resource_thresholds = {r: None for r in resources}
self.thresholds_check_operator: ThresholdOperator = config.get("thresholds-check-operator")
Expand Down Expand Up @@ -889,16 +889,16 @@ async def check_idleness(

# Merge same type of (exclusive) resources as a unique resource with the values added.
# Example: {cuda.device: 0, cuda.shares: 0.5} -> {cuda: 0.5}.
unique_res_map: DefaultDict[str, Any] = defaultdict(Decimal)
for k, v in occupied_slots.items():
unique_key = k.split(".")[0]
unique_res_map[unique_key] += v
unique_res_map: DefaultDict[str, Decimal] = defaultdict(Decimal)
for slot_name, alloc in occupied_slots.items():
unique_key = slot_name.split(".")[0]
unique_res_map[unique_key] += alloc

# Do not take into account unallocated resources. For example, do not garbage collect
# a session without GPU even if cuda_util is configured in resource-thresholds.
for slot in unique_res_map:
if unique_res_map[slot] == 0:
unavailable_resources.update(self.slot_resource_map[slot])
for slot_prefix, util_metric in self.slot_prefix_to_utilization_metric_map.items():
if unique_res_map.get(slot_prefix, 0) == 0:
unavailable_resources.update(util_metric)

# Get current utilization data from all containers of the session.
if kernel["cluster_size"] > 1:
Expand Down

0 comments on commit aa0d3dd

Please sign in to comment.