Skip to content

Commit

Permalink
fix(BA-510): Utilization idle checker computes kernel stat correctly (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
fregataa authored Jan 14, 2025
1 parent d90f801 commit 30b158e
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 31 deletions.
1 change: 1 addition & 0 deletions changes/3442.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Utilization idle checker computes kernel resource usages correctly
47 changes: 16 additions & 31 deletions src/ai/backend/manager/idle.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,11 @@ def from_avg_threshold(
exclusions: set[str],
) -> UtilizationResourceReport:
data: dict[str, UtilizationExtraInfo] = {}
for resource_name, val in thresholds.unique_resource_name_map.items():
_resource_name = cast(str, resource_name)
if val.average is None or _resource_name in exclusions:
for metric_key, val in thresholds.items():
if val.average is None or metric_key in exclusions:
continue
avg_util = avg_utils.get(_resource_name, 0)
data[_resource_name] = UtilizationExtraInfo(float(avg_util), float(val.average))
avg_util = avg_utils.get(metric_key, 0)
data[metric_key] = UtilizationExtraInfo(float(avg_util), float(val.average))
return cls(data)

def to_dict(self, apply_unit: bool = True) -> dict[str, UtilizationExtraInfo]:
Expand Down Expand Up @@ -820,7 +819,7 @@ async def get_checker_result(
def _get_resource_name_from_metric_key(name: str) -> str:
for p in _metric_name_postfix:
if name.endswith(p):
return name.rstrip(p)
return name.removesuffix(p)
return name


Expand Down Expand Up @@ -854,16 +853,6 @@ def default_factory(cls) -> Self:
cuda_mem=ResourceThresholdValue(average=None, name=None),
)

@property
def unique_resource_name_map(self) -> Mapping[str, ResourceThresholdValue]:
ret: dict[str, ResourceThresholdValue] = {}
for resource_name_or_metric_key, val in self.items():
if (name := val.name) is not None:
ret[name] = val
else:
ret[_get_resource_name_from_metric_key(resource_name_or_metric_key)] = val
return ret

@classmethod
def threshold_validator(cls, value: dict[str, Any]) -> Self:
return cls({k: ResourceThresholdValue(**v) for k, v in value.items()})
Expand Down Expand Up @@ -1075,9 +1064,9 @@ async def check_idleness(

# Do not take into account unallocated resources. For example, do not garbage collect
# a session without GPU even if cuda_util is configured in resource-thresholds.
for _resource_name in self.resource_thresholds.unique_resource_name_map.keys():
if _resource_name not in requested_resource_names:
excluded_resources.add(_resource_name)
for resource_key in self.resource_thresholds.keys():
if _get_resource_name_from_metric_key(resource_key) not in requested_resource_names:
excluded_resources.add(resource_key)

# Get current utilization data from all containers of the session.
if kernel["cluster_size"] > 1:
Expand All @@ -1099,15 +1088,13 @@ async def check_idleness(
)

def default_util_series() -> dict[str, list[float]]:
return {resource: [] for resource in requested_resource_names}
return {resource: [] for resource in current_utilizations.keys()}

if raw_util_series is not None:
try:
raw_data: dict[str, list[float]] = msgpack.unpackb(raw_util_series, use_list=True)
util_series: dict[str, list[float]] = {
resource: v
for resource, v in raw_data.items()
if resource in requested_resource_names
metric_key: v for metric_key, v in raw_data.items()
}
except TypeError:
util_series = default_util_series()
Expand All @@ -1116,14 +1103,12 @@ def default_util_series() -> dict[str, list[float]]:

do_idle_check: bool = True

for k in util_series:
try:
current_util = current_utilizations[k]
except KeyError:
continue
util_series[k].append(current_util)
if len(util_series[k]) > window_size:
util_series[k].pop(0)
for metric_key, val in current_utilizations.items():
if metric_key not in util_series:
util_series[metric_key] = []
util_series[metric_key].append(val)
if len(util_series[metric_key]) > window_size:
util_series[metric_key].pop(0)
else:
do_idle_check = False

Expand Down

0 comments on commit 30b158e

Please sign in to comment.