Skip to content

Commit

Permalink
Increase test coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
djperrefort committed Aug 19, 2024
1 parent 3b2bfdb commit 19a4483
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 35 deletions.
60 changes: 28 additions & 32 deletions apps/crc_idle.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from argparse import Namespace
from collections import defaultdict

from .utils.cli import BaseParser
from .utils import Shell, Slurm
from .utils.cli import BaseParser


class CrcIdle(BaseParser):
Expand Down Expand Up @@ -59,56 +59,52 @@ def get_cluster_list(self, args: Namespace) -> tuple[str]:

@staticmethod
def count_idle_cpu_resources(cluster: str, partition: str) -> dict[int, int]:
"""Return the idle CPU resources on a given cluster partition
"""Return the idle CPU resources on a given cluster partition.
Args:
cluster: The cluster to print a summary for
partition: The partition in the parent cluster
cluster: The cluster to print a summary for.
partition: The partition in the parent cluster.
Returns:
A dictionary mapping idle resources to number of nodes
A dictionary mapping the number of idle resources to the number of nodes with that many idle resources.
"""

# Use `sinfo` command to determine the status of each node in the given partition
command = f'sinfo -h -M {cluster} -p {partition} -N -o %N,%C'
stdout = Shell.run_command(command)
slurm_data = stdout.strip().split()
slurm_data = Shell.run_command(command).strip().split()

# Count the number of nodes having a given number of idle cores/GPUs
return_dict = dict()
for node_info in slurm_data:
_, resource_data = node_info.split(',') # Returns: node_name, resource_data
_, idle, _, _ = [int(x) for x in resource_data.split('/')] # Returns: allocated, idle, other, total
node_name, resource_data = node_info.split(',')
allocated, idle, other, total = [int(x) for x in resource_data.split('/')]
return_dict[idle] = return_dict.setdefault(idle, 0) + 1

return return_dict

@staticmethod
def count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, int]:
"""Return idle GPU resources on a given cluster partition
"""Return idle GPU resources on a given cluster partition.
If the host node is in a `drain` state, the GPUs are reported as unavailable.
Args:
cluster: The cluster to print a summary for
partition: The partition in the parent cluster
cluster: The cluster to print a summary for.
partition: The partition in the parent cluster.
Returns:
A dictionary mapping idle resources to number of nodes
A dictionary mapping the number of idle resources to the number of nodes with that many idle resources.
"""

# Use `sinfo` command to determine the status of each node in the given partition
command = f"sinfo -h -M {cluster} -p {partition} -N " \
f"--Format=NodeList:'_',gres:5'_',gresUsed:12'_',StateCompact:' '"

stdout = Shell.run_command(command)
slurm_data = stdout.strip().split()
slurm_output_format = "NodeList:'_',gres:5'_',gresUsed:12'_',StateCompact:' '"
command = f"sinfo -h -M {cluster} -p {partition} -N --Format={slurm_output_format}"
slurm_data = Shell.run_command(command).strip().split()

# Count the number of nodes having a given number of idle cores/GPUs
return_dict = dict()
for node_info in slurm_data:
# Returns: node_name, total, allocated, node state
_, total, allocated, state = node_info.split('_')
node_name, total, allocated, state = node_info.split('_')

# If the node is in a downed state, report 0 resource availability.
if re.search("drain", state):
Expand All @@ -124,17 +120,17 @@ def count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, int]:
return return_dict

def count_idle_resources(self, cluster: str, partition: str) -> dict[int, int]:
"""Determine the number of idle resources on a given cluster partition
"""Determine the number of idle resources on a given cluster partition.
The returned dictionary maps the number of idle resources (e.g., cores)
to the number of nodes in the partition having that many resources idle.
Args:
cluster: The cluster to print a summary for
partition: The partition in the parent cluster
cluster: The cluster to print a summary for.
partition: The partition in the parent cluster.
Returns:
A dictionary mapping idle resources to number of nodes
A dictionary mapping idle resources to number of nodes.
"""

cluster_type = self.cluster_types[cluster]
Expand All @@ -146,38 +142,38 @@ def count_idle_resources(self, cluster: str, partition: str) -> dict[int, int]:

raise ValueError(f'Unknown cluster type: {cluster}')

def print_partition_summary(self, cluster: str, partition: str) -> None:
def print_partition_summary(self, cluster: str, partition: str, idle_resources: dict) -> None:
"""Print a summary of idle resources in a single partition
Args:
cluster: The cluster to print a summary for
partition: The partition in the parent cluster
idle_resources: Dictionary mapping idle resources to number of nodes
"""

resource_allocation = self.count_idle_resources(cluster, partition)

output_width = 30
header = f'Cluster: {cluster}, Partition: {partition}'
unit = self.cluster_types[cluster]

print(header)
print('=' * output_width)
for idle, nodes in sorted(resource_allocation.items()):
for idle, nodes in sorted(idle_resources.items()):
print(f'{nodes:4d} nodes w/ {idle:3d} idle {unit}')

if not resource_allocation:
if not idle_resources:
print(' No idle resources')

print('')

def app_logic(self, args: Namespace) -> None:
"""Logic to evaluate when executing the application
"""Logic to evaluate when executing the application.
Args:
args: Parsed command line arguments
args: Parsed command line arguments.
"""

for cluster in self.get_cluster_list(args):
partitions_to_print = args.partition or Slurm.get_partition_names(cluster)
for partition in partitions_to_print:
self.print_partition_summary(cluster, partition)
idle_resources = self.count_idle_resources(cluster, partition)
self.print_partition_summary(cluster, partition, idle_resources)
83 changes: 80 additions & 3 deletions tests/test_crc_idle.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from argparse import Namespace
from unittest import TestCase
from unittest.mock import patch
from unittest.mock import call, Mock, patch

from apps.crc_idle import CrcIdle
from apps.utils.system_info import Slurm
Expand Down Expand Up @@ -50,7 +50,7 @@ def test_clusters_default_to_false(self) -> None:
class GetClusterList(TestCase):
"""Test the selection of which clusters to print"""

def test_get_cluster_list_no_arguments(self):
def test_get_cluster_list_no_arguments(self) -> None:
"""Test returned values when no clusters are specified."""

app = CrcIdle()
Expand All @@ -60,11 +60,88 @@ def test_get_cluster_list_no_arguments(self):
expected = tuple(app.cluster_types.keys())
self.assertEqual(expected, result)

def test_get_cluster_list_with_cluster_arguments(self):
def test_get_cluster_list_with_cluster_arguments(self) -> None:
"""Test returned values when select clusters are specified."""

app = CrcIdle()
args = Namespace(smp=True, gpu=False, mpi=True, invest=False, htc=False, partition=None)
result = app.get_cluster_list(args)

self.assertEqual(('smp', 'mpi'), result)


class CountIdleResources(TestCase):
"""Test the counting of idle CPU/DPU resources"""

@patch('apps.utils.Shell.run_command')
def test_count_idle_cpu_resources(self, mock_run_command: Mock) -> None:
"""Test counting idle CPU resources."""

cluster = 'smp'
partition = 'default'
mock_run_command.return_value = "node1,2/4/0/4\nnode2,3/2/0/3"

app = CrcIdle()
result = app.count_idle_resources(cluster, partition)

expected = {4: 1, 2: 1}
self.assertEqual(expected, result)

@patch('apps.utils.Shell.run_command')
def test_count_idle_gpu_resources(self, mock_run_command: Mock) -> None:
"""Test counting idle GPU resources."""

cluster = 'gpu'
partition = 'default'
mock_run_command.return_value = "node1_4_2_idle\nnode2_4_4_drain"

app = CrcIdle()
result = app.count_idle_resources(cluster, partition)
expected = {2: 1, 0: 1}
self.assertEqual(expected, result)


class PrintPartitionSummary(TestCase):
"""Test the printing of a partition summary"""

@patch('builtins.print')
def test_print_partition_summary_with_idle_resources(self, mock_print: Mock) -> None:
"""Test printing a summary with idle resources."""

cluster = 'smp'
partition = 'default'
idle_resources = {2: 3, 4: 1} # 3 nodes with 2 idle resources, 1 node with 4 idle resources

app = CrcIdle()
app.print_partition_summary(cluster, partition, idle_resources)

mock_print.assert_has_calls([
call(f'Cluster: {cluster}, Partition: {partition}'),
call('=' * 30),
call(' 3 nodes w/ 2 idle cores'),
call(' 1 nodes w/ 4 idle cores'),
call('')
], any_order=False)

@patch('builtins.print')
def test_print_partition_summary_no_idle_resources(self, mock_print: Mock) -> None:
"""Test printing a summary when no idle resources are available."""

cluster = 'smp'
partition = 'default'
idle_resources = dict() # No idle resources

app = CrcIdle()
app.print_partition_summary(cluster, partition, idle_resources)

mock_print.assert_any_call(f'Cluster: {cluster}, Partition: {partition}')
mock_print.assert_any_call('=' * 30)
mock_print.assert_any_call(' No idle resources')
mock_print.assert_any_call('')

mock_print.assert_has_calls([
call(f'Cluster: {cluster}, Partition: {partition}'),
call('=====' * 6),
call(' No idle resources'),
call('')
], any_order=False)

0 comments on commit 19a4483

Please sign in to comment.