diff --git a/apps/crc_idle.py b/apps/crc_idle.py index 8e9cbfb..d9bb384 100755 --- a/apps/crc_idle.py +++ b/apps/crc_idle.py @@ -9,8 +9,8 @@ from argparse import Namespace from collections import defaultdict -from .utils.cli import BaseParser from .utils import Shell, Slurm +from .utils.cli import BaseParser class CrcIdle(BaseParser): @@ -59,56 +59,52 @@ def get_cluster_list(self, args: Namespace) -> tuple[str]: @staticmethod def count_idle_cpu_resources(cluster: str, partition: str) -> dict[int, int]: - """Return the idle CPU resources on a given cluster partition + """Return the idle CPU resources on a given cluster partition. Args: - cluster: The cluster to print a summary for - partition: The partition in the parent cluster + cluster: The cluster to print a summary for. + partition: The partition in the parent cluster. Returns: - A dictionary mapping idle resources to number of nodes + A dictionary mapping the number of idle resources to the number of nodes with that many idle resources. """ # Use `sinfo` command to determine the status of each node in the given partition command = f'sinfo -h -M {cluster} -p {partition} -N -o %N,%C' - stdout = Shell.run_command(command) - slurm_data = stdout.strip().split() + slurm_data = Shell.run_command(command).strip().split() # Count the number of nodes having a given number of idle cores/GPUs return_dict = dict() for node_info in slurm_data: - _, resource_data = node_info.split(',') # Returns: node_name, resource_data - _, idle, _, _ = [int(x) for x in resource_data.split('/')] # Returns: allocated, idle, other, total + node_name, resource_data = node_info.split(',') + allocated, idle, other, total = [int(x) for x in resource_data.split('/')] return_dict[idle] = return_dict.setdefault(idle, 0) + 1 return return_dict @staticmethod def count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, int]: - """Return idle GPU resources on a given cluster partition + """Return idle GPU resources on a given cluster partition. If the host node is in a `drain` state, the GPUs are reported as unavailable. Args: - cluster: The cluster to print a summary for - partition: The partition in the parent cluster + cluster: The cluster to print a summary for. + partition: The partition in the parent cluster. Returns: - A dictionary mapping idle resources to number of nodes + A dictionary mapping the number of idle resources to the number of nodes with that many idle resources. """ # Use `sinfo` command to determine the status of each node in the given partition - command = f"sinfo -h -M {cluster} -p {partition} -N " \ - f"--Format=NodeList:'_',gres:5'_',gresUsed:12'_',StateCompact:' '" - - stdout = Shell.run_command(command) - slurm_data = stdout.strip().split() + slurm_output_format = "NodeList:'_',gres:5'_',gresUsed:12'_',StateCompact:' '" + command = f"sinfo -h -M {cluster} -p {partition} -N --Format={slurm_output_format}" + slurm_data = Shell.run_command(command).strip().split() # Count the number of nodes having a given number of idle cores/GPUs return_dict = dict() for node_info in slurm_data: - # Returns: node_name, total, allocated, node state - _, total, allocated, state = node_info.split('_') + node_name, total, allocated, state = node_info.split('_') # If the node is in a downed state, report 0 resource availability. if re.search("drain", state): @@ -124,17 +120,17 @@ def count_idle_gpu_resources(cluster: str, partition: str) -> dict[int, int]: return return_dict def count_idle_resources(self, cluster: str, partition: str) -> dict[int, int]: - """Determine the number of idle resources on a given cluster partition + """Determine the number of idle resources on a given cluster partition. The returned dictionary maps the number of idle resources (e.g., cores) to the number of nodes in the partition having that many resources idle. Args: - cluster: The cluster to print a summary for - partition: The partition in the parent cluster + cluster: The cluster to print a summary for. + partition: The partition in the parent cluster. Returns: - A dictionary mapping idle resources to number of nodes + A dictionary mapping idle resources to number of nodes. """ cluster_type = self.cluster_types[cluster] @@ -146,38 +142,38 @@ def count_idle_resources(self, cluster: str, partition: str) -> dict[int, int]: raise ValueError(f'Unknown cluster type: {cluster}') - def print_partition_summary(self, cluster: str, partition: str) -> None: + def print_partition_summary(self, cluster: str, partition: str, idle_resources: dict) -> None: """Print a summary of idle resources in a single partition Args: cluster: The cluster to print a summary for partition: The partition in the parent cluster + idle_resources: Dictionary mapping idle resources to number of nodes """ - resource_allocation = self.count_idle_resources(cluster, partition) - output_width = 30 header = f'Cluster: {cluster}, Partition: {partition}' unit = self.cluster_types[cluster] print(header) print('=' * output_width) - for idle, nodes in sorted(resource_allocation.items()): + for idle, nodes in sorted(idle_resources.items()): print(f'{nodes:4d} nodes w/ {idle:3d} idle {unit}') - if not resource_allocation: + if not idle_resources: print(' No idle resources') print('') def app_logic(self, args: Namespace) -> None: - """Logic to evaluate when executing the application + """Logic to evaluate when executing the application. Args: - args: Parsed command line arguments + args: Parsed command line arguments. """ for cluster in self.get_cluster_list(args): partitions_to_print = args.partition or Slurm.get_partition_names(cluster) for partition in partitions_to_print: - self.print_partition_summary(cluster, partition) + idle_resources = self.count_idle_resources(cluster, partition) + self.print_partition_summary(cluster, partition, idle_resources) diff --git a/tests/test_crc_idle.py b/tests/test_crc_idle.py index cb3cbd9..e64b3c7 100644 --- a/tests/test_crc_idle.py +++ b/tests/test_crc_idle.py @@ -2,7 +2,7 @@ from argparse import Namespace from unittest import TestCase -from unittest.mock import patch +from unittest.mock import call, Mock, patch from apps.crc_idle import CrcIdle from apps.utils.system_info import Slurm @@ -50,7 +50,7 @@ def test_clusters_default_to_false(self) -> None: class GetClusterList(TestCase): """Test the selection of which clusters to print""" - def test_get_cluster_list_no_arguments(self): + def test_get_cluster_list_no_arguments(self) -> None: """Test returned values when no clusters are specified.""" app = CrcIdle() @@ -60,7 +60,7 @@ def test_get_cluster_list_no_arguments(self): expected = tuple(app.cluster_types.keys()) self.assertEqual(expected, result) - def test_get_cluster_list_with_cluster_arguments(self): + def test_get_cluster_list_with_cluster_arguments(self) -> None: """Test returned values when select clusters are specified.""" app = CrcIdle() @@ -68,3 +68,80 @@ def test_get_cluster_list_with_cluster_arguments(self): result = app.get_cluster_list(args) self.assertEqual(('smp', 'mpi'), result) + + +class CountIdleResources(TestCase): + """Test the counting of idle CPU/DPU resources""" + + @patch('apps.utils.Shell.run_command') + def test_count_idle_cpu_resources(self, mock_run_command: Mock) -> None: + """Test counting idle CPU resources.""" + + cluster = 'smp' + partition = 'default' + mock_run_command.return_value = "node1,2/4/0/4\nnode2,3/2/0/3" + + app = CrcIdle() + result = app.count_idle_resources(cluster, partition) + + expected = {4: 1, 2: 1} + self.assertEqual(expected, result) + + @patch('apps.utils.Shell.run_command') + def test_count_idle_gpu_resources(self, mock_run_command: Mock) -> None: + """Test counting idle GPU resources.""" + + cluster = 'gpu' + partition = 'default' + mock_run_command.return_value = "node1_4_2_idle\nnode2_4_4_drain" + + app = CrcIdle() + result = app.count_idle_resources(cluster, partition) + expected = {2: 1, 0: 1} + self.assertEqual(expected, result) + + +class PrintPartitionSummary(TestCase): + """Test the printing of a partition summary""" + + @patch('builtins.print') + def test_print_partition_summary_with_idle_resources(self, mock_print: Mock) -> None: + """Test printing a summary with idle resources.""" + + cluster = 'smp' + partition = 'default' + idle_resources = {2: 3, 4: 1} # 3 nodes with 2 idle resources, 1 node with 4 idle resources + + app = CrcIdle() + app.print_partition_summary(cluster, partition, idle_resources) + + mock_print.assert_has_calls([ + call(f'Cluster: {cluster}, Partition: {partition}'), + call('=' * 30), + call(' 3 nodes w/ 2 idle cores'), + call(' 1 nodes w/ 4 idle cores'), + call('') + ], any_order=False) + + @patch('builtins.print') + def test_print_partition_summary_no_idle_resources(self, mock_print: Mock) -> None: + """Test printing a summary when no idle resources are available.""" + + cluster = 'smp' + partition = 'default' + idle_resources = dict() # No idle resources + + app = CrcIdle() + app.print_partition_summary(cluster, partition, idle_resources) + + mock_print.assert_any_call(f'Cluster: {cluster}, Partition: {partition}') + mock_print.assert_any_call('=' * 30) + mock_print.assert_any_call(' No idle resources') + mock_print.assert_any_call('') + + mock_print.assert_has_calls([ + call(f'Cluster: {cluster}, Partition: {partition}'), + call('=====' * 6), + call(' No idle resources'), + call('') + ], any_order=False)