From f8351447c6fd3de6a10cccf2019b210ab5af4286 Mon Sep 17 00:00:00 2001 From: Andrzej-Zukowski <150017073+Andrzej-Zukowski@users.noreply.github.com> Date: Thu, 25 Jul 2024 13:47:11 +0200 Subject: [PATCH] Feature issue 1219 (#1221) * [ISSUE-1219] Add E2E test for auto drive replacement with Storage Group (HDD SC) - One volume per pod Signed-off-by: Andrzej Zukowski * [ISSUE-1219] Check CR removal before event generation Signed-off-by: Andrzej Zukowski * [ISSUE-1219] Conflict resolution Signed-off-by: Andrzej Zukowski * [ISSUE-1219] Function declaration adjustment Signed-off-by: Andrzej Zukowski --------- Signed-off-by: Andrzej Zukowski --- .../test_drive_replacement_one_volume.py | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 tests/e2e-test-framework/tests/test_drive_replacement_one_volume.py diff --git a/tests/e2e-test-framework/tests/test_drive_replacement_one_volume.py b/tests/e2e-test-framework/tests/test_drive_replacement_one_volume.py new file mode 100644 index 000000000..63ee48986 --- /dev/null +++ b/tests/e2e-test-framework/tests/test_drive_replacement_one_volume.py @@ -0,0 +1,180 @@ +import logging +from typing import Dict +import pytest + +import framework.const as const + +from framework.sts import STS +from framework.utils import Utils +from framework.drive import DriveUtils + + +class TestAutoDriveReplacementWithOneVolumePerPod: + @classmethod + @pytest.fixture(autouse=True) + def setup_class( + cls, + namespace: str, + drive_utils_executors: Dict[str, DriveUtils], + utils: Utils, + ): + cls.namespace = namespace + cls.name = "test-auto-drive-replacement-one-volume" + cls.timeout = 120 + cls.replicas = 1 + + cls.utils = utils + + cls.drive_utils = drive_utils_executors + cls.sts = STS(cls.namespace, cls.name, cls.replicas) + cls.sts.delete() + cls.sts.create(storage_classes=[const.HDD_SC]) + + yield + + cls.sts.delete() + + @pytest.mark.hal + def test_5771_auto_drive_replacement_with_one_volume_per_pod(self): + # 1. get volume for deployed pod + assert ( + self.sts.verify(self.timeout) is True + ), f"STS: {self.name} failed to reach desired number of replicas: {self.replicas}" + pod = self.utils.list_pods(name_prefix=self.name)[0] + node_ip = self.utils.get_pod_node_ip( + pod_name=pod.metadata.name, namespace=self.namespace + ) + volumes = self.utils.list_volumes(pod_name=pod.metadata.name) + assert ( len(volumes) is 1 ), f"volumes: Unexpected number of volumes: {len(volumes)}" + volume = volumes[0] + + # get drive + drive = self.utils.get_drive_cr( + volume_name=volume["metadata"]["name"], + namespace=volume["metadata"]["namespace"]) + + # 2.1 simulate drive failure. Annotate drive used by pod with health=BAD + drive_name = drive["metadata"]["name"] + self.utils.annotate_custom_resource( + resource_name=drive_name, + resource_type="drives", + annotation_key=const.DRIVE_HEALTH_ANNOTATION, + annotation_value=const.DRIVE_HEALTH_BAD_ANNOTATION, + ) + logging.info(f"drive: {drive_name} was annotated with health=BAD") + + # 2.2 wait until drive health is BAD, status=ONLINE, usage=RELEASING. + drive_name = drive["metadata"]["name"] + logging.info(f"Waiting for drive: {drive_name}") + assert self.utils.wait_drive( + name=drive_name, + expected_status=const.STATUS_ONLINE, + expected_health=const.HEALTH_BAD, + expected_usage=const.USAGE_RELEASING + ), f"Drive {drive_name} failed to reach expected Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}" + logging.info(f"drive {drive_name} went in Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}") + + # 2.3. wait until volume health is BAD, status=OPERATIVE, usage=RELEASING. + volume_name = volume["metadata"]["name"] + logging.info(f"Waiting for volume: {volume_name}") + assert self.utils.wait_volume( + name=volume_name, + expected_health=const.HEALTH_BAD, + expected_usage=const.USAGE_RELEASING, + expected_operational_status=const.STATUS_OPERATIVE + ), f"Volume {volume_name} failed to reach OperationalStatus: {const.STATUS_OPERATIVE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}" + logging.info(f"volume {volume_name} went in OperationalStatus: {const.STATUS_OPERATIVE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_RELEASING}") + + # 3. check events and locate event related to DriveHealthFailure + drive_name = drive["metadata"]["name"] + assert self.utils.event_in( + resource_name=drive_name, + reason=const.DRIVE_HEALTH_FAILURE, + ), f"event {const.DRIVE_HEALTH_FAILURE} for drive {drive_name} not found" + + # 6.1. annotate volume with release=done + volume_name = volume["metadata"]["name"] + self.utils.annotate_custom_resource( + resource_name=volume_name, + resource_type="volumes", + annotation_key="release", + annotation_value="done", + namespace=volume['metadata']['namespace'] + ) + logging.info(f"volume: {volume_name} was annotated with release=done") + + # 6.2. check drive usages are RELEASED + assert self.utils.wait_drive( + name=drive['metadata']['name'], + expected_usage=const.USAGE_RELEASED + ), f"Drive {drive_name} failed to reach expected Usage: {const.USAGE_RELEASED}" + logging.info(f"drive {drive_name} went in Usage: {const.USAGE_RELEASED}") + + # 6.3. check volume is RELEASED + assert self.utils.wait_volume( + name=volume['metadata']['name'], + expected_usage=const.USAGE_RELEASED + ), f"Volume {volume_name} failed to reach expected Usage {const.USAGE_RELEASED}" + logging.info(f"volume {volume_name} went in Usage: {const.USAGE_RELEASED}") + + # 7.check event DriveReadyForRemoval is generated, check events and locate event related to VolumeBadHealth + drive_name = drive["metadata"]["name"] + assert self.utils.event_in( + resource_name=drive_name, + reason=const.DRIVE_READY_FOR_REMOVAL, + ), f"event {const.DRIVE_READY_FOR_REMOVAL} for drive {drive_name} not found" + + volume_name = volume["metadata"]["name"] + assert self.utils.event_in( + resource_name=volume_name, + reason=const.VOLUME_BAD_HEALTH, + ), f"event {const.VOLUME_BAD_HEALTH} for volume {volume_name} not found" + + # 8. delete pod and pvc + self.utils.clear_pvc_and_pod(pod_name=pod.metadata.name, namespace=self.namespace) + + # 9. check drive status to be REMOVING or REMOVED + # 10. check LED state to be 1 (if drive supports LED ) or 2 (if drive does not support LED) + # 11. check drive status to be ONLINE + assert self.utils.wait_drive( + name=drive['metadata']['name'], + expected_status=const.STATUS_ONLINE, + expected_usage=const.USAGE_REMOVED, + expected_health=const.HEALTH_BAD, + expected_led_state=const.LED_STATE + ), f"Drive {drive_name} failed to reach expected Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_REMOVED}, LEDState: {drive["spec"]["LEDState"]}" + logging.info(f"drive {drive_name} went in Status: {const.STATUS_ONLINE}, Health: {const.HEALTH_BAD}, Usage: {const.USAGE_REMOVED}, LEDState: {drive["spec"]["LEDState"]}") + + # 12. check for events: DriveReadyForPhysicalRemoval + drive_name = drive["metadata"]["name"] + assert self.utils.event_in( + resource_name=drive_name, + reason=const.DRIVE_READY_FOR_PHYSICAL_REMOVAL, + ), f"event {const.DRIVE_READY_FOR_PHYSICAL_REMOVAL} for drive {drive_name} not found" + + # 13. get Node ID on which drive resides, Obtain path for affected drive, identify node name for corresponding node id and remove drive + drive_name = drive["metadata"]["name"] + drive_path = drive["spec"]["Path"] + assert drive_path, f"Drive path for drive {drive_name} not found" + logging.info(f"drive_path: {drive_path}") + + scsi_id = self.drive_utils[node_ip].get_scsi_id(drive_path) + assert scsi_id, f"scsi_id for drive {drive_name} not found" + logging.info(f"scsi_id: {scsi_id}") + + # 14. remove drive + self.drive_utils[node_ip].remove(scsi_id) + logging.info(f"drive {drive_path}, {scsi_id} removed") + + # 15. check driveCR succesfully removed + drive_name = drive["metadata"]["name"] + assert self.utils.check_drive_cr_exist_or_not( + drive_name=drive_name, cr_existence=False + ), f"Drive CR {drive_name} still exists" + + # 16. check for events DriveSuccessfullyRemoved in kubernetes events + drive_name = drive["metadata"]["name"] + assert self.utils.event_in( + resource_name=drive_name, + reason=const.DRIVE_SUCCESSFULLY_REMOVED, + ), f"event {const.DRIVE_SUCCESSFULLY_REMOVED} for drive {drive_name} not found"