From c3fcc3dffa55934583ec6627283844f4277c5710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yuan-Ting=20Hsieh=20=28=E8=AC=9D=E6=B2=85=E5=BB=B7=29?= Date: Wed, 22 Jan 2025 10:41:07 -0800 Subject: [PATCH] Disable HA CI tests (#3097) ### Description - Disable HA tests as the current impl. is obsolete - Make master_template consistent (in nvflare/lighter/templates/master_template.yml and nvflare/lighter/impl/master_template.yml) ### Types of changes - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Quick tests passed locally by running `./runtest.sh`. - [ ] In-line docstrings updated. - [ ] Documentation updated. Co-authored-by: Sean Yang --- .../data/projects/ha_2_servers_2_clients.yml | 47 --------------- .../data/test_configs/ha/fladminapi.yml | 30 ---------- ...rver_during_training_after_first_round.yml | 60 ------------------- ...ver_during_training_before_first_round.yml | 60 ------------------- .../kill_server_after_training_complete.yml | 33 ---------- ...rver_during_training_after_first_round.yml | 56 ----------------- ...ver_during_training_before_first_round.yml | 56 ----------------- ...l_server_during_training_sending_model.yml | 48 --------------- .../data/test_configs/ha/two_servers.yml | 26 -------- .../integration_test/run_integration_tests.sh | 2 +- tests/integration_test/test_configs.yml | 9 --- 11 files changed, 1 insertion(+), 426 deletions(-) delete mode 100644 tests/integration_test/data/projects/ha_2_servers_2_clients.yml delete mode 100644 tests/integration_test/data/test_configs/ha/fladminapi.yml delete mode 100644 tests/integration_test/data/test_configs/ha/kill_one_server_during_training_after_first_round.yml delete mode 100644 tests/integration_test/data/test_configs/ha/kill_one_server_during_training_before_first_round.yml delete mode 100644 tests/integration_test/data/test_configs/ha/kill_server_after_training_complete.yml delete mode 100644 tests/integration_test/data/test_configs/ha/kill_server_during_training_after_first_round.yml delete mode 100644 tests/integration_test/data/test_configs/ha/kill_server_during_training_before_first_round.yml delete mode 100644 tests/integration_test/data/test_configs/ha/kill_server_during_training_sending_model.yml delete mode 100644 tests/integration_test/data/test_configs/ha/two_servers.yml diff --git a/tests/integration_test/data/projects/ha_2_servers_2_clients.yml b/tests/integration_test/data/projects/ha_2_servers_2_clients.yml deleted file mode 100644 index 211e48178d..0000000000 --- a/tests/integration_test/data/projects/ha_2_servers_2_clients.yml +++ /dev/null @@ -1,47 +0,0 @@ -api_version: 3 -name: integration_test -description: NVIDIA FLARE integration_test project yaml file - -participants: - - name: localhost - type: overseer - org: nvidia - protocol: https - api_root: /api/v1 - port: 8443 - - name: localhost0 - type: server - org: nvidia - fed_learn_port: 8002 - admin_port: 8003 - - name: localhost1 - type: server - org: nvidia - fed_learn_port: 8102 - admin_port: 8103 - - name: site-1 - type: client - org: nvidia - - name: site-2 - type: client - org: nvidia - - name: super@test.org - type: admin - org: nvidia - role: project_admin - -# The same methods in all builders are called in their order defined in builders section -builders: - - path: nvflare.lighter.impl.workspace.WorkspaceBuilder - args: - template_file: master_template.yml - - path: nvflare.lighter.impl.template.TemplateBuilder - - path: nvflare.lighter.impl.static_file.StaticFileBuilder - args: - config_folder: config - overseer_agent: - path: nvflare.ha.overseer_agent.HttpOverseerAgent - overseer_exists: true - - - path: nvflare.lighter.impl.cert.CertBuilder - - path: nvflare.lighter.impl.signature.SignatureBuilder diff --git a/tests/integration_test/data/test_configs/ha/fladminapi.yml b/tests/integration_test/data/test_configs/ha/fladminapi.yml deleted file mode 100644 index 60272014a0..0000000000 --- a/tests/integration_test/data/test_configs/ha/fladminapi.yml +++ /dev/null @@ -1,30 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_1_servers_2_clients.yml - - -tests: - - test_name: "run fl admin api" - validators: - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "server_job_log" - "data": "sent task assignment to client" - "actions": [ "run_admin_commands" ] - "result": - "type": "run_state" - "data": { } - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "mark_test_done" ] - "result": - "type": "run_state" - "data": { } diff --git a/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_after_first_round.yml b/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_after_first_round.yml deleted file mode 100644 index e15d731ca3..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_after_first_round.yml +++ /dev/null @@ -1,60 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_2_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload slow_job, kill one server after we start training and the first round is completed in SAG, - second server should pick up the work and run to completion" - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 1 - } - } - } - "actions": [ - "kill server localhost0", - "sleep 5", - ] - "result": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 1 - } - } - } - - "trigger": - "type": "server_job_log" - "data": "Round 1 started" - "actions": [ "no_op" ] - "result": - "type": "run_state" - "data": { } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 3, 4, 5 ], [ 6, 7, 8 ], [ 9, 10, 11 ] ] } diff --git a/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_before_first_round.yml b/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_before_first_round.yml deleted file mode 100644 index 599dd0c936..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_one_server_during_training_before_first_round.yml +++ /dev/null @@ -1,60 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_2_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload slow_job, kill one server after we start training but no round is completed in SAG, - second server should pick up the work and run to completion" - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 0 - } - } - } - "actions": [ - "kill server localhost0", - "sleep 5", - ] - "result": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 0 - } - } - } - - "trigger": - "type": "server_job_log" - "data": "Round 0 started" - "actions": [ "no_op" ] - "result": - "type": "run_state" - "data": { } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 3, 4, 5 ], [ 6, 7, 8 ], [ 9, 10, 11 ] ] } diff --git a/tests/integration_test/data/test_configs/ha/kill_server_after_training_complete.yml b/tests/integration_test/data/test_configs/ha/kill_server_after_training_complete.yml deleted file mode 100644 index 01961760d6..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_server_after_training_complete.yml +++ /dev/null @@ -1,33 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_1_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload np_sag and let it finish" - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job np_sag" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ - "kill server", - "sleep 10", - "start server", - ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } diff --git a/tests/integration_test/data/test_configs/ha/kill_server_during_training_after_first_round.yml b/tests/integration_test/data/test_configs/ha/kill_server_during_training_after_first_round.yml deleted file mode 100644 index 1677ba762c..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_server_during_training_after_first_round.yml +++ /dev/null @@ -1,56 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_1_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload slow_job, kill the server during training after SAG first round, - restart it should pick up the work" - - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 1 - } - } - } - "actions": [ - "kill server", - "sleep 10", - "start server", - "sleep 1" - ] - "result": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 1 - } - } - } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 3, 4, 5 ], [ 6, 7, 8 ], [ 9, 10, 11 ] ] } diff --git a/tests/integration_test/data/test_configs/ha/kill_server_during_training_before_first_round.yml b/tests/integration_test/data/test_configs/ha/kill_server_during_training_before_first_round.yml deleted file mode 100644 index 302c6324d8..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_server_during_training_before_first_round.yml +++ /dev/null @@ -1,56 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_1_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload slow_job, kill the server after we start training but no round is completed in SAG, - restart it should pick up the work" - - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 0 - } - } - } - "actions": [ - "kill server", - "sleep 10", - "start server", - "sleep 1" - ] - "result": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 0 - } - } - } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 3, 4, 5 ], [ 6, 7, 8 ], [ 9, 10, 11 ] ] } diff --git a/tests/integration_test/data/test_configs/ha/kill_server_during_training_sending_model.yml b/tests/integration_test/data/test_configs/ha/kill_server_during_training_sending_model.yml deleted file mode 100644 index 5fea19adf0..0000000000 --- a/tests/integration_test/data/test_configs/ha/kill_server_during_training_sending_model.yml +++ /dev/null @@ -1,48 +0,0 @@ -ha: True -jobs_root_dir: ./data/apps -cleanup: True -project_yaml: ./data/projects/ha_1_servers_2_clients.yml -poll_period: 0.1 - -tests: - - test_name: "upload slow_job, kill the server during sending models to clients, - restart it should pick up the work" - - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job slow_job" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "server_job_log" - "data": "sent task assignment to client" - "actions": [ - "kill server", - "sleep 10", - "start server", - "sleep 1" - ] - "result": - "type": "run_state" - "data": - { - "workflows": { - "ScatterAndGather": { - "phase": "train", - "current_round": 0 - } - } - } - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done 60" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 3, 4, 5 ], [ 6, 7, 8 ], [ 9, 10, 11 ] ] } diff --git a/tests/integration_test/data/test_configs/ha/two_servers.yml b/tests/integration_test/data/test_configs/ha/two_servers.yml deleted file mode 100644 index feb0b43dde..0000000000 --- a/tests/integration_test/data/test_configs/ha/two_servers.yml +++ /dev/null @@ -1,26 +0,0 @@ -ha: True -jobs_root_dir: ./data/jobs -cleanup: True -project_yaml: ./data/projects/ha_2_servers_2_clients.yml - - -tests: - - test_name: "upload a job, wait for it to finish" - event_sequence: - - "trigger": - "type": "server_log" - "data": "Server started" - "actions": [ "submit_job hello-numpy-sag" ] - "result": - "type": "job_submit_success" - - "trigger": - "type": "run_state" - "data": { "run_finished": True } - "actions": [ "ensure_current_job_done" ] - "result": - "type": "run_state" - "data": { "run_finished": True } - - validators: - - path: tests.integration_test.src.validators.NumpySAGResultValidator - args: { expected_result: [ [ 4, 5, 6 ], [ 7, 8, 9 ], [ 10, 11, 12 ] ] } diff --git a/tests/integration_test/run_integration_tests.sh b/tests/integration_test/run_integration_tests.sh index 50d34e6c9d..299308b49f 100755 --- a/tests/integration_test/run_integration_tests.sh +++ b/tests/integration_test/run_integration_tests.sh @@ -3,7 +3,7 @@ set -e PYTHONPATH="${PWD}/../.." -backends=(numpy tensorflow pytorch overseer ha auth preflight cifar auto stats xgboost client_api client_api_qa model_controller_api) +backends=(numpy tensorflow pytorch overseer auth preflight cifar auto stats xgboost client_api client_api_qa model_controller_api) usage() { diff --git a/tests/integration_test/test_configs.yml b/tests/integration_test/test_configs.yml index 52aa9455de..867bddd327 100644 --- a/tests/integration_test/test_configs.yml +++ b/tests/integration_test/test_configs.yml @@ -6,15 +6,6 @@ test_configs: - ./data/test_configs/authorization/abort_job.yml - ./data/test_configs/authorization/list_job.yml - ./data/test_configs/authorization/shell_commands.yml - ha: - - ./data/test_configs/ha/kill_one_server_during_training_after_first_round.yml - - ./data/test_configs/ha/kill_one_server_during_training_before_first_round.yml - - ./data/test_configs/ha/kill_server_after_training_complete.yml - - ./data/test_configs/ha/kill_server_during_training_after_first_round.yml - - ./data/test_configs/ha/kill_server_during_training_before_first_round.yml - - ./data/test_configs/ha/kill_server_during_training_sending_model.yml - - ./data/test_configs/ha/two_servers.yml - - ./data/test_configs/ha/fladminapi.yml numpy: - ./data/test_configs/standalone_job/np_job.yml - ./data/test_configs/standalone_job/np_app.yml