From cc9e97c004d36ece1da777ae6672a7613d18ac3f Mon Sep 17 00:00:00 2001 From: yuval-neureality <73497616+yuval-neureality@users.noreply.github.com> Date: Tue, 10 May 2022 19:13:31 +0300 Subject: [PATCH] submission checker updates for network mode (#1142) * submission checker updates for network mode * [submission checker] (1) added 'network' division for 2.1 submission (2) verify remote system for network division, local for closed division (3) allow network submission for open division * removed comment Co-authored-by: rameshchukka --- tools/submission/submission-checker.py | 71 ++++++++++++++++++++------ 1 file changed, 56 insertions(+), 15 deletions(-) diff --git a/tools/submission/submission-checker.py b/tools/submission/submission-checker.py index 308262f97..15b699899 100755 --- a/tools/submission/submission-checker.py +++ b/tools/submission/submission-checker.py @@ -541,7 +541,7 @@ }, } -VALID_DIVISIONS = ["open", "closed"] +VALID_DIVISIONS = ["open", "closed", "network"] VALID_AVAILABILITIES = ["available", "preview", "rdi"] REQUIRED_PERF_FILES = ["mlperf_log_summary.txt", "mlperf_log_detail.txt"] OPTIONAL_PERF_FILES = ["mlperf_log_accuracy.json"] @@ -650,6 +650,13 @@ "disk_controllers" ] +SYSTEM_DESC_IS_NETWORK_MODE = "is_network" +SYSTEM_DESC_REQUIRED_FIELDS_NETWORK_MODE = [ + SYSTEM_DESC_IS_NETWORK_MODE, "network_type", "network_rate", "nic_loadgen", "number_nic_loadgen", + "net_software_stack_loadgen", "network_protocol", "number_connections", "nic_sut", "number_nic_sut", + "net_software_stack_sut", "network_setup" +] +NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME = "Network SUT" SYSTEM_IMP_REQUIRED_FILES = [ "input_data_types", "retraining", "starting_weights_filename", "weight_data_types", @@ -890,7 +897,7 @@ def check_accuracy_dir(config, model, path, verbose): return is_valid, acc -def check_performance_dir(config, model, path, scenario_fixed): +def check_performance_dir(config, model, path, scenario_fixed, division, system_json): is_valid = False rt = {} @@ -919,6 +926,7 @@ def check_performance_dir(config, model, path, scenario_fixed): if scenario == "SingleStream": # qps_wo_loadgen_overhead is only used for inferring Offline from SingleStream; only for old submissions qps_wo_loadgen_overhead = mlperf_log["result_qps_without_loadgen_overhead"] + sut_name = mlperf_log["sut_name"] else: fname = os.path.join(path, "mlperf_log_summary.txt") with open(fname, "r") as f: @@ -942,6 +950,7 @@ def check_performance_dir(config, model, path, scenario_fixed): min_duration = int(rt["min_duration (ms)"]) if scenario == "SingleStream": qps_wo_loadgen_overhead = float(rt["QPS w/o loadgen overhead"]) + sut_name = str(rt['System Under Test (SUT) name: ']) # check if there are any errors in the detailed log fname = os.path.join(path, "mlperf_log_detail.txt") @@ -1042,6 +1051,15 @@ def check_performance_dir(config, model, path, scenario_fixed): else: res = (latency_99_percentile * samples_per_query) / MS_TO_NS + is_network_system, is_network_mode_valid = is_system_over_network(division, system_json, path) + is_valid &= is_network_mode_valid + if is_network_system: + # for network mode verify the SUT name is valid, accodring to the rules (must include "Network SUT" in name) + if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name: + log.error( + f"{fname} invalid sut name for network mode. expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'") + is_valid = False + return is_valid, res, inferred @@ -1174,13 +1192,31 @@ def files_diff(list1, list2, optional=None): return list(set(list2) - set(list1)) return [] +def is_system_over_network(division, system_json, path): + """ + Verify whether the submitted system is over network and whether it is valid for the division + + for 'network' division, it is mandatory that the system is over-network + for 'closed' division, the system must not be over-network + for 'open' division, the system may be either local or over-network + """ + is_network_mode_sys_spec_str = system_json.get(SYSTEM_DESC_IS_NETWORK_MODE) + is_network_system = is_network_mode_sys_spec_str.lower()=="true" if is_network_mode_sys_spec_str is not None else False + # verify that the system corresponds the division + is_valid = True + expected_state_by_division = {"network": True, "closed": False} + if division in expected_state_by_division: + is_valid = expected_state_by_division[division] is is_network_system + if not is_valid: + log.error(f"{path} incorrect network mode (={is_network_system}) for division '{division}'") + return is_network_system, is_valid def check_results_dir(config, filter_submitter, skip_compliance, csv, debug=False): """ Walk the results directory and do the checking. We are called with the cdw at the root of the submission directory. - level1 division - closed|open + level1 division - closed|open|network level2 submitter - for example mlperf_org level3 - results, systems, measurements, code @@ -1212,7 +1248,6 @@ def check_results_dir(config, filter_submitter, skip_compliance, csv, debug=Fal csv.write(",".join(head) + "\n") results = {} - def log_result(submitter, available, division, @@ -1290,7 +1325,7 @@ def log_result(submitter, if division not in [".git", ".github", "assets"]: log.error("invalid division in input dir %s", division) continue - is_closed = division == "closed" + is_closed_or_network = division in ["closed", "network"] for submitter in list_dir(division): # we are looking at ./$division/$submitter, ie ./closed/mlperf_org @@ -1344,10 +1379,10 @@ def log_result(submitter, name = os.path.join(results_path, system_desc, model_name) mlperf_model = config.get_mlperf_model(model_name) - if is_closed and mlperf_model not in config.models: - # for closed division we want the model name to match. + if is_closed_or_network and mlperf_model not in config.models: + # for closed/network divisions we want the model name to match. # for open division the model_name might be different than the task - log.error("%s has an invalid model %s for closed division", name, + log.error("%s has an invalid model %s for closed/network division", name, model_name) results[name] = None continue @@ -1372,7 +1407,7 @@ def log_result(submitter, # ie ./closed/mlperf_org/results/t4-ort/bert/Offline name = os.path.join(results_path, system_desc, model_name, scenario) results[name] = None - if is_closed and scenario_fixed not in all_scenarios: + if is_closed_or_network and scenario_fixed not in all_scenarios: log.warning("%s ignoring scenario %s (neither required nor optional)", name, scenario) continue @@ -1403,8 +1438,8 @@ def log_result(submitter, diff = files_diff(list_files(acc_path), REQUIRED_ACC_FILES) if diff: log.error("%s has file list mismatch (%s)", acc_path, diff) - accuracy_is_valid, acc = check_accuracy_dir(config, mlperf_model, acc_path, debug or is_closed) - if not accuracy_is_valid and not is_closed: + accuracy_is_valid, acc = check_accuracy_dir(config, mlperf_model, acc_path, debug or is_closed_or_network) + if not accuracy_is_valid and not is_closed_or_network: if debug: log.warning("%s, accuracy not valid but taken for open", acc_path) accuracy_is_valid = True @@ -1439,7 +1474,7 @@ def log_result(submitter, log.error("%s has file list mismatch (%s)", perf_path, diff) try: - is_valid, r, is_inferred = check_performance_dir(config, mlperf_model, perf_path, scenario_fixed) + is_valid, r, is_inferred = check_performance_dir(config, mlperf_model, perf_path, scenario_fixed, division, system_json) if is_inferred: inferred = 1 log.info("%s has inferred results, qps=%s", perf_path, r) @@ -1468,8 +1503,8 @@ def log_result(submitter, errors += 1 # check if compliance dir is good for CLOSED division - compliance = 0 if is_closed else 1 - if is_closed and not skip_compliance: + compliance = 0 if is_closed_or_network else 1 + if is_closed_or_network and not skip_compliance: compliance_dir = os.path.join(division, submitter, "compliance", system_desc, model_name, scenario) if not os.path.exists(compliance_dir): @@ -1492,7 +1527,7 @@ def log_result(submitter, if required_scenarios: name = os.path.join(results_path, system_desc, model_name) - if is_closed: + if is_closed_or_network: results[name] = None log.error("%s does not have all required scenarios, missing %s", name, required_scenarios) elif debug: @@ -1508,6 +1543,12 @@ def check_system_desc_id(fname, systems_json, submitter, division, version): required_fields = SYSTEM_DESC_REQUIRED_FIELDS else: required_fields = SYSTEM_DESC_REQUIRED_FIELDS + SYSTEM_DESC_REQUIED_FIELDS_SINCE_V1 + + is_network_system, is_network_mode_valid = is_system_over_network(division, systems_json, fname) + is_valid &= is_network_mode_valid + if is_network_system: + required_fields += SYSTEM_DESC_REQUIRED_FIELDS_NETWORK_MODE + for k in required_fields: if k not in systems_json: is_valid = False