Skip to content

Commit

Permalink
submission checker updates for network mode (mlcommons#1142)
Browse files Browse the repository at this point in the history
* submission checker updates for network mode

* [submission checker]
(1) added 'network' division for 2.1 submission
(2) verify remote system for network division, local for closed division
(3) allow network submission for open division

* removed comment

Co-authored-by: rameshchukka <[email protected]>
  • Loading branch information
yuval-neureality and rnaidu02 authored May 10, 2022
1 parent f8500d6 commit cc9e97c
Showing 1 changed file with 56 additions and 15 deletions.
71 changes: 56 additions & 15 deletions tools/submission/submission-checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@
},
}

VALID_DIVISIONS = ["open", "closed"]
VALID_DIVISIONS = ["open", "closed", "network"]
VALID_AVAILABILITIES = ["available", "preview", "rdi"]
REQUIRED_PERF_FILES = ["mlperf_log_summary.txt", "mlperf_log_detail.txt"]
OPTIONAL_PERF_FILES = ["mlperf_log_accuracy.json"]
Expand Down Expand Up @@ -650,6 +650,13 @@
"disk_controllers"
]

SYSTEM_DESC_IS_NETWORK_MODE = "is_network"
SYSTEM_DESC_REQUIRED_FIELDS_NETWORK_MODE = [
SYSTEM_DESC_IS_NETWORK_MODE, "network_type", "network_rate", "nic_loadgen", "number_nic_loadgen",
"net_software_stack_loadgen", "network_protocol", "number_connections", "nic_sut", "number_nic_sut",
"net_software_stack_sut", "network_setup"
]
NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME = "Network SUT"

SYSTEM_IMP_REQUIRED_FILES = [
"input_data_types", "retraining", "starting_weights_filename", "weight_data_types",
Expand Down Expand Up @@ -890,7 +897,7 @@ def check_accuracy_dir(config, model, path, verbose):
return is_valid, acc


def check_performance_dir(config, model, path, scenario_fixed):
def check_performance_dir(config, model, path, scenario_fixed, division, system_json):
is_valid = False
rt = {}

Expand Down Expand Up @@ -919,6 +926,7 @@ def check_performance_dir(config, model, path, scenario_fixed):
if scenario == "SingleStream":
# qps_wo_loadgen_overhead is only used for inferring Offline from SingleStream; only for old submissions
qps_wo_loadgen_overhead = mlperf_log["result_qps_without_loadgen_overhead"]
sut_name = mlperf_log["sut_name"]
else:
fname = os.path.join(path, "mlperf_log_summary.txt")
with open(fname, "r") as f:
Expand All @@ -942,6 +950,7 @@ def check_performance_dir(config, model, path, scenario_fixed):
min_duration = int(rt["min_duration (ms)"])
if scenario == "SingleStream":
qps_wo_loadgen_overhead = float(rt["QPS w/o loadgen overhead"])
sut_name = str(rt['System Under Test (SUT) name: '])

# check if there are any errors in the detailed log
fname = os.path.join(path, "mlperf_log_detail.txt")
Expand Down Expand Up @@ -1042,6 +1051,15 @@ def check_performance_dir(config, model, path, scenario_fixed):
else:
res = (latency_99_percentile * samples_per_query) / MS_TO_NS

is_network_system, is_network_mode_valid = is_system_over_network(division, system_json, path)
is_valid &= is_network_mode_valid
if is_network_system:
# for network mode verify the SUT name is valid, accodring to the rules (must include "Network SUT" in name)
if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name:
log.error(
f"{fname} invalid sut name for network mode. expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'")
is_valid = False

return is_valid, res, inferred


Expand Down Expand Up @@ -1174,13 +1192,31 @@ def files_diff(list1, list2, optional=None):
return list(set(list2) - set(list1))
return []

def is_system_over_network(division, system_json, path):
"""
Verify whether the submitted system is over network and whether it is valid for the division
for 'network' division, it is mandatory that the system is over-network
for 'closed' division, the system must not be over-network
for 'open' division, the system may be either local or over-network
"""
is_network_mode_sys_spec_str = system_json.get(SYSTEM_DESC_IS_NETWORK_MODE)
is_network_system = is_network_mode_sys_spec_str.lower()=="true" if is_network_mode_sys_spec_str is not None else False
# verify that the system corresponds the division
is_valid = True
expected_state_by_division = {"network": True, "closed": False}
if division in expected_state_by_division:
is_valid = expected_state_by_division[division] is is_network_system
if not is_valid:
log.error(f"{path} incorrect network mode (={is_network_system}) for division '{division}'")
return is_network_system, is_valid

def check_results_dir(config, filter_submitter, skip_compliance, csv, debug=False):
"""
Walk the results directory and do the checking.
We are called with the cdw at the root of the submission directory.
level1 division - closed|open
level1 division - closed|open|network
level2 submitter - for example mlperf_org
level3 - results, systems, measurements, code
Expand Down Expand Up @@ -1212,7 +1248,6 @@ def check_results_dir(config, filter_submitter, skip_compliance, csv, debug=Fal
csv.write(",".join(head) + "\n")
results = {}


def log_result(submitter,
available,
division,
Expand Down Expand Up @@ -1290,7 +1325,7 @@ def log_result(submitter,
if division not in [".git", ".github", "assets"]:
log.error("invalid division in input dir %s", division)
continue
is_closed = division == "closed"
is_closed_or_network = division in ["closed", "network"]

for submitter in list_dir(division):
# we are looking at ./$division/$submitter, ie ./closed/mlperf_org
Expand Down Expand Up @@ -1344,10 +1379,10 @@ def log_result(submitter,
name = os.path.join(results_path, system_desc, model_name)
mlperf_model = config.get_mlperf_model(model_name)

if is_closed and mlperf_model not in config.models:
# for closed division we want the model name to match.
if is_closed_or_network and mlperf_model not in config.models:
# for closed/network divisions we want the model name to match.
# for open division the model_name might be different than the task
log.error("%s has an invalid model %s for closed division", name,
log.error("%s has an invalid model %s for closed/network division", name,
model_name)
results[name] = None
continue
Expand All @@ -1372,7 +1407,7 @@ def log_result(submitter,
# ie ./closed/mlperf_org/results/t4-ort/bert/Offline
name = os.path.join(results_path, system_desc, model_name, scenario)
results[name] = None
if is_closed and scenario_fixed not in all_scenarios:
if is_closed_or_network and scenario_fixed not in all_scenarios:
log.warning("%s ignoring scenario %s (neither required nor optional)", name, scenario)
continue

Expand Down Expand Up @@ -1403,8 +1438,8 @@ def log_result(submitter,
diff = files_diff(list_files(acc_path), REQUIRED_ACC_FILES)
if diff:
log.error("%s has file list mismatch (%s)", acc_path, diff)
accuracy_is_valid, acc = check_accuracy_dir(config, mlperf_model, acc_path, debug or is_closed)
if not accuracy_is_valid and not is_closed:
accuracy_is_valid, acc = check_accuracy_dir(config, mlperf_model, acc_path, debug or is_closed_or_network)
if not accuracy_is_valid and not is_closed_or_network:
if debug:
log.warning("%s, accuracy not valid but taken for open", acc_path)
accuracy_is_valid = True
Expand Down Expand Up @@ -1439,7 +1474,7 @@ def log_result(submitter,
log.error("%s has file list mismatch (%s)", perf_path, diff)

try:
is_valid, r, is_inferred = check_performance_dir(config, mlperf_model, perf_path, scenario_fixed)
is_valid, r, is_inferred = check_performance_dir(config, mlperf_model, perf_path, scenario_fixed, division, system_json)
if is_inferred:
inferred = 1
log.info("%s has inferred results, qps=%s", perf_path, r)
Expand Down Expand Up @@ -1468,8 +1503,8 @@ def log_result(submitter,
errors += 1

# check if compliance dir is good for CLOSED division
compliance = 0 if is_closed else 1
if is_closed and not skip_compliance:
compliance = 0 if is_closed_or_network else 1
if is_closed_or_network and not skip_compliance:
compliance_dir = os.path.join(division, submitter, "compliance",
system_desc, model_name, scenario)
if not os.path.exists(compliance_dir):
Expand All @@ -1492,7 +1527,7 @@ def log_result(submitter,

if required_scenarios:
name = os.path.join(results_path, system_desc, model_name)
if is_closed:
if is_closed_or_network:
results[name] = None
log.error("%s does not have all required scenarios, missing %s", name, required_scenarios)
elif debug:
Expand All @@ -1508,6 +1543,12 @@ def check_system_desc_id(fname, systems_json, submitter, division, version):
required_fields = SYSTEM_DESC_REQUIRED_FIELDS
else:
required_fields = SYSTEM_DESC_REQUIRED_FIELDS + SYSTEM_DESC_REQUIED_FIELDS_SINCE_V1

is_network_system, is_network_mode_valid = is_system_over_network(division, systems_json, fname)
is_valid &= is_network_mode_valid
if is_network_system:
required_fields += SYSTEM_DESC_REQUIRED_FIELDS_NETWORK_MODE

for k in required_fields:
if k not in systems_json:
is_valid = False
Expand Down

0 comments on commit cc9e97c

Please sign in to comment.