From 1f7f7df996ac2fd8018ecc33158f2b4bf301162a Mon Sep 17 00:00:00 2001 From: Dina Suehiro Jones Date: Thu, 21 Oct 2021 13:47:33 -0700 Subject: [PATCH] Updates to only use docker `--privileged` when required and check cpuset (#150) * Update numactl usage * Update error handling * Check to make sure it's in cpuset * Update forming cpu list * Style fixes and add unit tests * Test updates * Updates to figure out which cores are on which node * Update to print debug message * Update to organize the cpuset list by node: * Check args to see if docker should run with privileged * Add unit tests * Update numa_cores_per_instance for 'socket' setting * update for cores per instance 'socket' * Add doc update * Remove unused import * Style fixes * update to cpuset list * make str * limit length * update cores_per_node * Move num_physical_cores calculation * update num inter/intra threads * Fix intra threads * Updates to platform util to explain the core lists * Update unit tests * Unit test updates * Add tests with limited cpusets * Remove debug print * Add additional error handling and info * Update base benchmark util * Fix conditionals * Update messages * Update for numa_cores_per_instance 'socket' when sockets have different number of cores * Fix conditionals for checking numa cores per instance socket * Add another unit test to check the case when the --socket-id specified does not have any cores in the cpuset * Removing conditional in the validate function since it's being done in the init function * Add stderr=PIPE so that the terminal doesn't show an error when PlatformUtils is used before numactl is installed --- benchmarks/common/base_benchmark_util.py | 51 +++++--- benchmarks/common/base_model_init.py | 89 +++++++++----- benchmarks/common/platform_util.py | 110 +++++++++++++++++- .../common/tensorflow/container_init.sh | 14 ++- benchmarks/common/tensorflow/start.sh | 10 +- benchmarks/launch_benchmark.py | 6 +- .../ModelPackagesAdvancedOptions.md | 40 +++++++ tests/test_utils/io.py | 6 +- tests/test_utils/platform_config.py | 6 +- .../tensorflow/test_run_tf_benchmarks.py | 78 ++++++++++++- .../tf_model_args/tf_3d_unet_args.json | 3 +- .../tf_model_args/tf_3d_unet_mlperf_args.json | 18 ++- .../tf_model_args/tf_bert_args.json | 42 +++++-- .../tf_model_args/tf_densenet169_args.json | 9 +- .../tf_model_args/tf_dien_args.json | 21 ++-- .../tf_model_args/tf_faster_rcnn_args.json | 19 +-- .../tf_model_args/tf_gnmt_args.json | 9 +- .../tf_model_args/tf_inceptionv3_args.json | 46 ++++++-- .../tf_model_args/tf_inceptionv4_args.json | 15 ++- .../tf_model_args/tf_mask_rcnn_args.json | 16 ++- .../tf_model_args/tf_minigo_args.json | 9 +- .../tf_model_args/tf_mobilenet_v1_args.json | 35 ++++-- .../tensorflow/tf_model_args/tf_ncf_args.json | 17 ++- .../tf_model_args/tf_resnet101_args.json | 15 ++- .../tf_model_args/tf_resnet50_args.json | 28 +++-- .../tf_model_args/tf_resnet50v1_5_args.json | 59 +++++++--- .../tf_model_args/tf_rfcn_args.json | 18 ++- .../tf_model_args/tf_ssd_mobilenet_args.json | 28 ++++- .../tf_model_args/tf_ssd_resnet34_args.json | 40 +++++-- .../tf_transformer_lt_official_args.json | 6 +- .../tf_transformer_mlperf_args.json | 12 +- .../tf_model_args/tf_unet_args.json | 3 +- .../tf_model_args/tf_wavenet_args.json | 3 +- .../tf_model_args/tf_wide_deep_args.json | 6 +- .../tf_wide_deep_large_ds_args.json | 27 +++-- tests/unit/common/test_base_model_init.py | 43 +++++++ tests/unit/common/test_platform_util.py | 91 ++++++++++++++- tests/unit/test_launch_benchmark.py | 28 +++++ 38 files changed, 860 insertions(+), 216 deletions(-) diff --git a/benchmarks/common/base_benchmark_util.py b/benchmarks/common/base_benchmark_util.py index cbfd7e55b..ca86b2931 100644 --- a/benchmarks/common/base_benchmark_util.py +++ b/benchmarks/common/base_benchmark_util.py @@ -315,21 +315,34 @@ def _validate_args(self): if args.mpi: raise ValueError("--mpi_num_processes cannot be used together with --numa-cores-per-instance.") - if args.numa_cores_per_instance == "socket": - args.numa_cores_per_instance = self._platform_util.num_cores_per_socket - - if args.socket_id != -1: - if int(args.numa_cores_per_instance) > self._platform_util.num_cores_per_socket: - raise ValueError("The number of --numa-cores-per-instance ({}) cannot exceed the " - "number of cores per socket {} when a single socket (--socket-id {}) " - "is being used.".format(args.numa_cores_per_instance, - self._platform_util.num_cores_per_socket, - args.socket_id)) - else: - if int(args.numa_cores_per_instance) > system_num_cores: - raise ValueError("The number of --numa-cores-per-instance ({}) cannot exceed the " - "number of system cores ({}).".format(args.numa_cores_per_instance, - system_num_cores)) + if args.numa_cores_per_instance != "socket": + if args.socket_id != -1: + if int(args.numa_cores_per_instance) > self._platform_util.num_cores_per_socket: + raise ValueError("The number of --numa-cores-per-instance ({}) cannot exceed the " + "number of cores per socket {} when a single socket (--socket-id {}) " + "is being used.".format(args.numa_cores_per_instance, + self._platform_util.num_cores_per_socket, + args.socket_id)) + else: + if int(args.numa_cores_per_instance) > system_num_cores: + raise ValueError("The number of --numa-cores-per-instance ({}) cannot exceed the " + "number of system cores ({}).".format(args.numa_cores_per_instance, + system_num_cores)) + + # If socket id is specified and we have a cpuset, make sure that there are some cores in the specified socket. + # If cores are limited, then print out a note about that. + if args.socket_id != -1 and self._platform_util.cpuset_cpus: + cpuset_len_for_socket = 0 + + if args.socket_id in self._platform_util.cpuset_cpus.keys(): + cpuset_len_for_socket = len(self._platform_util.cpuset_cpus[args.socket_id]) + + if cpuset_len_for_socket == 0: + sys.exit("ERROR: There are no socket id {} cores in the cpuset.".format(args.socket_id)) + elif cpuset_len_for_socket < self._platform_util.num_cores_per_socket: + print("Note: Socket id {} is specified, but the cpuset has limited this socket to {} cores. " + "This is less than the number of cores per socket on the system ({})". + format(args.socket_id, cpuset_len_for_socket, self._platform_util.num_cores_per_socket)) def initialize_model(self, args, unknown_args): """Create model initializer for the specified model""" @@ -340,7 +353,13 @@ def initialize_model(self, args, unknown_args): os.path.dirname(os.path.realpath(__file__))) if args.numa_cores_per_instance == "socket": - args.numa_cores_per_instance = self._platform_util.num_cores_per_socket + if self._platform_util.cpuset_cpus: + if args.socket_id != -1: + args.numa_cores_per_instance = len(self._platform_util.cpuset_cpus[args.socket_id]) + else: + args.numa_cores_per_instance = "socket" + else: + args.numa_cores_per_instance = self._platform_util.num_cores_per_socket # find the path to the model_init.py file filename = "{}.py".format(self.MODEL_INITIALIZER) diff --git a/benchmarks/common/base_model_init.py b/benchmarks/common/base_model_init.py index 71898c13a..65468f23c 100644 --- a/benchmarks/common/base_model_init.py +++ b/benchmarks/common/base_model_init.py @@ -152,26 +152,38 @@ def run_numactl_multi_instance(self, cmd, replace_unique_output_dir=None): swap out that path for a path with the instance number in the folder name so that each instance uses a unique output folder. """ - # Get the cores list and group them according to the number of cores per instance - cores_per_instance = int(self.args.numa_cores_per_instance) - cpu_cores_list = self.platform_util.cpu_core_list - - if self.args.socket_id != -1: - # If it's specified to just use a single socket, then only use the cores from that socket - if len(cpu_cores_list) > self.args.socket_id: - cpu_cores_list = cpu_cores_list[self.args.socket_id] + + if self.args.numa_cores_per_instance != "socket": + # Get the cores list and group them according to the number of cores per instance + cores_per_instance = int(self.args.numa_cores_per_instance) + cpu_cores_list = self.platform_util.cpu_core_list + + if self.args.socket_id != -1: + # If it's specified to just use a single socket, then only use the cores from that socket + if len(cpu_cores_list) > self.args.socket_id: + cpu_cores_list = cpu_cores_list[self.args.socket_id] + else: + raise ValueError("Error while trying to get the core list for socket {0}. " + "The core list does not have cores for socket {0}.\n " + "Core list: {1}\n".format(self.args.socket_id, str(cpu_cores_list))) else: - raise ValueError("Error while trying to get the core list for socket {0}. " - "The core list does not have cores for socket {0}.\n " - "Core list: {1}\n".format(self.args.socket_id, str(cpu_cores_list))) - else: - # Using cores from all sockets - combined_core_list = [] - for socket_cores in cpu_cores_list: - combined_core_list += socket_cores - cpu_cores_list = combined_core_list + # Using cores from all sockets + combined_core_list = [] + for socket_cores in cpu_cores_list: + combined_core_list += socket_cores + cpu_cores_list = combined_core_list - instance_cores_list = self.group_cores(cpu_cores_list, cores_per_instance) + instance_cores_list = self.group_cores(cpu_cores_list, cores_per_instance) + else: + instance_cores_list = [] + cores_per_instance = "socket" + # Cores should be grouped based on the cores for each socket + if self.args.socket_id != -1: + # Only using cores from one socket + instance_cores_list[0] = self.platform_util.cpu_core_list[self.args.socket_id] + else: + # Get the cores for each socket + instance_cores_list = self.platform_util.cpu_core_list # Setup the log file name with the model name, precision, mode, batch size (if there is one), # number of cores per instance. An extra {} is intentionally left in the log_filename_format @@ -188,11 +200,14 @@ def run_numactl_multi_instance(self, cmd, replace_unique_output_dir=None): # Loop through each instance and add that instance's command to a string multi_instance_command = "" for instance_num, core_list in enumerate(instance_cores_list): - if len(core_list) < int(cores_per_instance): + if cores_per_instance != "socket" and len(core_list) < int(cores_per_instance): print("NOTE: Skipping remainder of {} cores for instance {}" .format(len(core_list), instance_num)) continue + if len(core_list) == 0: + continue + prefix = ("OMP_NUM_THREADS={0} " "numactl --localalloc --physcpubind={1}").format( len(core_list), ",".join(core_list)) @@ -340,21 +355,36 @@ def set_num_inter_intra_threads(self, num_inter_threads=None, num_intra_threads= if self.args.numa_cores_per_instance: # Set default num inter/intra threads if the user didn't provide specific values + if self.args.numa_cores_per_instance == "socket": + if self.args.socket_id != -1: + inter_threads = len(self.platform_util.cpu_core_list[self.args.socket_id]) + else: + # since we can only have one value for inter threads and the number of cores + # per socket can vary, if the cpuset is limited, get the lowest core count + # per socket and use that as the num inter threads + inter_threads = min([len(i) for i in self.platform_util.cpu_core_list if len(i) > 0]) + else: + inter_threads = self.args.numa_cores_per_instance + if not self.args.num_inter_threads: self.args.num_inter_threads = 1 if not self.args.num_intra_threads: - self.args.num_intra_threads = self.args.numa_cores_per_instance + self.args.num_intra_threads = inter_threads if not self.args.data_num_inter_threads: self.args.data_num_inter_threads = 1 if not self.args.data_num_intra_threads: - self.args.data_num_intra_threads = self.args.numa_cores_per_instance + self.args.data_num_intra_threads = inter_threads elif self.args.socket_id != -1: if not self.args.num_inter_threads: self.args.num_inter_threads = 1 if not self.args.num_intra_threads: - self.args.num_intra_threads = \ - self.platform_util.num_cores_per_socket \ - if self.args.num_cores == -1 else self.args.num_cores + if self.args.num_cores != -1: + self.args.num_intra_threads = self.args.num_cores + elif self.platform_util.cpuset_cpus and \ + self.args.socket_id in self.platform_util.cpuset_cpus.keys(): + self.args.num_intra_threads = len(self.platform_util.cpuset_cpus[self.args.socket_id]) + else: + self.args.num_intra_threads = self.platform_util.num_cores_per_socket else: if not self.args.num_inter_threads: self.args.num_inter_threads = self.platform_util.num_cpu_sockets @@ -362,9 +392,14 @@ def set_num_inter_intra_threads(self, num_inter_threads=None, num_intra_threads= self.args.num_inter_threads = 1 if not self.args.num_intra_threads: if self.args.num_cores == -1: - self.args.num_intra_threads = \ - int(self.platform_util.num_cores_per_socket * - self.platform_util.num_cpu_sockets) + if self.platform_util.cpuset_cpus and len(self.platform_util.cpuset_cpus.keys()) > 0: + # Total up the number of cores in the cpuset + self.args.num_intra_threads = sum([len(self.platform_util.cpuset_cpus[socket_id]) + for socket_id in self.platform_util.cpuset_cpus.keys()]) + else: + self.args.num_intra_threads = \ + int(self.platform_util.num_cores_per_socket * + self.platform_util.num_cpu_sockets) if os.environ["MPI_NUM_PROCESSES"] != "None": self.args.num_intra_threads = self.platform_util.num_cores_per_socket - 2 else: diff --git a/benchmarks/common/platform_util.py b/benchmarks/common/platform_util.py index a831b1d70..6f6cfb07c 100644 --- a/benchmarks/common/platform_util.py +++ b/benchmarks/common/platform_util.py @@ -33,6 +33,8 @@ CORES_PER_SOCKET_STR_ = "Core(s) per socket" THREADS_PER_CORE_STR_ = "Thread(s) per core" LOGICAL_CPUS_STR_ = "CPU(s)" +NUMA_NODE_CPU_RANGE_STR_ = "NUMA node{} CPU(s):" +ONLINE_CPUS_LIST = "On-line CPU(s) list:" class CPUInfo(): @@ -192,8 +194,16 @@ def __init__(self, args): self.num_threads_per_core = 0 self.num_logical_cpus = 0 self.num_numa_nodes = 0 + + # Core list generated by numactl -H in the case where --numa-cores-per-instance is + # being used. It then gets pruned based on the cpuset_cpus, in case docker is + # limiting the cores that the container has access to self.cpu_core_list = [] + # Dictionary generated from the cpuset.cpus file (in linux_init) for the case where + # docker is limiting the number of cores that the container has access to + self.cpuset_cpus = None + os_type = system_platform.system() if "Windows" == os_type: self.windows_init() @@ -204,6 +214,45 @@ def __init__(self, args): else: raise ValueError("Unable to determine Operating system type.") + def _get_list_from_string_ranges(self, str_ranges): + """ + Converts a string of numbered ranges (comma separated numbers or ranges) to an + integer list. Duplicates should be removed and the integer list should be + ordered. + For example an input of "3-6,10,0-5" should return [0, 1, 2, 3, 4, 5, 6, 10] + """ + result_list = [] + + for section in str_ranges.split(","): + if "-" in section: + # Section is a range, so get the start and end values + start, end = section.split("-") + section_list = range(int(start), int(end) + 1) + result_list += section_list + else: + # This section is just a single number, not a range + result_list.append(int(section)) + + # Remove duplicates + result_list = list(set(result_list)) + + return result_list + + def _get_cpuset(self): + """ + Try to get the cpuset.cpus info, since lscpu does not know if docker has limited + the cpuset accessible to the container + """ + cpuset = "" + cpuset_cpus_file = "/sys/fs/cgroup/cpuset/cpuset.cpus" + if os.path.exists(cpuset_cpus_file): + with open(cpuset_cpus_file, "r") as f: + cpuset = f.read() + + if self.args.verbose: + print("cpuset.cpus: {}".format(cpuset)) + return cpuset + def linux_init(self): lscpu_cmd = "lscpu" try: @@ -219,6 +268,9 @@ def linux_init(self): print("Problem getting CPU info: {}".format(e)) sys.exit(1) + core_list_per_node = {} + online_cpus_list = "" + # parse it for line in cpu_info: # NUMA_NODES_STR_ = "NUMA node(s)" @@ -236,28 +288,76 @@ def linux_init(self): # LOGICAL_CPUS_STR_ = "CPU(s)" elif line.find(LOGICAL_CPUS_STR_) == 0: self.num_logical_cpus = int(line.split(":")[1].strip()) + # ONLINE_CPUS_LIST = "On-line CPU(s) list" + elif line.find(ONLINE_CPUS_LIST) == 0: + online_cpus_list = line.split(":")[1].strip() + else: + # Get the ranges of cores per node from NUMA node* CPU(s) + for node in range(0, self.num_numa_nodes): + if line.find(NUMA_NODE_CPU_RANGE_STR_.format(str(node))) == 0: + range_for_node = line.split(":")[1].strip() + range_list_for_node = self._get_list_from_string_ranges(range_for_node) + core_list_per_node[node] = range_list_for_node + + # Try to get the cpuset.cpus info, since lscpu does not know if the cpuset is limited + cpuset = self._get_cpuset() + if cpuset: + # If the cpuset is the same as the online_cpus_list, then we are using the whole + # machine, so let's avoid unnecessary complexity and don't bother with the cpuset_cpu list + if (online_cpus_list != "" and online_cpus_list != cpuset) or online_cpus_list == "": + self.cpuset_cpus = self._get_list_from_string_ranges(cpuset) # Uses numactl get the core number for each numa node and adds the cores for each - # node to the cpu_cores_list array - if self.num_numa_nodes > 0: + # node to the cpu_cores_list array. Only do this if the command is trying to use + # numa_cores_per_instance we can't count on numactl being installed otherwise and + # this list is only used for the numactl multi-instance runs. + num_physical_cores = self.num_cpu_sockets * self.num_cores_per_socket + cores_per_node = int(num_physical_cores / self.num_numa_nodes) + if self.num_numa_nodes > 0 and self.args.numa_cores_per_instance is not None: try: # Get the list of cores - num_physical_cores = self.num_cpu_sockets * self.num_cores_per_socket - cores_per_node = int(num_physical_cores / self.num_numa_nodes) cpu_array_command = \ "numactl -H | grep 'node [0-9]* cpus:' |" \ "sed 's/.*node [0-9]* cpus: *//' | head -{0} |cut -f1-{1} -d' '".format( self.num_numa_nodes, int(cores_per_node)) cpu_array = subprocess.Popen( - cpu_array_command, shell=True, stdout=subprocess.PIPE).stdout.readlines() + cpu_array_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stdout.readlines() for node_cpus in cpu_array: node_cpus = str(node_cpus).lstrip("b'").replace("\\n'", " ") self.cpu_core_list.append([x for x in node_cpus.split(" ") if x != '']) + + # If we have the cpuset list, cross check that list with our core list and + # remove cores that are not part of the cpuset list + if self.cpuset_cpus is not None: + for socket, core_list in enumerate(self.cpu_core_list): + self.cpu_core_list[socket] = [x for x in core_list if int(x) in self.cpuset_cpus] + + if (self.args.verbose): + print("Core list: {}".format(self.cpu_core_list), flush=True) + except Exception as e: print("Warning: An error occured when getting the list of cores using '{}':\n {}". format(cpu_array_command, e)) + if self.cpuset_cpus is not None: + # Reformat the cpuset_cpus list so that it's split up by node + for node in core_list_per_node.keys(): + core_list_per_node[node] = [x for x in core_list_per_node[node] if x in self.cpuset_cpus] + self.cpuset_cpus = core_list_per_node + + # Remove cores that aren't part of the cpu_core_list + for socket in self.cpuset_cpus.keys(): + if len(self.cpuset_cpus[socket]) > cores_per_node: + del self.cpuset_cpus[socket][cores_per_node:] + + # Remove keys with empty lists (sockets where there are no cores enabled in the cpuset) + self.cpuset_cpus = {k: v for k, v in self.cpuset_cpus.items() if v} + + # Update the number of sockets based on the cpuset + if len(self.cpuset_cpus.keys()) > 0: + self.num_cpu_sockets = len(self.cpuset_cpus.keys()) + def windows_init(self): NUM_SOCKETS_STR_ = "DeviceID" CORES_PER_SOCKET_STR_ = "NumberOfCores" diff --git a/benchmarks/common/tensorflow/container_init.sh b/benchmarks/common/tensorflow/container_init.sh index ff0bce322..b2ac40524 100755 --- a/benchmarks/common/tensorflow/container_init.sh +++ b/benchmarks/common/tensorflow/container_init.sh @@ -16,10 +16,12 @@ # # This file includes runtime installs for model containers - -if (( $(id -u) == 0 )); then - apt-get install numactl -y -else - echo "Please run as root" - exit 1 +if [[ $NUMA_CORES_PER_INSTANCE != "None" || $SOCKET_ID != "-1" || $NUM_CORES != "-1" ]]; then + if (( $(id -u) == 0 )); then + apt-get install numactl -y + else + echo "Please run as root" + exit 1 + fi fi + diff --git a/benchmarks/common/tensorflow/start.sh b/benchmarks/common/tensorflow/start.sh index 8095b8247..83aafafa7 100644 --- a/benchmarks/common/tensorflow/start.sh +++ b/benchmarks/common/tensorflow/start.sh @@ -154,10 +154,12 @@ if _running-in-container ; then # Call the framework's container_init.sh, if it exists if [ -f ${MOUNT_BENCHMARK}/common/${FRAMEWORK}/container_init.sh ]; then if [[ ${CENTOS_PLATFORM} == "True" ]] && [[ ${NOINSTALL} != "True" ]]; then - yum update -y - yum install -y numactl - else - ${MOUNT_BENCHMARK}/common/${FRAMEWORK}/container_init.sh + if [[ $NUMA_CORES_PER_INSTANCE != "None" || $SOCKET_ID != "-1" || $NUM_CORES != "-1" ]]; then + yum update -y + yum install -y numactl + fi + else + ${MOUNT_BENCHMARK}/common/${FRAMEWORK}/container_init.sh fi fi # Call the model specific container_init.sh, if it exists diff --git a/benchmarks/launch_benchmark.py b/benchmarks/launch_benchmark.py index fb2318db6..2be4cf494 100644 --- a/benchmarks/launch_benchmark.py +++ b/benchmarks/launch_benchmark.py @@ -442,9 +442,13 @@ def run_docker_container(self, benchmark_scripts, intelai_models, if args.debug: docker_run_cmd.append("-it") + if args.numa_cores_per_instance is not None or args.socket_id != -1 or \ + args.num_cores != -1 or args.mpi is not None or args.num_mpi > 1: + docker_run_cmd.append("--privileged") + docker_shm_size = "--shm-size={}".format(args.shm_size) docker_run_cmd = docker_run_cmd + env_vars + volume_mounts + [ - docker_shm_size, "--privileged", "-u", "root:root", "-w", + docker_shm_size, "-u", "root:root", "-w", workspace, args.docker_image, "/bin/bash"] if not args.debug: diff --git a/quickstart/common/tensorflow/ModelPackagesAdvancedOptions.md b/quickstart/common/tensorflow/ModelPackagesAdvancedOptions.md index 2b19c8442..c53c5b6db 100644 --- a/quickstart/common/tensorflow/ModelPackagesAdvancedOptions.md +++ b/quickstart/common/tensorflow/ModelPackagesAdvancedOptions.md @@ -116,6 +116,46 @@ docker run \ --data-location ${DATASET_DIR} +If a cpuset is specified along with `--numa-cores-per-instance`, the cores +used for each instance will be limited to those specified as part of the cpuset. +Also, note that since `--numa-cores-per-instance` uses `numactl`, it needs to +be run with `--privilege`. + +
+$MODEL_ZOO_DIR=<path to the model zoo directory>
+DATASET_DIR=<path to the preprocessed imagenet dataset>
+OUTPUT_DIR=<directory where log files will be written>
+
+docker run --rm --privileged --init \
+    --volume $PRETRAINED_MODEL:$PRETRAINED_MODEL \
+    --volume $MODEL_ZOO_DIR:$MODEL_ZOO_DIR \
+    --volume $OUTPUT_DIR:$OUTPUT_DIR \
+    --env http_proxy=$http_proxy \
+    --env https_proxy=$https_proxy \
+    --env PRETRAINED_MODEL=$PRETRAINED_MODEL \
+    --env OUTPUT_DIR=$OUTPUT_DIR \
+    -w $MODEL_ZOO_DIR \
+    --cpuset-cpus "0-7,28-35" \
+    -it intel/intel-optimized-tensorflow:latest \
+    python benchmarks/launch_benchmark.py \
+    --in-graph ${PRETRAINED_MODEL} \
+    --model-name resnet50v1_5 \
+    --framework tensorflow \
+    --precision bfloat16 \
+    --mode inference \
+    --batch-size=1 \
+    --output-dir ${OUTPUT_DIR} \
+    --benchmark-only \
+    --numa-cores-per-instance 4
+
+# The command above ends up running the following instances:
+# OMP_NUM_THREADS=4 numactl --localalloc --physcpubind=0,1,2,3 python eval_image_classifier_inference.py --input-graph=resnet50_v1_5_bfloat16.pb --num-inter-threads=1 --num-intra-threads=4 --num-cores=28 --batch-size=1 --warmup-steps=10 --steps=50 --data-num-inter-threads=1 --data-num-intra-threads=4 >> resnet50v1_5_bfloat16_inference_bs1_cores4_instance0.log 2>&1 & \
+# OMP_NUM_THREADS=4 numactl --localalloc --physcpubind=4,5,6,7 python eval_image_classifier_inference.py --input-graph=resnet50_v1_5_bfloat16.pb --num-inter-threads=1 --num-intra-threads=4 --num-cores=28 --batch-size=1 --warmup-steps=10 --steps=50 --data-num-inter-threads=1 --data-num-intra-threads=4 >> resnet50v1_5_bfloat16_inference_bs1_cores4_instance1.log 2>&1 & \
+# OMP_NUM_THREADS=4 numactl --localalloc --physcpubind=28,29,30,31 python eval_image_classifier_inference.py --input-graph=resnet50_v1_5_bfloat16.pb --num-inter-threads=1 --num-intra-threads=4 --num-cores=28 --batch-size=1 --warmup-steps=10 --steps=50 --data-num-inter-threads=1 --data-num-intra-threads=4 >> resnet50v1_5_bfloat16_inference_bs1_cores4_instance2.log 2>&1 & \
+# OMP_NUM_THREADS=4 numactl --localalloc --physcpubind=32,33,34,35 python eval_image_classifier_inference.py --input-graph=resnet50_v1_5_bfloat16.pb --num-inter-threads=1 --num-intra-threads=4 --num-cores=28 --batch-size=1 --warmup-steps=10 --steps=50 --data-num-inter-threads=1 --data-num-intra-threads=4 >> resnet50v1_5_bfloat16_inference_bs1_cores4_instance3.log 2>&1 & \
+# wait
+
+ ## Mounting local model packages in docker A download of the model package can be run in a docker container by mounting the diff --git a/tests/test_utils/io.py b/tests/test_utils/io.py index 21ee943f2..a9809b549 100644 --- a/tests/test_utils/io.py +++ b/tests/test_utils/io.py @@ -33,6 +33,10 @@ def parse_json_files(json_dir_path): with open(file_path) as f: data = json.load(f) for x in data: + # Use 0-111 as the default cpuset, if it's not specified in the json + cpuset = "0-111" + if 'cpuset' in x.keys(): + cpuset = x['cpuset'] values.append( - tuple((x['input'], x['output'], model_file + " :: " + x['_comment']))) + tuple((x['input'], x['output'], model_file + " :: " + x['_comment'], cpuset))) return values diff --git a/tests/test_utils/platform_config.py b/tests/test_utils/platform_config.py index 2ae2384af..6d070434f 100644 --- a/tests/test_utils/platform_config.py +++ b/tests/test_utils/platform_config.py @@ -29,7 +29,11 @@ "Thread(s) per core: 2\n" "Core(s) per socket: 28\n" "Socket(s): 2\n" - "NUMA node(s): 2\n") + "NUMA node(s): 2\n" + "On-line CPU(s) list: 0-111\n" + "NUMA node0 CPU(s): 0-27,56-83\n" + "NUMA node1 CPU(s): 28-55,84-111\n") + NUMA_CORES_OUTPUT = ['0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55'] diff --git a/tests/unit/common/tensorflow/test_run_tf_benchmarks.py b/tests/unit/common/tensorflow/test_run_tf_benchmarks.py index f4a850815..6f86ca95a 100644 --- a/tests/unit/common/tensorflow/test_run_tf_benchmarks.py +++ b/tests/unit/common/tensorflow/test_run_tf_benchmarks.py @@ -63,7 +63,7 @@ def clear_kmp_env_vars(): test_arg_values = parse_model_args_file() -@pytest.mark.parametrize("test_args,expected_cmd,comment", test_arg_values) +@pytest.mark.parametrize("test_args,expected_cmd,comment,cpuset", test_arg_values) @patch("os.mkdir") @patch("shutil.rmtree") @patch("os.listdir") @@ -74,14 +74,15 @@ def clear_kmp_env_vars(): @patch("os.chdir") @patch("os.remove") @patch("glob.glob") +@patch("common.platform_util.PlatformUtil._get_cpuset") @patch("common.platform_util.os") @patch("common.platform_util.system_platform") @patch("common.platform_util.subprocess") @patch("common.base_model_init.BaseModelInitializer.run_command") -def test_run_benchmark(mock_run_command, mock_subprocess, mock_platform, mock_os, +def test_run_benchmark(mock_run_command, mock_subprocess, mock_platform, mock_os, mock_get_cpuset, mock_glob, mock_remove, mock_chdir, mock_stat, mock_path_exists, mock_is_file, mock_is_dir, mock_listdir, mock_rmtree, mock_mkdir, - test_args, expected_cmd, comment): + test_args, expected_cmd, comment, cpuset): """ Runs through executing the specified run_tf_benchmarks.py command from the test_args and verifying that the model_init file calls run_command with @@ -103,7 +104,8 @@ def test_run_benchmark(mock_run_command, mock_subprocess, mock_platform, mock_os if match_per_socket and match_per_socket.lastindex >= 1: os.environ["MPI_NUM_PROCESSES_PER_SOCKET"] = match_per_socket.group(1) - mock_path_exists.return_value = True + mock_os.path.exists.side_effect = True + mock_get_cpuset.return_value = cpuset mock_is_dir.return_value = True mock_is_file.return_value = True mock_stat.return_value = MagicMock(st_nlink=0) @@ -128,3 +130,71 @@ def test_run_benchmark(mock_run_command, mock_subprocess, mock_platform, mock_os # use fnmatch in case we have file names with wildcards (like timestamps in output files) assert fnmatch.fnmatch(actual_arg, expected_arg), \ "Expected: {}\nActual: {}".format(expected_cmd, call_args) + + +@pytest.mark.parametrize("test_args,socket_id,cpuset", + [["run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 " + "--mode inference --model-name inceptionv3 --batch-size 128 " + "--in-graph /final_int8_inceptionv3.pb --intelai-models . --socket-id 1 " + "--benchmark-only", "1", "0-2"], + ["run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 " + "--mode inference --model-name inceptionv3 --batch-size 128 " + "--in-graph /final_int8_inceptionv3.pb --intelai-models . --socket-id 0 " + "--benchmark-only", "0", "50-55"]]) +@patch("os.mkdir") +@patch("shutil.rmtree") +@patch("os.listdir") +@patch("os.path.isdir") +@patch("os.path.isfile") +@patch("os.path.exists") +@patch("os.stat") +@patch("os.chdir") +@patch("os.remove") +@patch("glob.glob") +@patch("common.platform_util.PlatformUtil._get_cpuset") +@patch("common.platform_util.os") +@patch("common.platform_util.system_platform") +@patch("common.platform_util.subprocess") +@patch("common.base_model_init.BaseModelInitializer.run_command") +def test_run_benchmark_bad_socket(mock_run_command, mock_subprocess, mock_platform, mock_os, mock_get_cpuset, + mock_glob, mock_remove, mock_chdir, mock_stat, mock_path_exists, + mock_is_file, mock_is_dir, mock_listdir, mock_rmtree, mock_mkdir, + test_args, socket_id, cpuset): + """ + Checks to ensure that the proper error handling is done when the cpuset does not include any cores + for the specified socket_id + """ + + os.environ["PYTHON_EXE"] = "python" + if "mpi" not in test_args: + os.environ["MPI_NUM_PROCESSES"] = "None" + os.environ["MPI_HOSTNAMES"] = "None" + else: + if "--mpi_num_processes=" in test_args: + match_mpi_procs = re.search('--mpi_num_processes=([0-9]+)', test_args) + if match_mpi_procs and match_mpi_procs.lastindex >= 1: + os.environ["MPI_NUM_PROCESSES"] = match_mpi_procs.group(1) + if "--mpi_num_processes_per_socket=" in test_args: + match_per_socket = re.search('--mpi_num_processes_per_socket=([0-9]+)', test_args) + if match_per_socket and match_per_socket.lastindex >= 1: + os.environ["MPI_NUM_PROCESSES_PER_SOCKET"] = match_per_socket.group(1) + + mock_os.path.exists.side_effect = True + mock_get_cpuset.return_value = cpuset + mock_is_dir.return_value = True + mock_is_file.return_value = True + mock_stat.return_value = MagicMock(st_nlink=0) + parse_model_args_file() + mock_listdir.return_value = ["data.record"] + mock_glob.return_value = ["/usr/lib/libtcmalloc.so.4.2.6"] + clear_kmp_env_vars() + platform_config.set_mock_system_type(mock_platform) + platform_config.set_mock_os_access(mock_os) + platform_config.set_mock_lscpu_subprocess_values(mock_subprocess) + test_args = re.sub(" +", " ", test_args) # get rid of extra spaces in the test_args string + test_arg_list = test_args.split(" ") + with pytest.raises(SystemExit, + match="ERROR: There are no socket id {} cores in the cpuset.".format(socket_id)): + with patch.object(sys, "argv", test_arg_list): + model_benchmark = ModelBenchmarkUtil() + model_benchmark.main() diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_3d_unet_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_3d_unet_args.json index b5cd928b5..9172efa48 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_3d_unet_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_3d_unet_args.json @@ -2,6 +2,7 @@ { "_comment": "3d_unet_inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=3d_unet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --in-graph=/in_graph/tumor_segmentation_model.h5 --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/brats/predict.py --inter 1 --intra 28 --nw 1 --nb 5 --bs 1" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/brats/predict.py --inter 1 --intra 28 --nw 1 --nb 5 --bs 1", + "cpuset": "0-111" } ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_3d_unet_mlperf_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_3d_unet_mlperf_args.json index cd1b686c2..c2bab43be 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_3d_unet_mlperf_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_3d_unet_mlperf_args.json @@ -2,31 +2,37 @@ { "_comment": "3d_unet_mlperf_fp32_inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=3d_unet_mlperf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --benchmark-only --in-graph=/in_graph/3dunet_dynamic_ndhwc.pb --warmup-steps=20 --steps=100", - "output": "python /workspace/intelai_models/inference/fp32/brats/run_performance.py --input-graph=/in_graph/3dunet_dynamic_ndhwc.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --steps=100 --warmup-steps=20" + "output": "python /workspace/intelai_models/inference/fp32/brats/run_performance.py --input-graph=/in_graph/3dunet_dynamic_ndhwc.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --steps=100 --warmup-steps=20", + "cpuset": "0-111" }, { "_comment": "3d_unet_mlperf_fp32_inference_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=3d_unet_mlperf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --accuracy-only --in-graph=/in_graph/3dunet_dynamic_ndhwc.pb --data-location=/dataset/MICCAI_BraTS_2019_Data_Training", - "output": "python /workspace/intelai_models/inference/fp32/brats/run_accuracy.py --input-graph=/in_graph/3dunet_dynamic_ndhwc.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --data-location=/dataset/MICCAI_BraTS_2019_Data_Training --accuracy-only" + "output": "python /workspace/intelai_models/inference/fp32/brats/run_accuracy.py --input-graph=/in_graph/3dunet_dynamic_ndhwc.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --data-location=/dataset/MICCAI_BraTS_2019_Data_Training --accuracy-only", + "cpuset": "0-111" }, { "_comment": "3d_unet_mlperf_int8_inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=3d_unet_mlperf --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --benchmark-only --in-graph=/in_graph/3dunet_int8_fully_quantized_perchannel.pb --warmup-steps=20 --steps=100", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/brats/run_performance.py --input-graph=/in_graph/3dunet_int8_fully_quantized_perchannel.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --steps=100 --warmup-steps=20" + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/brats/run_performance.py --input-graph=/in_graph/3dunet_int8_fully_quantized_perchannel.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --steps=100 --warmup-steps=20", + "cpuset": "0-111" }, { "_comment": "3d_unet_mlperf_int8_inference_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=3d_unet_mlperf --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --accuracy-only --in-graph=/in_graph/3dunet_int8_fully_quantized_perchannel.pb --data-location=/dataset/MICCAI_BraTS_2019_Data_Training", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/brats/run_accuracy.py --input-graph=/in_graph/3dunet_int8_fully_quantized_perchannel.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --data-location=/dataset/MICCAI_BraTS_2019_Data_Training --accuracy-only" + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/brats/run_accuracy.py --input-graph=/in_graph/3dunet_int8_fully_quantized_perchannel.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --data-location=/dataset/MICCAI_BraTS_2019_Data_Training --accuracy-only", + "cpuset": "0-111" }, { "_comment": "3d_unet_mlperf_bfloat16_inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=3d_unet_mlperf --precision=bfloat16 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --benchmark-only --in-graph=/in_graph/3dunet_dynamic_ndhwc.pb --warmup-steps=20 --steps=100", - "output": "python /workspace/intelai_models/inference/bfloat16/brats/run_performance.py --input-graph=/in_graph/3dunet_dynamic_ndhwc.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --steps=100 --warmup-steps=20" + "output": "python /workspace/intelai_models/inference/bfloat16/brats/run_performance.py --input-graph=/in_graph/3dunet_dynamic_ndhwc.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --steps=100 --warmup-steps=20", + "cpuset": "0-111" }, { "_comment": "3d_unet_mlperf_bfloat16_inference_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=3d_unet_mlperf --precision=bfloat16 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --accuracy-only --in-graph=/in_graph/3dunet_dynamic_ndhwc.pb --data-location=/dataset/MICCAI_BraTS_2019_Data_Training", - "output": "python /workspace/intelai_models/inference/bfloat16/brats/run_accuracy.py --input-graph=/in_graph/3dunet_dynamic_ndhwc.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --data-location=/dataset/MICCAI_BraTS_2019_Data_Training --accuracy-only" + "output": "python /workspace/intelai_models/inference/bfloat16/brats/run_accuracy.py --input-graph=/in_graph/3dunet_dynamic_ndhwc.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=1 --model-name=3d_unet_mlperf --data-location=/dataset/MICCAI_BraTS_2019_Data_Training --accuracy-only", + "cpuset": "0-111" } ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_bert_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_bert_args.json index 582fbc647..1585de31f 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_bert_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_bert_args.json @@ -2,51 +2,73 @@ { "_comment": "bert_fp32_inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=bert --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --num-cores=28 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --data-location=/dataset --num-inter-threads=1 --num-intra-threads=28 --disable-tcmalloc=True --task_name=XNLI --max_seq_length=128 --batch-size=8 --learning_rate=5e-5", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/run_classifier.py --data_dir=/dataset --output_dir=/workspace/benchmarks/common/tensorflow/logs --vocab_file=/checkpoints/vocab.txt --bert_config_file=/checkpoints/bert_config.json --init_checkpoint=/checkpoints/bert_model.ckpt --task_name=XNLI --max_seq_length=128 --eval_batch_size=8 --learning_rate=5e-05 --num_inter_threads=1 --num_intra_threads=28 --do_train=false --do_eval=true" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/run_classifier.py --data_dir=/dataset --output_dir=/workspace/benchmarks/common/tensorflow/logs --vocab_file=/checkpoints/vocab.txt --bert_config_file=/checkpoints/bert_config.json --init_checkpoint=/checkpoints/bert_model.ckpt --task_name=XNLI --max_seq_length=128 --eval_batch_size=8 --learning_rate=5e-05 --num_inter_threads=1 --num_intra_threads=28 --do_train=false --do_eval=true", + "cpuset": "0-111" }, { "_comment": "bert_large_fp32_squad_inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --in-graph=/in_graph/bert.pb --output-dir=/workspace/logs --batch-size=128 --data-location=/dataset --infer-option=SQuAD", - "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=fp32 --output_dir=/workspace/logs --predict_batch_size=128 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30" + "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=fp32 --output_dir=/workspace/logs --predict_batch_size=128 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30", + "cpuset": "0-111" }, { "_comment": "bert_large_fp32_squad_profile", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --in-graph=/in_graph/bert.pb --output-dir=/workspace/logs --batch-size=128 --data-location=/dataset --infer-option=SQuAD --profile=True", - "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=fp32 --output_dir=/workspace/logs --predict_batch_size=128 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --mode=profile --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30" + "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=fp32 --output_dir=/workspace/logs --predict_batch_size=128 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --mode=profile --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30", + "cpuset": "0-111" }, { "_comment": "bert_large_fp32_squad_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --in-graph=/in_graph/bert.pb --output-dir=/workspace/logs --batch-size=128 --data-location=/dataset --infer-option=SQuAD --accuracy-only", - "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=fp32 --output_dir=/workspace/logs --predict_batch_size=128 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --mode=accuracy --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30" + "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=fp32 --output_dir=/workspace/logs --predict_batch_size=128 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --mode=accuracy --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30", + "cpuset": "0-111" }, { "_comment": "bert_large_int8_squad_inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --in-graph=/in_graph/bert.pb --output-dir=/workspace/logs --batch-size=32 --data-location=/dataset --infer-option=SQuAD", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=int8 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30" + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=int8 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30", + "cpuset": "0-111" }, { "_comment": "bert_large_int8_inference_optional_args", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --in-graph=/in_graph/bert.pb --output-dir=/workspace/logs --data-location=/dataset --infer-option=SQuAD --num-intra-threads=28 --num-inter-threads=1 --benchmark-only --doc-stride=128 --max-seq-length=384", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=int8 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --mode=benchmark --doc_stride=128 --max_seq_length=384 --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=28 --warmup_steps=10 --steps=30" + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=int8 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --mode=benchmark --doc_stride=128 --max_seq_length=384 --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=28 --warmup_steps=10 --steps=30", + "cpuset": "0-111" }, { "_comment": "bert_large_int8_squad_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --in-graph=/in_graph/bert.pb --output-dir=/workspace/logs --batch-size=32 --data-location=/dataset --infer-option=SQuAD --accuracy-only", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=int8 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --mode=accuracy --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30" + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=int8 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --mode=accuracy --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30", + "cpuset": "0-111" }, { "_comment": "bert_large_bfloat16_squad_inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=bfloat16 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --batch-size=32 --output-dir=/workspace/logs --infer-option=SQuAD --data-location=/dataset --benchmark-only", - "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=bfloat16 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --do_predict=True --mode=benchmark --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30" + "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=bfloat16 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --do_predict=True --mode=benchmark --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30", + "cpuset": "0-111" }, { "_comment": "bert_large_bfloat16_squad_profile", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=bfloat16 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --batch-size=32 --output-dir=/workspace/logs --infer-option=SQuAD --data-location=/dataset --profile=True", - "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=bfloat16 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --do_predict=True --mode=profile --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30" + "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=bfloat16 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --do_predict=True --mode=profile --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30", + "cpuset": "0-111" }, { "_comment": "bert_large_bfloat16_squad_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=bfloat16 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --batch-size=32 --output-dir=/workspace/logs --infer-option=SQuAD --data-location=/dataset --accuracy-only", - "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=bfloat16 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --do_predict=True --mode=accuracy --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30" + "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=bfloat16 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --do_predict=True --mode=accuracy --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=56 --warmup_steps=10 --steps=30", + "cpuset": "0-111" + }, + { + "_comment": "bert_large_bfloat16_squad_inference_cpuset", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=bfloat16 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --batch-size=32 --output-dir=/workspace/logs --infer-option=SQuAD --data-location=/dataset --benchmark-only", + "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=bfloat16 --output_dir=/workspace/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --do_predict=True --mode=benchmark --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=8 --warmup_steps=10 --steps=30", + "cpuset": "0-7" + }, + { + "_comment": "bert_large_fp32_squad_inference_cpuset", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --checkpoint=/checkpoints --intelai-models=/workspace/intelai_models --in-graph=/in_graph/bert.pb --output-dir=/workspace/logs --batch-size=128 --data-location=/dataset --infer-option=SQuAD", + "output": "python /workspace/intelai_models/inference/run_squad.py --init_checkpoint=/checkpoints/model.ckpt-3649 --vocab_file=/dataset/vocab.txt --bert_config_file=/dataset/bert_config.json --predict_file=/dataset/dev-v1.1.json --precision=fp32 --output_dir=/workspace/logs --predict_batch_size=128 --experimental_gelu=False --optimized_softmax=True --input_graph=/in_graph/bert.pb --do_predict=True --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=16 --warmup_steps=10 --steps=30", + "cpuset": "0-7,28-35" } ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_densenet169_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_densenet169_args.json index 2a6e1d877..75ef1158c 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_densenet169_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_densenet169_args.json @@ -1,15 +1,18 @@ [ { "_comment": "densenet169_fp32_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=densenet169 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/densenet169_fp32_pretrained_model.pb", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/benchmark.py --num_intra_threads=28 --num_inter_threads=1 --batch_size=1 --input_graph=/in_graph/densenet169_fp32_pretrained_model.pb"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/benchmark.py --num_intra_threads=28 --num_inter_threads=1 --batch_size=1 --input_graph=/in_graph/densenet169_fp32_pretrained_model.pb", + "cpuset": "0-111" }, { "_comment": "densenet169_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=densenet169 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/densenet169_fp32_pretrained_model.pb --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/accuracy.py --num_intra_threads=28 --num_inter_threads=1 --batch_size=100 --input_graph=/in_graph/densenet169_fp32_pretrained_model.pb --data_location=/dataset"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/accuracy.py --num_intra_threads=28 --num_inter_threads=1 --batch_size=100 --input_graph=/in_graph/densenet169_fp32_pretrained_model.pb --data_location=/dataset", + "cpuset": "0-111" }, { "_comment": "densenet169_fp32_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=densenet169 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/densenet169_fp32_pretrained_model.pb", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/benchmark.py --num_intra_threads=28 --num_inter_threads=1 --batch_size=100 --input_graph=/in_graph/densenet169_fp32_pretrained_model.pb"} + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/benchmark.py --num_intra_threads=28 --num_inter_threads=1 --batch_size=100 --input_graph=/in_graph/densenet169_fp32_pretrained_model.pb", + "cpuset": "0-111" } ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_dien_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_dien_args.json index a9ad5d01b..f0fa78c7f 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_dien_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_dien_args.json @@ -2,36 +2,43 @@ { "_comment": "dien_fp32_training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=dien --precision=fp32 --mode=training --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --data-location=/dataset/dien-dataset-folder --socket-id=0 --batch-size 128", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/training/fp32/train.py --batch_size 128 --num_inter_threads 1 --num_intra_threads 28 --data_location /dataset/dien-dataset-folder --mode train" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/training/fp32/train.py --batch_size 128 --num_inter_threads 1 --num_intra_threads 28 --data_location /dataset/dien-dataset-folder --mode train", + "cpuset": "0-111" }, { "_comment": "dien_fp32_inference_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=dien --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --data-location=/dataset/dien-dataset-folder --in-graph=/workspace/dien_fp32_static_rnn_graph.pb --socket-id 0 --batch-size 128 --num-intra-threads 26 --num-inter-threads 1 --graph_type=static", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 128 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type fp32 --input_graph /workspace/dien_fp32_static_rnn_graph.pb --graph_type static" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 128 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type fp32 --input_graph /workspace/dien_fp32_static_rnn_graph.pb --graph_type static", + "cpuset": "0-111" }, { "_comment": "dien_fp32_inference_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=dien --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --data-location=/dataset/dien-dataset-folder --in-graph=/workspace/dien_fp32_static_rnn_graph.pb --socket-id 0 --batch-size 1 --num-intra-threads 26 --num-inter-threads 1 --graph_type=dynamic --exact-max-length=100 --num-iterations=10", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 1 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type fp32 --input_graph /workspace/dien_fp32_static_rnn_graph.pb --graph_type dynamic --exact_max_length 100 --num_iterations 10" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 1 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type fp32 --input_graph /workspace/dien_fp32_static_rnn_graph.pb --graph_type dynamic --exact_max_length 100 --num_iterations 10", + "cpuset": "0-111" }, { "_comment": "dien_fp32_inference_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=dien --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --data-location=/dataset/dien-dataset-folder --in-graph=/workspace/dien_fp32_static_rnn_graph.pb --socket-id 0 --batch-size 128 --num-intra-threads 26 --num-inter-threads 1 --accuracy-only --graph_type=static", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 128 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type fp32 --input_graph /workspace/dien_fp32_static_rnn_graph.pb --accuracy_only --graph_type static" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 128 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type fp32 --input_graph /workspace/dien_fp32_static_rnn_graph.pb --accuracy_only --graph_type static", + "cpuset": "0-111" }, { "_comment": "dien_bfloat16_inference_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=dien --precision=bfloat16 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --data-location=/dataset/dien-dataset-folder --in-graph=/workspace/dien_bfloat16_pretrained_model.pb --socket-id 0 --batch-size 128 --num-intra-threads 26 --num-inter-threads 1 --graph_type=static", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 128 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type bfloat16 --input_graph /workspace/dien_bfloat16_pretrained_model.pb --graph_type static" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 128 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type bfloat16 --input_graph /workspace/dien_bfloat16_pretrained_model.pb --graph_type static", + "cpuset": "0-111" }, { "_comment": "dien_bfloat16_inference_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=dien --precision=bfloat16 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --data-location=/dataset/dien-dataset-folder --in-graph=/workspace/dien_bfloat16_pretrained_model.pb --socket-id 0 --batch-size 1 --num-intra-threads 26 --num-inter-threads 1 --graph_type=dynamic --exact-max-length=100 --num-iterations=10", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 1 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type bfloat16 --input_graph /workspace/dien_bfloat16_pretrained_model.pb --graph_type dynamic --exact_max_length 100 --num_iterations 10" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 1 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type bfloat16 --input_graph /workspace/dien_bfloat16_pretrained_model.pb --graph_type dynamic --exact_max_length 100 --num_iterations 10", + "cpuset": "0-111" }, { "_comment": "dien_bfloat16_inference_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=dien --precision=bfloat16 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --data-location=/dataset/dien-dataset-folder --in-graph=/workspace/dien_bfloat16_pretrained_model.pb --socket-id 0 --batch-size 128 --num-intra-threads 26 --num-inter-threads 1 --accuracy-only --graph_type=static", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 128 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type bfloat16 --input_graph /workspace/dien_bfloat16_pretrained_model.pb --accuracy_only --graph_type static" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/inference_pb.py --batch_size 128 --num_inter_threads 1 --num_intra_threads 26 --data_location /dataset/dien-dataset-folder --data_type bfloat16 --input_graph /workspace/dien_bfloat16_pretrained_model.pb --accuracy_only --graph_type static", + "cpuset": "0-111" } ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_faster_rcnn_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_faster_rcnn_args.json index 01ee8dad0..a50701953 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_faster_rcnn_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_faster_rcnn_args.json @@ -1,27 +1,32 @@ [ { "_comment": "FP32 accuracy command", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=-1 --accuracy-only --verbose --checkpoint=/checkpoints --data-location=/dataset --in-graph=/in_graph/frozen_inference_graph.pb", - "output": "bash /workspace/intelai_models/inference/fp32/coco_accuracy.sh /in_graph/frozen_inference_graph.pb /dataset/coco_val.record /workspace/models"}, + "output": "bash /workspace/intelai_models/inference/fp32/coco_accuracy.sh /in_graph/frozen_inference_graph.pb /dataset/coco_val.record /workspace/models", + "cpuset": "0-111"}, { "_comment": "FP32 benchmark command", "input": "run_tf_benchmark.py --framework tensorflow --use-case object_detection --precision fp32 --mode inference --model-name faster_rcnn --checkpoint /checkpoints --intelai-models . --model-source-dir . --socket-id 0 --verbose --config_file=pipeline.config", - "output": "numactl --cpunodebind=0 --membind=0 python ./inference/fp32/eval.py --num_inter_threads 1 --num_intra_threads 28 --pipeline_config_path /checkpoints/pipeline.config --checkpoint_dir /checkpoints --eval_dir ./research/object_detection/log/eval"}, + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/fp32/eval.py --num_inter_threads 1 --num_intra_threads 28 --pipeline_config_path /checkpoints/pipeline.config --checkpoint_dir /checkpoints --eval_dir ./research/object_detection/log/eval", + "cpuset": "0-111"}, { "_comment": "FP32 benchmark command with custom --num_inter_threads 4 --num_intra_threads 16", "input": "run_tf_benchmark.py --framework tensorflow --use-case object_detection --precision fp32 --mode inference --model-name faster_rcnn --checkpoint /checkpoints --intelai-models . --model-source-dir . --socket-id 0 --verbose --config_file=pipeline.config --num-inter-threads 4 --num-intra-threads 16", - "output": "numactl --cpunodebind=0 --membind=0 python ./inference/fp32/eval.py --num_inter_threads 4 --num_intra_threads 16 --pipeline_config_path /checkpoints/pipeline.config --checkpoint_dir /checkpoints --eval_dir ./research/object_detection/log/eval"}, + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/fp32/eval.py --num_inter_threads 4 --num_intra_threads 16 --pipeline_config_path /checkpoints/pipeline.config --checkpoint_dir /checkpoints --eval_dir ./research/object_detection/log/eval", + "cpuset": "0-111"}, { "_comment": "Int8 command for throughput benchmark with --number-of-steps enabled.", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=-1 --benchmark-only --verbose --in-graph=/in_graph/pretrained_int8_faster_rcnn_model.pb --data-location=/dataset --number-of-steps=500", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/run_frozen_graph_rcnn.py -g /in_graph/pretrained_int8_faster_rcnn_model.pb -n 500 -d /dataset --num-inter-threads 2 --num-intra-threads 56"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/run_frozen_graph_rcnn.py -g /in_graph/pretrained_int8_faster_rcnn_model.pb -n 500 -d /dataset --num-inter-threads 2 --num-intra-threads 56", + "cpuset": "0-111"}, { "_comment": "Int8 accuracy command", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=-1 --accuracy-only --verbose --in-graph=/in_graph/pretrained_int8_faster_rcnn_model.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 /workspace/intelai_models/inference/int8/coco_int8.sh /in_graph/pretrained_int8_faster_rcnn_model.pb /dataset /workspace/models"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 /workspace/intelai_models/inference/int8/coco_int8.sh /in_graph/pretrained_int8_faster_rcnn_model.pb /dataset /workspace/models", + "cpuset": "0-111"}, { "_comment": "FP32 benchmark command", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=faster_rcnn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=-1 --benchmark-only --verbose --in-graph=/in_graph/pretrained_int8_faster_rcnn_model.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/run_frozen_graph_rcnn.py -g /in_graph/pretrained_int8_faster_rcnn_model.pb -d /dataset --num-inter-threads 2 --num-intra-threads 56" - } + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/int8/run_frozen_graph_rcnn.py -g /in_graph/pretrained_int8_faster_rcnn_model.pb -d /dataset --num-inter-threads 2 --num-intra-threads 56", + "cpuset": "0-111"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_gnmt_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_gnmt_args.json index 6c48cf5ac..a054ab315 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_gnmt_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_gnmt_args.json @@ -1,13 +1,16 @@ [ { "_comment": "gnmt_fp32_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=mlperf_gnmt --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --data-location=/dataset --in-graph=workspace/mlperf_gnmt_fp32_pretrained_model.pb", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/fp32/run_inference.py --in_graph=workspace/mlperf_gnmt_fp32_pretrained_model.pb --batch_size=1 --num_inter_threads=1 --num_intra_threads=28 --src_vocab_file=/dataset/vocab.bpe.32000.en --tgt_vocab_file=/dataset/vocab.bpe.32000.de --inference_input_file=/dataset/newstest2014.tok.bpe.32000.en --inference_ref_file=/dataset/newstest2014.tok.bpe.32000.de"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/fp32/run_inference.py --in_graph=workspace/mlperf_gnmt_fp32_pretrained_model.pb --batch_size=1 --num_inter_threads=1 --num_intra_threads=28 --src_vocab_file=/dataset/vocab.bpe.32000.en --tgt_vocab_file=/dataset/vocab.bpe.32000.de --inference_input_file=/dataset/newstest2014.tok.bpe.32000.en --inference_ref_file=/dataset/newstest2014.tok.bpe.32000.de", + "cpuset": "0-111"}, { "_comment": "gnmt_fp32_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=mlperf_gnmt --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=32 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --data-location=/dataset --in-graph=workspace/mlperf_gnmt_fp32_pretrained_model.pb", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/fp32/run_inference.py --in_graph=workspace/mlperf_gnmt_fp32_pretrained_model.pb --batch_size=32 --num_inter_threads=1 --num_intra_threads=28 --src_vocab_file=/dataset/vocab.bpe.32000.en --tgt_vocab_file=/dataset/vocab.bpe.32000.de --inference_input_file=/dataset/newstest2014.tok.bpe.32000.en --inference_ref_file=/dataset/newstest2014.tok.bpe.32000.de"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/fp32/run_inference.py --in_graph=workspace/mlperf_gnmt_fp32_pretrained_model.pb --batch_size=32 --num_inter_threads=1 --num_intra_threads=28 --src_vocab_file=/dataset/vocab.bpe.32000.en --tgt_vocab_file=/dataset/vocab.bpe.32000.de --inference_input_file=/dataset/newstest2014.tok.bpe.32000.en --inference_ref_file=/dataset/newstest2014.tok.bpe.32000.de", + "cpuset": "0-111"}, { "_comment": "gnmt_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=mlperf_gnmt --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=32 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --data-location=/dataset --in-graph=workspace/mlperf_gnmt_fp32_pretrained_model.pb", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/fp32/run_inference.py --in_graph=workspace/mlperf_gnmt_fp32_pretrained_model.pb --batch_size=32 --num_inter_threads=1 --num_intra_threads=28 --src_vocab_file=/dataset/vocab.bpe.32000.en --tgt_vocab_file=/dataset/vocab.bpe.32000.de --inference_input_file=/dataset/newstest2014.tok.bpe.32000.en --inference_ref_file=/dataset/newstest2014.tok.bpe.32000.de"} + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/fp32/run_inference.py --in_graph=workspace/mlperf_gnmt_fp32_pretrained_model.pb --batch_size=32 --num_inter_threads=1 --num_intra_threads=28 --src_vocab_file=/dataset/vocab.bpe.32000.en --tgt_vocab_file=/dataset/vocab.bpe.32000.de --inference_input_file=/dataset/newstest2014.tok.bpe.32000.en --inference_ref_file=/dataset/newstest2014.tok.bpe.32000.de", + "cpuset": "0-111"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv3_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv3_args.json index fa3764b76..191be23e6 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv3_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv3_args.json @@ -1,44 +1,68 @@ [ { "_comment": "inceptionv3_int8_accuracy", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 100 --in-graph /final_int8_inceptionv3.pb --intelai-models . --accuracy-only --verbose", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python ./int8/accuracy.py --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/final_int8_inceptionv3.pb"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python ./int8/accuracy.py --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/final_int8_inceptionv3.pb", + "cpuset": "0-111"}, { "_comment": "inception_v3_int8_latency_default_inter_intra", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 1 --in-graph /final_int8_inceptionv3.pb --intelai-models . --benchmark-only --socket-id 0 --verbose", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=28 --num_inter_threads=1 --batch_size=1 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=28 --num_inter_threads=1 --batch_size=1 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28", + "cpuset": "0-111"}, { "_comment": "inceptionv3_int8_throughput", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /final_int8_inceptionv3.pb --intelai-models . --benchmark-only --socket-id 0 --verbose", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=28 --num_inter_threads=1 --batch_size=128 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=28 --num_inter_threads=1 --batch_size=128 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28", + "cpuset": "0-111"}, { "_comment": "inceptionv3_int8_throughput_steps_warmup-steps", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inceptionv3 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/inception_frozen_max_min.pb --steps=200 --warmup-steps=20", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/int8/benchmark.py --warmup_steps=20 --num_intra_threads=28 --num_inter_threads=1 --batch_size=128 --input_graph=/in_graph/inception_frozen_max_min.pb --steps=200 --num_cores=28"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/int8/benchmark.py --warmup_steps=20 --num_intra_threads=28 --num_inter_threads=1 --batch_size=128 --input_graph=/in_graph/inception_frozen_max_min.pb --steps=200 --num_cores=28", + "cpuset": "0-111"}, { "_comment": "inceptionv3_int8_latency_steps_warmup-steps", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=inceptionv3 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/inception_frozen_max_min.pb --steps=200 --warmup-steps=20", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/int8/benchmark.py --warmup_steps=20 --num_intra_threads=28 --num_inter_threads=1 --batch_size=1 --input_graph=/in_graph/inception_frozen_max_min.pb --steps=200 --num_cores=28"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/int8/benchmark.py --warmup_steps=20 --num_intra_threads=28 --num_inter_threads=1 --batch_size=1 --input_graph=/in_graph/inception_frozen_max_min.pb --steps=200 --num_cores=28", + "cpuset": "0-111"}, { "_comment": "inceptionv3_int8_throughput_disable-tcmalloc", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /final_int8_inceptionv3.pb --intelai-models . --benchmark-only --socket-id 0 --disable-tcmalloc=True", - "output": "numactl --cpunodebind=0 --membind=0 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=28 --num_inter_threads=1 --batch_size=128 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28"}, + "output": "numactl --cpunodebind=0 --membind=0 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=28 --num_inter_threads=1 --batch_size=128 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28", + "cpuset": "0-111"}, { "_comment": "inceptionv3_fp32_accuracy", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 100 --accuracy-only --data-location /dataset --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --verbose", - "output": "python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only"}, + "output": "python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only", + "cpuset": "0-111"}, { "_comment": "inceptionv3_fp32_latency", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 1 --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --socket-id 0 --verbose", - "output": "numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28" - }, + "output": "numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28", + "cpuset": "0-111"}, { "_comment": "inceptionv3_fp32_throughput", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --socket-id 0 --verbose", - "output": "numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"}, + "output": "numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28", + "cpuset": "0-111"}, { "_comment": "inceptionv3_fp32_throughput_inter_intra", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --socket-id 0 --verbose --num-inter-threads 4 --num-intra-threads 16", - "output": "numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"} + "output": "numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28", + "cpuset": "0-111"}, + + { "_comment": "inceptionv3_fp32_throughput_inter_intra_cpuset", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --socket-id 0 --verbose --num-inter-threads 4 --num-intra-threads 16", + "output": "numactl --cpunodebind=0 --membind=0 python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28", + "cpuset": "0-7"}, + + { "_comment": "inceptionv3_fp32_throughput_cpuset", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /inceptionv3_fp32_pretrained_model.pb --intelai-models . --verbose", + "output": "python ./fp32/eval_image_classifier_inference.py --input-graph=/inceptionv3_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=11 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28", + "cpuset": "0-7,58-60"}, + + { "_comment": "inceptionv3_int8_throughput_cpuset", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name inceptionv3 --batch-size 128 --in-graph /final_int8_inceptionv3.pb --intelai-models . --benchmark-only --verbose", + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python ./int8/benchmark.py --warmup_steps=10 --num_intra_threads=9 --num_inter_threads=1 --batch_size=128 --input_graph=/final_int8_inceptionv3.pb --steps=50 --num_cores=28", + "cpuset": "28-34,50,55"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv4_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv4_args.json index ff8dbb6ba..b8a009dcb 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv4_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_inceptionv4_args.json @@ -1,21 +1,26 @@ [ { "_comment": "inceptionv4_int8_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=int8 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/inceptionv4_int8_pretrained_model.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/accuracy.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28 --data_location=/dataset"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/accuracy.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28 --data_location=/dataset", + "cpuset": "0-111"}, { "_comment": "inceptionv4_int8_latency_default_inter_intra", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=int8 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/inceptionv4_int8_pretrained_model.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/benchmark.py --batch_size=1 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/benchmark.py --batch_size=1 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28", + "cpuset": "0-111"}, { "_comment": "inceptionv4_int8_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=int8 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/inceptionv4_int8_pretrained_model.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/benchmark.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/benchmark.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_int8_pretrained_model.pb --num_intra_threads=28", + "cpuset": "0-111"}, { "_comment": "inceptionv4_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=fp32 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/inceptionv4_fp32_pretrained_model.pb --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/accuracy.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_fp32_pretrained_model.pb --num_intra_threads=28 --data_location=/dataset"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/accuracy.py --batch_size=240 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_fp32_pretrained_model.pb --num_intra_threads=28 --data_location=/dataset", + "cpuset": "0-111"}, { "_comment": "inceptionv4_fp32_batch_inf", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --precision=fp32 --mode=inference --model-name=inceptionv4 --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id 0 --benchmark-only --in-graph=/in_graph/inceptionv4_fp32_pretrained_model.pb --data-location=/dataset --steps=200", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/benchmark.py --batch_size=240 --num_intra_threads=28 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_fp32_pretrained_model.pb --warmup_steps=10 --steps=200"} + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/benchmark.py --batch_size=240 --num_intra_threads=28 --num_inter_threads=2 --input_graph=/in_graph/inceptionv4_fp32_pretrained_model.pb --warmup_steps=10 --steps=200", + "cpuset": "0-111"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_mask_rcnn_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_mask_rcnn_args.json index 7eb68e8a6..8b78b7fb4 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_mask_rcnn_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_mask_rcnn_args.json @@ -1,9 +1,21 @@ [ { "_comment": "FP32 benchmark", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=maskrcnn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/coco.py evaluate --dataset=/dataset --num_inter_threads 1 --num_intra_threads 28 --nw 5 --nb 50 --model=coco --infbs 1"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/coco.py evaluate --dataset=/dataset --num_inter_threads 1 --num_intra_threads 28 --nw 5 --nb 50 --model=coco --infbs 1", + "cpuset": "0-111"}, { "_comment": "FP32 benchmark with --num-inter-threads 4 --num-intra-threads 16", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=maskrcnn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --data-location=/dataset --num-inter-threads 4 --num-intra-threads 16", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/coco.py evaluate --dataset=/dataset --num_inter_threads 4 --num_intra_threads 16 --nw 5 --nb 50 --model=coco --infbs 1"} + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/coco.py evaluate --dataset=/dataset --num_inter_threads 4 --num_intra_threads 16 --nw 5 --nb 50 --model=coco --infbs 1", + "cpuset": "0-111"}, + + { "_comment": "FP32 benchmark with cpuset 1 socket", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=maskrcnn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size=1 --benchmark-only --verbose --data-location=/dataset", + "output": "python /workspace/intelai_models/inference/fp32/coco.py evaluate --dataset=/dataset --num_inter_threads 1 --num_intra_threads 16 --nw 5 --nb 50 --model=coco --infbs 1", + "cpuset": "0-15"}, + + { "_comment": "FP32 benchmark with cpuset 2 socket", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=maskrcnn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size=1 --benchmark-only --verbose --data-location=/dataset", + "output": "python /workspace/intelai_models/inference/fp32/coco.py evaluate --dataset=/dataset --num_inter_threads 2 --num_intra_threads 16 --nw 5 --nb 50 --model=coco --infbs 1", + "cpuset": "0-7,28-35"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_minigo_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_minigo_args.json index 23aec10ec..194726bcd 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_minigo_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_minigo_args.json @@ -2,16 +2,19 @@ { "_comment": "minigo_fp32_training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=reinforcement --model-name=minigo --precision=fp32 --mode=training --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --steps=30 --quantization=True", - "output": "./run.sh True" + "output": "./run.sh True", + "cpuset": "0-111" }, { "_comment": "minigo_fp32_training_multi_node", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=reinforcement --model-name=minigo --precision=fp32 --mode=training --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --steps=30 --quantization=True --num-train-nodes=2 --multi-node=True", - "output": "./run_mn.sh 2 True" + "output": "./run_mn.sh 2 True", + "cpuset": "0-111" }, { "_comment": "minigo_fp32_training_multi_node_large_scale", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=reinforcement --model-name=minigo --precision=fp32 --mode=training --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --steps=30 --quantization=True --num-train-nodes=2 --num-eval-nodes=1 --large-scale=True --multi-node=True", - "output": "./run_mn.sh 2 1 True" + "output": "./run_mn.sh 2 1 True", + "cpuset": "0-111" } ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_mobilenet_v1_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_mobilenet_v1_args.json index 858d92e9a..b8a99e8c4 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_mobilenet_v1_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_mobilenet_v1_args.json @@ -1,43 +1,58 @@ [ { "_comment": "mobilenet_v1_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --accuracy-only --verbose --checkpoint=/checkpoints --in-graph=/in_graph/mobilenet_v1_1.0_224_frozen.pb --data-location=/dataset", - "output": "python /workspace/intelai_models/inference/accuracy.py --precision=fp32 --batch_size=100 --data_location=/dataset --num_intra_threads=56 --num_inter_threads=2 --input_graph=/in_graph/mobilenet_v1_1.0_224_frozen.pb --input_height=224 --input_width=224 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, + "output": "python /workspace/intelai_models/inference/accuracy.py --precision=fp32 --batch_size=100 --data_location=/dataset --num_intra_threads=56 --num_inter_threads=2 --input_graph=/in_graph/mobilenet_v1_1.0_224_frozen.pb --input_height=224 --input_width=224 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1", + "cpuset": "0-111"}, { "_comment": "mobilenet_v1_fp32_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", - "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=fp32 --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, + "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=fp32 --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1", + "cpuset": "0-111"}, { "_comment": "mobilenet_v1_fp32_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", - "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=fp32 --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, + "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=fp32 --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1", + "cpuset": "0-111"}, { "_comment": "mobilenet_v1_fp32_dummy_data_output-dir", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --checkpoint=/checkpoints", - "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=fp32 --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, + "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=fp32 --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_height=224 --input_width=224 --warmup_steps=10 --steps=50 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1", + "cpuset": "0-111"}, { "_comment": "mobilenet_v1_int8_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --data-location=/dataset --input_height=224 --input_width=224", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/accuracy.py --input_height=224 --input_width=224 --num_intra_threads=28 --output_layer=MobilenetV1/Predictions/Reshape_1 --num_inter_threads=1 --batch_size=100 --input_graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --data_location=/dataset --input_layer=input"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/accuracy.py --input_height=224 --input_width=224 --num_intra_threads=28 --output_layer=MobilenetV1/Predictions/Reshape_1 --num_inter_threads=1 --batch_size=100 --input_graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --data_location=/dataset --input_layer=input", + "cpuset": "0-111"}, { "_comment": "mobilenet_v1_int8_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --input_height=224 --input_width=224 --warmup_steps=10 --steps=50", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/benchmark.py --input_height=224 --input_width=224 --warmup_steps=10 --num_intra_threads=28 --output_layer=MobilenetV1/Predictions/Reshape_1 --num_inter_threads=1 --batch_size=1 --input_graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --input_layer=input --steps=50"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/benchmark.py --input_height=224 --input_width=224 --warmup_steps=10 --num_intra_threads=28 --output_layer=MobilenetV1/Predictions/Reshape_1 --num_inter_threads=1 --batch_size=1 --input_graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --input_layer=input --steps=50", + "cpuset": "0-111"}, { "_comment": "mobilenet_v1_int8_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=240 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --input_height=224 --input_width=224 --warmup_steps=10 --steps=50", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/benchmark.py --input_height=224 --input_width=224 --warmup_steps=10 --num_intra_threads=28 --output_layer=MobilenetV1/Predictions/Reshape_1 --num_inter_threads=1 --batch_size=240 --input_graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --input_layer=input --steps=50"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/benchmark.py --input_height=224 --input_width=224 --warmup_steps=10 --num_intra_threads=28 --output_layer=MobilenetV1/Predictions/Reshape_1 --num_inter_threads=1 --batch_size=240 --input_graph=/in_graph/models_mobilenetv1_int8_pretrained_model.pb --input_layer=input --steps=50", + "cpuset": "0-111"}, { "_comment": "mobilenet_v1_bfloat16_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=bfloat16 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --batch-size=1 --socket-id=0 --data-location=/dataset/ImageNet_Validation --in-graph=/workspace/mobilenetv1.pb --input_height=200 --input_width=300 --steps=500 --warmup_steps=100", - "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=bfloat16 --batch_size=1 --num_intra_threads=28 --num_inter_threads=2 --input_graph=/workspace/mobilenetv1.pb --input_height=200 --input_width=300 --warmup_steps=100 --steps=500 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, + "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=bfloat16 --batch_size=1 --num_intra_threads=28 --num_inter_threads=2 --input_graph=/workspace/mobilenetv1.pb --input_height=200 --input_width=300 --warmup_steps=100 --steps=500 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1", + "cpuset": "0-111"}, { "_comment": "mobilenet_v1_bfloat16_batch", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=bfloat16 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --batch-size 100 --socket-id=0 --data-location=/dataset/ImageNet_Validation --in-graph=/workspace/mobilenetv1.pb --input_height=150 --input_width=170 --steps=100 --warmup_steps=10", - "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=bfloat16 --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_graph=/workspace/mobilenetv1.pb --input_height=150 --input_width=170 --warmup_steps=10 --steps=100 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"}, + "output": "numactl --cpunodebind=0 -l python /workspace/intelai_models/inference/benchmark.py --precision=bfloat16 --batch_size=100 --num_intra_threads=28 --num_inter_threads=2 --input_graph=/workspace/mobilenetv1.pb --input_height=150 --input_width=170 --warmup_steps=10 --steps=100 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1", + "cpuset": "0-111"}, { "_comment": "mobilenet_v1_bfloat16_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=bfloat16 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --batch-size 150 --data-location=/dataset --accuracy-only --in-graph=/workspace/mobilenetv1.pb", - "output": "python /workspace/intelai_models/inference/accuracy.py --precision=bfloat16 --batch_size=150 --data_location=/dataset --num_intra_threads=56 --num_inter_threads=2 --input_graph=/workspace/mobilenetv1.pb --input_height=224 --input_width=224 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1"} + "output": "python /workspace/intelai_models/inference/accuracy.py --precision=bfloat16 --batch_size=150 --data_location=/dataset --num_intra_threads=56 --num_inter_threads=2 --input_graph=/workspace/mobilenetv1.pb --input_height=224 --input_width=224 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1", + "cpuset": "0-111"}, + + { "_comment": "mobilenet_v1_bfloat16_batch_cpuset", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=mobilenet_v1 --precision=bfloat16 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --batch-size 100 --data-location=/dataset/ImageNet_Validation --in-graph=/workspace/mobilenetv1.pb --input_height=150 --input_width=170 --steps=100 --warmup_steps=10", + "output": "python /workspace/intelai_models/inference/benchmark.py --precision=bfloat16 --batch_size=100 --num_intra_threads=16 --num_inter_threads=2 --input_graph=/workspace/mobilenetv1.pb --input_height=150 --input_width=170 --warmup_steps=10 --steps=100 --input_layer=input --output_layer=MobilenetV1/Predictions/Reshape_1", + "cpuset": "0-7,28-35"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_ncf_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_ncf_args.json index 4cbeed73a..815ab35ac 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_ncf_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_ncf_args.json @@ -1,21 +1,26 @@ [ { "_comment": "FP32 latency benchmark", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=1 --inference_only --benchmark_only"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=1 --inference_only --benchmark_only", + "cpuset": "0-111"}, { "_comment": "Fp32 accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=256 --socket-id 0 --accuracy-only --verbose --checkpoint=/checkpoints --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=256 --inference_only --accuracy_only"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=256 --inference_only --accuracy_only", + "cpuset": "0-111"}, { "_comment": "FP32 Throughput benchmark", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=256 --socket-id 0 --benchmark-only --verbose --checkpoint=/checkpoints --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=256 --inference_only --benchmark_only"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/ncf_main.py --data_dir=/dataset --model_dir=/checkpoints --intra_op_parallelism_threads=28 --inter_op_parallelism_threads=1 --batch_size=256 --inference_only --benchmark_only", + "cpuset": "0-111"}, { "_comment": "NCF FP32 Training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=fp32 --mode=training --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=98304 --num-inter-thread=2 --dataset=ml-20m --clean=1 --te=12", - "output": "python /workspace/intelai_models/training/ncf_estimator_main.py -dd=None -md=None -bs=98304 -hk=examplespersecondhook --dataset=ml-20m --layers=256,256,128,64 --num_factors=64 --eval_batch_size 160000 --learning_rate 0.003821 --beta1 0.783529 --beta2 0.909003 --epsilon 1.45439e-07 --hr_threshold 0.635 --ml_perf --clean=1 --te=12"}, + "output": "python /workspace/intelai_models/training/ncf_estimator_main.py -dd=None -md=None -bs=98304 -hk=examplespersecondhook --dataset=ml-20m --layers=256,256,128,64 --num_factors=64 --eval_batch_size 160000 --learning_rate 0.003821 --beta1 0.783529 --beta2 0.909003 --epsilon 1.45439e-07 --hr_threshold 0.635 --ml_perf --clean=1 --te=12", + "cpuset": "0-111"}, { "_comment": "NCF BFloat16 Training", - "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=bfloat16 --mode=training --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=98304 --num-inter-thread=2 --dataset=ml-20m --clean=1 --te=12", - "output": "python /workspace/intelai_models/training/ncf_estimator_main.py --use_bfloat16 -dd=None -md=None -bs=98304 -hk=examplespersecondhook --dataset=ml-20m --layers=256,256,128,64 --num_factors=64 --eval_batch_size 160000 --learning_rate 0.003821 --beta1 0.783529 --beta2 0.909003 --epsilon 1.45439e-07 --hr_threshold 0.635 --ml_perf --clean=1 --te=12"} + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=ncf --precision=bfloat16 --mode=training --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=98304 --num-inter-thread=2 --dataset=ml-20m --clean=1 --te=12", + "output": "python /workspace/intelai_models/training/ncf_estimator_main.py --use_bfloat16 -dd=None -md=None -bs=98304 -hk=examplespersecondhook --dataset=ml-20m --layers=256,256,128,64 --num_factors=64 --eval_batch_size 160000 --learning_rate 0.003821 --beta1 0.783529 --beta2 0.909003 --epsilon 1.45439e-07 --hr_threshold 0.635 --ml_perf --clean=1 --te=12", + "cpuset": "0-111"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_resnet101_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_resnet101_args.json index 6e327c8d8..4f1eded04 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_resnet101_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_resnet101_args.json @@ -1,21 +1,26 @@ [ { "_comment": "resnet101_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --accuracy-only --verbose --in-graph=/in_graph/resnet101_fp32_model.pb --data-location=/dataset", - "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=128 --num-inter-threads=2 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=56 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only"}, + "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=128 --num-inter-threads=2 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=56 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only", + "cpuset": "0-111"}, { "_comment": "resnet101_fp32_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/resnet101_fp32_model.pb", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=128 --num-inter-threads=1 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=28 --warmup-steps=10 --steps=50"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=128 --num-inter-threads=1 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=28 --warmup-steps=10 --steps=50", + "cpuset": "0-111"}, { "_comment": "resnet101_int8_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/resnet101_int8_model.pb", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=1 --num-intra-threads=28 --num-inter-threads=1 --input-graph=/in_graph/resnet101_int8_model.pb --warmup-steps=40 --steps=100"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=1 --num-intra-threads=28 --num-inter-threads=1 --input-graph=/in_graph/resnet101_int8_model.pb --warmup-steps=40 --steps=100", + "cpuset": "0-111"}, { "_comment": "resnet101_int8_inference calibration", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --accuracy-only --calibration-only --in-graph=/in_graph/resnet101_int8_model.pb", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/calibration.py --batch_size=1 --num_intra_threads=28 --num_inter_threads=1 --input_graph=/in_graph/resnet101_int8_model.pb"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/calibration.py --batch_size=1 --num_intra_threads=28 --num_inter_threads=1 --input_graph=/in_graph/resnet101_int8_model.pb", + "cpuset": "0-111"}, { "_comment": "resnet101_fp32_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet101 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id 0 --benchmark-only --verbose --in-graph=/in_graph/resnet101_fp32_model.pb", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=1 --num-inter-threads=1 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=28 --warmup-steps=10 --steps=50"} + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --batch-size=1 --num-inter-threads=1 --input-graph=/in_graph/resnet101_fp32_model.pb --num-intra-threads=28 --warmup-steps=10 --steps=50", + "cpuset": "0-111"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_resnet50_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_resnet50_args.json index ec0916102..72c734308 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_resnet50_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_resnet50_args.json @@ -1,40 +1,48 @@ [ { "_comment": "resnet50_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size 100 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/freezed_resnet50.pb --accuracy-only --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=100 --data-location=/dataset --accuracy-only --num-cores=28 --warmup-steps=10 --steps=50"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=100 --data-location=/dataset --accuracy-only --num-cores=28 --warmup-steps=10 --steps=50", + "cpuset": "0-111"}, { "_comment": "resnet50_fp32_latency_default_inter_intra", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50 --batch-size 128 --in-graph /freezed_resnet50.pb --intelai-models . --socket-id 0 --verbose", - "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"}, + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28", + "cpuset": "0-111"}, { "_comment": "resnet50_fp32_latency_inter_intra", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50 --batch-size 1 --in-graph /freezed_resnet50.pb --intelai-models . --socket-id 0 --verbose --num-inter-threads 4 --num-intra-threads 16", - "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28"}, + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28", + "cpuset": "0-111"}, { "_comment": "resnet50_fp32_throughput_inter_intra", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50 --batch-size 128 --in-graph /freezed_resnet50.pb --intelai-models . --socket-id 0 --verbose", - "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"}, + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28", + "cpuset": "0-111"}, { "_comment": "resnet50_int8_throughput_output-dir", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/resnet50_int8_pretrained_model.pb --steps=200 --warmup-steps=20", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200", + "cpuset": "0-111"}, { "_comment": "resnet50_int8_data_calibration", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/resnet50_int8_pretrained_model.pb --data-location=/dataset --calibration-only", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/int8/generate_calibration_data.py --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/in_graph/resnet50_int8_pretrained_model.pb --data_location=/dataset"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/int8/generate_calibration_data.py --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/in_graph/resnet50_int8_pretrained_model.pb --data_location=/dataset", + "cpuset": "0-111"}, { "_comment": "resnet50_fp32_throughput_output-results", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --output-results --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/resnet50_fp32_pretrained_model.pb --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --results-file-path /workspace/benchmarks/common/tensorflow/logs/resnet50_fp32_inference_results*.txt"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --results-file-path /workspace/benchmarks/common/tensorflow/logs/resnet50_fp32_inference_results*.txt", + "cpuset": "0-111"}, { "_comment": "resnet50_int8_accuracy", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name resnet50 --batch-size 100 --data-location /dataset --in-graph /final_int8_resnet50.pb --intelai-models . --accuracy-only --verbose", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python ./inference/eval_image_classifier_inference.py --input-graph=/final_int8_resnet50.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python ./inference/eval_image_classifier_inference.py --input-graph=/final_int8_resnet50.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only", + "cpuset": "0-111"}, { "_comment": "resnet50_int8_throughput_steps_warmup-steps", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/resnet50_int8_pretrained_model.pb --steps=200 --warmup-steps=20", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200" - } + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200", + "cpuset": "0-111"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_resnet50v1_5_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_resnet50v1_5_args.json index 885eda88f..d0ab8877a 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_resnet50v1_5_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_resnet50v1_5_args.json @@ -1,75 +1,98 @@ [ { "_comment": "resnet50v1_5_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size 100 --socket-id 0 --accuracy-only --verbose --in-graph=/in_graph/freezed_resnet50v1_5.pb --accuracy-only --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=100 --data-location=/dataset --accuracy-only --num-cores=28 --warmup-steps=10 --steps=50"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=100 --data-location=/dataset --accuracy-only --num-cores=28 --warmup-steps=10 --steps=50", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_fp32_latency_default_inter_intra", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50v1_5 --batch-size 128 --in-graph /freezed_resnet50v1_5.pb --intelai-models . --socket-id 0 --verbose", - "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"}, + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_fp32_latency_inter_intra", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50v1_5 --batch-size 1 --in-graph /freezed_resnet50v1_5.pb --intelai-models . --socket-id 0 --verbose --num-inter-threads 4 --num-intra-threads 16", - "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50v1_5.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28"}, + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50v1_5.pb --num-inter-threads=4 --num-intra-threads=16 --batch-size=1 --warmup-steps=10 --steps=50 --num-cores=28", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_fp32_throughput_inter_intra", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50v1_5 --batch-size 128 --in-graph /freezed_resnet50v1_5.pb --intelai-models . --socket-id 0 --verbose", - "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28"}, + "output": "numactl --cpunodebind=0 --membind=0 python ./inference/eval_image_classifier_inference.py --input-graph=/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=10 --steps=50 --num-cores=28", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_int8_throughput_output-dir", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --steps=200 --warmup-steps=20", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_int8_data_calibration", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --data-location=/dataset --calibration-only", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/int8/generate_calibration_data.py --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --data_location=/dataset"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/int8/generate_calibration_data.py --num_intra_threads=56 --num_inter_threads=2 --batch_size=100 --input_graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --data_location=/dataset", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_fp32_throughput_output-results", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=100 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --output-results --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/resnet50v1_5_fp32_pretrained_model.pb --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50v1_5_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --results-file-path /workspace/benchmarks/common/tensorflow/logs/resnet50v1_5_fp32_inference_results*.txt"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50v1_5_fp32_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --results-file-path /workspace/benchmarks/common/tensorflow/logs/resnet50v1_5_fp32_inference_results*.txt", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_int8_accuracy", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name resnet50v1_5 --batch-size 100 --data-location /dataset --in-graph /final_int8_resnet50v1_5.pb --intelai-models . --accuracy-only --verbose", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python ./inference/eval_image_classifier_inference.py --input-graph=/final_int8_resnet50v1_5.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python ./inference/eval_image_classifier_inference.py --input-graph=/final_int8_resnet50v1_5.pb --num-inter-threads=2 --num-intra-threads=56 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_int8_throughput_steps_warmup-steps", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=128 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --steps=200 --warmup-steps=20", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=28 --batch-size=128 --warmup-steps=20 --steps=200", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_bfloat16_batch_inference", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision bfloat16 --mode inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --model-name resnet50v1_5 --batch-size=128 --data-location=/dataset --in-graph=resnet50v1_5.pb", - "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5.pb --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=128 --warmup-steps=10 --steps=50 --data-location=/dataset"}, + "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5.pb --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=128 --warmup-steps=10 --steps=50 --data-location=/dataset", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_bfloat16_online_inference", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision bfloat16 --mode inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --model-name resnet50v1_5 --batch-size=1 --data-location=/dataset --in-graph=resnet50v1_5.pb", - "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5.pb --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=1 --warmup-steps=10 --steps=50 --data-location=/dataset"}, + "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5.pb --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=1 --warmup-steps=10 --steps=50 --data-location=/dataset", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_bfloat16_inference_accuracy", "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision bfloat16 --mode inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --model-name resnet50v1_5 --batch-size=100 --data-location=/dataset --in-graph=resnet50v1_5.pb --accuracy-only", - "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5.pb --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only"}, + "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5.pb --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-location=/dataset --accuracy-only", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_fp32_multi_instance_one_socket", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=fp32 --mode=inference --intelai-models=/workspace/intelai_models --batch-size 100 --socket-id 0 --numa-cores-per-instance socket --in-graph=/in_graph/freezed_resnet50v1_5.pb --benchmark-only", - "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-num-inter-threads=1 --data-num-intra-threads=28"}, + "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-num-inter-threads=1 --data-num-intra-threads=28", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_fp32_multi_instance_all_sockets", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=fp32 --mode=inference --intelai-models=/workspace/intelai_models --batch-size 100 --numa-cores-per-instance socket --in-graph=/in_graph/freezed_resnet50v1_5.pb --benchmark-only", - "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-num-inter-threads=1 --data-num-intra-threads=28"}, + "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/freezed_resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=28 --num-cores=28 --batch-size=100 --warmup-steps=10 --steps=50 --data-num-inter-threads=1 --data-num-intra-threads=28", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_fp32_training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=fp32 --mode=training --intelai-models=/workspace/intelai_models --batch-size=256 --checkpoint=/workspace/checkpoints --output-dir=/workspace/logs --data-location=/dataset --steps=100 --train_epochs=6 --epochs_between_evals=2", - "output": "python /workspace/intelai_models/training/mlperf_resnet/imagenet_main.py 2 --batch_size=256 --max_train_steps=100 --train_epochs=6 --epochs_between_evals=2 --inter_op_parallelism_threads 2 --intra_op_parallelism_threads 56 --version 1 --resnet_size 50 --data_format=channels_last --data_dir=/dataset --model_dir=/workspace/checkpoints"}, + "output": "python /workspace/intelai_models/training/mlperf_resnet/imagenet_main.py 2 --batch_size=256 --max_train_steps=100 --train_epochs=6 --epochs_between_evals=2 --inter_op_parallelism_threads 2 --intra_op_parallelism_threads 56 --version 1 --resnet_size 50 --data_format=channels_last --data_dir=/dataset --model_dir=/workspace/checkpoints", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_fp32_distributed_training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=fp32 --mode=training --intelai-models=/workspace/intelai_models --batch-size=256 --checkpoint=/workspace/checkpoints --output-dir=/workspace/logs --mpi_num_processes=2 --mpi_num_processes_per_socket=1 --data-location=/dataset", - "output": "mpirun --allow-run-as-root -n 2 --map-by socket python /workspace/intelai_models/training/mlperf_resnet/imagenet_main.py 2 --batch_size=256 --max_train_steps=112590 --train_epochs=72 --epochs_between_evals=1 --inter_op_parallelism_threads 1 --intra_op_parallelism_threads 26 --version 1 --resnet_size 50 --data_format=channels_last --data_dir=/dataset --model_dir=/workspace/checkpoints"}, + "output": "mpirun --allow-run-as-root -n 2 --map-by socket python /workspace/intelai_models/training/mlperf_resnet/imagenet_main.py 2 --batch_size=256 --max_train_steps=112590 --train_epochs=72 --epochs_between_evals=1 --inter_op_parallelism_threads 1 --intra_op_parallelism_threads 26 --version 1 --resnet_size 50 --data_format=channels_last --data_dir=/dataset --model_dir=/workspace/checkpoints", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_bfloat16_training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=bfloat16 --mode=training --intelai-models=/workspace/intelai_models --checkpoint=/workspace/checkpoints --output-dir=/workspace/logs --steps=300 --train_epochs=10 --epochs_between_evals=2", - "output": "python /workspace/intelai_models/training/mlperf_resnet/imagenet_main.py 2 --batch_size=64 --max_train_steps=300 --train_epochs=10 --epochs_between_evals=2 --inter_op_parallelism_threads 2 --intra_op_parallelism_threads 56 --version 1 --resnet_size 50 --data_format=channels_last --model_dir=/workspace/checkpoints --use_bfloat16"}, + "output": "python /workspace/intelai_models/training/mlperf_resnet/imagenet_main.py 2 --batch_size=64 --max_train_steps=300 --train_epochs=10 --epochs_between_evals=2 --inter_op_parallelism_threads 2 --intra_op_parallelism_threads 56 --version 1 --resnet_size 50 --data_format=channels_last --model_dir=/workspace/checkpoints --use_bfloat16", + "cpuset": "0-111"}, { "_comment": "resnet50v1_5_bfloat16_distributed_training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=bfloat16 --mode=training --intelai-models=/workspace/intelai_models --checkpoint=/workspace/checkpoints --mpi_num_processes=4 --mpi_num_processes_per_socket=2 --output-dir=/workspace/logs", - "output": "mpirun --allow-run-as-root -n 4 --map-by ppr:2:socket:pe=14 --cpus-per-proc 14 python /workspace/intelai_models/training/mlperf_resnet/imagenet_main.py 2 --batch_size=64 --max_train_steps=112590 --train_epochs=72 --epochs_between_evals=1 --inter_op_parallelism_threads 1 --intra_op_parallelism_threads 26 --version 1 --resnet_size 50 --data_format=channels_last --model_dir=/workspace/checkpoints --use_bfloat16"} + "output": "mpirun --allow-run-as-root -n 4 --map-by ppr:2:socket:pe=14 --cpus-per-proc 14 python /workspace/intelai_models/training/mlperf_resnet/imagenet_main.py 2 --batch_size=64 --max_train_steps=112590 --train_epochs=72 --epochs_between_evals=1 --inter_op_parallelism_threads 1 --intra_op_parallelism_threads 26 --version 1 --resnet_size 50 --data_format=channels_last --model_dir=/workspace/checkpoints --use_bfloat16", + "cpuset": "0-111"}, + + { "_comment": "resnet50v1_5_bfloat16_online_inference_cpuset", + "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision bfloat16 --mode inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --model-name resnet50v1_5 --batch-size=1 --data-location=/dataset --in-graph=resnet50v1_5.pb", + "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5.pb --num-inter-threads=1 --num-intra-threads=16 --num-cores=28 --batch-size=1 --warmup-steps=10 --steps=50 --data-location=/dataset", + "cpuset": "2-17"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_rfcn_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_rfcn_args.json index 8301a4db9..ec759e926 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_rfcn_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_rfcn_args.json @@ -1,25 +1,31 @@ [ { "_comment": "rfcn_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --verbose --in-graph=/in_graph/frozen_inference_graph.pb --data-location=/dataset --accuracy-only --split=accuracy_message", - "output": "FROZEN_GRAPH=/in_graph/frozen_inference_graph.pb TF_RECORD_FILES=/dataset/data.record SPLIT=accuracy_message TF_MODELS_ROOT=/workspace/models /workspace/intelai_models/inference/fp32/coco_mAP.sh"}, + "output": "FROZEN_GRAPH=/in_graph/frozen_inference_graph.pb TF_RECORD_FILES=/dataset/data.record SPLIT=accuracy_message TF_MODELS_ROOT=/workspace/models /workspace/intelai_models/inference/fp32/coco_mAP.sh", + "cpuset": "0-111"}, { "_comment": "rfcn_fp32", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id 0 --verbose --in-graph=/in_graph/rfcn_resnet101_fp32_coco_pretrained_model.pb --data-location=/dataset --benchmark-only --number_of_steps=500", - "output": "numactl -N 0 -m 0 python /workspace/intelai_models/inference/fp32/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_fp32_coco_pretrained_model.pb -x 500 -d /dataset --num-inter-threads 1 --num-intra-threads 28"}, + "output": "numactl -N 0 -m 0 python /workspace/intelai_models/inference/fp32/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_fp32_coco_pretrained_model.pb -x 500 -d /dataset --num-inter-threads 1 --num-intra-threads 28", + "cpuset": "0-111"}, { "_comment": "rfcn_fp32_with_optional_args", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size=-1 --socket-id=1 --verbose --in-graph=/in_graph/rfcn_resnet101_fp32_coco_pretrained_model.pb --data-location=/dataset --benchmark-only --num-cores=8 --visualize", - "output": "numactl -C +0,1,2,4 -N 1 -m 1 python /workspace/intelai_models/inference/fp32/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_fp32_coco_pretrained_model.pb --num-intra-threads 8 --num-inter-threads 1 - v -d /dataset"}, + "output": "numactl -C +0,1,2,4 -N 1 -m 1 python /workspace/intelai_models/inference/fp32/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_fp32_coco_pretrained_model.pb --num-intra-threads 8 --num-inter-threads 1 - v -d /dataset", + "cpuset": "0-111"}, { "_comment": "rfcn_int8_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --verbose --in-graph=/in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb --data-location=/dataset --accuracy-only --split=accuracy_message", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 FROZEN_GRAPH=/in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb TF_RECORD_FILE=/dataset SPLIT=accuracy_message TF_MODELS_ROOT=/workspace/models /workspace/intelai_models/inference/int8/coco_mAP.sh"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 FROZEN_GRAPH=/in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb TF_RECORD_FILE=/dataset SPLIT=accuracy_message TF_MODELS_ROOT=/workspace/models /workspace/intelai_models/inference/int8/coco_mAP.sh", + "cpuset": "0-111"}, { "_comment": "rfcn_int8", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id 0 --verbose --in-graph=/in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb --data-location=/dataset --benchmark-only --number_of_steps=500", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl -N 0 -m 0 python /workspace/intelai_models/inference/int8/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb -x 500 -d /dataset --num-inter-threads 1 --num-intra-threads 28"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl -N 0 -m 0 python /workspace/intelai_models/inference/int8/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb -x 500 -d /dataset --num-inter-threads 1 --num-intra-threads 28", + "cpuset": "0-111"}, { "_comment": "rfcn_int8_with_optional_args", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=rfcn --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size=-1 --socket-id 1 --verbose --in-graph=/in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb --data-location=/dataset --benchmark-only --num-cores=8 --print_accuracy --evaluate_tensor=Tensor --visualize", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl -N 0 -m 0 python /workspace/intelai_models/inference/int8/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb --num-intra-threads 28 --num-inter-threads 1 -x 500 -d /dataset LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl -C +0,1,2,4 -N 1 -m 1 python /workspace/intelai_models/inference/int8/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb --num-intra-threads 8 --num-inter-threads 1 - v -d /dataset -e Tensor - p"} + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl -N 0 -m 0 python /workspace/intelai_models/inference/int8/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb --num-intra-threads 28 --num-inter-threads 1 -x 500 -d /dataset LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl -C +0,1,2,4 -N 1 -m 1 python /workspace/intelai_models/inference/int8/run_rfcn_inference.py -m /workspace/models -g /in_graph/rfcn_resnet101_int8_coco_pretrained_model.pb --num-intra-threads 8 --num-inter-threads 1 - v -d /dataset -e Tensor - p", + "cpuset": "0-111"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json index c4a2be048..b3659c5b3 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json @@ -1,25 +1,41 @@ [ { "_comment": "ssd_mobilenet_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=0 --accuracy-only --verbose --in-graph=/in_graph/frozen_inference_graph.pb --benchmark-dir=/workspace/benchmarks --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/frozen_inference_graph.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -r"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/frozen_inference_graph.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -r", + "cpuset": "0-111"}, { "_comment": "ssd_mobilenet_fp32", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=-1 --socket-id=0 --benchmark-only --verbose --in-graph=/in_graph/frozen_inference_graph.pb --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/frozen_inference_graph.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -b -1"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/frozen_inference_graph.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -b -1", + "cpuset": "0-111"}, { "_comment": "ssd_mobilenet_int8_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/infer_detections.py -g /in_graph/ssdmobilenet_int8_pretrained_model.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -r"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/infer_detections.py -g /in_graph/ssdmobilenet_int8_pretrained_model.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -r", + "cpuset": "0-111"}, { "_comment": "ssd_mobilenet_int8", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size=1 --socket-id 0 --data-location=/dataset --verbose --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb --benchmark-only --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/infer_detections.py -g /in_graph/ssdmobilenet_int8_pretrained_model.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -b 1"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/infer_detections.py -g /in_graph/ssdmobilenet_int8_pretrained_model.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -b 1", + "cpuset": "0-111"}, { "_comment": "ssd_mobilenet_bfloat16_inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=bfloat16 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --socket-id=0 --data-location=/dataset/coco_val.record --benchmark-only --in-graph=/in_graph/ssdmobilenet_fp32_pretrained_model_combinedNMS.pb", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/ssdmobilenet_fp32_pretrained_model_combinedNMS.pb -i 1000 -w 200 -a 28 -e 1 -p bfloat16 -d /dataset/coco_val.record -b -1"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/ssdmobilenet_fp32_pretrained_model_combinedNMS.pb -i 1000 -w 200 -a 28 -e 1 -p bfloat16 -d /dataset/coco_val.record -b -1", + "cpuset": "0-111"}, { "_comment": "ssd_mobilenet_bfloat16_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=bfloat16 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --socket-id=0 --data-location=/dataset/coco_val.record --accuracy-only --in-graph=/in_graph/ssdmobilenet_fp32_pretrained_model_combinedNMS.pb", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/ssdmobilenet_fp32_pretrained_model_combinedNMS.pb -i 1000 -w 200 -a 28 -e 1 -p bfloat16 -d /dataset/coco_val.record -r"} + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/ssdmobilenet_fp32_pretrained_model_combinedNMS.pb -i 1000 -w 200 -a 28 -e 1 -p bfloat16 -d /dataset/coco_val.record -r", + "cpuset": "0-111"}, + + { "_comment": "ssd_mobilenet_bfloat16_inference", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=bfloat16 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --data-location=/dataset/coco_val.record --benchmark-only --in-graph=/in_graph/ssdmobilenet_fp32_pretrained_model_combinedNMS.pb", + "output": "python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/ssdmobilenet_fp32_pretrained_model_combinedNMS.pb -i 1000 -w 200 -a 16 -e 1 -p bfloat16 -d /dataset/coco_val.record -b -1", + "cpuset": "0-7,8-15"}, + + { "_comment": "ssd_mobilenet_fp32_cpuset", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --batch-size=-1 --benchmark-only --verbose --in-graph=/in_graph/frozen_inference_graph.pb --data-location=/dataset", + "output": "python /workspace/intelai_models/inference/infer_detections.py -g /in_graph/frozen_inference_graph.pb -i 1000 -w 200 -a 8 -e 2 -d /dataset -b -1", + "cpuset": "25-28,0-3"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_ssd_resnet34_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_resnet34_args.json index 7619c3650..aca0d5496 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_ssd_resnet34_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_resnet34_args.json @@ -1,41 +1,61 @@ [ { "_comment": "ssd_resnet34_bfloat16_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=bfloat16 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssd_resnet34_bs1.pb --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/bfloat16/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --input-size 300 --warmup-steps 200 --steps 800 --accuracy-only --data-location /dataset"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/bfloat16/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --input-size 300 --warmup-steps 200 --steps 800 --accuracy-only --data-location /dataset", + "cpuset": "0-111"}, { "_comment": "ssd_resnet34_fp32_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssd_resnet34_bs1.pb --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --input-size 300 --warmup-steps 200 --steps 800 --accuracy-only --data-location /dataset"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --input-size 300 --warmup-steps 200 --steps 800 --accuracy-only --data-location /dataset", + "cpuset": "0-111"}, { "_comment": "ssd_resnet34_fp32", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssd_resnet34_bs1.pb --data-location=/dataset", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --input-size 300 --warmup-steps 200 --steps 800"}, + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --input-size 300 --warmup-steps 200 --steps 800", + "cpuset": "0-111"}, { "_comment": "ssd_resnet34_fp32_training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=fp32 --mode=training --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=32 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --num-train-steps=500 --benchmark-only --model-source-dir=/workspace/models --data-location=/dataset --num-inter-threads=1 --num-intra-threads=27 --disable-tcmalloc=True --mpi_num_processes=2 --mpi_num_processes_per_socket=1", - "output": "mpirun --allow-run-as-root -n 2 --map-by socket python /tmp/benchmark_ssd_resnet34/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_dir /dataset --batch_size 32 --num_inter_threads 1 --num_intra_threads 27 --model=ssd300 --data_name coco --mkl=True --device=cpu --data_format=NHWC --variable_update=horovod --horovod_device=cpu --kmp_affinity=granularity=fine,verbose,compact,1,0 --kmp_settings=1 --kmp_blocktime=1 --weight_decay 0.0005 --num_warmup_batches 0 --num_batches 500"}, + "output": "mpirun --allow-run-as-root -n 2 --map-by socket python /tmp/benchmark_ssd_resnet34/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_dir /dataset --batch_size 32 --num_inter_threads 1 --num_intra_threads 27 --model=ssd300 --data_name coco --mkl=True --device=cpu --data_format=NHWC --variable_update=horovod --horovod_device=cpu --kmp_affinity=granularity=fine,verbose,compact,1,0 --kmp_settings=1 --kmp_blocktime=1 --weight_decay 0.0005 --num_warmup_batches 0 --num_batches 500", + "cpuset": "0-111"}, { "_comment": "ssd_resnet34_fp32_training_calc_threads", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=fp32 --mode=training --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=32 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --num-train-steps=500 --benchmark-only --model-source-dir=/workspace/models --data-location=/dataset --disable-tcmalloc=True --mpi_num_processes=2 --mpi_num_processes_per_socket=1 --timeline=file", - "output": "mpirun --allow-run-as-root -n 2 --map-by socket python /tmp/benchmark_ssd_resnet34/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_dir /dataset --batch_size 32 --num_inter_threads 1 --num_intra_threads 54 --model=ssd300 --data_name coco --mkl=True --device=cpu --data_format=NHWC --variable_update=horovod --horovod_device=cpu --kmp_affinity=granularity=fine,verbose,compact,1,0 --kmp_settings=1 --kmp_blocktime=1 --use_chrome_trace_format=True --trace_file=file --weight_decay 0.0005 --num_warmup_batches 0 --num_batches 500"}, + "output": "mpirun --allow-run-as-root -n 2 --map-by socket python /tmp/benchmark_ssd_resnet34/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_dir /dataset --batch_size 32 --num_inter_threads 1 --num_intra_threads 54 --model=ssd300 --data_name coco --mkl=True --device=cpu --data_format=NHWC --variable_update=horovod --horovod_device=cpu --kmp_affinity=granularity=fine,verbose,compact,1,0 --kmp_settings=1 --kmp_blocktime=1 --use_chrome_trace_format=True --trace_file=file --weight_decay 0.0005 --num_warmup_batches 0 --num_batches 500", + "cpuset": "0-111"}, { "_comment": "ssd_resnet34_int8_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssd_resnet34_bs1.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --input-size 300 --warmup-steps 200 --steps 800 --accuracy-only --data-location /dataset"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --input-size 300 --warmup-steps 200 --steps 800 --accuracy-only --data-location /dataset", + "cpuset": "0-111"}, { "_comment": "ssd_resnet34_int8", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=int8 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssd_resnet34_bs1.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --input-size 300 --warmup-steps 200 --steps 800"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 28 --input-size 300 --warmup-steps 200 --steps 800", + "cpuset": "0-111"}, { "_comment": "ssd_resnet34_bfloat16_training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=bfloat16 --mode=training --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --data-location=/dataset --num-cores=52 --num-inter-threads=1 --num-intra-threads=52 --batch-size=100 --weight_decay=1e-4 --num-train-steps=100 --num_warmup_batches=20 --mpi_num_processes=1 --output-dir=/workspace/logs", - "output": "mpirun --allow-run-as-root -n 1 --map-by socket python /tmp/benchmark_ssd_resnet34/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_dir /dataset --batch_size 100 --num_inter_threads 1 --num_intra_threads 52 --model=ssd300 --data_name coco --mkl=True --device=cpu --data_format=NHWC --variable_update=horovod --horovod_device=cpu --kmp_affinity=granularity=fine,verbose,compact,1,0 --kmp_settings=1 --kmp_blocktime=1 --weight_decay 0.0001 --num_warmup_batches 20 --num_batches 100"}, + "output": "mpirun --allow-run-as-root -n 1 --map-by socket python /tmp/benchmark_ssd_resnet34/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_dir /dataset --batch_size 100 --num_inter_threads 1 --num_intra_threads 52 --model=ssd300 --data_name coco --mkl=True --device=cpu --data_format=NHWC --variable_update=horovod --horovod_device=cpu --kmp_affinity=granularity=fine,verbose,compact,1,0 --kmp_settings=1 --kmp_blocktime=1 --weight_decay 0.0001 --num_warmup_batches 20 --num_batches 100", + "cpuset": "0-111"}, { "_comment": "ssd_resnet34_bfloat16_training_accuracy", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=bfloat16 --mode=training --accuracy-only --num-cores=52 --num-inter-threads=1 --num-intra-threads=52 --batch-size=100 --data-location=/dataset --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models", - "output": "python /tmp/benchmark_ssd_resnet34/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_dir /dataset --batch_size 100 --num_inter_threads 1 --num_intra_threads 52 --model=ssd300 --data_name coco --mkl=True --device=cpu --data_format=NHWC --variable_update=horovod --horovod_device=cpu --kmp_affinity=granularity=fine,verbose,compact,1,0 --kmp_settings=1 --kmp_blocktime=1 --train_dir=None --eval=true --num_eval_epochs=1 --print_training_accuracy=True"}, + "output": "python /tmp/benchmark_ssd_resnet34/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_dir /dataset --batch_size 100 --num_inter_threads 1 --num_intra_threads 52 --model=ssd300 --data_name coco --mkl=True --device=cpu --data_format=NHWC --variable_update=horovod --horovod_device=cpu --kmp_affinity=granularity=fine,verbose,compact,1,0 --kmp_settings=1 --kmp_blocktime=1 --train_dir=None --eval=true --num_eval_epochs=1 --print_training_accuracy=True", + "cpuset": "0-111"}, { "_comment": "ssd_resnet34_bfloat16_training_with_backbone_model", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=bfloat16 --mode=training --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --data-location=/dataset --num-cores=52 --num-inter-threads=1 --num-intra-threads=50 --batch-size=100 --mpi_num_processes=4 --mpi_num_processes_per_socket=1 --epochs=60 --checkpoint /checkpoints --backbone-model=/workspace/ssd-backbone --output-dir=/workspace/logs", - "output": "mpirun --allow-run-as-root -n 4 --map-by socket python /tmp/benchmark_ssd_resnet34/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_dir /dataset --batch_size 100 --num_inter_threads 1 --num_intra_threads 50 --model=ssd300 --data_name coco --mkl=True --device=cpu --data_format=NHWC --variable_update=horovod --horovod_device=cpu --kmp_affinity=granularity=fine,verbose,compact,1,0 --kmp_settings=1 --kmp_blocktime=1 --backbone_model_path=/workspace/ssd-backbone/model.ckpt-28152 --optimizer=momentum --weight_decay=0.0005 --momentum=0.9 --num_epochs=60 --num_warmup_batches=0 --train_dir=/checkpoints --save_model_steps=10000"} + "output": "mpirun --allow-run-as-root -n 4 --map-by socket python /tmp/benchmark_ssd_resnet34/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_dir /dataset --batch_size 100 --num_inter_threads 1 --num_intra_threads 50 --model=ssd300 --data_name coco --mkl=True --device=cpu --data_format=NHWC --variable_update=horovod --horovod_device=cpu --kmp_affinity=granularity=fine,verbose,compact,1,0 --kmp_settings=1 --kmp_blocktime=1 --backbone_model_path=/workspace/ssd-backbone/model.ckpt-28152 --optimizer=momentum --weight_decay=0.0005 --momentum=0.9 --num_epochs=60 --num_warmup_batches=0 --train_dir=/checkpoints --save_model_steps=10000", + "cpuset": "0-111"}, + + { "_comment": "ssd_resnet34_fp32_accuracy_cpuset", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --batch-size=1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssd_resnet34_bs1.pb --data-location=/dataset", + "output": "python /workspace/intelai_models/inference/fp32/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 14 --input-size 300 --warmup-steps 200 --steps 800 --accuracy-only --data-location /dataset", + "cpuset": "5-15,20-22"}, + + { "_comment": "ssd_resnet34_bfloat16_accuracy_cpuset", + "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-resnet34 --precision=bfloat16 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --batch-size=1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --accuracy-only --verbose --model-source-dir=/workspace/models --in-graph=/in_graph/ssd_resnet34_bs1.pb --data-location=/dataset", + "output": "python /workspace/intelai_models/inference/bfloat16/infer_detections.py --input-graph /in_graph/ssd_resnet34_bs1.pb --batch-size 1 --inter-op-parallelism-threads 1 --intra-op-parallelism-threads 8 --input-size 300 --warmup-steps 200 --steps 800 --accuracy-only --data-location /dataset", + "cpuset": "28-35"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_transformer_lt_official_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_transformer_lt_official_args.json index f2e73edba..1cce64c70 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_transformer_lt_official_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_transformer_lt_official_args.json @@ -2,11 +2,13 @@ { "_comment": "Transformer LT official FP32 online inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_lt_official --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --batch-size=1 --socket-id=0 --benchmark-only --in-graph=fp32_graphdef.pb --data-location=/dataset --output-dir=/workspace/logs --file=newstest2014.en --file_out=out_translate.txt --reference=newstest2014.de --vocab_file=vocab.txt", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_ab.py --param_set=big --in_graph=fp32_graphdef.pb --batch_size=1 --file=newstest2014.en --file_out=/workspace/logs/out_translate.txt --vocab_file=vocab.txt --num_inter=1 --num_intra=28" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_ab.py --param_set=big --in_graph=fp32_graphdef.pb --batch_size=1 --file=newstest2014.en --file_out=/workspace/logs/out_translate.txt --vocab_file=vocab.txt --num_inter=1 --num_intra=28", + "cpuset": "0-111" }, { "_comment": "Transformer LT official FP32 batch inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_lt_official --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --batch-size=64 --socket-id=0 --benchmark-only --in-graph=fp32_graphdef.pb --data-location=/dataset --output-dir=/workspace/logs --file=newstest2014.en --file_out=out_translate.txt --reference=newstest2014.de --vocab_file=vocab.txt", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_ab.py --param_set=big --in_graph=fp32_graphdef.pb --batch_size=64 --file=newstest2014.en --file_out=/workspace/logs/out_translate.txt --vocab_file=vocab.txt --num_inter=1 --num_intra=28" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/infer_ab.py --param_set=big --in_graph=fp32_graphdef.pb --batch_size=64 --file=newstest2014.en --file_out=/workspace/logs/out_translate.txt --vocab_file=vocab.txt --num_inter=1 --num_intra=28", + "cpuset": "0-111" } ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_transformer_mlperf_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_transformer_mlperf_args.json index 879f52eec..2ae9eb560 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_transformer_mlperf_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_transformer_mlperf_args.json @@ -2,21 +2,25 @@ { "_comment": "Transformer MLPerf FP32 inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_mlperf --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --batch-size=64 -i=0 --in-graph=graph.pb --data-location=/dataset --file=newstest2014.en --file_out=translate.txt --reference=newstest2014.de", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/transformer/translate.py --params=big --input_graph=graph.pb --batch_size=64 --test_mode=inference --warmup_steps=3 --steps=100 --vocab_file= --file=newstest2014.en --file_out=/models/benchmarks/common/tensorflow/logs/translate.txt --data_dir=/dataset --num_inter=None --num_intra=None" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/fp32/transformer/translate.py --params=big --input_graph=graph.pb --batch_size=64 --test_mode=inference --warmup_steps=3 --steps=100 --vocab_file= --file=newstest2014.en --file_out=/models/benchmarks/common/tensorflow/logs/translate.txt --data_dir=/dataset --num_inter=None --num_intra=None", + "cpuset": "0-111" }, { "_comment": "Transformer MLPerf BFloat16 inference", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_mlperf --precision=bfloat16 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --batch-size=64 -i=0 --in-graph=graph.pb --data-location=/dataset --file=newstest2014.en --file_out=translate.txt --reference=newstest2014.de", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/bfloat16/transformer/translate.py --params=big --input_graph=graph.pb --batch_size=64 --test_mode=inference --warmup_steps=3 --steps=100 --vocab_file= --file=newstest2014.en --file_out=/models/benchmarks/common/tensorflow/logs/translate.txt --data_dir=/dataset --num_inter=None --num_intra=None" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/bfloat16/transformer/translate.py --params=big --input_graph=graph.pb --batch_size=64 --test_mode=inference --warmup_steps=3 --steps=100 --vocab_file= --file=newstest2014.en --file_out=/models/benchmarks/common/tensorflow/logs/translate.txt --data_dir=/dataset --num_inter=None --num_intra=None", + "cpuset": "0-111" }, { "_comment": "Transformer MLPerf FP32 training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_mlperf --precision=fp32 --mode=training --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --socket-id 0 --data-location /dataset --output-dir=/workspace/logs --batch-size=5120 --random_seed=11 --train_steps=2 --steps_between_eval=1 --params=big --save_checkpoints=Yes --do_eval=Yes --print_iter=10", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/training/fp32/transformer/transformer_main.py --data_dir=/dataset --model_dir=/workspace/logs --batch_size=5120 --random_seed=11 --params=big --train_steps=2 --steps_between_eval=1 --do_eval=Yes --save_checkpoints=Yes --save_profile=No --print_iter=10 --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=28 --learning_rate=2 --static_batch=No" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/training/fp32/transformer/transformer_main.py --data_dir=/dataset --model_dir=/workspace/logs --batch_size=5120 --random_seed=11 --params=big --train_steps=2 --steps_between_eval=1 --do_eval=Yes --save_checkpoints=Yes --save_profile=No --print_iter=10 --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=28 --learning_rate=2 --static_batch=No", + "cpuset": "0-111" }, { "_comment": "Transformer MLPerf BFloat16 training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_translation --model-name=transformer_mlperf --precision=bfloat16 --mode=training --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --socket-id 0 --data-location /dataset --output-dir=/workspace/logs --batch-size=5120 --random_seed=11 --train_steps=2 --steps_between_eval=1 --params=big --save_checkpoints=Yes --do_eval=Yes --print_iter=10", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/training/bfloat16/transformer/transformer_main.py --data_dir=/dataset --model_dir=/workspace/logs --batch_size=5120 --random_seed=11 --params=big --train_steps=2 --steps_between_eval=1 --do_eval=Yes --save_checkpoints=Yes --save_profile=No --print_iter=10 --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=28 --learning_rate=2 --static_batch=No" + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/training/bfloat16/transformer/transformer_main.py --data_dir=/dataset --model_dir=/workspace/logs --batch_size=5120 --random_seed=11 --params=big --train_steps=2 --steps_between_eval=1 --do_eval=Yes --save_checkpoints=Yes --save_profile=No --print_iter=10 --inter_op_parallelism_threads=1 --intra_op_parallelism_threads=28 --learning_rate=2 --static_batch=No", + "cpuset": "0-111" } ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_unet_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_unet_args.json index 37a5f21d7..651e6c368 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_unet_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_unet_args.json @@ -1,5 +1,6 @@ [ { "_comment": "FP32 benchmark", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_segmentation --model-name=unet --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --benchmark-only --verbose --checkpoint=/checkpoints --checkpoint_name=model.ckpt", - "output": "numactl --cpunodebind=0 --membind=0 python /workspace/benchmarks/image_segmentation/tensorflow/unet/inference/fp32/unet_infer.py -bs 1 -cp /checkpoints/model.ckpt --num_inter_threads 1 --num_intra_threads 28 -nw 80 -nb 400"} + "output": "numactl --cpunodebind=0 --membind=0 python /workspace/benchmarks/image_segmentation/tensorflow/unet/inference/fp32/unet_infer.py -bs 1 -cp /checkpoints/model.ckpt --num_inter_threads 1 --num_intra_threads 28 -nw 80 -nb 400", + "cpuset": "0-111"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_wavenet_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_wavenet_args.json index 99ef0e147..577b25562 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_wavenet_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_wavenet_args.json @@ -1,5 +1,6 @@ [ { "_comment": "FP32 benchmark command", "input": "run_tf_benchmark.py --framework tensorflow --use-case text_to_speech --precision fp32 --mode inference --model-name wavenet --num-cores 1 --checkpoint /checkpoints --intelai-models . --model-source-dir . --socket-id 0 --verbose --checkpoint_name=model.ckpt-99 --sample=8510", - "output": "numactl --physcpubind=0-0 --membind=0 python generate.py /checkpoints/model.ckpt-99 --num_inter_threads=1 --num_intra_threads=1 --sample=8510"} + "output": "numactl --physcpubind=0-0 --membind=0 python generate.py /checkpoints/model.ckpt-99 --num_inter_threads=1 --num_intra_threads=1 --sample=8510", + "cpuset": "0-111"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_args.json index bff19d1e6..3ba041ec4 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_args.json @@ -1,9 +1,11 @@ [ { "_comment": "wide_deep_small_fp32_batch_inference", "input": "run_tf_benchmark.py --framework tensorflow --use-case recommendation --precision fp32 --mode inference --model-name wide_deep --batch-size 1024 --data-location /dataset --checkpoint /checkpoints --intelai-models . --verbose", - "output": "OMP_NUM_THREADS=1 numactl --cpunodebind=0 --membind=0 python inference/fp32/wide_deep_inference.py --data_dir=/dataset --model_dir=/checkpoints --batch_size=1024"}, + "output": "OMP_NUM_THREADS=1 numactl --cpunodebind=0 --membind=0 python inference/fp32/wide_deep_inference.py --data_dir=/dataset --model_dir=/checkpoints --batch_size=1024", + "cpuset": "0-111"}, { "_comment": "wide_deep_small_fp32_online_inference", "input": "run_tf_benchmark.py --framework tensorflow --use-case recommendation --precision fp32 --mode inference --model-name wide_deep --data-location /dataset --checkpoint /checkpoints --intelai-models /workspace/models --verbose", - "output": "OMP_NUM_THREADS=1 numactl --cpunodebind=0 --membind=0 python inference/fp32/wide_deep_inference.py --data_dir=/dataset --model_dir=/checkpoints --batch_size=1"} + "output": "OMP_NUM_THREADS=1 numactl --cpunodebind=0 --membind=0 python inference/fp32/wide_deep_inference.py --data_dir=/dataset --model_dir=/checkpoints --batch_size=1", + "cpuset": "0-111"} ] diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_large_ds_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_large_ds_args.json index 29d5b8813..717dda7c1 100644 --- a/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_large_ds_args.json +++ b/tests/unit/common/tensorflow/tf_model_args/tf_wide_deep_large_ds_args.json @@ -1,38 +1,47 @@ [ { "_comment": "wide_deep_large_int8", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/inference.py --data_location=/dataset --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/inference.py --data_location=/dataset --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb", + "cpuset": "0-111"}, { "_comment": "wide_deep_large_int8_28_cores", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=28 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/inference.py --batch_size=1 --data_location=/dataset --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/inference.py --batch_size=1 --data_location=/dataset --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb", + "cpuset": "0-111"}, { "_comment": "wide_deep_large_int8_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/inference.py --batch_size=1 --data_location=/dataset --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/inference.py --batch_size=1 --data_location=/dataset --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb", + "cpuset": "0-111"}, { "_comment": "wide_deep_large_int8_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=int8 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=512 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_int8_pretrained_model.pb --data-location=/dataset", - "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/inference.py --batch_size=512 --data_location=/dataset --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb"}, + "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/inference.py --batch_size=512 --data_location=/dataset --input_graph=/in_graph/wide_deep_int8_pretrained_model.pb", + "cpuset": "0-111"}, { "_comment": "wide_deep_large_fp32", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset", - "output": "python /workspace/intelai_models/inference/inference.py --data_location=/dataset --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb"}, + "output": "python /workspace/intelai_models/inference/inference.py --data_location=/dataset --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb", + "cpuset": "0-111"}, { "_comment": "wide_deep_large_fp32_28_cores", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=28 --batch-size=512 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset", - "output": "python /workspace/intelai_models/inference/inference.py --batch_size=512 --data_location=/dataset --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb"}, + "output": "python /workspace/intelai_models/inference/inference.py --batch_size=512 --data_location=/dataset --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb", + "cpuset": "0-111"}, { "_comment": "wide_deep_large_fp32_throughput", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=512 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset", - "output": "python /workspace/intelai_models/inference/inference.py --batch_size=512 --data_location=/dataset --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb"}, + "output": "python /workspace/intelai_models/inference/inference.py --batch_size=512 --data_location=/dataset --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb", + "cpuset": "0-111"}, { "_comment": "wide_deep_large_fp32_latency", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=0 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --verbose --in-graph=/in_graph/wide_deep_fp32_pretrained_model.pb --data-location=/dataset", - "output": "python /workspace/intelai_models/inference/inference.py --batch_size=1 --data_location=/dataset --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb"}, + "output": "python /workspace/intelai_models/inference/inference.py --batch_size=1 --data_location=/dataset --input_graph=/in_graph/wide_deep_fp32_pretrained_model.pb", + "cpuset": "0-111"}, { "_comment": "wide_deep_large_ds_fp32training", "input": "run_tf_benchmark.py --framework=tensorflow --use-case=recommendation --model-name=wide_deep_large_ds --precision=fp32 --mode=training --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=512 --output-dir=/workspace/logs --mode training --checkpoint /checkpoint_dir --data-location=/dataset", - "output": "python /workspace/intelai_models/training/train.py --batch_size=512 --data_location=/dataset --checkpoint=/checkpoint_dir --output_dir=/workspace/logs"} + "output": "python /workspace/intelai_models/training/train.py --batch_size=512 --data_location=/dataset --checkpoint=/checkpoint_dir --output_dir=/workspace/logs", + "cpuset": "0-111"} ] diff --git a/tests/unit/common/test_base_model_init.py b/tests/unit/common/test_base_model_init.py index 65bb0ba78..4ce28a6bd 100644 --- a/tests/unit/common/test_base_model_init.py +++ b/tests/unit/common/test_base_model_init.py @@ -328,3 +328,46 @@ def test_numa_multi_instance_run_command( for cpu_bind in expected_cpu_bind: assert "numactl --localalloc --physcpubind={} {} >> {}".\ format(cpu_bind, test_run_command, test_output_dir) in system_call_args + + +@pytest.mark.parametrize('test_num_instances,test_socket_id,test_num_cores,test_cpu_list,test_cpuset,' + 'expected_inter_threads,expected_intra_threads', + [[2, -1, -1, [['0', '1'], ['2', '3']], {0: ['0', '1'], 1: ['2', '3']}, 1, 2], + [None, 0, -1, [['1', '2', '3'], ['10', '11']], {0: ['1', '2', '3'], 1: ['10', '11']}, 1, 3], + [None, 1, -1, [['1', '2', '3'], ['10', '11']], {0: ['1', '2', '3'], 1: ['10', '11']}, 1, 2], + [None, 1, -1, [['1', '2', '3'], ['10', '11']], None, 1, 3], + [None, 1, 8, [['1', '2', '3'], ['10', '11']], {0: ['1', '2', '3'], 1: ['10', '11']}, 1, 8]]) +@patch("os.path.exists") +@patch("benchmarks.common.base_model_init.open") +@patch("common.platform_util.os") +@patch("common.platform_util.system_platform") +@patch("common.platform_util.subprocess") +@patch("os.system") +def test_num_inter_intra_threads_settings( + mock_system, mock_subprocess, mock_platform, mock_os, mock_open, + mock_path_exists, test_num_instances, test_socket_id, test_num_cores, + test_cpu_list, test_cpuset, expected_inter_threads, expected_intra_threads): + """ + Tests the base model init function that determines the num_inter_threads and + num_intra_thread values. + """ + platform_util = MagicMock(cpu_core_list=test_cpu_list, cpuset_cpus=test_cpuset, + num_cores_per_socket=len(test_cpu_list[0])) + test_output_dir = "/tmp/output" + args = MagicMock(verbose=True, model_name=test_model_name, batch_size=100, + numa_cores_per_instance=test_num_instances, precision="fp32", + output_dir=test_output_dir, socket_id=test_socket_id, num_cores=test_num_cores, + num_inter_threads=None, num_intra_threads=None) + os.environ["PYTHON_EXE"] = "python" + os.environ["MPI_HOSTNAMES"] = "None" + os.environ["MPI_NUM_PROCESSES"] = "None" + base_model_init = BaseModelInitializer(args, [], platform_util) + + mock_path_exists.return_value = True + + # Get the number of inter/intra threads and compared to the expected values + base_model_init.set_num_inter_intra_threads() + print(base_model_init.args.num_inter_threads) + print(base_model_init.args.num_intra_threads) + assert base_model_init.args.num_inter_threads == expected_inter_threads + assert base_model_init.args.num_intra_threads == expected_intra_threads diff --git a/tests/unit/common/test_platform_util.py b/tests/unit/common/test_platform_util.py index 8796c5887..b05b71442 100644 --- a/tests/unit/common/test_platform_util.py +++ b/tests/unit/common/test_platform_util.py @@ -21,7 +21,7 @@ import json import pytest import os -from mock import MagicMock +from mock import MagicMock, mock_open, patch from benchmarks.common.platform_util import PlatformUtil, CPUInfo from test_utils import platform_config @@ -53,13 +53,20 @@ def platform_mock(patch): return patch("system_platform.system") -def test_platform_util_lscpu_parsing(platform_mock, subprocess_mock, os_mock): +@pytest.fixture +def read_mock(patch): + return patch("read") + + +@patch("benchmarks.common.platform_util.PlatformUtil._get_cpuset") +def test_platform_util_lscpu_parsing(get_cpuset_mock, platform_mock, subprocess_mock, os_mock): """ Verifies that platform_utils gives us the proper values that we expect based on the lscpu_output string provided. """ platform_mock.return_value = platform_config.SYSTEM_TYPE os_mock.return_value = True + get_cpuset_mock.return_value = "0-111" subprocess_mock.return_value = platform_config.LSCPU_OUTPUT platform_util = PlatformUtil(MagicMock(verbose=True)) platform_util.linux_init() @@ -137,11 +144,13 @@ def test_cpu_info_binding_information_no_numa(subprocess_mock): assert generated_value == expected_value -def test_numa_cpu_core_list(subprocess_mock, subprocess_popen_mock, platform_mock, os_mock): +@patch("benchmarks.common.platform_util.PlatformUtil._get_cpuset") +def test_numa_cpu_core_list(get_cpuset_mock, subprocess_mock, subprocess_popen_mock, platform_mock, os_mock): """ Test the platform utils to ensure that we are getting the proper core lists """ subprocess_mock.return_value = platform_config.LSCPU_OUTPUT subprocess_popen_mock.return_value.stdout.readlines.return_value = platform_config.NUMA_CORES_OUTPUT platform_mock.return_value = platform_config.SYSTEM_TYPE + get_cpuset_mock.return_value = "0-111" os_mock.return_value = True subprocess_mock.return_value = platform_config.LSCPU_OUTPUT platform_util = PlatformUtil(MagicMock(verbose=True)) @@ -169,3 +178,79 @@ def test_platform_util_wmic_parsing(platform_mock, subprocess_mock, os_mock): assert platform_util.num_threads_per_core == 28 assert platform_util.num_logical_cpus == 56 assert platform_util.num_numa_nodes == 0 + + +@patch("benchmarks.common.platform_util.PlatformUtil._get_cpuset") +@pytest.mark.parametrize('cpuset_range,expected_list', + [['0-5', [0, 1, 2, 3, 4, 5]], + ['0-3,7,6', [0, 1, 2, 3, 6, 7]], + ['2-3,7,9-11,20', [2, 3, 7, 9, 10, 11, 20]], + ['0-3,7-6,11,11', [0, 1, 2, 3, 11]], + ['7-9,5-10,6,4', [4, 5, 6, 7, 8, 9, 10]]]) +def test_get_list_from_string_ranges(get_cpuset_mock, platform_mock, subprocess_mock, os_mock, + cpuset_range, expected_list,): + """ + Tests the PlatformUtils _get_list_from_string_ranges function that converts string + number ranges to an integer list. + """ + platform_mock.return_value = platform_config.SYSTEM_TYPE + subprocess_mock.return_value = platform_config.LSCPU_OUTPUT + get_cpuset_mock.return_value = cpuset_range + os_mock.return_value = True + platform_util = PlatformUtil(MagicMock()) + result = platform_util._get_list_from_string_ranges(cpuset_range) + assert result == expected_list + + +@pytest.mark.parametrize('cpuset_range,expected_core_list', + [["0-7,28-35", + [["0", "1", "2", "3", "4", "5", "6", "7"], + ["28", "29", "30", "31", "32", "33", "34", "35"]]], + ["0,2-5,20,29-32,1", + [["0", "1", "2", "3", "4", "5", "20"], + ["29", "30", "31", "32"]]]]) +@patch("os.path.exists") +def test_numa_cpu_core_list_cpuset(path_exists_mock, subprocess_mock, subprocess_popen_mock, + platform_mock, os_mock, cpuset_range, expected_core_list): + """ Test the platform utils to ensure that we are getting the proper core lists """ + subprocess_mock.return_value = platform_config.LSCPU_OUTPUT + subprocess_popen_mock.return_value.stdout.readlines.return_value = platform_config.NUMA_CORES_OUTPUT + platform_mock.return_value = platform_config.SYSTEM_TYPE + os_mock.return_value = True + subprocess_mock.return_value = platform_config.LSCPU_OUTPUT + path_exists_mock.return_value = True + cpuset_mock = mock_open(read_data=cpuset_range) + with patch("builtins.open", cpuset_mock): + platform_util = PlatformUtil(MagicMock(verbose=True, numa_cores_per_instance=4)) + + # ensure there are 2 items in the list since there are 2 sockets + assert len(platform_util.cpu_core_list) == 2 + + # Check that the core list matches the ranges defined for the cpuset file read + assert platform_util.cpu_core_list == expected_core_list + + +@patch("benchmarks.common.platform_util.PlatformUtil._get_cpuset") +@pytest.mark.parametrize('cpuset_range,expected_num_sockets', + [['0-5', 1], + ['0-3,7,6', 1], + ['2-3,7,9-11,20', 1], + ['0-3,7-6,11,11', 1], + ['7-9,5-10,6,4', 1], + ['0-111', 2], + ['28-32,84-90', 1]]) +def test_platform_utils_num_sockets_with_cpuset(get_cpuset_mock, platform_mock, subprocess_mock, + os_mock, cpuset_range, expected_num_sockets): + """ + Checks that the number of sockets in platform_utils reflects the proper value based on + the cpuset. If the cores being used by the container in the cpuset are all on one socket, + then the num_cpu_sockets should be 1, even if the system itself has 2 sockets (since the + container only has access to 1). + """ + platform_mock.return_value = platform_config.SYSTEM_TYPE + os_mock.return_value = True + get_cpuset_mock.return_value = cpuset_range + subprocess_mock.return_value = platform_config.LSCPU_OUTPUT + platform_util = PlatformUtil(MagicMock(verbose=True)) + platform_util.linux_init() + assert platform_util.num_cpu_sockets == expected_num_sockets diff --git a/tests/unit/test_launch_benchmark.py b/tests/unit/test_launch_benchmark.py index 6bc73e892..cab62775a 100644 --- a/tests/unit/test_launch_benchmark.py +++ b/tests/unit/test_launch_benchmark.py @@ -343,3 +343,31 @@ def test_disable_tcmalloc(launch_benchmark, mock_popen, # convert the run command args to a string and then check for the custom volume mounts docker_run_cmd = " ".join(args[0]) assert "--env DISABLE_TCMALLOC=".format(expected_disable_tcmalloc) in docker_run_cmd + + +@pytest.mark.parametrize("numa_cores_per_instance_arg,socket_id_args,num_cores_arg,mpi_num_proc_arg,run_privileged", + [["4", -1, -1, None, True], + [None, -1, -1, None, False], + ["socket", -1, -1, None, True], + [None, 0, -1, None, True], + [None, 1, -1, None, True], + [None, -1, 8, None, True], + [None, -1, -1, 2, True]]) +def test_launch_benchmark_docker_privileged(launch_benchmark, mock_popen, platform_mock, + numa_cores_per_instance_arg, socket_id_args, + num_cores_arg, mpi_num_proc_arg, run_privileged): + """ + Verifies that docker only runs with --privileged when it needs to (if args that + run multi-instance or numactl are used). + """ + launch_benchmark.args.numa_cores_per_instance = numa_cores_per_instance_arg + launch_benchmark.args.socket_id = socket_id_args + launch_benchmark.args.num_cores = num_cores_arg + launch_benchmark.args.mpi = mpi_num_proc_arg + platform_mock.return_value = platform_config.OS_TYPE + launch_benchmark.main() + assert mock_popen.called + args, _ = mock_popen.call_args + # convert the run command args to a string and then check for the docker args + docker_run_cmd = " ".join(args[0]) + assert ("--privileged" in docker_run_cmd) == run_privileged