Merge branch 'release-v0.96'

============================== Release Notes: v0.96 ============================== Support for new layers: - Log softmax - Basic math functions - Weights layer, which outputs a weights tensor - L2 norm squared - Binary cross entropy loss and sigmoid binary cross entropy loss - Boolean accuracy, Boolean false negative rate, Boolean false positive rate - Bilinear resize - Variance and covariance - Dilated and grouped convolution (GPU only) Performance optimizations: - Optimized GPU model-parallel softmax layer Model portability & usability: - Option for weight initialization with user-provided list of values - Callback to save any layer output as an image Internal features: - Provide compile time option to selectively disable OpenMP for data fetching loop - Thrust calls no longer involve the default CUDA stream I/O & data readers: - Reworked jag_conduit data reader: - Support the updated JAG simulation data output format - Use direct HDF5 I/O for on-demand data loading with Conduit - Ingest a unique set of data files per instance - Allow exclusive data partitioning among multiple trainers - Multi-channel images - Normalization of JAG data - Interface to select images of specific views and time indices - Interface to describe how to slice JAG data - Avoid redundant fetching and incoherent random number pulls in the group of local data readers - Improved threading performance by preallocating scratch space for loading samples Build system: - Support cross-compilation configurations in superbuild and SetupProtobuf
LLNL · Nov 14, 2018 · cd7350e · cd7350e
2 parents ffecbef + 07d6f36
commit cd7350e
Show file tree

Hide file tree

Showing 296 changed files with 20,185 additions and 10,130 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -37,26 +37,45 @@ endif ()
 # Version setup
 #
 
+set(LBANN_VERSION_MAJOR 0)
+set(LBANN_VERSION_MINOR 96)
+
+set(LBANN_VERSION "${LBANN_VERSION_MAJOR}.${LBANN_VERSION_MINOR}")
+
 # Check to see if we are in a git repo
-execute_process(
-  COMMAND git rev-parse --is-inside-work-tree
-  WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-  OUTPUT_VARIABLE GIT_REPO
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-if (GIT_REPO)
-  # Get the git version so that we can embed it into the executable
+find_program(__GIT_EXECUTABLE git)
+mark_as_advanced(__GIT_EXECUTABLE)
+if (__GIT_EXECUTABLE)
+
   execute_process(
-    COMMAND git --git-dir .git describe --abbrev=7 --dirty --always --tags
-    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_VERSION
+    COMMAND ${__GIT_EXECUTABLE} rev-parse --is-inside-work-tree
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE __BUILDING_FROM_GIT_SOURCES
     OUTPUT_STRIP_TRAILING_WHITESPACE)
-  set(${UPPER_PROJECT_NAME}_VERSION ${GIT_VERSION}
-    CACHE STRING "LBANN's version string")
-else ()
-  set(${UPPER_PROJECT_NAME}_VERSION v0.95
-    CACHE STRING "LBANN's version string")
-endif (GIT_REPO)
+
+  if (__BUILDING_FROM_GIT_SOURCES)
+    # Get the git version so that we can embed it into the executable
+    execute_process(
+      COMMAND ${__GIT_EXECUTABLE} rev-parse --show-toplevel
+      WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+      OUTPUT_VARIABLE __GIT_TOPLEVEL_DIR
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(
+      COMMAND ${__GIT_EXECUTABLE} rev-parse --git-dir
+      WORKING_DIRECTORY "${__GIT_TOPLEVEL_DIR}"
+      OUTPUT_VARIABLE __GIT_GIT_DIR
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(
+      COMMAND ${__GIT_EXECUTABLE} --git-dir "${__GIT_GIT_DIR}" describe
+      --abbrev=7 --always --dirty --tags
+      WORKING_DIRECTORY "${__GIT_TOPLEVEL_DIR}"
+      OUTPUT_VARIABLE __GIT_DESCRIBE_VERSION
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    set(LBANN_GIT_VERSION "${__GIT_DESCRIBE_VERSION}"
+      CACHE STRING "LBANN's version string as told by git.")
+  endif (__BUILDING_FROM_GIT_SOURCES)
+endif (__GIT_EXECUTABLE)
 
 if (CMAKE_HOST_SYSTEM_NAME MATCHES "Linux")
   set(LBANN_GNU_LINUX TRUE)
@@ -214,7 +233,7 @@ endif (LBANN_HAS_CUDA)
 # guarantee. There's no harm including it multiple times.
 find_library(DL_LIBRARY dl DOC "The dynamic loader library.")
 if (DL_LIBRARY)
-  message("Found dl: ${DL_LIBRARY}")
+  message(STATUS "Found dl: ${DL_LIBRARY}")
 else ()
   message(FATAL_ERROR
     "dl library not found! This is a required library.\n"
@@ -401,32 +420,47 @@ get_directory_property( DirDefs COMPILE_DEFINITIONS )
 # Configuration summary
 ################################################################
 
-message("== Configuration Summary ==")
-message("  PROJECT_SOURCE_DIR:   ${PROJECT_SOURCE_DIR}")
-message("  PROJECT_BINARY_DIR:   ${PROJECT_BINARY_DIR}")
-message("  CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
-message("  CMAKE_BUILD_TYPE:     ${CMAKE_BUILD_TYPE}")
+# NOTE: message() outputs to stderr by default. We now use a string to
+# maintain this information and then have cmake echo it to stdout. The
+# only side effects are that if you use the CMake GUI, you won't see
+# this output anymore (they only report stderr) and that if you add
+# something to the list, you must remember your newline!
+set(_str "== Configuration Summary ==\n")
+string(APPEND _str "  PROJECT_SOURCE_DIR:   ${PROJECT_SOURCE_DIR}\n"
+  "  PROJECT_BINARY_DIR:   ${PROJECT_BINARY_DIR}\n"
+  "  CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}\n"
+  "  CMAKE_BUILD_TYPE:     ${CMAKE_BUILD_TYPE}\n")
 if (CMAKE_BUILD_TYPE MATCHES None)
-  message("  CXX FLAGS:            ${CMAKE_CXX_FLAGS}")
+  string(APPEND _str
+    "  CXX FLAGS:            ${CMAKE_CXX_FLAGS}\n")
 elseif (CMAKE_BUILD_TYPE MATCHES Release)
-  message("  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
+  string(APPEND _str
+    "  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}\n")
 elseif (CMAKE_BUILD_TYPE MATCHES RelWithDebInfo)
-  message("  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+  string(APPEND _str
+    "  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}\n")
 elseif (CMAKE_BUILD_TYPE MATCHES Debug)
-  message("  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
+  string(APPEND _str
+    "  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}\n")
 endif ()
-message("  LBANN_GNU_LINUX:      ${LBANN_GNU_LINUX}")
-message("  LBANN_HAS_HYDROGEN:   ${LBANN_HAS_HYDROGEN}")
-message("  LBANN_HAS_OPENCV:     ${LBANN_HAS_OPENCV}")
-message("  LBANN_HAS_CUDA:       ${LBANN_HAS_CUDA}")
-message("  LBANN_HAS_CUDNN:      ${LBANN_HAS_CUDNN}")
-message("  LBANN_HAS_NCCL2:      ${LBANN_HAS_NCCL2}")
-message("  LBANN_HAS_PROTOBUF:   ${LBANN_HAS_PROTOBUF}")
-message("  LBANN_HAS_CNPY:       ${LBANN_HAS_CNPY}")
-message("  LBANN_HAS_TBINF:      ${LBANN_HAS_TBINF}")
-message("  LBANN_HAS_VTUNE:      ${LBANN_HAS_VTUNE}")
-message("  LBANN_NVPROF:         ${LBANN_NVPROF}")
-message("  LBANN_HAS_DOXYGEN:    ${LBANN_HAS_DOXYGEN}")
-message("  LBANN_HAS_LBANN_PROTO:${LBANN_HAS_LBANN_PROTO}")
-message("  LBANN_HAS_ALUMINUM:   ${LBANN_HAS_ALUMINUM}")
-message("  LBANN_HAS_CONDUIT:    ${LBANN_HAS_CONDUIT}")
+string(APPEND _str
+  "  LBANN_GNU_LINUX:      ${LBANN_GNU_LINUX}\n"
+  "  LBANN_HAS_HYDROGEN:   ${LBANN_HAS_HYDROGEN}\n"
+  "  LBANN_HAS_OPENCV:     ${LBANN_HAS_OPENCV}\n"
+  "  LBANN_HAS_CUDA:       ${LBANN_HAS_CUDA}\n"
+  "  LBANN_HAS_CUDNN:      ${LBANN_HAS_CUDNN}\n"
+  "  LBANN_HAS_NCCL2:      ${LBANN_HAS_NCCL2}\n"
+  "  LBANN_HAS_PROTOBUF:   ${LBANN_HAS_PROTOBUF}\n"
+  "  LBANN_HAS_CNPY:       ${LBANN_HAS_CNPY}\n"
+  "  LBANN_HAS_TBINF:      ${LBANN_HAS_TBINF}\n"
+  "  LBANN_HAS_VTUNE:      ${LBANN_HAS_VTUNE}\n"
+  "  LBANN_NVPROF:         ${LBANN_NVPROF}\n"
+  "  LBANN_HAS_DOXYGEN:    ${LBANN_HAS_DOXYGEN}\n"
+  "  LBANN_HAS_LBANN_PROTO:${LBANN_HAS_LBANN_PROTO}\n"
+  "  LBANN_HAS_ALUMINUM:   ${LBANN_HAS_ALUMINUM}\n"
+  "  LBANN_HAS_CONDUIT:    ${LBANN_HAS_CONDUIT}\n"
+  "  LBANN_NO_OMP_FOR_DATA_READERS: ${LBANN_NO_OMP_FOR_DATA_READERS}\n")
+
+# Output to stdout
+execute_process(COMMAND ${CMAKE_COMMAND} -E echo "${_str}")
+set(_str)
diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt
@@ -1,3 +1,59 @@
+============================== (Pending) Release Notes: v0.97 ==============================
+Support for new training algorithms:
+
+Support for new network structures:
+
+Support for new layers:
+
+Performance optimizations:
+
+Model portability & usability:
+
+Internal features:
+
+I/O & data readers:
+
+Build system:
+
+============================== Release Notes: v0.96 ==============================
+Support for new layers:
+ - Log softmax
+ - Basic math functions
+ - Weights layer, which outputs a weights tensor
+ - L2 norm squared
+ - Binary cross entropy loss and sigmoid binary cross entropy loss
+ - Boolean accuracy, Boolean false negative rate, Boolean false positive rate
+ - Bilinear resize
+ - Variance and covariance
+ - Dilated and grouped convolution (GPU only)
+
+Performance optimizations:
+ - Optimized GPU model-parallel softmax layer
+
+Model portability & usability:
+ - Option for weight initialization with user-provided list of values
+ - Callback to save any layer output as an image
+
+Internal features:
+ - Provide compile time option to selectively disable OpenMP for data fetching loop
+ - Thrust calls no longer involve the default CUDA stream
+
+I/O & data readers:
+ - Reworked jag_conduit data reader:
+   - Support the updated JAG simulation data output format
+   - Use direct HDF5 I/O for on-demand data loading with Conduit
+   - Ingest a unique set of data files per instance
+   - Allow exclusive data partitioning among multiple trainers
+   - Multi-channel images
+   - Normalization of JAG data
+   - Interface to select images of specific views and time indices
+   - Interface to describe how to slice JAG data
+   - Avoid redundant fetching and incoherent random number pulls in the group of local data readers
+ - Improved threading performance by preallocating scratch space for loading samples
+
+Build system:
+ - Support cross-compilation configurations in superbuild and SetupProtobuf
+
 ============================== Release Notes: v0.95 ==============================
 Support for new training algorithms:
   - Generative Adversarial Networks (GAN)

diff --git a/bamboo/unit_tests/.gitignore b/bamboo/unit_tests/.gitignore
@@ -0,0 +1 @@
+.cache
diff --git a/bamboo/unit_tests/error/.gitignore b/bamboo/unit_tests/error/.gitignore
@@ -0,0 +1,3 @@
+*
+!.gitignore
+!README.md
diff --git a/bamboo/unit_tests/output/.gitignore b/bamboo/unit_tests/output/.gitignore
@@ -0,0 +1,3 @@
+*
+!.gitignore
+!README.md
diff --git a/bamboo/unit_tests/test_unit_conv_graph.py b/bamboo/unit_tests/test_unit_conv_graph.py
diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../common_python')
+import tools
+import pytest
+import os
+
+def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name):
+    if compiler_name not in executables:
+      pytest.skip('default_exes[%s] does not exist' % compiler_name)
+    output_file_name = '%s/bamboo/unit_tests/output/layer_covariance_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/layer_covariance_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
+        data_filedir_default='', data_reader_name='synthetic',
+        model_folder='tests/layer_tests', model_name='covariance', optimizer_name='sgd',
+        output_file_name=output_file_name, error_file_name=error_file_name)
+    return_code = os.system(command)
+    assert return_code == 0
+
+def test_unit_layer_covariance_clang4(cluster, exes, dirname):
+    skeleton_layer_covariance(cluster, exes, dirname, 'clang4')
+
+def test_unit_layer_covariance_gcc4_check(cluster, exes, dirname):
+    if cluster in ['surface']:
+        pytest.skip('FIXME')
+        # Surface Errors:
+        # assert 34304 == 0
+    skeleton_layer_covariance(cluster, exes, dirname, 'gcc4')
+
+def test_unit_layer_covariance_gcc7(cluster, exes, dirname):
+    skeleton_layer_covariance(cluster, exes, dirname, 'gcc7')
+
+def test_unit_layer_covariance_intel18(cluster, exes, dirname):
+    skeleton_layer_covariance(cluster, exes, dirname, 'intel18')
+
+# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_covariance_exe' --exe=<executable>
+def test_unit_layer_covariance_exe(cluster, dirname, exe):
+    if exe == None:
+        pytest.skip('Non-local testing')
+    exes = {'exe' : exe}
+    skeleton_layer_covariance(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../common_python')
+import tools
+import pytest
+import os
+
+def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name):
+    if compiler_name not in executables:
+      pytest.skip('default_exes[%s] does not exist' % compiler_name)
+    output_file_name = '%s/bamboo/unit_tests/output/layer_l2_norm2_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/layer_l2_norm2_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
+        data_filedir_default='', data_reader_name='synthetic',
+        model_folder='tests/layer_tests', model_name='l2_norm2', optimizer_name='sgd',
+        output_file_name=output_file_name, error_file_name=error_file_name)
+    return_code = os.system(command)
+    assert return_code == 0
+
+def test_unit_layer_l2_norm2_clang4(cluster, exes, dirname):
+    skeleton_layer_l2_norm2(cluster, exes, dirname, 'clang4')
+
+def test_unit_layer_l2_norm2_gcc4_check(cluster, exes, dirname):
+    if cluster in ['surface']:
+        pytest.skip('FIXME')
+        # Surface Errors:
+        # assert 34304 == 0
+    skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc4')
+
+def test_unit_layer_l2_norm2_gcc7(cluster, exes, dirname):
+    skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc7')
+
+def test_unit_layer_l2_norm2_intel18(cluster, exes, dirname):
+    skeleton_layer_l2_norm2(cluster, exes, dirname, 'intel18')
+
+# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l2_norm2_exe' --exe=<executable>
+def test_unit_layer_l2_norm2_exe(cluster, dirname, exe):
+    if exe == None:
+        pytest.skip('Non-local testing')
+    exes = {'exe' : exe}
+    skeleton_layer_l2_norm2(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../common_python')
+import tools
+import pytest
+import os
+
+def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name):
+    if compiler_name not in executables:
+      pytest.skip('default_exes[%s] does not exist' % compiler_name)
+    output_file_name = '%s/bamboo/unit_tests/output/layer_log_softmax_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/layer_log_softmax_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
+        data_filedir_default='', data_reader_name='synthetic',
+        model_folder='tests/layer_tests', model_name='log_softmax', optimizer_name='sgd',
+        output_file_name=output_file_name, error_file_name=error_file_name)
+    return_code = os.system(command)
+    assert return_code == 0
+
+def test_unit_layer_log_softmax_clang4(cluster, exes, dirname):
+    skeleton_layer_log_softmax(cluster, exes, dirname, 'clang4')
+
+def test_unit_layer_log_softmax_gcc4_check(cluster, exes, dirname):
+    if cluster in ['surface']:
+        pytest.skip('FIXME')
+        # Surface Errors:
+        # assert 34304 == 0
+    skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc4')
+
+def test_unit_layer_log_softmax_gcc7(cluster, exes, dirname):
+    skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc7')
+
+def test_unit_layer_log_softmax_intel18(cluster, exes, dirname):
+    skeleton_layer_log_softmax(cluster, exes, dirname, 'intel18')
+
+# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_log_softmax_exe' --exe=<executable>
+def test_unit_layer_log_softmax_exe(cluster, dirname, exe):
+    if exe == None:
+        pytest.skip('Non-local testing')
+    exes = {'exe' : exe}
+    skeleton_layer_log_softmax(cluster, exes, dirname, 'exe')