diff --git a/CMakeLists.txt b/CMakeLists.txt
index 393e86bbbe7..53cdfb092c1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,26 +37,45 @@ endif ()
 # Version setup
 #
 
+set(LBANN_VERSION_MAJOR 0)
+set(LBANN_VERSION_MINOR 96)
+
+set(LBANN_VERSION "${LBANN_VERSION_MAJOR}.${LBANN_VERSION_MINOR}")
+
 # Check to see if we are in a git repo
-execute_process(
-  COMMAND git rev-parse --is-inside-work-tree
-  WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-  OUTPUT_VARIABLE GIT_REPO
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-if (GIT_REPO)
-  # Get the git version so that we can embed it into the executable
+find_program(__GIT_EXECUTABLE git)
+mark_as_advanced(__GIT_EXECUTABLE)
+if (__GIT_EXECUTABLE)
+
   execute_process(
-    COMMAND git --git-dir .git describe --abbrev=7 --dirty --always --tags
-    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_VERSION
+    COMMAND ${__GIT_EXECUTABLE} rev-parse --is-inside-work-tree
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE __BUILDING_FROM_GIT_SOURCES
     OUTPUT_STRIP_TRAILING_WHITESPACE)
-  set(${UPPER_PROJECT_NAME}_VERSION ${GIT_VERSION}
-    CACHE STRING "LBANN's version string")
-else ()
-  set(${UPPER_PROJECT_NAME}_VERSION v0.95
-    CACHE STRING "LBANN's version string")
-endif (GIT_REPO)
+
+  if (__BUILDING_FROM_GIT_SOURCES)
+    # Get the git version so that we can embed it into the executable
+    execute_process(
+      COMMAND ${__GIT_EXECUTABLE} rev-parse --show-toplevel
+      WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+      OUTPUT_VARIABLE __GIT_TOPLEVEL_DIR
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(
+      COMMAND ${__GIT_EXECUTABLE} rev-parse --git-dir
+      WORKING_DIRECTORY "${__GIT_TOPLEVEL_DIR}"
+      OUTPUT_VARIABLE __GIT_GIT_DIR
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(
+      COMMAND ${__GIT_EXECUTABLE} --git-dir "${__GIT_GIT_DIR}" describe
+      --abbrev=7 --always --dirty --tags
+      WORKING_DIRECTORY "${__GIT_TOPLEVEL_DIR}"
+      OUTPUT_VARIABLE __GIT_DESCRIBE_VERSION
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    set(LBANN_GIT_VERSION "${__GIT_DESCRIBE_VERSION}"
+      CACHE STRING "LBANN's version string as told by git.")
+  endif (__BUILDING_FROM_GIT_SOURCES)
+endif (__GIT_EXECUTABLE)
 
 if (CMAKE_HOST_SYSTEM_NAME MATCHES "Linux")
   set(LBANN_GNU_LINUX TRUE)
@@ -214,7 +233,7 @@ endif (LBANN_HAS_CUDA)
 # guarantee. There's no harm including it multiple times.
 find_library(DL_LIBRARY dl DOC "The dynamic loader library.")
 if (DL_LIBRARY)
-  message("Found dl: ${DL_LIBRARY}")
+  message(STATUS "Found dl: ${DL_LIBRARY}")
 else ()
   message(FATAL_ERROR
     "dl library not found! This is a required library.\n"
@@ -401,32 +420,47 @@ get_directory_property( DirDefs COMPILE_DEFINITIONS )
 # Configuration summary
 ################################################################
 
-message("== Configuration Summary ==")
-message("  PROJECT_SOURCE_DIR:   ${PROJECT_SOURCE_DIR}")
-message("  PROJECT_BINARY_DIR:   ${PROJECT_BINARY_DIR}")
-message("  CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
-message("  CMAKE_BUILD_TYPE:     ${CMAKE_BUILD_TYPE}")
+# NOTE: message() outputs to stderr by default. We now use a string to
+# maintain this information and then have cmake echo it to stdout. The
+# only side effects are that if you use the CMake GUI, you won't see
+# this output anymore (they only report stderr) and that if you add
+# something to the list, you must remember your newline!
+set(_str "== Configuration Summary ==\n")
+string(APPEND _str "  PROJECT_SOURCE_DIR:   ${PROJECT_SOURCE_DIR}\n"
+  "  PROJECT_BINARY_DIR:   ${PROJECT_BINARY_DIR}\n"
+  "  CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}\n"
+  "  CMAKE_BUILD_TYPE:     ${CMAKE_BUILD_TYPE}\n")
 if (CMAKE_BUILD_TYPE MATCHES None)
-  message("  CXX FLAGS:            ${CMAKE_CXX_FLAGS}")
+  string(APPEND _str
+    "  CXX FLAGS:            ${CMAKE_CXX_FLAGS}\n")
 elseif (CMAKE_BUILD_TYPE MATCHES Release)
-  message("  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
+  string(APPEND _str
+    "  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}\n")
 elseif (CMAKE_BUILD_TYPE MATCHES RelWithDebInfo)
-  message("  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+  string(APPEND _str
+    "  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}\n")
 elseif (CMAKE_BUILD_TYPE MATCHES Debug)
-  message("  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
+  string(APPEND _str
+    "  CXX FLAGS:            ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}\n")
 endif ()
-message("  LBANN_GNU_LINUX:      ${LBANN_GNU_LINUX}")
-message("  LBANN_HAS_HYDROGEN:   ${LBANN_HAS_HYDROGEN}")
-message("  LBANN_HAS_OPENCV:     ${LBANN_HAS_OPENCV}")
-message("  LBANN_HAS_CUDA:       ${LBANN_HAS_CUDA}")
-message("  LBANN_HAS_CUDNN:      ${LBANN_HAS_CUDNN}")
-message("  LBANN_HAS_NCCL2:      ${LBANN_HAS_NCCL2}")
-message("  LBANN_HAS_PROTOBUF:   ${LBANN_HAS_PROTOBUF}")
-message("  LBANN_HAS_CNPY:       ${LBANN_HAS_CNPY}")
-message("  LBANN_HAS_TBINF:      ${LBANN_HAS_TBINF}")
-message("  LBANN_HAS_VTUNE:      ${LBANN_HAS_VTUNE}")
-message("  LBANN_NVPROF:         ${LBANN_NVPROF}")
-message("  LBANN_HAS_DOXYGEN:    ${LBANN_HAS_DOXYGEN}")
-message("  LBANN_HAS_LBANN_PROTO:${LBANN_HAS_LBANN_PROTO}")
-message("  LBANN_HAS_ALUMINUM:   ${LBANN_HAS_ALUMINUM}")
-message("  LBANN_HAS_CONDUIT:    ${LBANN_HAS_CONDUIT}")
+string(APPEND _str
+  "  LBANN_GNU_LINUX:      ${LBANN_GNU_LINUX}\n"
+  "  LBANN_HAS_HYDROGEN:   ${LBANN_HAS_HYDROGEN}\n"
+  "  LBANN_HAS_OPENCV:     ${LBANN_HAS_OPENCV}\n"
+  "  LBANN_HAS_CUDA:       ${LBANN_HAS_CUDA}\n"
+  "  LBANN_HAS_CUDNN:      ${LBANN_HAS_CUDNN}\n"
+  "  LBANN_HAS_NCCL2:      ${LBANN_HAS_NCCL2}\n"
+  "  LBANN_HAS_PROTOBUF:   ${LBANN_HAS_PROTOBUF}\n"
+  "  LBANN_HAS_CNPY:       ${LBANN_HAS_CNPY}\n"
+  "  LBANN_HAS_TBINF:      ${LBANN_HAS_TBINF}\n"
+  "  LBANN_HAS_VTUNE:      ${LBANN_HAS_VTUNE}\n"
+  "  LBANN_NVPROF:         ${LBANN_NVPROF}\n"
+  "  LBANN_HAS_DOXYGEN:    ${LBANN_HAS_DOXYGEN}\n"
+  "  LBANN_HAS_LBANN_PROTO:${LBANN_HAS_LBANN_PROTO}\n"
+  "  LBANN_HAS_ALUMINUM:   ${LBANN_HAS_ALUMINUM}\n"
+  "  LBANN_HAS_CONDUIT:    ${LBANN_HAS_CONDUIT}\n"
+  "  LBANN_NO_OMP_FOR_DATA_READERS: ${LBANN_NO_OMP_FOR_DATA_READERS}\n")
+
+# Output to stdout
+execute_process(COMMAND ${CMAKE_COMMAND} -E echo "${_str}")
+set(_str)
diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt
index 9efcd857bf5..19c0255cd9b 100644
--- a/ReleaseNotes.txt
+++ b/ReleaseNotes.txt
@@ -1,3 +1,59 @@
+============================== (Pending) Release Notes: v0.97 ==============================
+Support for new training algorithms:
+
+Support for new network structures:
+
+Support for new layers:
+
+Performance optimizations:
+
+Model portability & usability:
+
+Internal features:
+
+I/O & data readers:
+
+Build system:
+
+============================== Release Notes: v0.96 ==============================
+Support for new layers:
+ - Log softmax
+ - Basic math functions
+ - Weights layer, which outputs a weights tensor
+ - L2 norm squared
+ - Binary cross entropy loss and sigmoid binary cross entropy loss
+ - Boolean accuracy, Boolean false negative rate, Boolean false positive rate
+ - Bilinear resize
+ - Variance and covariance
+ - Dilated and grouped convolution (GPU only)
+
+Performance optimizations:
+ - Optimized GPU model-parallel softmax layer
+
+Model portability & usability:
+ - Option for weight initialization with user-provided list of values
+ - Callback to save any layer output as an image
+
+Internal features:
+ - Provide compile time option to selectively disable OpenMP for data fetching loop
+ - Thrust calls no longer involve the default CUDA stream
+
+I/O & data readers:
+ - Reworked jag_conduit data reader:
+   - Support the updated JAG simulation data output format
+   - Use direct HDF5 I/O for on-demand data loading with Conduit
+   - Ingest a unique set of data files per instance
+   - Allow exclusive data partitioning among multiple trainers
+   - Multi-channel images
+   - Normalization of JAG data
+   - Interface to select images of specific views and time indices
+   - Interface to describe how to slice JAG data
+   - Avoid redundant fetching and incoherent random number pulls in the group of local data readers
+ - Improved threading performance by preallocating scratch space for loading samples
+
+Build system:
+ - Support cross-compilation configurations in superbuild and SetupProtobuf
+
 ============================== Release Notes: v0.95 ==============================
 Support for new training algorithms:
   - Generative Adversarial Networks (GAN)
diff --git a/bamboo/unit_tests/.gitignore b/bamboo/unit_tests/.gitignore
new file mode 100644
index 00000000000..16d3c4dbbfe
--- /dev/null
+++ b/bamboo/unit_tests/.gitignore
@@ -0,0 +1 @@
+.cache
diff --git a/bamboo/unit_tests/error/.gitignore b/bamboo/unit_tests/error/.gitignore
new file mode 100644
index 00000000000..7c9d611b592
--- /dev/null
+++ b/bamboo/unit_tests/error/.gitignore
@@ -0,0 +1,3 @@
+*
+!.gitignore
+!README.md
diff --git a/bamboo/unit_tests/output/.gitignore b/bamboo/unit_tests/output/.gitignore
new file mode 100644
index 00000000000..7c9d611b592
--- /dev/null
+++ b/bamboo/unit_tests/output/.gitignore
@@ -0,0 +1,3 @@
+*
+!.gitignore
+!README.md
diff --git a/bamboo/unit_tests/test_unit_conv_graph.py b/bamboo/unit_tests/test_unit_conv_graph.py
deleted file mode 100644
index d6c4439aa6e..00000000000
--- a/bamboo/unit_tests/test_unit_conv_graph.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import sys
-sys.path.insert(0, '../common_python')
-import tools
-import pytest
-import os
-
-def skeleton_gradient_check_conv_graph(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      pytest.skip('default_exes[%s] does not exist' % compiler_name)
-    output_file_name = '%s/bamboo/unit_tests/output/gradient_check_conv_graph_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/gradient_check_conv_graph_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1,
-        dir_name=dir_name, data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
-        data_reader_name='mnist', model_folder='tests', model_name='mnist_conv_graph',
-        optimizer_name='adam',
-        output_file_name=output_file_name,
-        error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
-
-def test_unit_gradient_check_conv_graph_clang4(cluster, exes, dirname):
-    skeleton_gradient_check_conv_graph(cluster, exes, dirname, 'clang4')
-
-def test_unit_gradient_check_conv_graph_gcc4(cluster, exes, dirname):
-    if cluster in ['surface']:
-        pytest.skip('FIXME')
-        # Surface Errors:
-        # assert 35584 == 0
-    skeleton_gradient_check_conv_graph(cluster, exes, dirname, 'gcc4')
-
-def test_unit_gradient_check_conv_graph_gcc7(cluster, exes, dirname):
-    skeleton_gradient_check_conv_graph(cluster, exes, dirname, 'gcc7')
-
-def test_unit_gradient_check_conv_graph_intel18(cluster, exes, dirname):
-    skeleton_gradient_check_conv_graph(cluster, exes, dirname, 'intel18')
-
-# Run with python -m pytest -s test_unit_conv_graph.py -k 'test_unit_gradient_check_conv_graph_exe' --exe=<executable>
-def test_unit_gradient_check_conv_graph_exe(cluster, dirname, exe):
-    if exe == None:
-        pytest.skip('Non-local testing')
-    exes = {'exe' : exe}
-    skeleton_gradient_check_conv_graph(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py
new file mode 100644
index 00000000000..41bdb9d985f
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_covariance.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../common_python')
+import tools
+import pytest
+import os
+
+def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name):
+    if compiler_name not in executables:
+      pytest.skip('default_exes[%s] does not exist' % compiler_name)
+    output_file_name = '%s/bamboo/unit_tests/output/layer_covariance_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/layer_covariance_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
+        data_filedir_default='', data_reader_name='synthetic',
+        model_folder='tests/layer_tests', model_name='covariance', optimizer_name='sgd',
+        output_file_name=output_file_name, error_file_name=error_file_name)
+    return_code = os.system(command)
+    assert return_code == 0
+
+def test_unit_layer_covariance_clang4(cluster, exes, dirname):
+    skeleton_layer_covariance(cluster, exes, dirname, 'clang4')
+
+def test_unit_layer_covariance_gcc4_check(cluster, exes, dirname):
+    if cluster in ['surface']:
+        pytest.skip('FIXME')
+        # Surface Errors:
+        # assert 34304 == 0
+    skeleton_layer_covariance(cluster, exes, dirname, 'gcc4')
+
+def test_unit_layer_covariance_gcc7(cluster, exes, dirname):
+    skeleton_layer_covariance(cluster, exes, dirname, 'gcc7')
+
+def test_unit_layer_covariance_intel18(cluster, exes, dirname):
+    skeleton_layer_covariance(cluster, exes, dirname, 'intel18')
+
+# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_covariance_exe' --exe=<executable>
+def test_unit_layer_covariance_exe(cluster, dirname, exe):
+    if exe == None:
+        pytest.skip('Non-local testing')
+    exes = {'exe' : exe}
+    skeleton_layer_covariance(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py
new file mode 100644
index 00000000000..29233e9ce18
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_l2_norm2.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../common_python')
+import tools
+import pytest
+import os
+
+def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name):
+    if compiler_name not in executables:
+      pytest.skip('default_exes[%s] does not exist' % compiler_name)
+    output_file_name = '%s/bamboo/unit_tests/output/layer_l2_norm2_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/layer_l2_norm2_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
+        data_filedir_default='', data_reader_name='synthetic',
+        model_folder='tests/layer_tests', model_name='l2_norm2', optimizer_name='sgd',
+        output_file_name=output_file_name, error_file_name=error_file_name)
+    return_code = os.system(command)
+    assert return_code == 0
+
+def test_unit_layer_l2_norm2_clang4(cluster, exes, dirname):
+    skeleton_layer_l2_norm2(cluster, exes, dirname, 'clang4')
+
+def test_unit_layer_l2_norm2_gcc4_check(cluster, exes, dirname):
+    if cluster in ['surface']:
+        pytest.skip('FIXME')
+        # Surface Errors:
+        # assert 34304 == 0
+    skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc4')
+
+def test_unit_layer_l2_norm2_gcc7(cluster, exes, dirname):
+    skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc7')
+
+def test_unit_layer_l2_norm2_intel18(cluster, exes, dirname):
+    skeleton_layer_l2_norm2(cluster, exes, dirname, 'intel18')
+
+# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l2_norm2_exe' --exe=<executable>
+def test_unit_layer_l2_norm2_exe(cluster, dirname, exe):
+    if exe == None:
+        pytest.skip('Non-local testing')
+    exes = {'exe' : exe}
+    skeleton_layer_l2_norm2(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py
new file mode 100644
index 00000000000..749cd34dc22
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../common_python')
+import tools
+import pytest
+import os
+
+def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name):
+    if compiler_name not in executables:
+      pytest.skip('default_exes[%s] does not exist' % compiler_name)
+    output_file_name = '%s/bamboo/unit_tests/output/layer_log_softmax_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/layer_log_softmax_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
+        data_filedir_default='', data_reader_name='synthetic',
+        model_folder='tests/layer_tests', model_name='log_softmax', optimizer_name='sgd',
+        output_file_name=output_file_name, error_file_name=error_file_name)
+    return_code = os.system(command)
+    assert return_code == 0
+
+def test_unit_layer_log_softmax_clang4(cluster, exes, dirname):
+    skeleton_layer_log_softmax(cluster, exes, dirname, 'clang4')
+
+def test_unit_layer_log_softmax_gcc4_check(cluster, exes, dirname):
+    if cluster in ['surface']:
+        pytest.skip('FIXME')
+        # Surface Errors:
+        # assert 34304 == 0
+    skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc4')
+
+def test_unit_layer_log_softmax_gcc7(cluster, exes, dirname):
+    skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc7')
+
+def test_unit_layer_log_softmax_intel18(cluster, exes, dirname):
+    skeleton_layer_log_softmax(cluster, exes, dirname, 'intel18')
+
+# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_log_softmax_exe' --exe=<executable>
+def test_unit_layer_log_softmax_exe(cluster, dirname, exe):
+    if exe == None:
+        pytest.skip('Non-local testing')
+    exes = {'exe' : exe}
+    skeleton_layer_log_softmax(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py
new file mode 100644
index 00000000000..dd1742a551c
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_softmax.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../common_python')
+import tools
+import pytest
+import os
+
+def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name):
+    if compiler_name not in executables:
+      pytest.skip('default_exes[%s] does not exist' % compiler_name)
+    output_file_name = '%s/bamboo/unit_tests/output/layer_softmax_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/layer_softmax_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
+        data_filedir_default='', data_reader_name='synthetic',
+        model_folder='tests/layer_tests', model_name='softmax', optimizer_name='sgd',
+        output_file_name=output_file_name, error_file_name=error_file_name)
+    return_code = os.system(command)
+    assert return_code == 0
+
+def test_unit_layer_softmax_clang4(cluster, exes, dirname):
+    skeleton_layer_softmax(cluster, exes, dirname, 'clang4')
+
+def test_unit_layer_softmax_gcc4_check(cluster, exes, dirname):
+    if cluster in ['surface']:
+        pytest.skip('FIXME')
+        # Surface Errors:
+        # assert 34304 == 0
+    skeleton_layer_softmax(cluster, exes, dirname, 'gcc4')
+
+def test_unit_layer_softmax_gcc7(cluster, exes, dirname):
+    skeleton_layer_softmax(cluster, exes, dirname, 'gcc7')
+
+def test_unit_layer_softmax_intel18(cluster, exes, dirname):
+    skeleton_layer_softmax(cluster, exes, dirname, 'intel18')
+
+# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_softmax_exe' --exe=<executable>
+def test_unit_layer_softmax_exe(cluster, dirname, exe):
+    if exe == None:
+        pytest.skip('Non-local testing')
+    exes = {'exe' : exe}
+    skeleton_layer_softmax(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py
new file mode 100644
index 00000000000..4b476aedf5b
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_layer_variance.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../common_python')
+import tools
+import pytest
+import os
+
+def skeleton_layer_variance(cluster, executables, dir_name, compiler_name):
+    if compiler_name not in executables:
+      pytest.skip('default_exes[%s] does not exist' % compiler_name)
+    output_file_name = '%s/bamboo/unit_tests/output/layer_variance_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/layer_variance_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
+        data_filedir_default='', data_reader_name='synthetic',
+        model_folder='tests/layer_tests', model_name='variance', optimizer_name='sgd',
+        output_file_name=output_file_name, error_file_name=error_file_name)
+    return_code = os.system(command)
+    assert return_code == 0
+
+def test_unit_layer_variance_clang4(cluster, exes, dirname):
+    skeleton_layer_variance(cluster, exes, dirname, 'clang4')
+
+def test_unit_layer_variance_gcc4_check(cluster, exes, dirname):
+    if cluster in ['surface']:
+        pytest.skip('FIXME')
+        # Surface Errors:
+        # assert 34304 == 0
+    skeleton_layer_variance(cluster, exes, dirname, 'gcc4')
+
+def test_unit_layer_variance_gcc7(cluster, exes, dirname):
+    skeleton_layer_variance(cluster, exes, dirname, 'gcc7')
+
+def test_unit_layer_variance_intel18(cluster, exes, dirname):
+    skeleton_layer_variance(cluster, exes, dirname, 'intel18')
+
+# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_variance_exe' --exe=<executable>
+def test_unit_layer_variance_exe(cluster, dirname, exe):
+    if exe == None:
+        pytest.skip('Non-local testing')
+    exes = {'exe' : exe}
+    skeleton_layer_variance(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_mnist_conv_graph.py b/bamboo/unit_tests/test_unit_mnist_conv_graph.py
new file mode 100644
index 00000000000..278bf62226a
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_mnist_conv_graph.py
@@ -0,0 +1,43 @@
+import sys
+sys.path.insert(0, '../common_python')
+import tools
+import pytest
+import os
+
+def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name):
+    if compiler_name not in executables:
+      pytest.skip('default_exes[%s] does not exist' % compiler_name)
+    output_file_name = '%s/bamboo/unit_tests/output/mnist_conv_graph_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/mnist_conv_graph_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1,
+        dir_name=dir_name, data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
+        data_reader_name='mnist', model_folder='tests', model_name='mnist_conv_graph',
+        optimizer_name='adam',
+        output_file_name=output_file_name,
+        error_file_name=error_file_name)
+    return_code = os.system(command)
+    assert return_code == 0
+
+def test_unit_mnist_conv_graph_clang4(cluster, exes, dirname):
+    skeleton_mnist_conv_graph(cluster, exes, dirname, 'clang4')
+
+def test_unit_mnist_conv_graph_gcc4(cluster, exes, dirname):
+    if cluster in ['surface']:
+        pytest.skip('FIXME')
+        # Surface Errors:
+        # assert 35584 == 0
+    skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc4')
+
+def test_unit_mnist_conv_graph_gcc7(cluster, exes, dirname):
+    skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc7')
+
+def test_unit_mnist_conv_graph_intel18(cluster, exes, dirname):
+    skeleton_mnist_conv_graph(cluster, exes, dirname, 'intel18')
+
+# Run with python -m pytest -s test_unit_conv_graph.py -k 'test_unit_mnist_conv_graph_exe' --exe=<executable>
+def test_unit_mnist_conv_graph_exe(cluster, dirname, exe):
+    if exe == None:
+        pytest.skip('Non-local testing')
+    exes = {'exe' : exe}
+    skeleton_mnist_conv_graph(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py
new file mode 100644
index 00000000000..14da3054905
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../common_python')
+import tools
+import pytest
+import os
+
+def skeleton_mnist_ridge_regression(cluster, executables, dir_name, compiler_name):
+    if compiler_name not in executables:
+      pytest.skip('default_exes[%s] does not exist' % compiler_name)
+    output_file_name = '%s/bamboo/unit_tests/output/mnist_ridge_regression_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/mnist_ridge_regression_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1, dir_name=dir_name,
+        data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST', data_reader_name='mnist',
+        model_folder='tests', model_name='mnist_ridge_regression', optimizer_name='adam',
+        output_file_name=output_file_name, error_file_name=error_file_name)
+    return_code = os.system(command)
+    assert return_code == 0
+
+def test_unit_mnist_ridge_regression_clang4(cluster, exes, dirname):
+    skeleton_mnist_ridge_regression(cluster, exes, dirname, 'clang4')
+
+def test_unit_mnist_ridge_regression_gcc4(cluster, exes, dirname):
+    if cluster in ['surface']:
+        pytest.skip('FIXME')
+        # Surface Errors:
+        # assert 34304 == 0
+    skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc4')
+
+def test_unit_mnist_ridge_regression_gcc7(cluster, exes, dirname):
+    skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc7')
+
+def test_unit_mnist_ridge_regression_intel18(cluster, exes, dirname):
+    skeleton_mnist_ridge_regression(cluster, exes, dirname, 'intel18')
+
+# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_mnist_ridge_regression_exe' --exe=<executable>
+def test_unit_mnist_ridge_regression_exe(cluster, dirname, exe):
+    if exe == None:
+        pytest.skip('Non-local testing')
+    exes = {'exe' : exe}
+    skeleton_mnist_ridge_regression(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py
new file mode 100644
index 00000000000..af3e8f1a3d1
--- /dev/null
+++ b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py
@@ -0,0 +1,41 @@
+import sys
+sys.path.insert(0, '../common_python')
+import tools
+import pytest
+import os
+
+def skeleton_mnist_softmax_classifier(cluster, executables, dir_name, compiler_name):
+    if compiler_name not in executables:
+      pytest.skip('default_exes[%s] does not exist' % compiler_name)
+    output_file_name = '%s/bamboo/unit_tests/output/mnist_softmax_classifier_%s_output.txt' % (dir_name, compiler_name)
+    error_file_name  = '%s/bamboo/unit_tests/error/mnist_softmax_classifier_%s_error.txt' % (dir_name, compiler_name)
+    command = tools.get_command(
+        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1, dir_name=dir_name,
+        data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST', data_reader_name='mnist',
+        model_folder='tests', model_name='mnist_softmax_classifier', optimizer_name='adam',
+        output_file_name=output_file_name, error_file_name=error_file_name)
+    return_code = os.system(command)
+    assert return_code == 0
+
+def test_unit_mnist_softmax_classifier_clang4(cluster, exes, dirname):
+    skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'clang4')
+
+def test_unit_mnist_softmax_classifier_gcc4(cluster, exes, dirname):
+    if cluster in ['surface']:
+        pytest.skip('FIXME')
+        # Surface Errors:
+        # assert 34304 == 0
+    skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc4')
+
+def test_unit_mnist_softmax_classifier_gcc7(cluster, exes, dirname):
+    skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc7')
+
+def test_unit_mnist_softmax_classifier_intel18(cluster, exes, dirname):
+    skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'intel18')
+
+# Run with python -m pytest -s test_unit_softmax_classifier.py -k 'test_unit_mnist_softmax_classifier_exe' --exe=<executable>
+def test_unit_mnist_softmax_classifier_exe(cluster, dirname, exe):
+    if exe == None:
+        pytest.skip('Non-local testing')
+    exes = {'exe' : exe}
+    skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'exe')
diff --git a/bamboo/unit_tests/test_unit_ridge_regression.py b/bamboo/unit_tests/test_unit_ridge_regression.py
deleted file mode 100644
index 949c50730fc..00000000000
--- a/bamboo/unit_tests/test_unit_ridge_regression.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import sys
-sys.path.insert(0, '../common_python')
-import tools
-import pytest
-import os
-
-def skeleton_gradient_check(cluster, executables, dir_name, compiler_name):
-    if compiler_name not in executables:
-      pytest.skip('default_exes[%s] does not exist' % compiler_name)
-    output_file_name = '%s/bamboo/unit_tests/output/gradient_check_ridge_regression_%s_output.txt' % (dir_name, compiler_name)
-    error_file_name  = '%s/bamboo/unit_tests/error/gradient_check_ridge_regression_%s_error.txt' % (dir_name, compiler_name)
-    command = tools.get_command(
-        cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=1, dir_name=dir_name,
-        data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST', data_reader_name='mnist',
-        model_folder='tests', model_name='mnist_ridge_regression', optimizer_name='adam',
-        output_file_name=output_file_name, error_file_name=error_file_name)
-    return_code = os.system(command)
-    assert return_code == 0
-
-def test_unit_gradient_check_clang4(cluster, exes, dirname):
-    skeleton_gradient_check(cluster, exes, dirname, 'clang4')
-
-def test_unit_gradient_check_gcc4(cluster, exes, dirname):
-    if cluster in ['surface']:
-        pytest.skip('FIXME')
-        # Surface Errors:
-        # assert 34304 == 0
-    skeleton_gradient_check(cluster, exes, dirname, 'gcc4')
-
-def test_unit_gradient_check_gcc7(cluster, exes, dirname):
-    skeleton_gradient_check(cluster, exes, dirname, 'gcc7')
-
-def test_unit_gradient_check_intel18(cluster, exes, dirname):
-    skeleton_gradient_check(cluster, exes, dirname, 'intel18')
-
-# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_gradient_check_exe' --exe=<executable>
-def test_unit_gradient_check_exe(cluster, dirname, exe):
-    if exe == None:
-        pytest.skip('Non-local testing')
-    exes = {'exe' : exe}
-    skeleton_gradient_check(cluster, exes, dirname, 'exe')
diff --git a/cmake/configure_files/lbann_config.hpp.in b/cmake/configure_files/lbann_config.hpp.in
index c28aa82ea95..dfb7d54ca22 100644
--- a/cmake/configure_files/lbann_config.hpp.in
+++ b/cmake/configure_files/lbann_config.hpp.in
@@ -9,6 +9,7 @@
 
 /* Version string for LBANN */
 #define LBANN_VERSION @LBANN_VERSION@
+#cmakedefine LBANN_GIT_VERSION @LBANN_GIT_VERSION@
 
 /* Defined if LBANN is in debug mode */
 #cmakedefine LBANN_DEBUG
@@ -25,6 +26,7 @@
 #cmakedefine LBANN_HAS_ALUMINUM
 #cmakedefine LBANN_ALUMINUM_MPI_PASSTHROUGH
 #cmakedefine LBANN_HAS_CONDUIT
+#cmakedefine LBANN_NO_OMP_FOR_DATA_READERS
 
 #cmakedefine LBANN_DETERMINISTIC
 
diff --git a/cmake/modules/FindCONDUIT.cmake b/cmake/modules/FindCONDUIT.cmake
index ebb0223d685..fc456244348 100644
--- a/cmake/modules/FindCONDUIT.cmake
+++ b/cmake/modules/FindCONDUIT.cmake
@@ -34,7 +34,7 @@ endif (NOT TARGET CONDUIT::CONDUIT)
 
 # Set the link libraries for the target
 set_property(TARGET CONDUIT::CONDUIT APPEND
-  PROPERTY INTERFACE_LINK_LIBRARIES conduit conduit_relay conduit_blueprint)
+  PROPERTY INTERFACE_LINK_LIBRARIES conduit conduit_relay conduit_blueprint conduit_relay_mpi)
 
 set_property(TARGET CONDUIT::CONDUIT APPEND
   PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${CONDUIT_INCLUDE_DIRS}")
diff --git a/cmake/modules/FindHWLOC.cmake b/cmake/modules/FindHWLOC.cmake
index aa3e1b73d24..d7131b0a5cc 100644
--- a/cmake/modules/FindHWLOC.cmake
+++ b/cmake/modules/FindHWLOC.cmake
@@ -12,7 +12,10 @@ if (MPI_FOUND)
     get_filename_component(_TMP_MPI_LIB_DIR "${lib}" DIRECTORY)
     list(APPEND _TMP_MPI_LIBRARY_DIRS ${_TMP_MPI_LIB_DIR})
   endforeach ()
-  list(REMOVE_DUPLICATES _TMP_MPI_LIBRARY_DIRS)
+
+  if (_TMP_MPI_LIBRARY_DIRS)
+    list(REMOVE_DUPLICATES _TMP_MPI_LIBRARY_DIRS)
+  endif ()
 endif (MPI_FOUND)
 
 # Find the library
diff --git a/cmake/modules/SetupCUDAToolkit.cmake b/cmake/modules/SetupCUDAToolkit.cmake
index e897ef2b476..0940fea4491 100644
--- a/cmake/modules/SetupCUDAToolkit.cmake
+++ b/cmake/modules/SetupCUDAToolkit.cmake
@@ -1,7 +1,10 @@
 # This handles the non-compiler aspect of the CUDA toolkit.
 # NCCL and cuDNN are handled separately.
 
-# TODO CUB
+if (NOT CUDA_FOUND)
+  find_package(CUDA REQUIRED)
+endif ()
+
 find_package(CUB REQUIRED)
 find_package(NVTX REQUIRED)
 find_package(cuDNN REQUIRED)
@@ -11,7 +14,7 @@ if (NOT TARGET cuda::toolkit)
 endif ()
 
 set_property(TARGET cuda::toolkit APPEND PROPERTY
-  INTERFACE_LINK_LIBRARIES cuda::cub cuda::cudnn cuda::nvtx "${CUDA_CUBLAS_LIBRARIES}")
+  INTERFACE_LINK_LIBRARIES cuda::cub cuda::cudnn cuda::nvtx "${CUDA_CUBLAS_LIBRARIES}" "${CUDA_CUDA_LIBRARY}")
 
 set_property(TARGET cuda::toolkit APPEND PROPERTY
   INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CUDA>:-arch=sm_30>)
diff --git a/cmake/modules/SetupElemental.cmake b/cmake/modules/SetupElemental.cmake
index 6e09406d2e0..d98c17f8dd5 100644
--- a/cmake/modules/SetupElemental.cmake
+++ b/cmake/modules/SetupElemental.cmake
@@ -8,39 +8,13 @@
 # Hydrogen, this file is no longer necessary as it's just one
 # find_package() line.
 
-find_package(Hydrogen NO_MODULE
+set(_MIN_H_VERSION 1.0.0)
+find_package(Hydrogen ${_MIN_H_VERSION} NO_MODULE
   HINTS ${Hydrogen_DIR} ${HYDROGEN_DIR} $ENV{Hydrogen_DIR} $ENV{HYDROGEN_DIR}
   PATH_SUFFIXES lib/cmake/hydrogen
-  NO_DEFUALT_PATH)
-find_package(Hydrogen NO_MODULE)
-
-if (Hydrogen_FOUND)
-  message(STATUS "Found Hydrogen: ${Hydrogen_DIR}")
-  set(LBANN_HAS_HYDROGEN TRUE)
-else ()
-  set(LBANN_HAS_HYDROGEN FALSE)
-
-  find_package(Elemental NO_MODULE
-    PATH_SUFFIXES lib/cmake/elemental)
-
-  if (Elemental_FOUND)
-    set(HYDROGEN_LIBRARIES "${Elemental_LIBRARIES}")
-    message(STATUS "Found Elemental: ${Elemental_DIR}")
-
-    if (TARGET El)
-      set_property(TARGET El PROPERTY
-        INTERFACE_INCLUDE_DIRECTORIES ${Elemental_INCLUDE_DIRS})
-    endif ()
-
-    set(LBANN_HAS_ELEMENTAL TRUE)
-  else ()
-    message(FATAL_ERROR "Neither Hydrogen nor Elemental was found! "
-      "Try setting Hydrogen_DIR or Elemental_DIR and try again!")
-
-    set(LBANN_HAS_ELEMENTAL FALSE)
-  endif (Elemental_FOUND)
-endif (Hydrogen_FOUND)
-
-if (NOT LBANN_HAS_HYDROGEN AND NOT LBANN_HAS_ELEMENTAL)
-  message(FATAL_ERROR "LBANN requires Hydrogen or Elemental.")
+  NO_DEFAULT_PATH)
+if (NOT Hydrogen_FOUND)
+  find_package(Hydrogen ${_MIN_H_VERSION} NO_MODULE REQUIRED)
 endif ()
+
+set(LBANN_HAS_HYDROGEN TRUE)
diff --git a/cmake/modules/SetupProtobuf.cmake b/cmake/modules/SetupProtobuf.cmake
index 5d09b95714c..f61b9ef744f 100644
--- a/cmake/modules/SetupProtobuf.cmake
+++ b/cmake/modules/SetupProtobuf.cmake
@@ -1,22 +1,50 @@
 set(PROTOBUF_MIN_VERSION "3.0.0")
 
-find_package(Protobuf REQUIRED CONFIG 
-    HINTS "${PROTOBUF_DIR}/lib64/cmake/protobuf" "${PROTOBUF_DIR}/lib/cmake/protobuf")
+# On cross-compilation machines, we want to use the module because we
+# will use the host protoc and the target libprotobuf. In this case,
+# users should set Protobuf_PROTOC_EXECUTABLE=/path/to/host/bin/protoc
+# and set PROTOBUF_DIR=/path/to/target/protobuf/prefix.
+option(${PROJECT_NAME}_USE_PROTOBUF_MODULE
+  "Use the FindProtobuf module instead of Protobuf's config file." OFF)
 
-if(NOT PROTOBUF_FOUND)
+if (${PROJECT_NAME}_USE_PROTOBUF_MODULE)
   if (PROTOBUF_DIR)
+    set(__remove_protobuf_from_paths TRUE)
     list(APPEND CMAKE_LIBRARY_PATH ${PROTOBUF_DIR}/lib)
     list(APPEND CMAKE_INCLUDE_PATH ${PROTOBUF_DIR}/include)
     list(APPEND CMAKE_PREFIX_PATH ${PROTOBUF_DIR})
   endif ()
 
-  find_package(Protobuf ${PROTOBUF_MIN_VERSION} REQUIRED)
-  if (PROTOBUF_DIR)
+  # At this point, throw an error if Protobuf is not found.
+  find_package(Protobuf "${PROTOBUF_MIN_VERSION}" MODULE)
+
+  if (__remove_protobuf_from_paths)
     list(REMOVE_ITEM CMAKE_LIBRARY_PATH ${PROTOBUF_DIR}/lib)
     list(REMOVE_ITEM CMAKE_INCLUDE_PATH ${PROTOBUF_DIR}/include)
     list(REMOVE_ITEM CMAKE_PREFIX_PATH ${PROTOBUF_DIR})
+    set(__remove_protobuf_from_paths)
   endif ()
+
+else ()
+  set(protobuf_MODULE_COMPATIBLE ON)
+  set(protobuf_BUILD_SHARED_LIBS ON)
+
+  find_package(Protobuf "${PROTOBUF_MIN_VERSION}" CONFIG
+    HINTS
+    "${Protobuf_DIR}/lib64/cmake/protobuf"
+    "${PROTOBUF_DIR}/lib64/cmake/protobuf"
+    "${Protobuf_DIR}/lib/cmake/protobuf"
+    "${PROTOBUF_DIR}/lib/cmake/protobuf"
+    "$ENV{Protobuf_DIR}/lib64/cmake/protobuf"
+    "$ENV{PROTOBUF_DIR}/lib64/cmake/protobuf"
+    "$ENV{Protobuf_DIR}/lib/cmake/protobuf"
+    "$ENV{PROTOBUF_DIR}/lib/cmake/protobuf")
 endif ()
+
+if(NOT PROTOBUF_FOUND AND NOT Protobuf_FOUND)
+  message(FATAL_ERROR "Protobuf not found.")
+endif ()
+
 # Setup the imported target for old versions of CMake
 if (NOT TARGET protobuf::libprotobuf)
   add_library(protobuf::libprotobuf INTERFACE IMPORTED)
@@ -27,4 +55,5 @@ if (NOT TARGET protobuf::libprotobuf)
     INTERFACE_INCLUDE_DIRECTORIES "${PROTOBUF_INCLUDE_DIRS}")
 endif ()
 
-set(LBANN_HAS_PROTOBUF ${PROTOBUF_FOUND})
+# This can just be "TRUE" since protobuf is REQUIRED above.
+set(LBANN_HAS_PROTOBUF TRUE)
diff --git a/experiments/run_lbann_experiment.sh b/experiments/run_lbann_experiment.sh
index c8d8a5a38d8..cf0ace8d59f 100755
--- a/experiments/run_lbann_experiment.sh
+++ b/experiments/run_lbann_experiment.sh
@@ -156,8 +156,8 @@ if [ -n "${IMAGENET_CLASSES}" ]; then
         catalyst|flash|quartz|surface|pascal)
             case ${IMAGENET_CLASSES} in
                 10|100|300|1000)
-                    IMAGENET_DIR=/p/lscratchf/brainusr/datasets/ILSVRC2012
-                    DATASET_TARBALLS="${IMAGENET_DIR}/resized_256x256/train.tar ${IMAGENET_DIR}/resized_256x256/val.tar ${IMAGENET_DIR}/labels.tar"
+                    IMAGENET_DIR=/p/lscratchh/brainusr/datasets/ILSVRC2012
+                    DATASET_TARBALLS="${IMAGENET_DIR}/original/train.tar ${IMAGENET_DIR}/original/val.tar ${IMAGENET_DIR}/labels.tar"
                     IMAGENET_SUFFIX=_c0-$((${IMAGENET_CLASSES}-1))
                     if [ "${IMAGENET_CLASSES}" -eq "1000" ]; then
                         IMAGENET_SUFFIX=
@@ -170,9 +170,9 @@ if [ -n "${IMAGENET_CLASSES}" ]; then
                             TEST_DATASET_LABELS=${CACHE_DIR}/labels/val${IMAGENET_SUFFIX}.txt
                             ;;
                         *)
-                            TRAIN_DATASET_DIR=${IMAGENET_DIR}/resized_256x256/train/
+                            TRAIN_DATASET_DIR=${IMAGENET_DIR}/original/train/
                             TRAIN_DATASET_LABELS=${IMAGENET_DIR}/labels/train${IMAGENET_SUFFIX}.txt
-                            TEST_DATASET_DIR=${IMAGENET_DIR}/resized_256x256/val/
+                            TEST_DATASET_DIR=${IMAGENET_DIR}/original/val/
                             TEST_DATASET_LABELS=${IMAGENET_DIR}/labels/val${IMAGENET_SUFFIX}.txt
                             ;;
                     esac
@@ -190,7 +190,7 @@ if [ -n "${IMAGENET_CLASSES}" ]; then
             ;;
         ray)
             IMAGENET_DIR=/p/gscratchr/brainusr/datasets/ILSVRC2012
-            DATASET_TARBALLS="${IMAGENET_DIR}/resized_256x256/train.tar ${IMAGENET_DIR}/resized_256x256/val.tar ${IMAGENET_DIR}/labels.tar"
+            DATASET_TARBALLS="${IMAGENET_DIR}/original/train.tar ${IMAGENET_DIR}/original/val.tar ${IMAGENET_DIR}/labels.tar"
             IMAGENET_SUFFIX=_c0-$((${IMAGENET_CLASSES}-1))
             if [ "${IMAGENET_CLASSES}" -eq "1000" ]; then
                 IMAGENET_SUFFIX=
@@ -203,9 +203,9 @@ if [ -n "${IMAGENET_CLASSES}" ]; then
                     TEST_DATASET_LABELS=${CACHE_DIR}/labels/val${IMAGENET_SUFFIX}.txt
                     ;;
                 *)
-                    TRAIN_DATASET_DIR=${IMAGENET_DIR}/resized_256x256/train/
+                    TRAIN_DATASET_DIR=${IMAGENET_DIR}/original/train/
                     TRAIN_DATASET_LABELS=${IMAGENET_DIR}/labels/train${IMAGENET_SUFFIX}.txt
-                    TEST_DATASET_DIR=${IMAGENET_DIR}/resized_256x256/val/
+                    TEST_DATASET_DIR=${IMAGENET_DIR}/original/val/
                     TEST_DATASET_LABELS=${IMAGENET_DIR}/labels/val${IMAGENET_SUFFIX}.txt
                     ;;
             esac
diff --git a/include/lbann/callbacks/callback_save_images.hpp b/include/lbann/callbacks/callback_save_images.hpp
index df1c012c254..443f39ad49a 100644
--- a/include/lbann/callbacks/callback_save_images.hpp
+++ b/include/lbann/callbacks/callback_save_images.hpp
@@ -22,36 +22,32 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
-//
-// lbann_callback_save_images .hpp .cpp - Callbacks to save images, currently used in autoencoder
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifndef LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED
 #define LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED
 
-#include <utility>
-
+#include <string>
+#include <vector>
 #include "lbann/callbacks/callback.hpp"
-#include "lbann/data_readers/data_reader.hpp"
 
 namespace lbann {
 
-/**
- * Save images to file
+/** Save layer outputs as image files.
+ *  Image files are in the form
+ *  "<prefix><tag>-<layer name>.<format>".
  */
 class lbann_callback_save_images : public lbann_callback {
- public:
-  /**
-   * @param data reader type e.g., imagenet, mnist, cifar10....
-   * @param image_dir directory to save image
-   * @param layer_names list of layers from which to save images 
-   * @param image extension e.g., jpg, png, pgm......
+public:
+
+  /** Constructor.
+   *  @param layer_names  List of layer names to save as images.
+   *  @param image_format Image file format (e.g. jpg, png, pgm).
+   *  @param image_prefix Prefix for image file names.
    */
-  lbann_callback_save_images(generic_data_reader *reader, std::string image_dir,
-                             std::vector<std::string> layer_names,
-                             std::string extension="jpg") :
-    lbann_callback(), m_image_dir(std::move(image_dir)), m_extension(std::move(extension)),
-    m_reader(reader), m_layer_names(layer_names) {}
+  lbann_callback_save_images(std::vector<std::string> layer_names,
+                             std::string image_format = "jpg",
+                             std::string image_prefix = "");
   lbann_callback_save_images(const lbann_callback_save_images&) = default;
   lbann_callback_save_images& operator=(
     const lbann_callback_save_images&) = default;
@@ -62,15 +58,20 @@ class lbann_callback_save_images : public lbann_callback {
   void on_phase_end(model *m) override;
   void on_test_end(model *m) override;
   std::string name() const override { return "save images"; }
- private:
-  std::string m_image_dir; //directory to save images
-  std::string m_extension; //image extension; pgm, jpg, png etc
-  generic_data_reader *m_reader;
-  /** List of layers at which to save images*/
+
+private:
+
+  /** List of layer names to save as images. */
   std::vector<std::string> m_layer_names;
-  void save_image(model& m, std::string tag);
+  /** Image file format.
+   *  Valid options: jpg, png, pgm.
+   */
+  std::string m_image_format;
+  /** Prefix for saved image files. */
+  std::string m_image_prefix;
+
 };
 
-}  // namespace lbann
+} // namespace lbann
 
 #endif  // LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED
diff --git a/include/lbann/callbacks/callback_timer.hpp b/include/lbann/callbacks/callback_timer.hpp
index 8b0ebf82a56..2dec66c762c 100644
--- a/include/lbann/callbacks/callback_timer.hpp
+++ b/include/lbann/callbacks/callback_timer.hpp
@@ -22,16 +22,15 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
-//
-// lbann_callback_timer .hpp .cpp - Callback hooks to time training
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifndef LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED
 #define LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED
 
+#include "lbann/callbacks/callback.hpp"
 #include <chrono>
+#include <map>
 #include <vector>
-#include "lbann/callbacks/callback.hpp"
 
 namespace lbann {
 
@@ -41,64 +40,64 @@ namespace lbann {
  *  master process in each model.
  */
 class lbann_callback_timer : public lbann_callback {
- public:
+public:
 
-  /** Constructor. */
-  lbann_callback_timer(lbann_summary *summarizer = nullptr) :
-    lbann_callback(1, summarizer) {}
-  /** Copy constructor. */
+  lbann_callback_timer(lbann_summary *summarizer = nullptr)
+    : lbann_callback(1, summarizer) {}
   lbann_callback_timer(const lbann_callback_timer&) = default;
-  /** Copy assignment operator. */
   lbann_callback_timer& operator=(const lbann_callback_timer&) = default;
-  /** Copy function. */
   lbann_callback_timer* copy() const override {
     return new lbann_callback_timer(*this);
   }
 
   /** Start timing for a training epoch. */
-  void on_epoch_begin(model *m) override      { timing_begin(m); }
+  void on_epoch_begin(model *m) override      { timing_begin(*m); }
   /** Report timing for a training epoch. */
-  void on_epoch_end(model *m) override        { timing_end(m); }
+  void on_epoch_end(model *m) override        { timing_end(*m);   }
   /** Start timing for validation. */
-  void on_validation_begin(model *m) override { timing_begin(m); }
+  void on_validation_begin(model *m) override { timing_begin(*m); }
   /** Report timing for validation. */
-  void on_validation_end(model *m) override   { timing_end(m); }
+  void on_validation_end(model *m) override   { timing_end(*m);   }
   /** Start timing for testing. */
-  void on_test_begin(model *m) override       { timing_begin(m); }
+  void on_test_begin(model *m) override       { timing_begin(*m); }
   /** Report timing for testing. */
-  void on_test_end(model *m) override         { timing_end(m); }
+  void on_test_end(model *m) override         { timing_end(*m);   }
   /** Record training mini-batch start time. */
-  void on_batch_begin(model *m) override          { batch_timing_begin(m); }
+  void on_batch_begin(model *m) override          { batch_timing_begin(*m); }
   /** Record training mini-batch run time. */
-  void on_batch_end(model *m) override            { batch_timing_end(m); }
+  void on_batch_end(model *m) override            { batch_timing_end(*m);   }
   /** Record evaluation mini-batch start time. */
-  void on_batch_evaluate_begin(model *m) override { batch_timing_begin(m); }
+  void on_batch_evaluate_begin(model *m) override { batch_timing_begin(*m); }
   /** Record evaluation mini-batch run time. */
-  void on_batch_evaluate_end(model *m) override   { batch_timing_end(m); }
+  void on_batch_evaluate_end(model *m) override   { batch_timing_end(*m);   }
 
   /** Callback name. */
   std::string name() const override { return "timer"; }
 
- private:
+private:
 
-  /** Start time for the current timing. */
-  EvalType m_start_time;
-  /** Start time for the current mini-batch. */
-  EvalType m_batch_start_time;
-  /** History of mini-batch times for the current timing. */
-  std::vector<EvalType> m_batch_times;
+  /** Timing session start times. */
+  std::map<execution_mode,EvalType> m_start_times;
+  /** Mini-batch timing session start times. */
+  std::map<execution_mode,EvalType> m_batch_start_times;
+  /** Mini-batch times. */
+  std::map<execution_mode,std::vector<EvalType>> m_batch_times;
 
-  /** Start timing. */
-  void timing_begin(model *m);
-  /** Print timing results to standard output. */
-  void timing_end(model *m);
-  /** Start mini-batch timing. */
-  void batch_timing_begin(model *m);
-  /** Record mini-batch timing. */
-  void batch_timing_end(model *m);
+  /** Start timing session. */
+  void timing_begin(const model& m);
+  /** End timing session.
+   *  Prints results to standard output.
+   */
+  void timing_end(model& m);
+  /** Start mini-batch timing session. */
+  void batch_timing_begin(const model& m);
+  /** End mini-batch timing session.
+   *  Prints results to standard output.
+   */
+  void batch_timing_end(const model& m);
 
 };
 
-}  // namespace lbann
+} // namespace lbann
 
 #endif  // LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED
diff --git a/include/lbann/comm.hpp b/include/lbann/comm.hpp
index a196be42037..5c73a097da1 100644
--- a/include/lbann/comm.hpp
+++ b/include/lbann/comm.hpp
@@ -84,6 +84,41 @@ struct request {
 
 } // namespace Al
 
+/* Notes on Synchronization
+ *
+ * The updated interface exposes a synchronization handle/device
+ * tagging mechanism used by Hydrogen: El::SyncInfo<D>, where D is an
+ * El::Device. When operating on Matrix objects, this should be
+ * handled automagically, assuming the Matrix is setup properly. Users
+ * must be aware of this when making MPI calls through Hydrogen or
+ * through lbann_comm with raw data buffers (T[]).
+ *
+ * When dealing with El::Matrix objects, users should be aware of the
+ * following. There is no synchronization for CPU objects
+ * (El::SyncInfo<El::Device::CPU> is an empty struct), but GPU Matrix
+ * objects now have an associated stream and event. These are
+ * GPUManager::Stream() and GPUManager::Event() by default, resp., but
+ * can be overriden by a user. Note: the Matrix never owns these; it
+ * will not free these resources at destruction. There are many
+ * methods in which multiple El::Matrix objects might interact. This
+ * should work properly; otherwise, report bugs to benson31.
+ *
+ * When dealing with raw data (T[]), users should be aware of the
+ * following. In the near future, all El::mpi functions will have an
+ * El::SyncInfo object as their last parameter, and it will be a
+ * required parameter. In lbann_comm, this means that when the call
+ * trickles down to an El::mpi function, an appropriate El::SyncInfo
+ * must be available. Since many of LBANN's uses of this interface are
+ * for communicating CPU buffers, there is "shortcut" API that assumes
+ * the data is CPU memory, thus providing the default
+ * El::SyncInfo<El::Device::CPU> object to El::mpi. If a user wishes
+ * to communicate GPU data, they must use the "full" API, which adds a
+ * final El::SyncInfo parameter to the function. This ensures the
+ * appropriate synchronization semantics, especially when working with
+ * Aluminum as the communication frontend.
+ */
+
+
 /**
  * Manage communication.
  * This supports separate models, each of which are split over potentially
@@ -100,7 +135,7 @@ class lbann_comm {
    * defaulting to every process in one model.
    */
   lbann_comm(int procs_per_model = 0,
-             const El::mpi::Comm world = El::mpi::COMM_WORLD);
+             El::mpi::Comm world = El::mpi::COMM_WORLD);
   /** Don't allow copying; it doesn't make sense for the communicator. */
   lbann_comm(const lbann_comm&) = delete;
   /** Don't allow assignment; it doesn't make sense for the communicator. */
@@ -154,6 +189,10 @@ class lbann_comm {
   inline Grid& get_model_grid() {
     return *grid;
   }
+  /** Return a read-only grid to use for this model. */
+  inline const Grid& get_model_grid() const {
+    return *grid;
+  }
   /** Return the total number of models. */
   inline int get_num_models() const {
     return num_models;
@@ -202,12 +241,12 @@ class lbann_comm {
 
   /// Broadcast a scalar value over an arbitrary communicator
   template < typename T, bool S = is_instantiated_El_mpi_type<T>::value >
-  void broadcast(int root, T& val, const El::mpi::Comm c);
+  void broadcast(int root, T& val, El::mpi::Comm c);
 
   template <typename T>
-  void broadcast_custom(int root, T& val, const El::mpi::Comm c) const;
+  void broadcast_custom(int root, T& val, El::mpi::Comm c) const;
   template <typename T>
-  void broadcast_native(int root, T& val, const El::mpi::Comm c) const;
+  void broadcast_native(int root, T& val, El::mpi::Comm c) const;
 
   /// World broadcast of a scalar.
   template <typename T>
@@ -229,33 +268,59 @@ class lbann_comm {
    * Broadcast a buffer over an arbitrary communicator assuming that
    * the buffer space is already allocated.
    */
-  template < typename T, bool S = is_instantiated_El_mpi_type<T>::value >
-  void broadcast(const int root, T* data, const int count, const El::mpi::Comm c);
+
+  // Default to cpu memory
+  template <typename T>
+  void broadcast(const int root, T* data, const int count, El::mpi::Comm c) {
+      broadcast(root, data, count, std::move(c), El::SyncInfo<El::Device::CPU>{});
+  }
+
+  template < typename T, El::Device D, bool S = is_instantiated_El_mpi_type<T>::value >
+  void broadcast(const int root, T* data, const int count, El::mpi::Comm c,
+                 El::SyncInfo<D> const& syncInfo);
 
   /// World broadcast of a buffer.
   template <typename T>
   void world_broadcast(const int root, T* data, const int count) {
-    broadcast(root, data, count, get_world_comm());
+    world_broadcast(root, data, count, El::SyncInfo<El::Device::CPU>{});
+  }
+
+  template <typename T, El::Device D>
+  void world_broadcast(const int root, T* data, const int count,
+                       El::SyncInfo<D> const& syncInfo) {
+    broadcast(root, data, count, get_world_comm(), syncInfo);
   }
   /// Inter-model broadcast of a buffer.
   template <typename T>
   void intermodel_broadcast(const int root, T* data, const int count) {
-    broadcast(root, data, count, get_intermodel_comm());
+    intermodel_broadcast(root, data, count, El::SyncInfo<El::Device::CPU>{});
+  }
+  template <typename T, El::Device D>
+  void intermodel_broadcast(const int root, T* data, const int count,
+                            El::SyncInfo<D> const& syncInfo) {
+    broadcast(root, data, count, get_intermodel_comm(), syncInfo);
   }
   /// Within-model broadcast of a buffer.
   template <typename T>
   void model_broadcast(const int root, T* data, const int count) {
-    broadcast(root, data, count, get_model_comm());
+    model_broadcast(root, data, count, El::SyncInfo<El::Device::CPU>{});
+  }
+
+  template <typename T, El::Device D>
+  void model_broadcast(const int root, T* data, const int count,
+                       El::SyncInfo<D> const& syncInfo) {
+    broadcast(root, data, count, get_model_comm(), syncInfo);
   }
 
   /**
    * Resize vector<> over an arbitrary communicator to match the one on root.
    */
   template <typename T>
-  size_t resize(const int root, std::vector<T> &data, const El::mpi::Comm c) {
+  size_t resize(const int root, std::vector<T> &data, El::mpi::Comm c) {
+    auto const rank_c = El::mpi::Rank(c);
     size_t count = data.size();
-    El::mpi::Broadcast(&count, 1, root, c);
-    count_bytes_broadcast(sizeof(size_t), El::mpi::Rank(c), root);
+    El::mpi::Broadcast(&count, 1, root, std::move(c), El::SyncInfo<El::Device::CPU>{});
+    count_bytes_broadcast(sizeof(size_t), rank_c, root);
     data.resize(count);
     return count;
   }
@@ -265,12 +330,12 @@ class lbann_comm {
    * vector<> for non-root processes will be resized as needed.
    */
   template <typename T>
-  void broadcast(const int root, std::vector<T> &data, const El::mpi::Comm c) {
+  void broadcast(const int root, std::vector<T> &data, El::mpi::Comm c) {
     const int count = static_cast<int>(resize(root, data, c));
     if (count <= 0) {
       return;
     }
-    broadcast<T>(root, data.data(), count, c);
+    broadcast<T>(root, data.data(), count, std::move(c), El::SyncInfo<El::Device::CPU>{});
   }
   /// Broadcast vector<> to world.
   template <typename T>
@@ -306,10 +371,16 @@ class lbann_comm {
   /** Allgather over an arbitrary communicator */
   template <typename T>
   void all_gather(const T* src, int src_count, T* rcv, int rcv_count, El::mpi::Comm c) {
-    El::mpi::AllGather<T>(src, src_count, rcv, rcv_count, c);
+    all_gather(src, src_count, rcv, rcv_count, std::move(c),
+                   El::SyncInfo<El::Device::CPU>{});
+  }
+  template <typename T, El::Device D>
+  void all_gather(const T* src, int src_count, T* rcv, int rcv_count, El::mpi::Comm c,
+                  El::SyncInfo<D> const& syncInfo) {
+    El::mpi::AllGather<T>(src, src_count, rcv, rcv_count, std::move(c), syncInfo);
   }
 
-  /** 
+  /**
    * Allgatherv over an arbitrary communicator;
    * all vectors must be correctly sized prior to entry.
    */
@@ -322,25 +393,26 @@ class lbann_comm {
               << "this doesn't work!";
       lbann_comm_abort(err.str());
     }
-    El::mpi::AllGather<T>(src.data(), src.size(), rcs.data(), rcv_counts.data(), rcv_disp.data(), c);
+    El::mpi::AllGather<T>(src.data(), src.size(), rcs.data(), rcv_counts.data(), rcv_disp.data(), std::move(c), El::SyncInfo<El::Device::CPU>{});
   }
-  /** 
+  /**
    * Allgatherv over a model communicator;
    * all vectors must be correctly sized prior to entry.
    */
   template <typename T>
-  void model_all_gather(std::vector<T> &src, std::vector<T> &rcs, std::vector<int> &rcv_counts, std::vector<int> &rcv_disp, const El::mpi::Comm c) {
+  void model_all_gather(std::vector<T> &src, std::vector<T> &rcs, std::vector<int> &rcv_counts, std::vector<int> &rcv_disp) {
     all_gather(src, rcs, rcv_counts, rcv_disp, get_model_comm());
   }
-  /** 
+  /**
    * Allgather for a single element over an arbitrary communicator;
    * std::vector<T> &data must be correctly sized prior to entry.
    */
   template <typename T>
-  void all_gather(T &src, std::vector<T> &data, const El::mpi::Comm c) {
-    El::mpi::AllGather(&src, 1, data.data(), 1, c);
+  void all_gather(T &src, std::vector<T> &data, El::mpi::Comm c) {
+    El::mpi::AllGather(&src, 1, data.data(), 1, std::move(c),
+                       El::SyncInfo<El::Device::CPU>{});
   }
-  /** 
+  /**
    * Allgather for a single element over the model communicator;
    * std::vector<T> &data must be correctly sized prior to entry.
    */
@@ -357,7 +429,7 @@ class lbann_comm {
   /** Within-model scalar gather (for root processes). */
   template <typename T>
   void model_gather(T snd, T* rcv) {
-    gather(snd, get_model_master(), model_comm);
+    gather(snd, rcv, model_comm);
   }
   /** Within-model scalar-array gather (for non-root processes). */
   template <typename T>
@@ -373,7 +445,7 @@ class lbann_comm {
   template <typename T>
   void model_gatherv(T* snd, int count, int root) {
     bytes_sent += sizeof(T) * count;
-    El::mpi::Gather(snd, count, (T *) NULL, (int *) nullptr, (int *) nullptr, root,
+    El::mpi::Gather(snd, count, nullptr, nullptr, nullptr, root,
                     model_comm);
   }
   template <typename T>
@@ -407,47 +479,69 @@ class lbann_comm {
   }
   /** Scalar gather (for non-root processes). */
   template <typename T>
-  void gather(T snd, int root, const El::mpi::Comm c) {
+  void gather(T snd, int root, El::mpi::Comm c) {
     bytes_sent += sizeof(T);
-    El::mpi::Gather(&snd, 1, (T*) nullptr, 0, root, c);
+    El::mpi::Gather(&snd, 1, (T*) nullptr, 0, root, std::move(c),
+                    El::SyncInfo<El::Device::CPU>{});
   }
   /** Scalar gather (for root processes). */
   template <typename T>
-  void gather(T snd, T *rcv, const El::mpi::Comm c) {
-    El::mpi::Gather(&snd, 1, rcv, 1, El::mpi::Rank(c), c);
-    bytes_received += sizeof(T) * (El::mpi::Size(c) - 1);
+  void gather(T snd, T *rcv, El::mpi::Comm c) {
+    auto const size_c = El::mpi::Size(c);
+    auto const rank_c = El::mpi::Rank(c);
+    El::mpi::Gather(&snd, 1, rcv, 1, rank_c, std::move(c),
+                    El::SyncInfo<El::Device::CPU>{});
+    bytes_received += sizeof(T) * (size_c - 1);
   }
   /** Scalar gather (for root processes). */
   template <typename T>
-  void gather(T snd, std::vector<T>& rcv, const El::mpi::Comm c) {
-    gather(snd, rcv.data(), c);
+  void gather(T snd, std::vector<T>& rcv, El::mpi::Comm c) {
+    gather(snd, rcv.data(), std::move(c));
   }
   /** Scalar-array gather (for non-root processes). */
   template <typename T>
-  void gather(T *snd, int count, int root, const El::mpi::Comm c) {
+  void gather(T *snd, int count, int root, El::mpi::Comm c)
+  {
+    gather(snd, count, root, std::move(c),
+           El::SyncInfo<El::Device::CPU>{});
+  }
+  template <typename T, El::Device D>
+  void gather(T *snd, int count, int root, El::mpi::Comm c,
+              El::SyncInfo<D> const& syncInfo) {
     bytes_sent += sizeof(T) * count;
-    El::mpi::Gather(snd, count, (T*) nullptr, 0, root, c);
+    El::mpi::Gather(snd, count, (T*) nullptr, 0, root, std::move(c),
+                    syncInfo);
   }
   /** Scalar-array gather (for root processes). */
   template <typename T>
-  void gather(T *snd, int count, T *rcv, const El::mpi::Comm c) {
-    El::mpi::Gather(snd, count, rcv, count, El::mpi::Rank(c), c);
-    bytes_received += sizeof(T) * count * (El::mpi::Size(c) - 1);
+  void gather(T *snd, int count, T *rcv, El::mpi::Comm c) {
+      gather(snd, count, rcv, std::move(c), El::SyncInfo<El::Device::CPU>{});
+  }
+  template <typename T, El::Device D>
+  void gather(T *snd, int count, T *rcv, El::mpi::Comm c,
+              El::SyncInfo<D> const& syncInfo) {
+    auto const size_c = El::mpi::Size(c);
+    auto const rank_c = El::mpi::Rank(c);
+    El::mpi::Gather(snd, count, rcv, count, rank_c, std::move(c), syncInfo);
+    bytes_received += sizeof(T) * count * (size_c - 1);
   }
   /** Scalar scatter (for non-root processes). */
   template <typename T>
-  T scatter(int root, const El::mpi::Comm c) {
+  T scatter(int root, El::mpi::Comm c) {
     T val = {};
-    El::mpi::Scatter((T*) nullptr, 1, &val, 1, root, c);
+    El::mpi::Scatter((T*) nullptr, 1, &val, 1, root, std::move(c),
+                     El::SyncInfo<El::Device::CPU>{});
     bytes_received += sizeof(T);
     return val;
   }
   /** Scalar scatter (for root processes). */
   template <typename T>
-  T scatter(T *snd, const El::mpi::Comm c) {
+  T scatter(T *snd, El::mpi::Comm c) {
     bytes_sent += sizeof(T) * (El::mpi::Size(c) - 1);
     T val = {};
-    El::mpi::Scatter(snd, 1, &val, 1, El::mpi::Rank(c), c);
+    auto root = El::mpi::Rank(c);
+    El::mpi::Scatter(snd, 1, &val, 1, root, std::move(c),
+                     El::SyncInfo<El::Device::CPU>{});
     return val;
   }
   /** Inter-model reduce (for non-root processes). */
@@ -482,30 +576,65 @@ class lbann_comm {
   }
   /** Scalar reduce (for non-root processes). */
   template <typename T>
-  void reduce(T snd, int root, const El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
+  void reduce(T snd, int root, El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
     bytes_sent += sizeof(T);
-    El::mpi::Reduce(&snd, (T*) NULL, 1, op, root, c);
+    El::mpi::Reduce(&snd, (T*) NULL, 1, op, root, std::move(c),
+                    El::SyncInfo<El::Device::CPU>{});
   }
   /** Scalar reduce (for root processes). */
   template <typename T>
-  T reduce(T snd, const El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
+  T reduce(T snd, El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
     T val = {};
-    El::mpi::Reduce(&snd, &val, 1, op, El::mpi::Rank(c), c);
-    bytes_received += sizeof(T) * (El::mpi::Size(c) - 1);
+    auto const size_c = El::mpi::Size(c);
+    auto const rank_c = El::mpi::Rank(c);
+    El::mpi::Reduce(&snd, &val, 1, op, rank_c, std::move(c),
+                    El::SyncInfo<El::Device::CPU>{});
+    bytes_received += sizeof(T) * (size_c - 1);
     return val;
   }
+
   /** Scalar-array reduce (for non-root processes). */
+  // Op is "SUM"
   template <typename T>
-  void reduce(T *snd, int count, int root, const El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
+  void reduce(T *snd, int count, int root, El::mpi::Comm c) {
+    reduce(snd, count, root, std::move(c), El::mpi::SUM,
+           El::SyncInfo<El::Device::CPU>{});
+  }
+  template <typename T, El::Device D>
+  void reduce(T *snd, int count, int root, El::mpi::Comm c, El::SyncInfo<D> const& syncInfo) {
+    reduce(snd, count, root, std::move(c), El::mpi::SUM, syncInfo);
+  }
+
+  template <typename T>
+  void reduce(T *snd, int count, int root, El::mpi::Comm c, El::mpi::Op op) {
+    reduce(snd, count, root, std::move(c), op, El::SyncInfo<El::Device::CPU>{});
+  }
+  template <typename T, El::Device D>
+  void reduce(T *snd, int count, int root, El::mpi::Comm c, El::mpi::Op op, El::SyncInfo<D> const& syncInfo) {
     bytes_sent += sizeof(T) * count;
-    El::mpi::Reduce(snd, (T*) NULL, count, op, root, c);
+    El::mpi::Reduce(snd, (T*) NULL, count, op, root, std::move(c), syncInfo);
   }
   /** Scalar-array reduce (for root processes). */
+  template <typename T, El::Device D>
+  void reduce(T *snd, int count, T *rcv, El::mpi::Comm c, El::SyncInfo<D> const& syncInfo) {
+    reduce(snd, count, rcv, std::move(c), El::mpi::SUM, syncInfo);
+  }
   template <typename T>
-  void reduce(T *snd, int count, T *rcv, const El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
-    if (snd == rcv) { snd = (T*) MPI_IN_PLACE; }
-    El::mpi::Reduce(snd, rcv, count, op, El::mpi::Rank(c), c);
-    bytes_received += sizeof(T) * count * (El::mpi::Size(c) - 1);
+  void reduce(T *snd, int count, T *rcv, El::mpi::Comm c) {
+    reduce(snd, count, rcv, std::move(c), El::mpi::SUM, El::SyncInfo<El::Device::CPU>{});
+  }
+
+  template <typename T>
+  void reduce(T *snd, int count, T *rcv, El::mpi::Comm c, El::mpi::Op op) {
+      reduce(snd, count, rcv, std::move(c), op, El::SyncInfo<El::Device::CPU>{});
+  }
+  template <typename T, El::Device D>
+  void reduce(T *snd, int count, T *rcv, El::mpi::Comm c, El::mpi::Op op, El::SyncInfo<D> const& syncInfo) {
+      if (snd == rcv) { snd = (T*)MPI_IN_PLACE; }
+    auto const rank_c = El::mpi::Rank(c);
+    auto const size_c = El::mpi::Size(c);
+    El::mpi::Reduce(snd, rcv, count, op, rank_c, std::move(c), syncInfo);
+    bytes_received += sizeof(T) * count * (size_c - 1);
   }
   /** Inter-model all-reduce. */
   template <typename T>
@@ -524,15 +653,20 @@ class lbann_comm {
   }
   /** Scalar allreduce. */
   template <typename T>
-  T allreduce(T snd, const El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
+  T allreduce(T snd, El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
+    auto const size_c = El::mpi::Size(c);
     bytes_sent += sizeof(T);
-    allreduce(&snd, 1, c, op);
-    bytes_received += sizeof(T) * (El::mpi::Size(c) - 1);
+    allreduce(&snd, 1, std::move(c), op);
+    bytes_received += sizeof(T) * (size_c - 1);
     return snd;
   }
+
+  // FIXME (trb): Based on the backend choice of "MPIBackend", I'm
+  // assuming this is intended as a CPU-only call.
   /** Scalar-array allreduce. */
   template <typename T>
-  void allreduce(T *snd, int count, T *rcv, const El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
+  void allreduce(T *snd, int count, T *rcv, El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
+    auto const size_c = El::mpi::Size(c);
     bytes_sent += count * sizeof(T);
 #ifdef LBANN_HAS_ALUMINUM
 #ifdef LBANN_ALUMINUM_MPI_PASSTHROUGH
@@ -541,15 +675,17 @@ class lbann_comm {
     ::Al::AllreduceAlgorithm algo = ::Al::AllreduceAlgorithm::automatic;
 #endif
     ::Al::Allreduce<::Al::MPIBackend>(
-      snd, rcv, count, mpi_op_to_al_op(op), *get_al_comm(c), algo);
+      snd, rcv, count, mpi_op_to_al_op(op), c.template GetComm<::Al::MPIBackend>(), algo);
 #else
-    El::mpi::AllReduce(snd, rcv, count, op, c);
+    El::mpi::AllReduce(snd, rcv, count, op, std::move(c),
+                       El::SyncInfo<El::Device::CPU>{});
 #endif
-    bytes_received += count * sizeof(T) * (El::mpi::Size(c) - 1);
+    bytes_received += count * sizeof(T) * (size_c - 1);
   }
   /** In-place scalar-array allreduce. */
   template <typename T>
-  void allreduce(T *data, int count, const El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
+  void allreduce(T *data, int count, El::mpi::Comm c, El::mpi::Op op = El::mpi::SUM) {
+    auto const size_c = El::mpi::Size(c);
     bytes_sent += count * sizeof(T);
 #ifdef LBANN_HAS_ALUMINUM
 #ifdef LBANN_ALUMINUM_MPI_PASSTHROUGH
@@ -558,26 +694,27 @@ class lbann_comm {
     ::Al::AllreduceAlgorithm algo = ::Al::AllreduceAlgorithm::automatic;
 #endif
     ::Al::Allreduce<::Al::MPIBackend>(
-      data, count, mpi_op_to_al_op(op), *get_al_comm(c), algo);
+      data, count, mpi_op_to_al_op(op), c.template GetComm<::Al::MPIBackend>(), algo);
 #else
-    El::mpi::AllReduce(data, count, op, c);
+    El::mpi::AllReduce(data, count, op, std::move(c),
+                       El::SyncInfo<El::Device::CPU>{});
 #endif
-    bytes_received += count * sizeof(T) * (El::mpi::Size(c) - 1);
+    bytes_received += count * sizeof(T) * (size_c - 1);
   }
   /** Matrix allreduce. */
   void allreduce(AbsMat& m,
-                 const El::mpi::Comm c,
+                 El::mpi::Comm c,
                  El::mpi::Op op = El::mpi::SUM);
   /** Matrix allreduce. */
   void allreduce(AbsDistMat& m,
-                 const El::mpi::Comm c,
+                 El::mpi::Comm c,
                  El::mpi::Op op = El::mpi::SUM);
   /** Non-blocking matrix allreduce.
    *  If LBANN has not been built with Aluminum, then this calls a
    *  blocking matrix allreduce.
    */
   void nb_allreduce(AbsMat& m,
-                    const El::mpi::Comm c,
+                    El::mpi::Comm c,
                     Al::request& req,
                     El::mpi::Op op = El::mpi::SUM);
   /** Non-blocking matrix allreduce.
@@ -585,7 +722,7 @@ class lbann_comm {
    *  blocking matrix allreduce.
    */
   void nb_allreduce(AbsDistMat& m,
-                    const El::mpi::Comm c,
+                    El::mpi::Comm c,
                     Al::request& req,
                     El::mpi::Op op = El::mpi::SUM);
   /** Non-blocking in-place scalar-array allreduce.
@@ -594,16 +731,16 @@ class lbann_comm {
    *  This currently only supports host pointers (i.e. the MPI backend).
    */
   template <typename T>
-  void nb_allreduce(T *data, int count, const El::mpi::Comm c, Al::request& req,
+  void nb_allreduce(T *data, int count, El::mpi::Comm c, Al::request& req,
                     El::mpi::Op op = El::mpi::SUM) {
 #ifdef LBANN_HAS_ALUMINUM
     bytes_sent += count * sizeof(T);
     req.mpi_req = Al::mpi_null_req;
     ::Al::NonblockingAllreduce<::Al::MPIBackend>(
-      data, count, mpi_op_to_al_op(op), *get_al_comm(c), req.mpi_req);
+      data, count, mpi_op_to_al_op(op), c.template GetComm<::Al::MPIBackend>(), req.mpi_req);
     bytes_received += count * sizeof(T) * (El::mpi::Size(c) - 1);
 #else
-    allreduce(data, count, c, op);
+    allreduce(data, count, std::move(c), op);
 #endif  // LBANN_HAS_ALUMINUM
   }
 
@@ -636,11 +773,16 @@ class lbann_comm {
   /** Send a buffer to rank in model. */
   template <typename T>
   void send(const T *data, int count, int model, int rank) {
+    send(data, count, model, rank, El::SyncInfo<El::Device::CPU>{});
+  }
+  template <typename T, El::Device D>
+  void send(const T *data, int count, int model, int rank, El::SyncInfo<D> const& syncInfo) {
     bytes_sent += sizeof(T) * count;
-    El::mpi::Send(data, count, get_world_rank(model, rank), get_world_comm());
+    El::mpi::Send(data, count, get_world_rank(model, rank), get_world_comm(), syncInfo);
   }
-  template <typename T> void send(const T *data, int count, int model) {
-    send(data, count, model, rank_in_model);
+  template <typename T, El::Device D>
+  void send(const T *data, int count, int model, El::SyncInfo<D> const& syncInfo) {
+    send(data, count, model, rank_in_model, syncInfo);
   }
   void send(const AbsMat& mat, int model, int rank);
   void send(const DistMat& mat, int model, int rank);
@@ -660,9 +802,9 @@ class lbann_comm {
   }
   template <typename T>
   void nb_tagged_send(const T *data, int count, int rank, int tag,
-               El::mpi::Request<T>& req, const El::mpi::Comm c) {
+               El::mpi::Request<T>& req, El::mpi::Comm c) {
     bytes_sent += sizeof(T) * count;
-    El::mpi::TaggedISend(data, count, rank, tag, c, req);
+    El::mpi::TaggedISend(data, count, rank, tag, std::move(c), req);
   }
   template <typename T> void nb_send(const T *data, int count, int model,
                                      El::mpi::Request<T>& req) {
@@ -681,12 +823,23 @@ class lbann_comm {
 
   /** Corresponding receive to send. */
   template <typename T> void recv(T *data, int count, int model, int rank) {
-    El::mpi::Recv(data, count, get_world_rank(model, rank), get_world_comm());
-    bytes_received += sizeof(T) * count;
+    recv(data, count, model, rank, El::SyncInfo<El::Device::CPU>{});
   }
   template <typename T> void recv(T *data, int count, int model) {
     recv(data, count, model, rank_in_model);
   }
+  template <typename T> void recv(T *data, int count) {
+    recv(data, count, El::SyncInfo<El::Device::CPU>{});
+  }
+  template <typename T, El::Device D>
+  void recv(T *data, int count, int model, int rank, El::SyncInfo<D> const& syncInfo) {
+    El::mpi::Recv(data, count, get_world_rank(model, rank), get_world_comm(), syncInfo);
+    bytes_received += sizeof(T) * count;
+  }
+  template <typename T, El::Device D>
+  void recv(T *data, int count, int model, El::SyncInfo<D> const& syncInfo) {
+    recv(data, count, model, rank_in_model, syncInfo);
+  }
   void recv(AbsMat& mat, int model, int rank);
   void recv(DistMat& mat, int model, int rank);
   void recv(AbsMat& mat, int model) {
@@ -696,8 +849,9 @@ class lbann_comm {
     recv(mat, model, rank_in_model);
   }
   /** As above, but receive from anyone. */
-  template <typename T> void recv(T *data, int count) {
-    El::mpi::Recv(data, count, El::mpi::ANY_SOURCE, get_world_comm());
+  template <typename T, El::Device D>
+  void recv(T *data, int count, El::SyncInfo<D> const& syncInfo) {
+    El::mpi::Recv(data, count, El::mpi::ANY_SOURCE, get_world_comm(), syncInfo);
     bytes_received += sizeof(T) * count;
   }
   void recv(AbsMat& mat);
@@ -712,8 +866,8 @@ class lbann_comm {
   }
   template <typename T> void nb_tagged_recv(
                T *data, int count, int rank, int tag,
-               El::mpi::Request<T>& req, const El::mpi::Comm c) {
-    El::mpi::TaggedIRecv(data, count, rank, tag, c, req);
+               El::mpi::Request<T>& req, El::mpi::Comm c) {
+    El::mpi::TaggedIRecv(data, count, rank, tag, std::move(c), req);
     bytes_received += sizeof(T) * count;
   }
 
@@ -737,20 +891,37 @@ class lbann_comm {
   void nb_recv(DistMat& mat, El::mpi::Request<DataType>& req);
 
   /** Send/recv to/from ranks. */
-  template <typename T>
+  template <typename T, El::Device D>
   void sendrecv(const T *snd, int send_count, int send_model, int send_rank,
                 T *rcv, int recv_count, int recv_model, int recv_rank) {
+    sendrecv(snd, send_count, send_model, send_rank,
+             rcv, recv_count, recv_model, recv_rank,
+             El::SyncInfo<El::Device::CPU>{});
+  }
+  template <typename T, El::Device D>
+  void sendrecv(const T *snd, int send_count, int send_model,
+                T *rcv, int recv_count, int recv_model) {
+    sendrecv(snd, send_count, send_model, rank_in_model,
+             rcv, recv_count, recv_model, rank_in_model,
+             El::SyncInfo<El::Device::CPU>{});
+  }
+
+  template <typename T, El::Device D>
+  void sendrecv(const T *snd, int send_count, int send_model, int send_rank,
+                T *rcv, int recv_count, int recv_model, int recv_rank,
+                El::SyncInfo<D> const& syncInfo) {
     bytes_sent += sizeof(T) * send_count;
     bytes_received += sizeof(T) * recv_count;
     El::mpi::SendRecv(snd, send_count, get_world_rank(send_model, send_rank),
                       rcv, recv_count, get_world_rank(recv_model, recv_rank),
-                      get_world_comm());
+                      get_world_comm(), syncInfo);
   }
-  template <typename T>
+  template <typename T, El::Device D>
   void sendrecv(const T *snd, int send_count, int send_model,
-                T *rcv, int recv_count, int recv_model) {
+                T *rcv, int recv_count, int recv_model,
+                El::SyncInfo<D> const& syncInfo) {
     sendrecv(snd, send_count, send_model, rank_in_model,
-             rcv, recv_count, recv_model, rank_in_model);
+             rcv, recv_count, recv_model, rank_in_model, syncInfo);
   }
 
   /** Determine the size (count) of an incoming message. */
@@ -988,7 +1159,7 @@ class lbann_comm {
    */
   template <El::Device D>
   void pe_ring_allreduce(
-                         const El::mpi::Comm comm, DMat<D>& mat, int max_recv_count,
+    const El::mpi::Comm comm, DMat<D>& mat, int max_recv_count,
     std::function<uint8_t *(AbsMat&, El::IR, El::IR, int&, bool, int)> send_transform,
     std::function<int(uint8_t *, AbsMat&)> recv_transform,
     std::function<int(uint8_t *, AbsMat&, bool)> recv_apply_transform,
@@ -1085,19 +1256,6 @@ class lbann_comm {
     allreduce_algorithm::DYNAMIC;
 
 #ifdef LBANN_HAS_ALUMINUM
-  using al_comms_key_type = std::pair<MPI_Comm, std::type_index>;
-  using al_comms_val_type = std::unique_ptr<::Al::MPICommunicator>;
-  std::map<al_comms_key_type, al_comms_val_type> m_al_comms;
-
-  /** Get an Aluminum communicator.
-   *  The communicator will have the same process configuration as the
-   *  Elemental communicator c and use the backend corresponding to
-   *  type index t. An Aluminum communicator will be created if
-   *  needed.
-   */
-  ::Al::MPICommunicator* get_al_comm(
-    El::mpi::Comm c, std::type_index t = std::type_index(typeid(Al::mpi_backend)));
-
   /** Convert an MPI_Op to an Aluminum reduction operator. */
   ::Al::ReductionOperator mpi_op_to_al_op(El::mpi::Op op);
 #endif
@@ -1151,40 +1309,43 @@ class lbann_comm {
 };
 
 template <typename T, bool S>
-void lbann_comm::broadcast(int root, T& val, const El::mpi::Comm c) {
+void lbann_comm::broadcast(int root, T& val, El::mpi::Comm c) {
+  auto const rank_c = El::mpi::Rank(c);
   if (S) {
     // Avoid linking error from uninstantiated El::mpi routine if !S by converting T to El::byte
     using TT = typename interpret_as_byte_if_needed<S, T>::type;
-    broadcast_native<TT>(root, reinterpret_cast<TT&>(val), c);
+    broadcast_native<TT>(root, reinterpret_cast<TT&>(val), std::move(c));
   } else {
-    broadcast_custom(root, val, c);
+    broadcast_custom(root, val, std::move(c));
   }
-  count_bytes_broadcast(sizeof(T), El::mpi::Rank(c), root);
+  count_bytes_broadcast(sizeof(T), rank_c, root);
 }
 
 template <typename T>
-void lbann_comm::broadcast_native(int root, T& val, const El::mpi::Comm c) const {
-  El::mpi::Broadcast(val, root, c);
+void lbann_comm::broadcast_native(int root, T& val, El::mpi::Comm c) const {
+  El::mpi::Broadcast(val, root, std::move(c), El::SyncInfo<El::Device::CPU>{});
 }
 
 template <typename T>
-void lbann_comm::broadcast_custom(int root, T& val, const El::mpi::Comm c) const {
+void lbann_comm::broadcast_custom(int root, T& val, El::mpi::Comm c) const {
  const int bytes =  static_cast<int>(sizeof(T));
- El::mpi::Broadcast<El::byte>(reinterpret_cast<El::byte*>(&val), bytes, root, c);
+ El::mpi::Broadcast<El::byte>(reinterpret_cast<El::byte*>(&val), bytes, root, std::move(c),
+                              El::SyncInfo<El::Device::CPU>{});
 }
 
-template <typename T, bool S>
-void lbann_comm::broadcast(const int root, T* data, const int count, const El::mpi::Comm c) {
+template <typename T, El::Device D, bool S>
+void lbann_comm::broadcast(const int root, T* data, const int count, El::mpi::Comm c, El::SyncInfo<D> const& syncInfo) {
+  auto const rank_c = El::mpi::Rank(c);
   const int size = static_cast<int>(S? count : sizeof(T)*count);
   // Avoid linking error from uninstantiated El::mpi routine if !S by converting T to El::byte
   using TT = typename interpret_as_byte_if_needed<S, T>::type;
-  El::mpi::Broadcast<TT>(reinterpret_cast<TT*>(data), size, root, c);
-  count_bytes_broadcast(sizeof(T)*count, El::mpi::Rank(c), root);
+  El::mpi::Broadcast<TT>(reinterpret_cast<TT*>(data), size, root, std::move(c), syncInfo);
+  count_bytes_broadcast(sizeof(T)*count, rank_c, root);
 }
 
 /// Broadcast std::string over an arbitrary communicator.
 template<>
-void lbann_comm::broadcast<std::string>(const int root, std::string& str, const El::mpi::Comm c);
+void lbann_comm::broadcast<std::string>(const int root, std::string& str, El::mpi::Comm c);
 
 /** Get the current rank within MPI_COMM_WORLD.
  *  This function is safe to call even if MPI has not initialized or
diff --git a/include/lbann/data_readers/CMakeLists.txt b/include/lbann/data_readers/CMakeLists.txt
index ba1f6546927..425f7e3672a 100644
--- a/include/lbann/data_readers/CMakeLists.txt
+++ b/include/lbann/data_readers/CMakeLists.txt
@@ -21,6 +21,7 @@ set_full_path(THIS_DIR_HEADERS
   data_reader_merge_features.hpp
   data_reader_merge_samples.hpp
   data_reader_mnist.hpp
+  data_reader_moving_mnist.hpp
   data_reader_nci.hpp
   data_reader_numpy.hpp
   data_reader_pilot2_molecular.hpp
diff --git a/include/lbann/data_readers/cv_utils.hpp b/include/lbann/data_readers/cv_utils.hpp
index 1ed7134b748..76832c876a6 100644
--- a/include/lbann/data_readers/cv_utils.hpp
+++ b/include/lbann/data_readers/cv_utils.hpp
@@ -111,7 +111,7 @@ class cv_utils {
    *  return type. Avoiding the extra access to the underlying filesystem may
    *  result in a better performance.
    */
-  static cv::Mat lbann_imread(const std::string& img_file_path, int flags, std::vector<char>& buf);
+  static cv::Mat lbann_imread(const std::string& img_file_path, int flags, std::vector<char>& buf, cv::Mat* image = nullptr);
 };
 
 
diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp
index d6246601775..e9e77942144 100644
--- a/include/lbann/data_readers/data_reader.hpp
+++ b/include/lbann/data_readers/data_reader.hpp
@@ -36,6 +36,7 @@
 #include "lbann/io/file_io.hpp"
 #include "lbann/io/persist.hpp"
 #include "lbann/data_readers/image_preprocessor.hpp"
+#include "lbann/utils/options.hpp"
 #include <cassert>
 #include <algorithm>
 #include <string>
@@ -62,6 +63,9 @@ class model;
 class generic_data_reader : public lbann_image_preprocessor {
  public:
 
+ #define JAG_NOOP_VOID if (m_jag_partitioned) { return; }
+ #define JAG_NOOP_INT if (m_jag_partitioned) { return 0; } 
+
   /**
    * ctor
    */
@@ -92,7 +96,9 @@ class generic_data_reader : public lbann_image_preprocessor {
     m_is_partitioned(false),
     m_partition_overlap(0),
     m_partition_mode(0),
-    m_procs_per_partition(1)
+    m_procs_per_partition(1),
+    m_jag_partitioned(false),
+    m_model(nullptr)
   {}
   generic_data_reader(const generic_data_reader&) = default;
   generic_data_reader& operator=(const generic_data_reader&) = default;
@@ -232,6 +238,13 @@ class generic_data_reader : public lbann_image_preprocessor {
    */
   virtual void set_role(std::string role) {
     m_role = role;
+    if (options::get()->has_string("jag_partitioned")
+        && get_role() == "train") {
+      m_jag_partitioned = true;
+      if (is_master()) {
+        std::cerr << "USING JAG DATA PARTITIONING\n";
+      }
+    }
   }
 
   /**
@@ -255,7 +268,7 @@ class generic_data_reader : public lbann_image_preprocessor {
    * If the base offset is not specified set it to 0
    * If the stride is not specified set it to batch size
    */
-  void setup();
+  virtual void setup();
 
   /** Return this data_reader's type */
   virtual std::string get_type() const = 0;
@@ -284,6 +297,15 @@ class generic_data_reader : public lbann_image_preprocessor {
    */
   virtual bool update(bool is_active_reader);
 
+  /**
+   * This is called at the end of update; it permits data readers to
+   * perform actions that are specific to their data sets, for example,
+   * data_reader_jag_conduit_hdf5 has the 'primary' data reader
+   * bcast its shuffled indices to the other data readers. In general
+   * most data readers will probably not overide this method.
+   * It may also be called outside of update.
+   */
+
   /// Return the number of labels (classes) in this dataset.
   virtual int get_num_labels() const {
     return 0;
@@ -304,12 +326,23 @@ class generic_data_reader : public lbann_image_preprocessor {
   virtual int get_linearized_response_size() const {
     return 1;
   }
+  /// get the linearized size of what is identified by desc.
+  virtual int get_linearized_size(const std::string& desc) const {
+    if (desc == "data") {
+      return get_linearized_data_size();
+    } else if (desc == "label") {
+      return get_linearized_label_size();
+    } else if (desc == "response") {
+      return get_linearized_response_size();
+    }
+    return 0;
+  }
   /// Get the dimensions of the data.
   virtual const std::vector<int> get_data_dims() const {
     return std::vector<int>(0);
   }
   /// True if the data reader's current position is valid.
-  bool position_valid() const {
+  virtual bool position_valid() const {
     return (m_current_pos < (int)m_shuffled_indices.size());
   }
   /// True if the data reader is at the start of an epoch.
@@ -341,6 +374,7 @@ class generic_data_reader : public lbann_image_preprocessor {
   }
   /// Set the mini batch size across all models (global)
   void set_global_mini_batch_size(const int s) {
+    JAG_NOOP_VOID
     m_global_mini_batch_size = s;
   }
   /// Return the mini_batch_size across all models (global)
@@ -349,6 +383,7 @@ class generic_data_reader : public lbann_image_preprocessor {
   }
   /// Set the mini batch stride
   void set_stride_to_next_mini_batch(const int s) {
+    JAG_NOOP_VOID
     m_stride_to_next_mini_batch = s;
   }
   /// Return the mini batch stride.
@@ -357,6 +392,7 @@ class generic_data_reader : public lbann_image_preprocessor {
   }
   /// Set the sample stride
   void set_sample_stride(const int s) {
+    JAG_NOOP_VOID
     m_sample_stride = s;
   }
   /// Return the sample stride.
@@ -372,7 +408,8 @@ class generic_data_reader : public lbann_image_preprocessor {
     return m_iteration_stride;
   }
   /// Return the base offset.
-  void set_base_offset(const int s) {
+  virtual void set_base_offset(const int s) {
+    JAG_NOOP_VOID
     m_base_offset = s;
   }
   /// Return the base offset.
@@ -381,6 +418,7 @@ class generic_data_reader : public lbann_image_preprocessor {
   }
   /// Set the model offset
   void set_model_offset(const int s) {
+    JAG_NOOP_VOID
     m_model_offset = s;
   }
   /// Return the model offset.
@@ -389,6 +427,7 @@ class generic_data_reader : public lbann_image_preprocessor {
   }
   /// Set the last mini batch size
   void set_last_mini_batch_size(const int s) {
+    JAG_NOOP_VOID
     m_last_mini_batch_size = s;
   }
   /// Return the last mini batch size
@@ -397,6 +436,7 @@ class generic_data_reader : public lbann_image_preprocessor {
   }
   /// Set the last mini batch size across all models (global)
   void set_global_last_mini_batch_size(const int s) {
+    JAG_NOOP_VOID
     m_global_last_mini_batch_size = s;
   }
   /// Return the last mini batch size across all models (global)
@@ -405,6 +445,7 @@ class generic_data_reader : public lbann_image_preprocessor {
   }
   /// Set the world master mini batch adjustment (global)
   void set_world_master_mini_batch_adjustment(const int s) {
+    JAG_NOOP_VOID
     m_world_master_mini_batch_adjustment = s;
   }
   /// Return the world master mini batch adjustment (global)
@@ -413,6 +454,7 @@ class generic_data_reader : public lbann_image_preprocessor {
   }
   /// Set the last mini batch stride
   void set_stride_to_last_mini_batch(const int s) {
+    JAG_NOOP_VOID
     m_stride_to_last_mini_batch = s;
   }
   /// Return the last mini batch stride
@@ -428,7 +470,7 @@ class generic_data_reader : public lbann_image_preprocessor {
     return m_num_parallel_readers;
   }
   /// Set the starting mini-batch index for the epoch
-  void set_reset_mini_batch_index(const int s) {
+  virtual void set_reset_mini_batch_index(const int s) {
     m_reset_mini_batch_index = s;
   }
   /// Return the starting mini-batch index for the epoch
@@ -460,7 +502,7 @@ class generic_data_reader : public lbann_image_preprocessor {
     return &m_shuffled_indices[0];
   }
   /// Get the number of samples in this dataset.
-  int get_num_data() const {
+  virtual int get_num_data() const {
     return (int)m_shuffled_indices.size();
   }
   /// Get the number of unused samples in this dataset.
@@ -512,7 +554,7 @@ class generic_data_reader : public lbann_image_preprocessor {
   /**
    * Select the appropriate subset of data based on settings.
    */
-  void select_subset_of_data();
+  virtual void select_subset_of_data();
 
   /// called by select_subset_of_data() if data set is partitioned
   void select_subset_of_data_partitioned();
@@ -521,7 +563,7 @@ class generic_data_reader : public lbann_image_preprocessor {
    * Replaced the shuffled index set with the unused index set, empying the
    * unused set.
    */
-  void use_unused_index_set();
+  virtual void use_unused_index_set();
 
   /// partition the dataset amongst the models
   void set_partitioned(bool is_partitioned=true, double overlap=0.0, int mode=0);
@@ -666,6 +708,12 @@ class generic_data_reader : public lbann_image_preprocessor {
   /// support of data store functionality
   void set_data_store(generic_data_store *g);
 
+  void set_model(model *m) { m_model = m; }
+
+  /// experimental; used to ensure all readers for jag_conduit_hdf5
+  /// have identical shuffled indices
+  virtual void post_update() {}
+
  protected:
 
   /**
@@ -686,13 +734,11 @@ class generic_data_reader : public lbann_image_preprocessor {
    */
   double get_validation_percent() const;
 
- protected:
+  int m_rank;
 
-   int m_rank;
+  generic_data_store *m_data_store;
 
-   generic_data_store *m_data_store;
-
-   lbann_comm *m_comm;
+  lbann_comm *m_comm;
 
   /**
    * Fetch a single sample into a matrix.
@@ -838,6 +884,16 @@ class generic_data_reader : public lbann_image_preprocessor {
    int m_procs_per_partition;
 
   std::vector<std::vector<char>> m_thread_buffer;
+
+  /// special handling for 1B jag; each reader
+  /// owns a unique subset of the data
+  bool m_jag_partitioned;
+
+  /// called by fetch_data a single time if m_jag_partitioned = true;
+  /// this sets various member variables (num_iterations, m_reset_mini_batch_index,
+  /// etc.
+  void set_jag_variables(int mb_size);
+  model *m_model;
 };
 
 template<typename T>
diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp
index 7d7066f277b..1043e02483d 100644
--- a/include/lbann/data_readers/data_reader_image.hpp
+++ b/include/lbann/data_readers/data_reader_image.hpp
@@ -54,6 +54,8 @@ class image_data_reader : public generic_data_reader {
   // dataset specific functions
   void load() override;
 
+  void setup() override;
+
   int get_num_labels() const override {
     return m_num_labels;
   }
@@ -112,6 +114,7 @@ class image_data_reader : public generic_data_reader {
   int m_image_num_channels; ///< number of image channels
   int m_image_linearized_size; ///< linearized image size
   int m_num_labels; ///< number of labels
+  std::vector<cv::Mat> m_thread_cv_buffer;
 };
 
 }  // namespace lbann
diff --git a/include/lbann/data_readers/data_reader_jag.hpp b/include/lbann/data_readers/data_reader_jag.hpp
index a0dcf9fa6e8..3a2de20d76b 100644
--- a/include/lbann/data_readers/data_reader_jag.hpp
+++ b/include/lbann/data_readers/data_reader_jag.hpp
@@ -67,9 +67,9 @@ class data_reader_jag : public generic_data_reader {
   }
 
   /// Choose which data to use for independent variable
-  void set_independent_variable_type(const std::vector<variable_t> independent);
+  void set_independent_variable_type(const std::vector< std::vector<variable_t> >& independent);
   /// Choose which data to use for dependent variable
-  void set_dependent_variable_type(const std::vector<variable_t> dependent);
+  void set_dependent_variable_type(const std::vector< std::vector<variable_t> >& dependent);
 
   /// Tell which data to use for independent variable
   std::vector<variable_t> get_independent_variable_type() const;
@@ -134,6 +134,7 @@ class data_reader_jag : public generic_data_reader {
   /// check if type t is used in either the indepedent or the dependent variable
   bool is_used(const variable_t t) const;
 
+  using generic_data_reader::get_linearized_size;
   /// Return the linearized size of a particular JAG variable type
   size_t get_linearized_size(const variable_t t) const;
   /// Return the dimension of a particular JAG variable type
diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp
index f0d65dedf7c..814305aac91 100644
--- a/include/lbann/data_readers/data_reader_jag_conduit.hpp
+++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp
@@ -33,22 +33,56 @@
 #include "lbann/data_readers/opencv.hpp"
 #include "data_reader.hpp"
 #include "conduit/conduit.hpp"
-#include "conduit/conduit_relay.hpp"
+#include "hdf5.h"
 #include "lbann/data_readers/cv_process.hpp"
 #include <string>
 #include <set>
 #include <unordered_map>
+#include <map>
+#include <memory>
 
 namespace lbann {
 
 /**
- * Loads the pairs of JAG simulation inputs and results from a conduit-wrapped hdf5 file
+ * Store the handles of open hdf5 files, and close files at the end of the
+ * life time of this container object.
+ */
+class hdf5_file_handles {
+ protected:
+  std::unordered_map<std::string, hid_t> m_open_hdf5_files;
+  std::map<hid_t, std::string> m_open_hdf5_handles;
+
+ public:
+  ~hdf5_file_handles();
+  /// Add a handle that corresponds to the filename fname
+  bool add(const std::string fname, hid_t hnd);
+  /**
+   *  Returns the handle that corresponds to the given file name.
+   *  Reuturns a negative value if not found.
+   */
+  hid_t get(const std::string& fname) const;
+
+  std::string get(const hid_t h) const;
+
+  /// Returns the read-only access to the internal data
+  const std::unordered_map<std::string, hid_t>& get() const { return m_open_hdf5_files; }
+};
+
+
+/**
+ * Loads JAG simulation parameters and results from hdf5 files using conduit interfaces
  */
 class data_reader_jag_conduit : public generic_data_reader {
  public:
   using ch_t = float; ///< jag output image channel type
+  using conduit_ch_t = conduit::float32_array; ///< conduit type for ch_t array wrapper
   using scalar_t = double; ///< jag scalar output type
   using input_t = double; ///< jag input parameter type
+  /// Type for the pair of the key string of a sample and the handle of the file that contains it
+  using sample_locator_t = std::pair<std::string, hid_t>;
+  using sample_map_t = std::vector<sample_locator_t>; ///< valid sample map type
+  /// linear transform on X defined as: first * X + second => X'
+  using linear_transform_t = std::pair<double, double>;
 
   /**
    * Dependent/indepdendent variable types
@@ -75,9 +109,9 @@ class data_reader_jag_conduit : public generic_data_reader {
   }
 
   /// Choose which data to use for independent variable
-  void set_independent_variable_type(const std::vector<variable_t> independent);
+  void set_independent_variable_type(const std::vector< std::vector<variable_t> >& independent);
   /// Choose which data to use for dependent variable
-  void set_dependent_variable_type(const std::vector<variable_t> dependent);
+  void set_dependent_variable_type(const std::vector< std::vector<variable_t> >& dependent);
 
   /// Tell which data to use for independent variable
   std::vector<variable_t> get_independent_variable_type() const;
@@ -86,6 +120,10 @@ class data_reader_jag_conduit : public generic_data_reader {
 
   /// Set the image dimension
   void set_image_dims(const int width, const int height, const int ch = 1);
+  /// Choose images to use. e.g. by measurement views and time indices
+  void set_image_choices(const std::vector<std::string> image_keys);
+  /// Report the image choices
+  const std::vector<std::string>& get_image_choices() const;
 
   /// Add a scalar key to filter out
   void add_scalar_filter(const std::string& key);
@@ -110,20 +148,67 @@ class data_reader_jag_conduit : public generic_data_reader {
   /// Report the selected simulation input parameters
   const std::vector<std::string>& get_input_choices() const;
 
-  /// Load data and do data reader's chores.
 #ifndef _JAG_OFFLINE_TOOL_MODE_
+  /// Load data and do data reader's chores.
   void load() override;
+  /// True if the data reader's current position is valid.
+  bool position_valid() const override;
+  /// Return the base offset.
+  void set_base_offset(const int s) override;
+  /// Set the starting mini-batch index for the epoch
+  void set_reset_mini_batch_index(const int s) override;
+  /// Get the number of samples in this dataset.
+  int get_num_data() const override;
+  /// Select the appropriate subset of data based on settings.
+  void select_subset_of_data() override;
+  /// Replace the sample indices with the unused sample indices.
+  void use_unused_index_set() override;
+  /// Set the type of io_buffer that will rely on this reader
+  void set_io_buffer_type(const std::string io_buffer);
+
+  /// Set the id of this local instance
+  void set_local_id(const std::string role);
+  /// Get the id of this local instance
+  int get_local_id(const std::string role) const;
+  /// Set the set of open hdf5 data files
+  void set_open_hdf5_files(std::shared_ptr<hdf5_file_handles>& f);
+  /// Get the set of open hdf5 data files
+  std::shared_ptr<hdf5_file_handles>& get_open_hdf5_files();
+  /// Set the leader of local data reader group
+  void set_leading_reader(data_reader_jag_conduit* r);
+  /// Get the leader of local data reader group
+  data_reader_jag_conduit* get_leading_reader();
 #else
-  void load_conduit(const std::string conduit_file_path);
+  /// Load a data file
+  void load_conduit(const std::string conduit_file_path, size_t& idx);
+  /// See if the image size is consistent with the linearized size
+  void check_image_data();
+  /** Manually set m_global_num_samples_to_use and m_local_num_samples_to_use
+   *  to avoid calling determine_num_samples_to_use();
+   */
+  void set_num_samples(size_t ns);
 #endif // _JAG_OFFLINE_TOOL_MODE_
 
-  /// Return the number of samples
-  size_t get_num_samples() const;
+  /// Fetch data of a mini-batch or reuse it from the cache of the leading reader
+  int fetch_data(CPUMat& X) override;
+  /// Fetch responses of a mini-batch or reuse it from the cache of the leading reader
+  int fetch_responses(CPUMat& Y) override;
+  /// Fetch labels of a mini-batch or reuse it from the cache of the leading reader
+  int fetch_labels(CPUMat& Y) override;
+
+  /// Return the number of valid samples locally available
+  size_t get_num_valid_local_samples() const;
+  /// Allow read-only access to m_valid_samples member data
+  const sample_map_t& get_valid_local_samples() const;
+  /// Allow read-only access to m_unused_samples member data
+  const sample_map_t& get_valid_local_samples_unused() const;
 
   /// Return the number of measurement views
   unsigned int get_num_img_srcs() const;
   /// Return the linearized size of an image
   size_t get_linearized_image_size() const;
+  /// Return the linearized size of a single channel image
+  size_t get_linearized_1ch_image_size() const;
   /// Return the linearized size of scalar outputs
   size_t get_linearized_scalar_size() const;
   /// Return the linearized size of inputs
@@ -141,8 +226,18 @@ class data_reader_jag_conduit : public generic_data_reader {
   /// Return the dimension of data
   const std::vector<int> get_data_dims() const override;
 
+  /// Return the slice points for linearized independent variables
+  std::vector<El::Int> get_slice_points_independent() const;
+  /// Return the slice points for linearized dependent variables
+  std::vector<El::Int> get_slice_points_dependent() const;
+
   int get_num_labels() const override;
   int get_linearized_label_size() const override;
+  int get_linearized_size(const std::string& desc) const override;
+
+  void set_split_image_channels();
+  void unset_split_image_channels();
+  bool check_split_image_channels() const;
 
   /// Show the description
   std::string get_description() const;
@@ -165,9 +260,6 @@ class data_reader_jag_conduit : public generic_data_reader {
   template<typename S>
   static size_t add_val(const std::string key, const conduit::Node& n, std::vector<S>& vals);
 
-  /// Check if the simulation was successful
-  int check_exp_success(const size_t sample_id) const;
-
   void save_image(Mat& pixels, const std::string filename, bool do_scale = true) override;
 
 #ifndef _JAG_OFFLINE_TOOL_MODE_
@@ -176,15 +268,21 @@ class data_reader_jag_conduit : public generic_data_reader {
 #endif // _JAG_OFFLINE_TOOL_MODE_
 
   /// A untiliy function to convert the pointer to image data into an opencv image
-  static cv::Mat cast_to_cvMat(const std::pair<size_t, const ch_t*> img, const int height);
+  static cv::Mat cast_to_cvMat(const std::pair<size_t, const ch_t*> img,
+                               const int height, const int num_ch=1);
   /// A utility function to convert a JAG variable type to name string
   static std::string to_string(const variable_t t);
 
-  /// print the schema of the all the samples
-  void print_schema() const;
   /// print the schema of the specific sample identified by a given id
   void print_schema(const size_t i) const;
 
+  void clear_image_normalization_params();
+  void clear_scalar_normalization_params();
+  void clear_input_normalization_params();
+  void add_image_normalization_param(const linear_transform_t& t);
+  void add_scalar_normalization_param(const linear_transform_t& t);
+  void add_input_normalization_param(const linear_transform_t& t);
+
  protected:
   virtual void set_defaults();
   virtual bool replicate_processor(const cv_process& pp);
@@ -200,17 +298,29 @@ class data_reader_jag_conduit : public generic_data_reader {
   bool filter(const std::set<std::string>& key_filter,
               const std::vector<prefix_t>& prefix_filter, const std::string& name) const;
 
+  using generic_data_reader::get_linearized_size;
   /// Return the linearized size of a particular JAG variable type
   size_t get_linearized_size(const variable_t t) const;
   /// Return the dimension of a particular JAG variable type
   const std::vector<int> get_dims(const variable_t t) const;
-  /// A utility function to make a string to show all the variable types in a vector
+  /// Return the slice points for linearized data or responses
+  std::vector<El::Int> get_slice_points(const std::vector< std::vector<data_reader_jag_conduit::variable_t> >& var) const;
+  /// A utility function to make a string to show all the variable types
   static std::string to_string(const std::vector<variable_t>& vec);
+  /// A utility function to make a string to show all the groups of variable types
+  static std::string to_string(const std::vector< std::vector<variable_t> >& vec);
 
 
   virtual std::vector<CPUMat>
     create_datum_views(CPUMat& X, const std::vector<size_t>& sizes, const int mb_idx) const;
 
+  /// Export cached data minibatch
+  int reuse_data(CPUMat& X);
+  /// Export cached responses minibatch
+  int reuse_responses(CPUMat& Y);
+  /// Export cached labels minibatch
+  int reuse_labels(CPUMat& Y);
+
   bool fetch(CPUMat& X, int data_id, int mb_idx, int tid,
              const variable_t vt, const std::string tag);
   bool fetch_datum(CPUMat& X, int data_id, int mb_idx, int tid) override;
@@ -218,16 +328,41 @@ class data_reader_jag_conduit : public generic_data_reader {
   bool fetch_label(CPUMat& X, int data_id, int mb_idx, int tid) override;
 
 #ifndef _JAG_OFFLINE_TOOL_MODE_
-  /// Load a conduit-packed hdf5 data file
-  void load_conduit(const std::string conduit_file_path);
+  /// Shuffle sample indices
+  void shuffle_indices() override;
+  /**
+   * Compute the number of parallel readers based on the type of io_buffer,
+   * the mini batch size, the requested number of parallel readers.
+   * This is done before populating the sample indices.
+   */
+  int compute_max_num_parallel_readers();
+  /**
+   * Check if there are sufficient number of samples for the given number of
+   * data readers with distributed io buffer, based on the number of samples,
+   * the number of models and the mini batch size.
+   */
+  bool check_num_parallel_readers(long data_set_size);
+  /// Determine the number of samples to use
+  void determine_num_samples_to_use();
+  /**
+   * Approximate even distribution of samples by using as much samples
+   * as commonly available to every data reader instead of using
+   * all the available samples.
+   */
+  void adjust_num_samples_to_use();
+  /**
+   * populate the m_shuffled_indices such that each data reader can
+   * access local data using local indices.
+   */
+  void populate_shuffled_indices(const size_t num_samples);
+  /// Load a data file
+  void load_conduit(const std::string conduit_file_path, size_t& idx);
+  /// See if the image size is consistent with the linearized size
+  void check_image_data();
 #endif // _JAG_OFFLINE_TOOL_MODE_
 
-  /// Obtain the number of image measurement views
-  void set_num_img_srcs();
   /// Obtain the linearized size of images of a sample from the meta info
   void set_linearized_image_size();
-  /// See if the image size is consistent with the linearized size
-  void check_image_size();
   /// Make sure that the keys to choose scalar outputs are valid
   void check_scalar_keys();
   /// Make sure that the keys to choose scalar outputs are valid
@@ -242,32 +377,43 @@ class data_reader_jag_conduit : public generic_data_reader {
    */
   static bool check_non_numeric(const std::string key);
 
-  /// Choose the image closest to the bang time among those associated with the i-th sample
-  std::vector<int> choose_image_near_bang_time(const size_t i) const;
-
   /// Allow const access to the conduit data structure
-  const conduit::Node& get_conduit_node(const std::string key) const;
+  static const conduit::Node& get_conduit_node(const conduit::Node& n_base, const std::string key);
+  /** Load the conduit node with the data of the sample i identified by key
+   *  from the file that contains the sample.
+   */
+  bool load_conduit_node(const size_t i, const std::string& key, conduit::Node& node) const;
+  /// Check if a key exist for sample i
+  bool has_conduit_path(const size_t i, const std::string& key) const;
 
-  /// Obtain the pointers to read-only image data
-  std::vector< std::pair<size_t, const ch_t*> > get_image_ptrs(const size_t i) const;
+  /// Obtain image data
+  std::vector< std::vector<ch_t> > get_image_data(const size_t i) const;
 
  protected:
-  /// independent variable type
+  /// The flat list of independent variable types
   std::vector<variable_t> m_independent;
-  /// dependent variable type
+  /// The list of independent variable types grouped for slicing
+  std::vector< std::vector<variable_t> > m_independent_groups;
+  /// The flat list of dependent variable types
   std::vector<variable_t> m_dependent;
+  /// The list of independent variable types grouped for slicing
+  std::vector< std::vector<variable_t> > m_dependent_groups;
 
   int m_image_width; ///< image width
   int m_image_height; ///< image height
   int m_image_num_channels; ///< number of image channels
   size_t m_image_linearized_size; ///< The linearized size of an image
+  size_t m_1ch_image_linearized_size; ///< The linearized size of a single channel image
   unsigned int m_num_img_srcs; ///< number of views result in images
+  bool m_split_channels; ///< Whether to export a separate image per channel
 
   /// Whether data have been loaded
   bool m_is_data_loaded;
 
   int m_num_labels; ///< number of labels
 
+  /// Allow image selection by the view and the time index
+  std::vector<std::string> m_emi_image_keys;
   /// Keys to select a set of scalar simulation outputs to use. By default, use all.
   std::vector<std::string> m_scalar_keys;
   /// Keys to select a set of simulation input parameters to use. By default, use all.
@@ -276,9 +422,6 @@ class data_reader_jag_conduit : public generic_data_reader {
   /// preprocessor duplicated for each omp thread
   std::vector<std::unique_ptr<cv_process> > m_pps;
 
-  /// data wrapped in a conduit structure
-  conduit::Node m_data;
-
   /**
    * Set of keys that are associated with non_numerical values.
    * Such a variable requires a specific method for mapping to a numeric value.
@@ -303,15 +446,61 @@ class data_reader_jag_conduit : public generic_data_reader {
   std::vector<prefix_t> m_input_prefix_filter;
 
   /**
-   * maps integers to sample IDs. In the future the sample IDs may
-   * not be integers; also, this map only includes sample IDs that
-   * have <sample_id>/performance/success = 1
+   * maps integers to sample IDs and the handle of the file that contains it.
+   * In the future the sample IDs may not be integers; also, this map only
+   * includes sample IDs that have <sample_id>/performance/success = 1
    */
-  std::unordered_map<int, std::string> m_success_map;
+  sample_map_t m_valid_samples;
+  /// To support validation_percent
+  sample_map_t m_unused_samples;
 
-  std::set<std::string> m_emi_selectors;
-};
+  /**
+   * The number of local samples that are selected to use.
+   * This is less than or equal to the number of valid samples locally available.
+   */
+  size_t m_local_num_samples_to_use;
+  /**
+   * The total number of samples to use.
+   * This is the sum of m_local_num_samples_to_use.
+   */
+  size_t m_global_num_samples_to_use;
+
+  /**
+   * io_buffer type that will rely on this reader.
+   * e.g. distributed_io_buffer, partitioned_io_buffer
+   */
+  std::string m_io_buffer_type;
 
+  /// The number of local instances of this reader type
+  static std::unordered_map<std::string, int> m_num_local_readers;
+  /// locally addressable id in case of multiple data reader instances attached to a model
+  int m_local_reader_id;
+
+  /// Shared set of the handles of open HDF5 files
+  std::shared_ptr<hdf5_file_handles> m_open_hdf5_files;
+
+  /**
+   * The leading data reader among the local readers, which actually does the
+   * file IO and data shuffling.
+   */
+  data_reader_jag_conduit* m_leading_reader;
+
+  CPUMat m_data_cache;
+  CPUMat m_response_cache;
+  CPUMat m_label_cache;
+  int m_cached_data_mb_size;
+  int m_cached_response_mb_size;
+  int m_cached_label_mb_size;
+
+  /// temporary normalization parameters based on linear transforms
+  std::vector<linear_transform_t> m_image_normalization_params;
+  std::vector<linear_transform_t> m_scalar_normalization_params;
+  std::vector<linear_transform_t> m_input_normalization_params;
+  /** temporary image normalization
+   * The inputs are the image to normalize, the image source id and the channel id.
+   */
+  void image_normalization(cv::Mat& img, size_t i, size_t ch) const;
+};
 
 /**
  * To faciliate the type comparison between a c++ native type and a conduit type id.
diff --git a/include/lbann/data_readers/data_reader_jag_conduit_hdf5.hpp b/include/lbann/data_readers/data_reader_jag_conduit_hdf5.hpp
index 0edb9abfa3c..6f7b3890eaf 100644
--- a/include/lbann/data_readers/data_reader_jag_conduit_hdf5.hpp
+++ b/include/lbann/data_readers/data_reader_jag_conduit_hdf5.hpp
@@ -81,8 +81,12 @@ class data_reader_jag_conduit_hdf5 : public generic_data_reader {
 
   /// Return the number of measurement views
   unsigned int get_num_img_srcs() const;
-  /// Return the linearized size of an image
+  // Return the number of channels in an image
+  unsigned int get_num_channels() const;
+  /// Return the linearized size of an image;
   size_t get_linearized_image_size() const;
+  /// Return the linearized size of one channel in the image
+  size_t get_linearized_channel_size() const;
   /// Return the linearized size of scalar outputs
   size_t get_linearized_scalar_size() const;
   /// Return the linearized size of inputs
@@ -107,19 +111,7 @@ class data_reader_jag_conduit_hdf5 : public generic_data_reader {
   std::string get_description() const;
 
   /// Return the image simulation output of the i-th sample
-  std::vector<cv::Mat> get_cv_images(const size_t i) const;
-
-  /**
-   * Return the images of the i-th sample as an 1-D vector of lbann::DataType
-   * There is one image per view, each of which is taken at closest to the bang time.
-   */
-  std::vector<ch_t> get_images(const size_t i) const;
-
-  /// Return the scalar simulation output data of the i-th sample
-  std::vector<scalar_t> get_scalars(const size_t i) const;
-
-  /// Return the simulation input parameters of the i-th sample
-  std::vector<input_t> get_inputs(const size_t i) const;
+  std::vector<cv::Mat> get_cv_images(const size_t i, int tid) const;
 
   template<typename S>
   static size_t add_val(const std::string key, const conduit::Node& n, std::vector<S>& vals);
@@ -129,46 +121,37 @@ class data_reader_jag_conduit_hdf5 : public generic_data_reader {
 
   /// A untiliy function to convert the pointer to image data into an opencv image
   static cv::Mat cast_to_cvMat(const std::pair<size_t, const ch_t*> img, const int height);
-  /// A utility function to convert a JAG variable type to name string
-  static std::string to_string(const variable_t t);
 
   void set_image_dims(const int width, const int height, const int ch=1);
 
-  void set_use_images(bool b) { m_use_images = b; }
-  void set_use_inputs(bool b) { m_use_inputs = b; }
-  void set_use_scalars(bool b) { m_use_scalars = b; }
+  void set_scalar_keys(const std::string &keys) { m_scalar_keys = keys; }
+  void set_input_keys(const std::string &keys) { m_input_keys = keys; }
+  void set_image_views(const std::string &views) { m_image_views = views; }
+  void set_image_channels(const std::string &channels) { m_image_channels = channels; }
+
+  void post_update() override;
 
  protected:
+
+  friend jag_store;
+
   virtual void set_defaults();
   virtual bool replicate_processor(const cv_process& pp);
   virtual void copy_members(const data_reader_jag_conduit_hdf5& rhs);
 
-  static std::string to_string(const std::vector<variable_t>& vec);
-
+  bool fetch_datum(CPUMat& X, int data_id, int mb_idx, int tid); 
 
   virtual std::vector<CPUMat>
     create_datum_views(CPUMat& X, const std::vector<size_t>& sizes, const int mb_idx) const;
 
-  bool fetch(CPUMat& X, int data_id, int mb_idx, int tid,
-             const variable_t vt, const std::string tag);
-  bool fetch_datum(CPUMat& X, int data_id, int mb_idx, int tid) override;
-  bool fetch_response(CPUMat& Y, int data_id, int mb_idx, int tid) override;
   bool fetch_label(CPUMat& X, int data_id, int mb_idx, int tid) override;
 
-#ifndef _JAG_OFFLINE_TOOL_MODE_
-  /// Load a conduit-packed hdf5 data file
-  void load_conduit(const std::string conduit_file_path);
-#endif // _JAG_OFFLINE_TOOL_MODE_
-
   /// Check if the given sample id is valid
   bool check_sample_id(const size_t i) const;
 
   /// Choose the image closest to the bang time among those associated with the i-th sample
   std::vector<int> choose_image_near_bang_time(const size_t i) const;
 
-  /// Obtain the pointers to read-only image data
-  std::vector< std::pair<size_t, const ch_t*> > get_image_ptrs(const size_t i) const;
-
   jag_store * get_jag_store() const { return m_jag_store; }
 
   int m_image_width; ///< image width
@@ -180,11 +163,6 @@ class data_reader_jag_conduit_hdf5 : public generic_data_reader {
 
   int m_num_labels; ///< number of labels
 
-  /// Keys to select a set of scalar simulation outputs to use. By default, use all.
-  std::vector<std::string> m_scalar_keys;
-  /// Keys to select a set of simulation input parameters to use. By default, use all.
-  std::vector<std::string> m_input_keys;
-
   /// preprocessor duplicated for each omp thread
   std::vector<std::unique_ptr<cv_process> > m_pps;
 
@@ -216,9 +194,12 @@ class data_reader_jag_conduit_hdf5 : public generic_data_reader {
 
   std::set<std::string> m_emi_selectors;
 
-  bool m_use_scalars;
-  bool m_use_inputs;
-  bool m_use_images;
+  std::string m_scalar_keys;
+  std::string m_input_keys;
+  std::string m_image_views;
+  std::string m_image_channels;
+
+  data_reader_jag_conduit_hdf5* m_primary_reader;
 };
 
 
diff --git a/include/lbann/data_readers/data_reader_moving_mnist.hpp b/include/lbann/data_readers/data_reader_moving_mnist.hpp
new file mode 100644
index 00000000000..13a80c06dfd
--- /dev/null
+++ b/include/lbann/data_readers/data_reader_moving_mnist.hpp
@@ -0,0 +1,86 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_DATA_READER_MOVING_MNIST_HPP
+#define LBANN_DATA_READER_MOVING_MNIST_HPP
+
+#include "data_reader.hpp"
+
+namespace lbann {
+
+class moving_mnist_reader : public generic_data_reader {
+public:
+  moving_mnist_reader(El::Int num_frames,
+                      El::Int image_height,
+                      El::Int image_width,
+                      El::Int num_objects);
+  moving_mnist_reader(const moving_mnist_reader&) = default;
+  moving_mnist_reader& operator=(const moving_mnist_reader&) = default;
+  ~moving_mnist_reader() override = default;
+  moving_mnist_reader* copy() const override { return new moving_mnist_reader(*this); }
+
+  std::string get_type() const override {
+    return "moving_mnist_reader";
+  }
+
+  void load() override;
+
+  const std::vector<int> get_data_dims() const override;
+  int get_num_labels() const override;
+  int get_linearized_data_size() const override;
+  int get_linearized_label_size() const override;
+
+protected:
+  bool fetch_datum(CPUMat& X, int data_id, int mb_idx, int tid) override;
+  bool fetch_label(CPUMat& Y, int data_id, int mb_idx, int tid) override;
+
+private:
+
+  /** Number of frames. */
+  El::Int m_num_frames;
+  /** Frame height. */
+  El::Int m_image_height;
+  /** Frame width. */
+  El::Int m_image_width;
+  /** Number of MNIST digits in each frame. */
+  El::Int m_num_objects;
+
+  /** Number of MNIST samples. */
+  El::Int m_num_raw_images = 0;
+  /** MNIST image height. */
+  El::Int m_raw_image_height = 0;
+  /** MNIST image width. */
+  El::Int m_raw_image_width = 0;
+  /** Raw MNIST image data. */
+  std::vector<unsigned char> m_raw_image_data;
+  /** Raw MNIST label data. */
+  std::vector<unsigned char> m_raw_label_data;
+  
+};
+
+}  // namespace lbann
+
+#endif  // LBANN_DATA_READER_MOVING_MNIST_HPP
diff --git a/include/lbann/data_readers/data_reader_synthetic.hpp b/include/lbann/data_readers/data_reader_synthetic.hpp
index f12f24e688f..6bab221d1dc 100644
--- a/include/lbann/data_readers/data_reader_synthetic.hpp
+++ b/include/lbann/data_readers/data_reader_synthetic.hpp
@@ -42,6 +42,8 @@ class data_reader_synthetic : public generic_data_reader {
   data_reader_synthetic(int num_samples, int num_features, bool shuffle = true);
   data_reader_synthetic(int num_samples, std::vector<int> dims,
                         int num_labels, bool shuffle = true);
+  data_reader_synthetic(int num_samples, std::vector<int> dims,
+                        std::vector<int> response_dims, bool shuffle = true);
   data_reader_synthetic(const data_reader_synthetic&) = default;
   data_reader_synthetic& operator=(const data_reader_synthetic&) = default;
   ~data_reader_synthetic() override {}
@@ -53,21 +55,30 @@ class data_reader_synthetic : public generic_data_reader {
   }
 
   void load() override;
-
-  int get_num_labels() const override { return m_num_labels; }
   
   int get_linearized_data_size() const override {
     return std::accumulate(m_dimensions.begin(), m_dimensions.end(), 1,
                            std::multiplies<int>());
   }
+  int get_linearized_response_size() const override {
+    return std::accumulate(m_response_dimensions.begin(),
+                           m_response_dimensions.end(), 1,
+                           std::multiplies<int>());
+  }
 
   const std::vector<int> get_data_dims() const override {
     return m_dimensions;
   }
 
+  int get_num_labels() const override { return m_num_labels; }
+  int get_num_responses() const override {
+    return get_linearized_response_size();
+  }
+
  protected:
   bool fetch_datum(CPUMat& X, int data_id, int mb_idx, int tid) override;
-  bool fetch_label(Mat& Y, int data_id, int mb_idx, int tid) override;
+  bool fetch_label(CPUMat& Y, int data_id, int mb_idx, int tid) override;
+  bool fetch_response(CPUMat& Y, int data_id, int mb_idx, int tid) override;
 
  private:
   /** Number of samples in the dataset. */
@@ -76,6 +87,8 @@ class data_reader_synthetic : public generic_data_reader {
   int m_num_labels;
   /** Shape of the data. */
   std::vector<int> m_dimensions;
+  /** Shape of the responses. */
+  std::vector<int> m_response_dimensions;
 };
 
 }  // namespace lbann
diff --git a/include/lbann/data_readers/image_utils.hpp b/include/lbann/data_readers/image_utils.hpp
index 88d95cd30e9..b9580795fef 100644
--- a/include/lbann/data_readers/image_utils.hpp
+++ b/include/lbann/data_readers/image_utils.hpp
@@ -57,26 +57,26 @@ class image_utils {
 #endif // LBANN_HAS_OPENCV
 
   // new function, to support sharded data reader and data store functionality
-  static bool load_image(std::vector<unsigned char>& image_buf, int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data);
+  static bool load_image(std::vector<unsigned char>& image_buf, int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data, cv::Mat* cv_buf = nullptr);
 
   // new function, to support sharded data reader and data store functionality
   static bool load_image(std::vector<unsigned char>& image_buf,
-                         int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data);
+                         int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data, cv::Mat* cv_buf = nullptr);
 
   // load/save an image into/from an LBANN data block of El::Matrix<DataType> type
   // Use a thread save temporary buffer for decoding the image
   /// Load an image from a file and put it into an LBANN Mat data block
-  static bool load_image(const std::string& filename, int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data, std::vector<char>& buf);
+  static bool load_image(const std::string& filename, int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data, std::vector<char>& buf, cv::Mat* cv_buf = nullptr);
   /// Load an image from a file, extract patches from it and put them into LBANN Mat data blocks
-  static bool load_image(const std::string& filename, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data, std::vector<char>& buf);
+  static bool load_image(const std::string& filename, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data, std::vector<char>& buf, cv::Mat* cv_buf = nullptr);
   /// Save an image using data from an LBANN Mat data block
   static bool save_image(const std::string& filename, const int Width, const int Height, const int Type, cv_process& pp, const ::Mat& data);
 
   // import/export via a buffer of std::vector<uchar> containg the raw bytes of an image file
   /// Import an image from a file buffer (inbuf) and put it into an LBANN Mat data block
-  static bool import_image(cv::InputArray inbuf, int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data);
+  static bool import_image(cv::InputArray inbuf, int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data, cv::Mat* cv_buf = nullptr);
   /// Import an image from a file buffer (inbuf), extract patches from it and put them into LBANN Mat data blocks
-  static bool import_image(cv::InputArray inbuf, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data);
+  static bool import_image(cv::InputArray inbuf, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data, cv::Mat* cv_buf = nullptr);
   /// Export an image using data from an LBANN Mat block into a file buffer (outbuf)
   static bool export_image(const std::string& fileExt, std::vector<uchar>& outbuf, const int Width, const int Height, const int Type, cv_process& pp, const ::Mat& data);
 };
diff --git a/include/lbann/data_store/jag_store.hpp b/include/lbann/data_store/jag_store.hpp
index 2d2469e57a6..da3770c320b 100644
--- a/include/lbann/data_store/jag_store.hpp
+++ b/include/lbann/data_store/jag_store.hpp
@@ -31,6 +31,7 @@
 
 #ifdef LBANN_HAS_CONDUIT
 
+#include "lbann/utils/timer.hpp"
 #include "conduit/conduit.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "lbann/data_readers/data_reader_jag_conduit_hdf5.hpp"
@@ -39,15 +40,26 @@
 #include <unordered_set>
 #include <vector>
 #include "lbann/comm.hpp"
+#include "hdf5.h"
 
 namespace lbann {
 
+class data_reader_jag_conduit_hdf5;
+
 /**
  * Loads the pairs of JAG simulation inputs and results from a conduit-wrapped hdf5 file
  */
 class jag_store {
  public:
 
+  #define METADATA_FN "metadata.txt"
+  #define IMAGE_SIZE_PER_CHANNEL 4096
+  #define NUM_IMAGE_CHANNELS 4
+  #define MAX_SAMPLES_PER_BINARY_FILE 1000
+  //#define MAX_SAMPLES_PER_BINARY_FILE 10000
+  #define BINARY_FILE_BASENAME "converted"
+  #define FILES_PER_DIR 1000
+
   jag_store();
 
   jag_store(const jag_store&) = default;
@@ -58,63 +70,56 @@ class jag_store {
 
   void set_comm(lbann_comm *comm) {
     m_comm = comm;
+    m_num_procs_in_world = m_comm->get_procs_in_world();
+    m_rank_in_world = m_comm->get_rank_in_world();
   }
 
   /// Returns the requested inputs
-  const std::vector<data_reader_jag_conduit_hdf5::input_t> & fetch_inputs(size_t sample_id) const {
+  const std::vector<data_reader_jag_conduit_hdf5::input_t> & fetch_inputs(size_t sample_id, size_t tid) const {
     check_sample_id(sample_id);
-    return m_data_inputs[sample_id];
+    return m_data_inputs[tid];
   }
 
   /// Returns the requested scalars
-  const std::vector<data_reader_jag_conduit_hdf5::scalar_t> & fetch_scalars (size_t sample_id) const {
+  const std::vector<data_reader_jag_conduit_hdf5::scalar_t> & fetch_scalars (size_t sample_id, size_t tid) const {
     check_sample_id(sample_id);
-    return m_data_scalars[sample_id];
+    return m_data_scalars[tid];
   }
 
   /// Returns the requested images
-  const std::vector<std::vector<data_reader_jag_conduit_hdf5::ch_t>> & fetch_images(size_t sample_id) {
+  const std::vector<std::vector<data_reader_jag_conduit_hdf5::ch_t>> & fetch_views(size_t sample_id, size_t tid) {
     check_sample_id(sample_id);
-    return m_data_images[sample_id];
+    return m_data_images[tid];
   }
 
-  /**
-   * Load all keys from the "input" section of the bundle.
-   * This must be called before calling setup()
-   */
-  void load_inputs();
-
-  /**
-   * Load all keys from the "scalars" section of the bundle.
-   * This must be called before calling setup()
-   */
-  void load_scalars();
-
-  /**
-   * Load the requested images.
-   * This must be called before calling setup()
-   */
-  void load_images(const std::vector<std::string> &keys);
-  
-  /**
-   * Loads data using the hdf5 conduit API from one or more conduit files.
-   * "num_stores" and "my_rank" are used to determine which of the files
-   * (in the conduit_filenames list) will be used. This functionality is 
-   * needed when the jag_store is used in conjunction with 
-   * data_store_jag_conduit
-   */
-  void setup(const std::vector<std::string> conduit_filenames,
+  void setup(data_reader_jag_conduit_hdf5 *reader,
              bool num_stores = 1,
              int my_rank = 0);
 
   void set_image_size(size_t n) { m_image_size = n; }
 
   size_t get_linearized_data_size() const;
+  size_t get_linearized_image_size() const { return 4096*4; }
+  //size_t get_linearized_image_size() const { return m_image_size; }
+  size_t get_linearized_channel_size() const { return IMAGE_SIZE_PER_CHANNEL; }
+
+  /// returns the total number of channels in a view (image)
+  /// Note: probably should be deleted, since we can chose which
+  ///       channels to use
+  //size_t get_num_channels() const { return NUM_IMAGE_CHANNELS; }
+  size_t get_linearized_scalar_size() const { return m_scalars_to_use.size(); }
+  size_t get_linearized_input_size() const { return m_inputs_to_use.size(); }
+
+  /// returns the number of views (images) that we're actually using
+  /// (so currently may be 0, 1, 2, or 3)
+  size_t get_num_img_srcs() const { return m_image_views_to_use.size(); }
 
-  size_t get_linearized_image_size() const { return m_image_size; }
-  size_t get_linearized_scalar_size() const { return m_scalars_to_load.size(); }
-  size_t get_linearized_input_size() const { return m_inputs_to_load.size(); }
-  size_t get_num_img_srcs() const { return m_images_to_load.size(); }
+  /// returns the number of channels that we're actually using per view,
+  /// i.e, may be 1, 2, 3, or 4
+  size_t get_num_channels_per_view() const { return m_image_channels_to_use.size(); }
+
+  /// returns the number channels that we're actually using, * num_views
+  size_t get_total_num_channels() const { return get_num_img_srcs() * get_num_channels_per_view(); }
 
   const std::vector<size_t> & get_linearized_data_sizes() const { return m_data_sizes; }
 
@@ -122,41 +127,135 @@ class jag_store {
 
   size_t get_num_samples() const { return m_num_samples; }
 
- private:
-
-  bool m_is_setup;
+  void load_data(int data_id, int tid) {
+    check_sample_id(data_id);
+    if (m_mode == 1) {
+      load_data_conduit(data_id, tid);
+    } else if (m_mode == 2) {
+      load_data_binary(data_id, tid);
+    }
+  }
 
-  bool m_load_inputs;
+ private:
 
-  bool m_load_scalars;
+  /// one of these is called by load_data()
+  void load_data_conduit(int data_id, int tid);
+  void load_data_binary(int data_id, int tid);
 
   size_t m_image_size;
 
   size_t m_num_samples;
 
-  bool m_run_tests;
+  lbann_comm *m_comm;
+
+  int m_num_procs_in_world;
 
-  std::unordered_set<std::string> m_valid_samples;
+  int m_rank_in_world;
 
-  std::unordered_map<size_t, std::string> m_id_to_name;
+  bool m_master;
 
-  std::vector<std::string> m_inputs_to_load;
-  std::vector<std::string> m_scalars_to_load;
-  std::vector<std::string> m_images_to_load;
+  data_reader_jag_conduit_hdf5 *m_reader;
 
+  /// next three will contain the actual sample data;
+  /// they are filled in by one of the load_data_XX methods;
+  /// each thread has a separate set of buffers
   std::vector<std::vector<data_reader_jag_conduit_hdf5::input_t>> m_data_inputs;
   std::vector<std::vector<data_reader_jag_conduit_hdf5::scalar_t>> m_data_scalars;
   std::vector<std::vector<std::vector<data_reader_jag_conduit_hdf5::ch_t>>> m_data_images;
 
-  lbann_comm *m_comm;
-
-  void get_default_keys(std::string &filename, std::string &sample_id, std::string key1, bool master);
+  /// next four are called by setup()
+  void build_data_sizes();
+  void load_variable_names();
+  void report_linearized_sizes();
+  void allocate_memory(); 
+
+  /// these hold the names of the dependent and independant variables
+  /// that we're using
+  std::vector<std::string> m_inputs_to_use;
+  std::vector<std::string> m_scalars_to_use;
+  std::vector<std::string> m_image_views_to_use;
+  std::vector<int> m_image_channels_to_use;
+
+  /// these fill in the above four variables;
+  /// they are called by load_variable_names()
+  void load_inputs_to_use(const std::string &keys);
+  void load_scalars_to_use(const std::string &keys);
+  void load_image_views_to_use(const std::string &keys);
+  void load_image_channels_to_use(const std::string &keys);
 
   std::vector<size_t> m_data_sizes;
 
-  void build_data_sizes();
+  void check_entry(std::string &e) {
+    if (m_key_map.find(e) == m_key_map.end()) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: m_key_map is missing entry: " + e);
+    }
+  }
 
-  void run_tests(const std::vector<std::string> &conduit_filenames); 
+  /// one of the next three methods is called by setup(), depending 
+  /// on the value of --mode=<int>
+  int m_mode;
+  void setup_conduit();  // mode = 1
+  void setup_binary();   // mode = 2
+  void setup_testing();  // mode = 3
+
+  size_t m_max_samples;
+
+  /// next three are used when reading samples from conduit files
+  std::vector<std::string> m_conduit_filenames;
+  std::vector<int> m_data_id_to_conduit_filename_idx;
+  std::vector<std::string> m_data_id_to_sample_id;
+
+
+  // these are used when reading samples from binary formatted files
+  std::vector<std::vector<unsigned char>> m_scratch;
+  std::unordered_map<std::string, size_t> m_key_map;
+  // maps a shuffled index to <file_idx, local_idx>
+  std::unordered_map<int, std::pair<int, int>> m_sample_map;
+  std::unordered_map<std::string, int> m_sample_id_to_global_idx;
+  std::vector<std::string> m_binary_filenames;
+  // maps global idx (i.e: shuffled indices subscript) to sample ID 
+  // (e.g: 0.9.99.57:1)
+  std::unordered_map<int, std::string> m_sample_id_map;
+  size_t m_sample_len;
+  std::vector<std::vector<std::ifstream*>> m_streams;
+  void read_key_map(const std::string &filename); 
+
+  /// methods and variables for dealing with normalization
+  void load_normalization_values();
+  void load_normalization_values_impl(
+      std::vector<std::pair<double, double>> &values,
+      const std::vector<std::string> &variables); 
+
+  std::vector<std::pair<double, double>> m_normalize_inputs;
+  std::vector<std::pair<double, double>> m_normalize_scalars;
+  std::vector<std::pair<double, double>> m_normalize_views;
+
+  // magic numbers (from Rushil); these are for normalizing the images
+  // 0.035550589898738466
+  // 0.0012234476453273034
+  // 1.0744965260584181e-05
+  // 2.29319120949361e-07
+
+  // testing and other special methods: if these are invoked something 
+  // special happens, the the code exits; in the case a model is not run
+  void compute_min_max();
+  void compute_bandwidth();
+  void build_conduit_index(const std::vector<std::string> &filenames);
+  void compute_bandwidth_binary();
+  void convert_conduit_to_binary(const std::vector<std::string> &filenames);
+  void test_converted_files();
+
+  /// functions and variables for converting conduit files to a binary format;
+  /// these are used by convert_conduit_to_binary
+  void write_binary_metadata(std::string dir); 
+  void write_binary(const std::vector<std::string> &input, const std::string &dir); 
+  std::ofstream m_name_file;
+  size_t m_global_file_idx;
+  size_t m_num_converted_samples;
+  void open_binary_file_for_output(const std::string &dir);
+  std::ofstream m_binary_output_file;
+  std::ofstream m_binary_output_file_names;
+  std::string m_binary_output_filename;
 };
 
 } // end of namespace lbann
diff --git a/include/lbann/io/data_buffers/distributed_io_buffer.hpp b/include/lbann/io/data_buffers/distributed_io_buffer.hpp
index 11a278167ca..ead8bc25953 100644
--- a/include/lbann/io/data_buffers/distributed_io_buffer.hpp
+++ b/include/lbann/io/data_buffers/distributed_io_buffer.hpp
@@ -132,6 +132,8 @@ class distributed_io_buffer : public generic_io_buffer {
   void calculate_num_iterations_per_epoch_spanning_models(int max_mini_batch_size, generic_data_reader *data_reader) override;
   void calculate_num_iterations_per_epoch_single_model(int max_mini_batch_size, generic_data_reader *data_reader) override;
   int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const override;
+  static int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers, const lbann_comm* comm);
+  static bool check_num_parallel_readers(long data_set_size, int mini_batch_size, int num_parallel_readers, const lbann_comm* comm);
 
   data_buffer *get_data_buffer(const execution_mode mode) const {
     data_buffer *data_buffer = nullptr;
diff --git a/include/lbann/io/data_buffers/partitioned_io_buffer.hpp b/include/lbann/io/data_buffers/partitioned_io_buffer.hpp
index 7563dce6d9a..5a9a033d7dd 100644
--- a/include/lbann/io/data_buffers/partitioned_io_buffer.hpp
+++ b/include/lbann/io/data_buffers/partitioned_io_buffer.hpp
@@ -58,6 +58,7 @@ class partitioned_io_buffer : public generic_io_buffer {
   void calculate_num_iterations_per_epoch_spanning_models(int max_mini_batch_size, generic_data_reader *data_reader) override;
   void calculate_num_iterations_per_epoch_single_model(int max_mini_batch_size, generic_data_reader *data_reader) override;
   int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const override;
+  static int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers, const lbann_comm* comm);
 
   std::vector<CPUMat*> M_local;
 };
diff --git a/include/lbann/layers/CMakeLists.txt b/include/lbann/layers/CMakeLists.txt
index b85cec6751c..0cc71271bcb 100644
--- a/include/lbann/layers/CMakeLists.txt
+++ b/include/lbann/layers/CMakeLists.txt
@@ -5,9 +5,12 @@ set_full_path(THIS_DIR_HEADERS
 
 # Add the subdirectories
 add_subdirectory(activations)
+add_subdirectory(image)
 add_subdirectory(io)
 add_subdirectory(learning)
 add_subdirectory(loss)
+add_subdirectory(math)
+add_subdirectory(misc)
 add_subdirectory(regularizers)
 add_subdirectory(transform)
 
diff --git a/include/lbann/layers/activations/CMakeLists.txt b/include/lbann/layers/activations/CMakeLists.txt
index 46bc6e7a952..29cd16fbee0 100644
--- a/include/lbann/layers/activations/CMakeLists.txt
+++ b/include/lbann/layers/activations/CMakeLists.txt
@@ -1,10 +1,8 @@
 # Add the headers for this directory
 set_full_path(THIS_DIR_HEADERS
   activation.hpp
-  atan.hpp
   bent_identity.hpp
   elu.hpp
-  exponential.hpp
   identity.hpp
   leaky_relu.hpp
   relu.hpp
@@ -12,11 +10,9 @@ set_full_path(THIS_DIR_HEADERS
   sigmoid.hpp
   smooth_relu.hpp
   softmax.hpp
+  log_softmax.hpp
   softplus.hpp
   swish.hpp
-  tanh.hpp
-  power.hpp
-  log.hpp
   )
 
 # Propagate the files up the tree
diff --git a/include/lbann/layers/activations/abs.hpp b/include/lbann/layers/activations/abs.hpp
deleted file mode 100644
index 7ca28c4240a..00000000000
--- a/include/lbann/layers/activations/abs.hpp
+++ /dev/null
@@ -1,114 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_LAYER_ACTIVATION_ABS_HPP_INCLUDED
-#define LBANN_LAYER_ACTIVATION_ABS_HPP_INCLUDED
-
-#include "lbann/layers/activations/activation.hpp"
-
-namespace lbann {
-
-#ifdef LBANN_HAS_GPU
-namespace abs_cuda {
-  void fp(int height, int width,
-          const DataType* input,
-          int input_leading_dim,
-          DataType* output,
-          int output_leading_dim);
-  void bp(int height, int width,
-          const DataType* input,
-          int input_leading_dim,
-          const DataType* gradient_wrt_output,
-          int gradient_wrt_output_leading_dim,
-          DataType* gradient_wrt_input,
-          int gradient_wrt_input_leading_dim);
-} // namespace abs_cuda
-#endif // LBANN_HAS_GPU
-
-/** Absolute value. */
-template <data_layout T_layout, El::Device Dev>
-class abs_layer : public entrywise_activation_layer {
- public:
-  abs_layer(lbann_comm *comm) : entrywise_activation_layer(comm) { }
-  abs_layer* copy() const override { return new abs_layer(*this); }
-  std::string get_type() const override { return "abs"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-  std::string get_description() const override {
-    return std::string {}
-      + " abs" + " dataLayout: "
-      + this->get_data_layout_string(get_data_layout());
-  }
-
- protected:
-
-  DataType activation(DataType x) const override {
-    return std::abs(x);
-  }
-
-  DataType activation_derivative(DataType x) const override {
-    if (x > DataType(0)) {
-      return 1;
-    } else if (x < DataType(0)) {
-      return -1;
-    } else {
-      return 0;
-    }
-  }
-
-  void fp_compute_gpu() override {
-#ifndef LBANN_HAS_GPU
-    LBANN_ERROR("CUDA not detected");
-#else
-    abs_cuda::fp(get_output_size(),
-                     get_prev_activations().LocalWidth(),
-                     get_prev_activations().LockedBuffer(),
-                     get_prev_activations().LDim(),
-                     get_activations().Buffer(),
-                     get_activations().LDim());
-#endif // LBANN_HAS_GPU
-  }
-
-  void bp_compute_gpu() override {
-#ifndef LBANN_HAS_GPU
-    LBANN_ERROR("CUDA not detected");
-#else
-    abs_cuda::bp(get_output_size(),
-                 get_prev_activations().LocalWidth(),
-                 get_prev_activations().LockedBuffer(),
-                 get_prev_activations().LDim(),
-                 get_prev_error_signals().LockedBuffer(),
-                 get_prev_error_signals().LDim(),
-                 get_error_signals().Buffer(),
-                 get_error_signals().LDim());
-#endif // LBANN_HAS_GPU
-  }
-  
-};
-
-} // namespace lbann
-
-#endif // LBANN_LAYER_ACTIVATION_ABS_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/exponential.hpp b/include/lbann/layers/activations/exponential.hpp
deleted file mode 100644
index cd5ad20fcb2..00000000000
--- a/include/lbann/layers/activations/exponential.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef EXPONENTIAL_HPP_INCLUDED
-#define EXPONENTIAL_HPP_INCLUDED
-
-#include "lbann/layers/activations/activation.hpp"
-
-namespace lbann {
-
-/** Exponential activation function. */
-template <data_layout T_layout, El::Device Dev>
-class exponential_layer : public entrywise_activation_layer {
- public:
-  exponential_layer(lbann_comm *comm) : entrywise_activation_layer(comm) {}
-  exponential_layer* copy() const override { return new exponential_layer(*this); }
-  std::string get_type() const override { return "exponential"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-
- protected:
-  DataType activation(DataType x) const override {
-    return std::exp(x);
-  }
-  DataType activation_derivative(DataType x) const override {
-    return std::exp(x);
-  }
-};
-
-} // namespace lbann
-
-#endif // EXPONENTIAL_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/l2_loss.hpp b/include/lbann/layers/activations/l2_loss.hpp
deleted file mode 100644
index f2a79a81221..00000000000
--- a/include/lbann/layers/activations/l2_loss.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef L2_NORM_HPP_INCLUDED
-#define L2_NORM_HPP_INCLUDED
-
-#include "lbann/layers/activations/activation.hpp"
-
-namespace lbann {
-
-/** Half the L2 loss of a tensor without the sqrt.
- * (x^2)/2
- * @todo relocate to loss layer as part of github issue 154
- */
-template <data_layout T_layout, El::Device Dev>
-class l2_loss_layer : public entrywise_activation_layer {
- public:
-  l2_loss_layer(lbann_comm *comm) : entrywise_activation_layer(comm) { }
-  l2_loss_layer* copy() const override { return new l2_loss_layer(*this); }
-  std::string get_type() const override { return "l2_loss"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-  std::string get_description() const override {
-    return std::string {}
-      + " l2_loss" + " dataLayout: "
-      + this->get_data_layout_string(get_data_layout());
-  }
-
- protected:
-  DataType activation(DataType x) const override {
-    return DataType(0.5)*x*x;
-  }
-
-  DataType activation_derivative(DataType x) const override {
-    return x;
-  }
-};
-
-} // namespace lbann
-
-#endif // L2_NORM_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/log.hpp b/include/lbann/layers/activations/log.hpp
deleted file mode 100644
index 5bd0fe2714f..00000000000
--- a/include/lbann/layers/activations/log.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LOG_HPP_INCLUDED
-#define LOG_HPP_INCLUDED
-
-#include "lbann/layers/activations/activation.hpp"
-
-#define LBANN_ENABLE_LOG_CUTOFF
-
-namespace lbann {
-
-/** Logarithm function. */
-template <data_layout T_layout, El::Device Dev>
-class log_layer : public entrywise_activation_layer {
- public:
-  log_layer(lbann_comm *comm, DataType base = std::exp(1.0))
-    : entrywise_activation_layer(comm),
-      m_base(base),
-      m_inv_log_base(1/std::log(base)),
-      m_min_input(std::max(m_inv_log_base * std::sqrt(std::numeric_limits<DataType>::min()),
-                           m_inv_log_base / std::sqrt(std::numeric_limits<DataType>::max()))) {
-    if (m_base <= DataType(0)) {
-      std::stringstream err;
-      err << "log base (" << m_base << ") is not positive";
-      LBANN_ERROR(err.str());
-    }
-  }
-  log_layer* copy() const override { return new log_layer(*this); }
-  std::string get_type() const override { return "log"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-
- protected:
-  DataType activation(DataType x) const override {
-    if (x < m_min_input) {
-      #ifdef LBANN_ENABLE_LOG_CUTOFF
-      x = m_min_input;
-      #else
-      LBANN_ERROR("invalid input");
-      #endif // LBANN_ENABLE_LOG_CUTOFF
-    }
-    return std::log(x) * m_inv_log_base;
-  }
-  DataType activation_derivative(DataType x) const override {
-    if (x < m_min_input) {
-      #ifdef LBANN_ENABLE_LOG_CUTOFF
-      return DataType(0);
-      #else
-      LBANN_ERROR("invalid input");
-      #endif // LBANN_ENABLE_LOG_CUTOFF
-    }
-    return m_inv_log_base / x;
-  }
-
- private:
-
-  /** Logarithm base. */
-  const DataType m_base;
-  /** 1 / ln(m_base). */
-  const DataType m_inv_log_base;
-  /** Minimum input value. */
-  const DataType m_min_input;
-
-};
-
-} // namespace lbann
-
-#endif // LOG_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/log_softmax.hpp b/include/lbann/layers/activations/log_softmax.hpp
new file mode 100644
index 00000000000..a1afceb4526
--- /dev/null
+++ b/include/lbann/layers/activations/log_softmax.hpp
@@ -0,0 +1,120 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYER_ACTIVATION_LOG_SOFTMAX_HPP_INCLUDED
+#define LBANN_LAYER_ACTIVATION_LOG_SOFTMAX_HPP_INCLUDED
+
+#include "lbann/layers/layer.hpp"
+#include "lbann/utils/cudnn.hpp"
+
+namespace lbann {
+
+/** Log softmax layer.
+ *  The softmax function is defined:
+ *    \f[ \text{softmax}(x)_i = \frac{e^{x_i}}{\sum_j e^{x_j}} \f]
+ *  This layer computes:
+ *    \f[ \log \text{softmax}(x)_i = x_i - \log \sum_j e^{x_j} \f]
+ */
+template <data_layout Layout, El::Device Device>
+class log_softmax_layer : public Layer {
+public:
+
+  log_softmax_layer(lbann_comm *comm)
+    : Layer(comm)
+#ifdef LBANN_HAS_CUDNN
+    , m_tensors_cudnn_desc(this)
+#endif // LBANN_HAS_CUDNN
+  {}
+
+  log_softmax_layer(const log_softmax_layer& other)
+    : Layer(other),
+      m_workspace(other.m_workspace ?
+                  other.m_workspace->Copy() : nullptr)
+#ifdef LBANN_HAS_CUDNN
+    , m_tensors_cudnn_desc(other.m_tensors_cudnn_desc)
+#endif // LBANN_HAS_CUDNN
+  {
+#ifdef LBANN_HAS_CUDNN
+    m_tensors_cudnn_desc.set_layer(this);
+#endif // LBANN_HAS_CUDNN
+  }
+
+  log_softmax_layer& operator=(const log_softmax_layer& other) {
+    Layer::operator=(other);
+    m_workspace.reset(other.m_workspace ?
+                      other.m_workspace->Copy() : nullptr);
+#ifdef LBANN_HAS_CUDNN
+    m_tensors_cudnn_desc = other.m_tensors_cudnn_desc;
+    m_tensors_cudnn_desc.set_layer(this);
+#endif // LBANN_HAS_CUDNN
+    return *this;
+  }
+
+  ~log_softmax_layer() = default;
+
+  log_softmax_layer* copy() const override { return new log_softmax_layer(*this); }
+  std::string get_type() const override { return "log softmax"; }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+
+  void setup_matrices(const El::Grid& grid) override {
+    Layer::setup_matrices(grid);
+    auto dist = get_prev_activations().DistData();
+    dist.colDist = El::STAR;
+    m_workspace.reset(AbsDistMat::Instantiate(dist));
+#ifdef HYDROGEN_HAVE_CUB
+    if (m_workspace->GetLocalDevice() == El::Device::GPU) {
+      m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool
+    }
+#endif // HYDROGEN_HAVE_CUB
+  }
+
+  void fp_setup_outputs(El::Int mini_batch_size) override {
+    Layer::fp_setup_outputs(mini_batch_size);
+    const auto& dist_data = get_prev_activations().DistData();
+    m_workspace->Empty(false);
+    m_workspace->AlignWith(dist_data);
+    m_workspace->Resize(1, mini_batch_size);
+  }
+
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  /** Workspace for column-wise reductions. */
+  std::unique_ptr<AbsDistMat> m_workspace;
+
+#ifdef LBANN_HAS_CUDNN
+  /** Tensor cuDNN descriptors. */
+  cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc;
+#endif // LBANN_HAS_CUDNN
+
+};
+
+} // namespace lbann
+
+#endif // LBANN_LAYER_ACTIVATION_LOG_SOFTMAX_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/power.hpp b/include/lbann/layers/activations/power.hpp
deleted file mode 100644
index 2c5e692e125..00000000000
--- a/include/lbann/layers/activations/power.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef POWER_HPP_INCLUDED
-#define POWER_HPP_INCLUDED
-
-#include "lbann/layers/activations/activation.hpp"
-
-namespace lbann {
-
-/** Power function. */
-template <data_layout T_layout, El::Device Dev>
-class power_layer : public entrywise_activation_layer {
- public:
-  power_layer(lbann_comm *comm, EvalType exponent)
-    : entrywise_activation_layer(comm), m_exponent(exponent) {}
-  power_layer* copy() const override { return new power_layer(*this); }
-  std::string get_type() const override { return "power"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-
- protected:
-  DataType activation(DataType x) const override {
-    if (m_exponent == EvalType(2)) {
-      return x * x;
-    } else {
-      return std::pow(x, m_exponent);
-    }
-  }
-  DataType activation_derivative(DataType x) const override {
-    if (m_exponent == EvalType(2)) {
-      return 2 * x;
-    } else {
-      return m_exponent * std::pow(x, m_exponent - EvalType(1));
-    }
-  }
-
- private:
-
-  /** Exponent for power function. */
-  const EvalType m_exponent;
-
-};
-
-} // namespace lbann
-
-#endif // POWER_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/relu.hpp b/include/lbann/layers/activations/relu.hpp
index 7b97c60229a..3ff89444ec5 100644
--- a/include/lbann/layers/activations/relu.hpp
+++ b/include/lbann/layers/activations/relu.hpp
@@ -27,44 +27,25 @@
 #ifndef LBANN_LAYER_ACTIVATION_RELU_HPP_INCLUDED
 #define LBANN_LAYER_ACTIVATION_RELU_HPP_INCLUDED
 
-#include "lbann/layers/activations/activation.hpp"
+#include "lbann/layers/layer.hpp"
 
 namespace lbann {
 
-/** Rectified linear unit activation function.
+/** Rectified linear unit activation function layer.
  *  \f[ ReLU(x) = \text{max}(x, 0) \f]
  *  See https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
  */
 template <data_layout T_layout, El::Device Dev>
-class relu_layer : public entrywise_activation_layer {
+class relu_layer : public Layer {
 public:
-
-  relu_layer(lbann_comm *comm) : entrywise_activation_layer(comm) {}
+  relu_layer(lbann_comm *comm) : Layer(comm) {}
   relu_layer* copy() const override { return new relu_layer(*this); }
   std::string get_type() const override { return "ReLU"; }
-
-  /** Returns description of ctor params */
-  std::string get_description() const override {
-    return std::string {} +
-     " relu" + " dataLayout: " + this->get_data_layout_string(get_data_layout());
-  }
-
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
-
 protected:
-
-  DataType activation(DataType x) const override {
-    return x > DataType(0) ? x : DataType(0);
-  }
-
-  DataType activation_derivative(DataType x) const override {
-    return x > DataType(0) ? DataType(1) : DataType(0);
-  }
-
   void fp_compute() override;
   void bp_compute() override;
-
 };
 
 } // namespace lbann
diff --git a/include/lbann/layers/activations/sigmoid.hpp b/include/lbann/layers/activations/sigmoid.hpp
index b25d8892e4c..83a26e7bf56 100644
--- a/include/lbann/layers/activations/sigmoid.hpp
+++ b/include/lbann/layers/activations/sigmoid.hpp
@@ -27,7 +27,7 @@
 #ifndef LBANN_LAYER_ACTIVATION_SIGMOID_HPP_INCLUDED
 #define LBANN_LAYER_ACTIVATION_SIGMOID_HPP_INCLUDED
 
-#include "lbann/layers/activations/activation.hpp"
+#include "lbann/layers/layer.hpp"
 #include "lbann/utils/cuda.hpp"
 
 // Output is strictly in (0,1) to avoid numerical issues
@@ -35,53 +35,21 @@
 
 namespace lbann {
 
-/** Sigmoid activation function.
+/** Sigmoid function layer.
+ *  \f[ \sigma(x) = \frac{1}{1 + e^{-x}} \f]
  *  See https://en.wikipedia.org/wiki/Sigmoid_function
  */
 template <data_layout T_layout, El::Device Dev>
-class sigmoid_layer : public entrywise_activation_layer {
+class sigmoid_layer : public Layer {
 public:
-  sigmoid_layer(lbann_comm *comm) : entrywise_activation_layer(comm) {}
-
+  sigmoid_layer(lbann_comm *comm) : Layer(comm) {}
   sigmoid_layer* copy() const override { return new sigmoid_layer(*this); }
   std::string get_type() const override { return "sigmoid"; }
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
-
  protected:
-
-  DataType activation(DataType x) const override {
-    constexpr DataType one = 1;
-    DataType y = 1 / (one + std::exp(-x));
-#ifdef LBANN_ENABLE_SIGMOID_CUTOFF
-    if (y <= eps) { y = eps; }
-    else if (y >= one - eps) { y = one - eps; }
-#endif // LBANN_ENABLE_SIGMOID_CUTOFF
-    return y;
-  }
-
-  DataType activation_derivative(DataType x) const override {
-    constexpr DataType one = 1;
-    const auto& y = activation(x); 
-#ifdef LBANN_ENABLE_SIGMOID_CUTOFF
-    if (y <= eps || y >= one - eps) { return DataType(0); }
-#endif // LBANN_ENABLE_SIGMOID_CUTOFF
-    return y * (one - y);
-  }
-
   void fp_compute() override;
   void bp_compute() override;
-
-private:
-
-#ifdef LBANN_ENABLE_SIGMOID_CUTOFF
-  /** Cutoff value for output.
-   *  If sigmoid cutoff is enabled, outputs are guaranteed to be in
-   *  the interval [eps, 1-eps].
-   */
-  static constexpr DataType eps = std::numeric_limits<DataType>::epsilon();
-#endif // LBANN_ENABLE_SIGMOID_CUTOFF
-  
 };
   
 } // namespace lbann
diff --git a/include/lbann/layers/activations/sigmoid_bce_with_logits.hpp b/include/lbann/layers/activations/sigmoid_bce_with_logits.hpp
deleted file mode 100644
index 2ed297724bb..00000000000
--- a/include/lbann/layers/activations/sigmoid_bce_with_logits.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef SIGMOID_BCE_WITH_LOGITS_HPP_INCLUDED
-#define SIGMOID_BCE_WITH_LOGITS_HPP_INCLUDED
-
-#include "lbann/layers/activations/activation.hpp"
-
-namespace lbann {
-
-/** Compute logistic loss.
- * @param label - label for ground truth 0/1
- * sigmoid_cross_entropy_with_logits (https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits)
- * @check that m_true_label is zero or 1
- */
-template <data_layout T_layout, El::Device Dev>
-class sigmoid_bce_with_logits_layer : public entrywise_activation_layer {
- private:
-  int  m_ground_truth_label;
- public:
-  sigmoid_bce_with_logits_layer(lbann_comm *comm, int ground_truth_label) : entrywise_activation_layer(comm),
-  m_ground_truth_label(ground_truth_label) {}
-  sigmoid_bce_with_logits_layer* copy() const override { return new sigmoid_bce_with_logits_layer(*this); }
-  std::string get_type() const override { return "sigmoid_bce_with_logits"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-  std::string get_description() const override {
-    return std::string {}
-      + "sigmod_bce_with_logits " + " gt_label: " + std::to_string(m_ground_truth_label) 
-      + " dataLayout: " + this->get_data_layout_string(get_data_layout());
-  }
-
- protected:
-  DataType activation(DataType x) const override {
-    // Note: This formulation has very good numerical accuracy if
-    // ground truth is exactly zero or one, but also may introduce
-    // denormalized floats.
-    if (x >= DataType(0)) {
-      return x * (DataType(1) - m_ground_truth_label) + std::log1p(std::exp(-x));
-    } else {
-      return -x * m_ground_truth_label + std::log1p(std::exp(x));
-    }
-  }
-
-  DataType activation_derivative(DataType x) const override {
-    // Note: This formulation has very good numerical accuracy if
-    // ground truth is exactly zero or one, but also may introduce
-    // denormalized floats.
-    const DataType one = DataType(1);
-    const DataType one_minus_truth = one - m_ground_truth_label;
-    if (x >= DataType(0)) {
-      return (one_minus_truth - m_ground_truth_label * std::exp(-x)) / (one + std::exp(-x));
-    } else {
-      return (one_minus_truth * std::exp(x) - m_ground_truth_label) / (one + std::exp(x));
-    }
-  }
-};
-
-} // namespace lbann
-
-#endif // SIGMOID_BCE_WITH_LOGITS_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/softmax.hpp b/include/lbann/layers/activations/softmax.hpp
index 62f26b763cb..db5a5f82dab 100644
--- a/include/lbann/layers/activations/softmax.hpp
+++ b/include/lbann/layers/activations/softmax.hpp
@@ -24,291 +24,101 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
-#ifndef LBANN_LAYER_SOFTMAX_HPP_INCLUDED
-#define LBANN_LAYER_SOFTMAX_HPP_INCLUDED
+#ifndef LBANN_LAYER_ACTIVATION_SOFTMAX_HPP_INCLUDED
+#define LBANN_LAYER_ACTIVATION_SOFTMAX_HPP_INCLUDED
 
-#include "lbann/layers/activations/activation.hpp"
 #include "lbann/layers/layer.hpp"
-#include "lbann/io/file_io.hpp"
-#include "lbann/utils/random.hpp"
-#include "lbann/models/model.hpp"
-#include "lbann/utils/cuda.hpp"
 #include "lbann/utils/cudnn.hpp"
-#include <unistd.h>
-#include <string>
 
-#include <cassert>
-
-// Output has minimum value to avoid denormalized floats
+// Threshold outputs to a minimum value.
+// If enabled, the minimum output value is sqrt(min), where min is the
+// minimum, normalized, positive value (~1e-19 for float and ~1e-154
+// for double). The gradients w.r.t. input will be inaccurate, on the
+// order of the minimum output value.
 #define LBANN_ENABLE_SOFTMAX_CUTOFF
 
 namespace lbann {
 
-#ifdef LBANN_HAS_GPU
-namespace softmax_cuda {
-/** Apply minimum cutoff to activation entries.
- *  A minimum output value helps avoid denormalized floats. Data is
- *  assumed to be on GPU.
- */
-void fp_cutoff(int height, int width,
-               DataType* output,
-               int output_leading_dim,
-               DataType cutoff,
-               cudaStream_t stream);
-/** Error signal correction if activations have minimum cutoff.
- *  Data is assumed to be on GPU.
- */
-void bp_cutoff(int height, int width,
-               const DataType* output,
-               int output_leading_dim,
-               DataType* gradient_wrt_input,
-               int gradient_wrt_input_leading_dim,
-               DataType cutoff,
-               cudaStream_t stream);
-/** Compute the maximum entry in input for each column.
- * Data is assumed to be on the GPU.
- */
-void max_local_col_entry(int height, int width,
-                         const DataType * __restrict__ input,
-                         int input_ldim,
-                         DataType * __restrict__ workspace,
-                         cudaStream_t stream);
-/** Exponentiate the (shifted) input, and compute its sum.
- * Data is assumed to be on the GPU.
+/** Softmax layer.
+ *  \f[ \text{softmax}(x)_i = \frac{e^{x_i}}{\sum_j e^{x_j}} \f]
  */
-void exp_and_col_sum(int height, int width,
-                     const DataType * __restrict__ input,
-                     int intput_ldim,
-                     DataType * __restrict__ output,
-                     int output_ldim,
-                     DataType * __restrict__ workspace,
-                     cudaStream_t stream);
-/** Divide each entry in a column by the pre-computed sum of the column.
- * Apply a minimum cutoff to the entries if needed.
- * Data is assumed to be on the GPU.
- */
-void div_by_col_sums_and_cutoff(int height, int width,
-                                DataType * __restrict__ output,
-                                int output_ldim,
-                                const DataType * __restrict__ workspace,
-                                const DataType cutoff,
-                                cudaStream_t stream);
-/** Compute the gradient w.r.t. the input and apply a cutoff if needed.
- * Data is assumed to be on the GPU.
- */
-void grad_wrt_input_and_cutoff(int height, int width,
-                               const DataType * __restrict__ output,
-                               int output_ldim,
-                               const DataType * __restrict__ workspace,
-                               const DataType * __restrict__ grad_wrt_output,
-                               int grad_wrt_output_ldim,
-                               DataType * __restrict__ grad_wrt_input,
-                               int grad_wrt_input_ldim,
-                               const DataType cutoff,
-                               cudaStream_t stream);
-}  // namespace softmax_cuda
-#endif // LBANN_HAS_CUDA
-
-/** Softmax layer. */
-template <data_layout T_layout, El::Device Dev>
-class softmax_layer : public activation_layer {
-
- private:
-
-  /** Workspace for column-wise reductions. */
-  AbsDistMat *m_workspace;
-
-  /** Lower bound for outputs.
-   *  This should be sufficiently large to avoid denormalized
-   *  floats.
-   */
-  DataType m_min_output;
-
-#ifdef LBANN_HAS_CUDNN
-  /** Tensor cuDNN descriptors. */
-  cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc;
-#endif // LBANN_HAS_CUDNN
-
- public:
+template <data_layout Layout, El::Device Device>
+class softmax_layer : public Layer {
+public:
 
   softmax_layer(lbann_comm *comm)
-    : activation_layer(comm),
-      m_workspace(nullptr),
-      m_min_output(std::sqrt(std::numeric_limits<DataType>::min()))
+    : Layer(comm)
 #ifdef LBANN_HAS_CUDNN
     , m_tensors_cudnn_desc(this)
 #endif // LBANN_HAS_CUDNN
   {}
 
   softmax_layer(const softmax_layer& other)
-    : activation_layer(other),
-      m_min_output(other.m_min_output)
+    : Layer(other),
+      m_workspace(other.m_workspace ?
+                  other.m_workspace->Copy() : nullptr)
 #ifdef LBANN_HAS_CUDNN
     , m_tensors_cudnn_desc(other.m_tensors_cudnn_desc)
 #endif // LBANN_HAS_CUDNN
   {
-
-    // Matrix deep copy
-    m_workspace = other.m_workspace;
-    if (m_workspace != nullptr) { m_workspace = m_workspace->Copy(); }
-
 #ifdef LBANN_HAS_CUDNN
     m_tensors_cudnn_desc.set_layer(this);
 #endif // LBANN_HAS_CUDNN
-
   }
 
   softmax_layer& operator=(const softmax_layer& other) {
-    activation_layer::operator=(other);
-    m_min_output = other.m_min_output;
-
-    // Deep matrix copy
-    if (m_workspace != nullptr) { delete m_workspace; }
-    m_workspace = other.m_workspace;
-    if (m_workspace != nullptr) { m_workspace = m_workspace->Copy(); }
-
+    Layer::operator=(other);
+    m_workspace.reset(other.m_workspace ?
+                      other.m_workspace->Copy() : nullptr);
 #ifdef LBANN_HAS_CUDNN
-    // Copy cuDNN objects
     m_tensors_cudnn_desc = other.m_tensors_cudnn_desc;
     m_tensors_cudnn_desc.set_layer(this);
 #endif // LBANN_HAS_CUDNN
-
+    return *this;
   }
 
-  ~softmax_layer() {
-    if (m_workspace != nullptr) { delete m_workspace; }
-  }
+  ~softmax_layer() = default;
 
   softmax_layer* copy() const override { return new softmax_layer(*this); }
   std::string get_type() const override { return "softmax"; }
-
-  std::string get_description() const override {
-    return std::string {} + " softmax" + " dataLayout: "
-           + this->get_data_layout_string(get_data_layout());
-  }
-
-  data_layout get_data_layout() const override { return T_layout; }
-
-  El::Device get_device_allocation() const override { return Dev; }
-
-  void setup_matrices(const El::Grid& grid) override;
-
-  void setup_data() override {
-    activation_layer::setup_data();
-    const int mini_batch_size = this->m_model->get_max_mini_batch_size();
-    m_workspace->Resize(1, mini_batch_size);
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+
+  void setup_matrices(const El::Grid& grid) override {
+    Layer::setup_matrices(grid);
+    auto dist = get_prev_activations().DistData();
+    dist.colDist = El::STAR;
+    m_workspace.reset(AbsDistMat::Instantiate(dist));
+#ifdef HYDROGEN_HAVE_CUB
+    if (m_workspace->GetLocalDevice() == El::Device::GPU) {
+      m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool
+    }
+#endif // HYDROGEN_HAVE_CUB
   }
 
   void fp_setup_outputs(El::Int mini_batch_size) override {
-    activation_layer::fp_setup_outputs(mini_batch_size);
+    Layer::fp_setup_outputs(mini_batch_size);
+    const auto& dist_data = get_prev_activations().DistData();
+    m_workspace->Empty(false);
+    m_workspace->AlignWith(dist_data);
     m_workspace->Resize(1, mini_batch_size);
   }
 
   void fp_compute() override;
   void bp_compute() override;
 
-  virtual void fp_compute_cpu() {
-
-    // Local matrices
-    const auto& local_input = get_local_prev_activations();
-    auto& local_output = get_local_activations();
-    auto& local_workspace = m_workspace->Matrix();
-
-    // Matrix parameters
-    const El::Int local_height = local_input.Height();
-    const El::Int local_width = local_input.Width();
+private:
 
-    // Find maximum entry in each column
-    if (local_height == 0) {
-      // When there's no local data, fill the workspace with a small value so
-      // the maximum across processors is still computed correctly.
-      El::Fill(local_workspace, std::numeric_limits<DataType>::lowest());
-    } else {
-      #pragma omp parallel for
-      for (El::Int col = 0; col < local_width; ++col) {
-        DataType max_entry = local_input(0, col);
-        for (El::Int row = 1; row < local_height; ++row) {
-          max_entry = std::max(max_entry, local_input(row, col));
-        }
-        local_workspace(0, col) = max_entry;
-      }
-    }
-    m_comm->allreduce(*m_workspace, m_workspace->RedundantComm(),
-                      El::mpi::MAX);
-
-    // Exponentiate activations and compute column sums
-    // Note: Subtracting by the column max prevents activations from
-    // blowing up. Large negative values underflow to 0.
-    #pragma omp parallel for
-    for (El::Int col = 0; col < local_width; ++col) {
-      const DataType shift = local_workspace(0, col);
-      DataType sum = 0;
-      for (El::Int row = 0; row < local_height; ++row) {
-        const DataType x = local_input(row, col);
-        const DataType y = std::exp(x - shift);
-        local_output(row, col) = y;
-        sum += y;
-      }
-      local_workspace(0, col) = sum;
-    }
-    m_comm->allreduce(*m_workspace, m_workspace->RedundantComm());
-
-    // Divide activations by column sums
-    // Note: Small values are rounded to minimum output value to avoid
-    // denormalized floats.
-    #pragma omp parallel for
-    for (El::Int col = 0; col < local_width; ++col) {
-      const DataType scale = DataType(1) / local_workspace(0, col);
-      for (El::Int row = 0; row < local_height; ++row) {
-        DataType& y = local_output(row, col);
-        y *= scale;
-      #ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
-        y = std::max(y, m_min_output);
-      #endif // LBANN_ENABLE_SOFTMAX_CUTOFF
-      }
-    }
-
-  }
-
-  virtual void bp_compute_cpu() {
-
-    // Local matrices
-    const DMat<Dev>& local_output = get_local_activations();
-    const DMat<Dev>& local_gradient_wrt_output = get_local_prev_error_signals();
-    DMat<Dev>& local_gradient_wrt_input = get_local_error_signals();
-    DMat<Dev>& local_workspace = m_workspace->Matrix();
-
-    // Matrix parameters
-    const El::Int local_height = local_output.Height();
-    const El::Int local_width = local_output.Width();
-
-    // Compute dot products between output and gradient w.r.t. output
-    for (El::Int col = 0; col < local_width; ++col) {
-      const auto& y = local_output(El::ALL, El::IR(col));
-      const auto& dy = local_gradient_wrt_output(El::ALL, El::IR(col));
-      local_workspace(0, col) = El::Dot(y, dy);
-    }
-    m_comm->allreduce(*m_workspace, m_workspace->RedundantComm());
-
-    // Compute gradient w.r.t. input
-    #pragma omp parallel for
-    for (El::Int col = 0; col < local_width; ++col) {
-      const DataType y_dot_dy = local_workspace(0, col);
-      for (El::Int row = 0; row < local_height; ++row) {
-        const DataType y = local_output(row, col);
-        const DataType dy = local_gradient_wrt_output(row, col);
-        DataType dx = y * (dy - y_dot_dy);
-      #ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
-        if (y <= m_min_output) { dx = DataType(0); }
-      #endif
-        local_gradient_wrt_input(row, col) = dx;
-      }
-    }
+  /** Workspace for column-wise reductions. */
+  std::unique_ptr<AbsDistMat> m_workspace;
 
-  }
+#ifdef LBANN_HAS_CUDNN
+  /** Tensor cuDNN descriptors. */
+  cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc;
+#endif // LBANN_HAS_CUDNN
 
 };
 
 } // namespace lbann
 
-#endif // LBANN_LAYER_SOFTMAX_HPP_INCLUDED
+#endif // LBANN_LAYER_ACTIVATION_SOFTMAX_HPP_INCLUDED
diff --git a/include/lbann/layers/activations/tanh.hpp b/include/lbann/layers/activations/tanh.hpp
deleted file mode 100644
index 6f4eda5fa6f..00000000000
--- a/include/lbann/layers/activations/tanh.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_LAYER_ACTIVATION_TANH_HPP_INCLUDED
-#define LBANN_LAYER_ACTIVATION_TANH_HPP_INCLUDED
-
-#include "lbann/layers/activations/activation.hpp"
-#include "lbann/utils/cudnn.hpp"
-
-namespace lbann {
-
-/** Hyperbolic tangent. */
-template <data_layout T_layout, El::Device Dev>
-class tanh_layer : public entrywise_activation_layer {
-
- private:
-#ifdef LBANN_HAS_CUDNN
-  /** Activation cuDNN descriptor. */
-  cudnnActivationDescriptor_t m_activation_cudnn_desc;
-  /** Tensor cuDNN descriptors. */
-  cudnn::entrywise_layer_tensor_manager m_tensors_cudnn_desc;
-#endif // LBANN_HAS_CUDNN
-
- public:
-
-  tanh_layer(lbann_comm *comm)
-    : entrywise_activation_layer(comm)
-#ifdef LBANN_HAS_CUDNN
-    , m_activation_cudnn_desc(nullptr),
-      m_tensors_cudnn_desc(this)
-#endif // LBANN_HAS_CUDNN
-  {}
-
-  tanh_layer(const tanh_layer& other)
-    : entrywise_activation_layer(other)
-#ifdef LBANN_HAS_CUDNN
-    , m_activation_cudnn_desc(nullptr),
-      m_tensors_cudnn_desc(other.m_tensors_cudnn_desc)
-#endif // LBANN_HAS_CUDNN
-  {
-#ifdef LBANN_HAS_CUDNN
-    cudnn::copy_activation_desc(other.m_activation_cudnn_desc,
-                                m_activation_cudnn_desc);
-    m_tensors_cudnn_desc.set_layer(this);
-#endif // LBANN_HAS_CUDNN
-  }
-
-  tanh_layer& operator=(const tanh_layer& other) {
-    entrywise_activation_layer::operator=(other);
-#ifdef LBANN_HAS_CUDNN
-    cudnn::copy_activation_desc(other.m_activation_cudnn_desc,
-                                m_activation_cudnn_desc);
-    m_tensors_cudnn_desc = other.m_tensors_cudnn_desc;
-    m_tensors_cudnn_desc.set_layer(this);
-#endif // LBANN_HAS_CUDNN
-    return *this;
-  }
-
-  ~tanh_layer() {
-#ifdef LBANN_HAS_CUDNN
-    if (m_activation_cudnn_desc != nullptr) {
-      cudnnDestroyActivationDescriptor(m_activation_cudnn_desc);
-    }
-#endif // LBANN_HAS_CUDNN
-  }
-
-  tanh_layer* copy() const override { return new tanh_layer(*this); }
-  std::string get_type() const override { return "tanh"; }
-
-  /** Returns description of ctor params */
-  std::string get_description() const override {
-    return std::string {} +
-     " tanh" + " dataLayout: " + this->get_data_layout_string(get_data_layout());
-  }
-
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-
-  void setup_gpu() override {
-    entrywise_activation_layer::setup_gpu();
-#ifndef LBANN_HAS_CUDNN
-    LBANN_ERROR("cuDNN not detected");
-#else
-    if (m_activation_cudnn_desc != nullptr) {
-      CHECK_CUDNN(cudnnDestroyActivationDescriptor(m_activation_cudnn_desc));
-      m_activation_cudnn_desc = nullptr;
-    }
-    CHECK_CUDNN(cudnnCreateActivationDescriptor(&m_activation_cudnn_desc));
-    CHECK_CUDNN(cudnnSetActivationDescriptor(m_activation_cudnn_desc,
-                                             CUDNN_ACTIVATION_TANH,
-                                             CUDNN_PROPAGATE_NAN,
-                                             0.0));
-#endif // LBANN_HAS_CUDNN
-  }
-
- protected:
-
-  DataType activation(DataType x) const override {
-    return std::tanh(x);
-  }
-
-  DataType activation_derivative(DataType x) const override {
-    const auto& coshx = std::cosh(x);
-    return 1 / (coshx * coshx);
-  }
-
-  void fp_compute() override;
-  void bp_compute() override;
-
-};
-
-} // namespace lbann
-
-#endif // LBANN_LAYER_ACTIVATION_TANH_HPP_INCLUDED
diff --git a/include/lbann/layers/image/CMakeLists.txt b/include/lbann/layers/image/CMakeLists.txt
new file mode 100644
index 00000000000..b9a997aae6c
--- /dev/null
+++ b/include/lbann/layers/image/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Add the headers for this directory
+set_full_path(THIS_DIR_HEADERS
+  bilinear_resize.hpp
+  )
+
+# Propagate the files up the tree
+set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/layers/image/bilinear_resize.hpp b/include/lbann/layers/image/bilinear_resize.hpp
new file mode 100644
index 00000000000..cc64e437453
--- /dev/null
+++ b/include/lbann/layers/image/bilinear_resize.hpp
@@ -0,0 +1,110 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED
+#define LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED
+
+#include "lbann/layers/layer.hpp"
+
+namespace lbann {
+
+/** Bilinear resize layer.
+ *  Tensors are assumed to be image data in CHW format. Gradients are
+ *  not propagated during backprop.
+ */
+template <data_layout Layout, El::Device Device>
+class bilinear_resize_layer : public Layer {
+public:
+
+  bilinear_resize_layer(lbann_comm *comm, El::Int height, El::Int width)
+    : Layer(comm), m_height(height), m_width(width) {
+    static_assert(Layout == data_layout::DATA_PARALLEL,
+                  "bilinear_resize_layer only supports DATA_PARALLEL");
+  }
+
+  bilinear_resize_layer* copy() const override {
+    return new bilinear_resize_layer(*this);
+  }
+  std::string get_type() const override { return "bilinear resize"; }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+
+  void fp_compute() override;
+
+protected:
+
+  void setup_dims() override {
+    Layer::setup_dims();
+
+    // Get input dimensions
+    auto dims = get_input_dims();
+    const auto& num_dims = dims.size();
+
+    // Check that dimensions are valid
+    std::stringstream err;
+    if (num_dims < 2) {
+      err << get_type() << " layer \"" << get_name() << "\" "
+          << "expects input with at least two dimensions, "
+          << "but input dimensions are ";
+      for (size_t i = 0; i < num_dims; ++i) {
+        err << (i > 0 ? " x " : "") << dims[i];
+      }
+      LBANN_ERROR(err.str());
+    } else if (m_height <= 0) {
+      err << get_type() << " layer \"" << get_name() << "\" "
+          << "attempted to resize with "
+          << "negative height (" << m_height << ")";
+      LBANN_ERROR(err.str());
+    } else if (m_width <= 0) {
+      err << get_type() << " layer \"" << get_name() << "\" "
+          << "attempted to resize with "
+          << "negative width (" << m_width << ")";
+      LBANN_ERROR(err.str());
+    }
+
+    // Resize output tensor
+    dims[num_dims-2] = m_height;
+    dims[num_dims-1] = m_width;
+    set_output_dims(dims);
+    
+  }
+
+private:
+
+  /** Output image height.
+   *  Data is assumed to be in CHW format.
+   */
+  El::Int m_height;
+  /** Output image width.
+   *  Data is assumed to be in CHW format.
+   */
+  El::Int m_width;
+  
+};
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED
diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp
index 9a2d0c93a42..73ccfc93439 100644
--- a/include/lbann/layers/io/input/generic_input_layer.hpp
+++ b/include/lbann/layers/io/input/generic_input_layer.hpp
@@ -561,6 +561,7 @@ class generic_input_layer : public io_layer {
     it = m_data_readers.find(execution_mode::training);
     if ((it != m_data_readers.end()) && it->second) {
       linearized_data_size = (it->second)->get_linearized_data_size();
+      std::cerr << "XX >>>>>> linearized_data_size: " << linearized_data_size << "\n";
     }
 
     it = m_data_readers.find(execution_mode::validation);
diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp
index e7effff3256..443a84b4103 100644
--- a/include/lbann/layers/learning/base_convolution.hpp
+++ b/include/lbann/layers/learning/base_convolution.hpp
@@ -57,6 +57,14 @@ class base_convolution_layer : public learning_layer {
   std::vector<int> m_pads;
   /** Convolution strides. */
   std::vector<int> m_strides;
+  /** Convolution dilations. */
+  std::vector<int> m_dilations;
+  /** Convolution groups.
+   *  The channels are split into this many independent groups when performing
+   *  convolution. The default convolution operation has one group, and a
+   *  depthwise convolution has as many groups as there are input channels.
+   */
+  int m_num_groups;
 
   /** Scaling factor for bias term.
    *  If the scaling factor is zero, bias is not applied.
@@ -95,12 +103,16 @@ class base_convolution_layer : public learning_layer {
                          const std::vector<int> conv_dims,
                          const std::vector<int> pads,
                          const std::vector<int> strides,
+                         const std::vector<int> dilations,
+                         int groups,
                          bool has_bias)
     : learning_layer(comm),
       m_kernel_dims(conv_dims),
       m_kernel_size(0),
       m_pads(pads),
       m_strides(strides),
+      m_dilations(dilations),
+      m_num_groups(groups),
       m_bias_scaling_factor(has_bias ? DataType(1) : DataType(0)),
       m_kernel_gradient(this->m_comm->get_model_grid()),
       m_bias_gradient(this->m_comm->get_model_grid())
@@ -112,17 +124,40 @@ class base_convolution_layer : public learning_layer {
 #endif // LBANN_HAS_CUDNN
   {
 
+    bool nonunit_dilation = false;
+    for (const auto& d : m_dilations) {
+      if (d != 1) {
+        nonunit_dilation = true;
+        break;
+      }
+    }
+    if (Dev == El::Device::CPU && nonunit_dilation) {
+      std::stringstream err;
+      err << "layer \"" << get_name() << "\" "
+          << "has nonunit dilation which is only supported on GPUs";
+      LBANN_ERROR(err.str());
+    }
+    if (Dev == El::Device::CPU && m_num_groups > 1) {
+      std::stringstream err;
+      err << "layer \"" << get_name() << "\" "
+          << "has nonunit groups " << m_num_groups
+          << " which is only supported on GPUs";
+      LBANN_ERROR(err.str());
+    }
+
     // Check dimensions of convolution parameters
     if ((int) m_kernel_dims.size() != num_data_dims
         || (int) m_pads.size() != num_data_dims
-        || (int) m_strides.size() != num_data_dims) {
+        || (int) m_strides.size() != num_data_dims
+        || (int) m_dilations.size() != num_data_dims) {
       std::stringstream err;
       err << "layer \"" << get_name() << "\" "
           << "has an invalid number of convolution parameters "
           << "(expected " << num_data_dims << " parameters, "
           << "conv_dims has " << m_kernel_dims.size() << ", "
           << "pads has " << m_pads.size() << ", "
-          << "strides has " << m_strides.size() << ")";
+          << "strides has " << m_strides.size() << ", "
+          << "dilations has " << m_dilations.size() << ")";
       LBANN_ERROR(err.str());
     }
 
@@ -137,6 +172,8 @@ class base_convolution_layer : public learning_layer {
       m_kernel_size(other.m_kernel_size),
       m_pads(other.m_pads),
       m_strides(other.m_strides),
+      m_dilations(other.m_dilations),
+      m_num_groups(other.m_num_groups),
       m_bias_scaling_factor(other.m_bias_scaling_factor),
       m_kernel_gradient(other.m_kernel_gradient),
       m_bias_gradient(other.m_bias_gradient)
@@ -164,6 +201,8 @@ class base_convolution_layer : public learning_layer {
     m_kernel_size = other.m_kernel_size;
     m_pads = other.m_pads;
     m_strides = other.m_strides;
+    m_dilations = other.m_dilations;
+    m_num_groups = other.m_num_groups;
     m_bias_scaling_factor = other.m_bias_scaling_factor;
     m_kernel_gradient = other.m_kernel_gradient;
     m_bias_gradient = other.m_bias_gradient;
@@ -235,7 +274,7 @@ class base_convolution_layer : public learning_layer {
     }
     auto& kernel_weights = *this->m_weights[0];
     auto& bias_weights = *this->m_weights[1];
-    
+
     // Initialize variance scaling initialization
     auto* cast_initializer
       = dynamic_cast<variance_scaling_initializer*>(kernel_weights.get_initializer());
@@ -279,7 +318,7 @@ class base_convolution_layer : public learning_layer {
         LBANN_ERROR(err.str());
       }
     }
-    
+
   }
 
   /// Initialize GPU objects
@@ -300,16 +339,16 @@ class base_convolution_layer : public learning_layer {
                                            m_kernel_dims.data()));
 
     // Set convolution descriptor
-    // Note: upscales are not supported as of cuDNN v5.1
     CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&m_convolution_cudnn_desc));
-    std::vector<int> upscales(output_dims.size() - 1, 1);
     CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(m_convolution_cudnn_desc,
                                                 m_pads.size(),
                                                 m_pads.data(),
                                                 m_strides.data(),
-                                                upscales.data(),
+                                                m_dilations.data(),
                                                 CUDNN_CROSS_CORRELATION,
                                                 cudnn::get_data_type()));
+    CHECK_CUDNN(cudnnSetConvolutionGroupCount(m_convolution_cudnn_desc,
+                                              m_num_groups));
 
     // Set bias tensor descriptor
     std::vector<int> bias_dims(output_dims.size() + 1, 1);
@@ -345,7 +384,7 @@ class base_convolution_layer : public learning_layer {
         || output.Height() < 1 || output.Width() < 1) {
       return;
     }
-    
+
     // Initialize GPU workspace
     GPUMat workspace;
 #ifdef HYDROGEN_HAVE_CUB
@@ -575,7 +614,7 @@ class base_convolution_layer : public learning_layer {
           = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
         #endif
         if (using_transposed_convolution) {
-          #ifndef LBANN_DETERMINISTIC 
+          #ifndef LBANN_DETERMINISTIC
           CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(cudnn::get_handle(),
                                                                  gradient_wrt_output_desc,
                                                                  input_desc,
@@ -600,7 +639,7 @@ class base_convolution_layer : public learning_layer {
                                                      m_kernel_gradient.Buffer()));
         }
         else {
-          #ifndef LBANN_DETERMINISTIC 
+          #ifndef LBANN_DETERMINISTIC
           CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(cudnn::get_handle(),
                                                                  input_desc,
                                                                  gradient_wrt_output_desc,
@@ -956,22 +995,27 @@ class base_convolution_layer : public learning_layer {
                                                   nullptr,
                                                   &mode,
                                                   &data_type));
-      std::vector<int> pads(num_dims), strides(num_dims), upscales(num_dims);
+      std::vector<int> pads(num_dims), strides(num_dims), dilations(num_dims);
       CHECK_CUDNN(cudnnGetConvolutionNdDescriptor(src,
                                                   num_dims,
                                                   &num_dims,
                                                   pads.data(),
                                                   strides.data(),
-                                                  upscales.data(),
+                                                  dilations.data(),
                                                   &mode,
                                                   &data_type));
+      int num_groups;
+      CHECK_CUDNN(cudnnGetConvolutionGroupCount(src,
+                                                &num_groups));
       CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(dst,
                                                   num_dims,
                                                   pads.data(),
                                                   strides.data(),
-                                                  upscales.data(),
+                                                  dilations.data(),
                                                   mode,
                                                   data_type));
+      CHECK_CUDNN(cudnnSetConvolutionGroupCount(dst,
+                                                num_groups));
     }
 
   }
diff --git a/include/lbann/layers/learning/convolution.hpp b/include/lbann/layers/learning/convolution.hpp
index d833311c506..62d257d89b6 100644
--- a/include/lbann/layers/learning/convolution.hpp
+++ b/include/lbann/layers/learning/convolution.hpp
@@ -61,6 +61,11 @@ class convolution_layer : public base_convolution_layer<Dev> {
     for (size_t h=0; h<this->m_strides.size(); h++) {
       s << this->m_strides[h] << " ";
     }
+    s << " dilation: ";
+    for (size_t h = 0; h < this->m_dilations.size(); ++h) {
+      s << this->m_dilations[h] << " ";
+    }
+    s << " groups: " << this->m_num_groups;
     s << " num_output_channels: " << this->get_output_dims()[0]
       << " has_bias: " << this->m_bias_scaling_factor
       << " dataLayout: " << this->get_data_layout_string(get_data_layout())
@@ -91,7 +96,7 @@ class convolution_layer : public base_convolution_layer<Dev> {
         }
       }
     }
-    return s.str();;
+    return s.str();
   }
 
   convolution_layer(lbann_comm *comm,
@@ -100,6 +105,8 @@ class convolution_layer : public base_convolution_layer<Dev> {
                     int conv_dim,
                     int pad,
                     int stride,
+                    int dilation,
+                    int groups,
                     bool has_bias = true)
     : convolution_layer(comm,
                         num_data_dims,
@@ -107,6 +114,8 @@ class convolution_layer : public base_convolution_layer<Dev> {
                         std::vector<int>(num_data_dims, conv_dim),
                         std::vector<int>(num_data_dims, pad),
                         std::vector<int>(num_data_dims, stride),
+                        std::vector<int>(num_data_dims, dilation),
+                        groups,
                         has_bias) {}
 
   convolution_layer(lbann_comm *comm,
@@ -115,6 +124,8 @@ class convolution_layer : public base_convolution_layer<Dev> {
                     std::vector<int> conv_dims,
                     std::vector<int> pads,
                     std::vector<int> strides,
+                    std::vector<int> dilations,
+                    int groups,
                     bool has_bias = true)
     : base_convolution_layer<Dev>(comm,
                                   num_data_dims,
@@ -122,6 +133,8 @@ class convolution_layer : public base_convolution_layer<Dev> {
                                   conv_dims,
                                   pads,
                                   strides,
+                                  dilations,
+                                  groups,
                                   has_bias) {
     static_assert(T_layout == data_layout::DATA_PARALLEL,
                   "convolution only supports DATA_PARALLEL");
@@ -145,7 +158,15 @@ class convolution_layer : public base_convolution_layer<Dev> {
     auto output_dims = input_dims;
 
     // Initialize convolution kernel dimensions
-    kernel_dims.insert(kernel_dims.begin() + 1, input_dims[0]);
+    if (input_dims[0] % this->m_num_groups != 0) {
+      std::stringstream err;
+      err << this->get_type() << " layer \"" << this->get_name() << "\" "
+          << " has input tensor with channels " << input_dims[0]
+          << " but groups " << this->m_num_groups
+          << "; groups must evenly divide input channels";
+      LBANN_ERROR(err.str());
+    }
+    kernel_dims.insert(kernel_dims.begin() + 1, input_dims[0] / this->m_num_groups);
     this->m_kernel_size = std::accumulate(kernel_dims.begin(),
                                           kernel_dims.end(),
                                           1,
@@ -167,12 +188,21 @@ class convolution_layer : public base_convolution_layer<Dev> {
     for (size_t i = 0; i < output_dims.size() - 1; ++i) {
       const auto& stride = this->m_strides[i];
       const auto& pad = this->m_pads[i];
+      const auto& dilation = this->m_dilations[i];
       const auto& effective_dim = (input_dims[i+1]
                                    + 2 * pad
-                                   - kernel_dims[i+2] + 1);
+                                   - dilation*(kernel_dims[i+2] - 1));
       output_dims[i+1] = (effective_dim + stride - 1) / stride;
     }
     this->set_output_dims(output_dims);
+    if (output_dims[0] % this->m_num_groups != 0) {
+      std::stringstream err;
+      err << this->get_type() << " layer \"" << this->get_name() << "\" "
+          << " has output tensor with filters " << output_dims[0]
+          << " but groups " << this->m_num_groups
+          << "; groups must evenly divide output filters";
+      LBANN_ERROR(err.str());
+    }
 
   }
 
diff --git a/include/lbann/layers/learning/deconvolution.hpp b/include/lbann/layers/learning/deconvolution.hpp
index 5b59fdd423c..4327bdb73aa 100644
--- a/include/lbann/layers/learning/deconvolution.hpp
+++ b/include/lbann/layers/learning/deconvolution.hpp
@@ -51,6 +51,8 @@ class deconvolution_layer : public base_convolution_layer<Dev> {
                       int conv_dim,
                       int pad,
                       int stride,
+                      int dilation,
+                      int groups,
                       bool has_bias = true)
     : deconvolution_layer(comm,
                           num_data_dims,
@@ -58,6 +60,8 @@ class deconvolution_layer : public base_convolution_layer<Dev> {
                           std::vector<int>(num_data_dims, conv_dim),
                           std::vector<int>(num_data_dims, pad),
                           std::vector<int>(num_data_dims, stride),
+                          std::vector<int>(num_data_dims, dilation),
+                          groups,
                           has_bias) {}
 
   deconvolution_layer(lbann_comm *comm,
@@ -66,14 +70,18 @@ class deconvolution_layer : public base_convolution_layer<Dev> {
                       std::vector<int> conv_dims,
                       std::vector<int> pads,
                       std::vector<int> strides,
+                      std::vector<int> dilations,
+                      int groups,
                       bool has_bias = true)
     : base_convolution_layer<Dev>(comm,
-                             num_data_dims,
-                             num_output_channels,
-                             conv_dims,
-                             pads,
-                             strides,
-                             has_bias) {
+                                  num_data_dims,
+                                  num_output_channels,
+                                  conv_dims,
+                                  pads,
+                                  strides,
+                                  dilations,
+                                  groups,
+                                  has_bias) {
     static_assert(T_layout == data_layout::DATA_PARALLEL,
                   "convolution only supports DATA_PARALLEL");
 
@@ -94,6 +102,11 @@ class deconvolution_layer : public base_convolution_layer<Dev> {
     for (size_t h=0; h<this->m_strides.size(); h++) {
       s << this->m_strides[h] << " ";
     }
+    s << " dilation: ";
+    for (size_t h = 0; h < this->m_dilations.size(); ++h) {
+      s << this->m_dilations[h] << " ";
+    }
+    s << " groups: " << this->m_num_groups;
     s << " num_output_channels: " << this->get_output_dims()[0]
       << " has_bias: " << this->m_bias_scaling_factor
       << " dataLayout: " << this->get_data_layout_string(get_data_layout())
@@ -117,6 +130,25 @@ class deconvolution_layer : public base_convolution_layer<Dev> {
     const auto& input_dims = this->get_input_dims();
     auto output_dims = input_dims;
 
+    bool nonunit_dilation = false;
+    for (const auto& d : this->m_dilations) {
+      if (d != 1) {
+        nonunit_dilation = true;
+        break;
+      }
+    }
+    if (nonunit_dilation) {
+      std::stringstream err;
+      err << this->get_type() << " layer \"" << this->get_name() << "\" "
+          << " does not support dilated convolutions";
+      LBANN_ERROR(err.str());
+    }
+    if (this->m_num_groups != 1) {
+      std::stringstream err;
+      err << this->get_type() << " layer \"" << this->get_name() << "\" "
+          << " does not support grouped convolutions";
+      LBANN_ERROR(err.str());
+    }
     // Initialize deconvolution kernel dimensions
     // Note: Unlike the convolutional kernel, the previous layer's
     // number of channels is now the leading position -- keep in mind
diff --git a/include/lbann/layers/loss/CMakeLists.txt b/include/lbann/layers/loss/CMakeLists.txt
index 45ec8805a9e..db762dcaac0 100644
--- a/include/lbann/layers/loss/CMakeLists.txt
+++ b/include/lbann/layers/loss/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Add the headers for this directory
 set_full_path(THIS_DIR_HEADERS
   cross_entropy.hpp
+  l2_norm2.hpp
   mean_squared_error.hpp
   top_k_categorical_accuracy.hpp
   )
diff --git a/include/lbann/layers/loss/categorical_accuracy.hpp b/include/lbann/layers/loss/categorical_accuracy.hpp
new file mode 100644
index 00000000000..de1bbb58458
--- /dev/null
+++ b/include/lbann/layers/loss/categorical_accuracy.hpp
@@ -0,0 +1,98 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED
+#define LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED
+
+#include "lbann/layers/layer.hpp"
+
+namespace lbann {
+
+/** Categorical accuracy layer.
+ *  The two inputs are interpreted as predictions and ground-truth
+ *  labels, respectively. An output is set to one if the top entries
+ *  in both inputs are in the same position and is otherwise
+ *  zero. Ties are broken in favor of entries with smaller indices.
+ */
+template <data_layout T_layout, El::Device Dev>
+class categorical_accuracy_layer : public Layer {
+public:
+
+  categorical_accuracy_layer(lbann_comm *comm) : Layer(comm) {
+    m_expected_num_parent_layers = 2;
+  }
+
+  categorical_accuracy_layer* copy() const override {
+    return new categorical_accuracy_layer(*this);
+  }
+  std::string get_type() const override { return "categorical accuracy"; }
+  data_layout get_data_layout() const override { return T_layout; }
+  El::Device get_device_allocation() const override { return Dev; }
+
+  void setup_dims() override {
+    Layer::setup_dims();
+    set_output_dims({1});
+
+    // Check that input dimensions are valid
+    std::stringstream err;
+    const auto& parents = get_parent_layers();
+    const auto& dims0 = get_input_dims(0);
+    const auto& dims1 = get_input_dims(1);
+    if (dims0 != dims1) {
+      err << get_type() << " layer \"" << get_name() << "\" "
+          << "expects input tensors with identical dimensions, "
+          << "but parent layer \"" << parents[0]->get_name() << "\" "
+          << "outputs a tensor with dimensions ";
+      for (size_t i = 0; i < dims0.size(); ++i) {
+        err << (i > 0 ? " x " : "") << dims0[i];
+      }
+      err << " and parent layer \"" << parents[1]->get_name() << "\" "
+          << "outputs a tensor with dimensions ";
+      for (size_t i = 0; i < dims1.size(); ++i) {
+        err << (i > 0 ? " x " : "") << dims1[i];
+      }
+      LBANN_ERROR(err.str());
+    }
+    if (get_input_size() <= 1) {
+      err << get_type() << " layer \"" << get_name() << "\" "
+          << "expects input tensors with at least two entries, "
+          << "but parent layers \"" << parents[0]->get_name() << "\" "
+          << "and \"" << parents[1]->get_name() << "\" "
+          << "output tensors with dimensions ";
+      for (size_t i = 0; i < dims0.size(); ++i) {
+        err << (i > 0 ? " x " : "") << dims0[i];
+      }
+      LBANN_ERROR(err.str());
+    }
+  }
+  
+  void fp_compute() override;
+  
+};
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED
diff --git a/include/lbann/layers/loss/cross_entropy.hpp b/include/lbann/layers/loss/cross_entropy.hpp
index 9782f61ef6d..4d644f63469 100644
--- a/include/lbann/layers/loss/cross_entropy.hpp
+++ b/include/lbann/layers/loss/cross_entropy.hpp
@@ -31,6 +31,13 @@
 
 namespace lbann {
 
+/** Cross entropy layer.
+ *  Given a predicted distribution \f$y\f$ and ground truth
+ *  distribution \f$\hat{y}\f$, the cross entropy is
+ *    \f[
+ *    CE(y,\hat{y}) = - \sum\limits_{i} \hat{y}_i \log y_i
+ *    \f]
+ */
 template <data_layout T_layout, El::Device Dev>
 class cross_entropy_layer : public Layer {
 public:
diff --git a/include/lbann/layers/loss/entrywise.hpp b/include/lbann/layers/loss/entrywise.hpp
new file mode 100644
index 00000000000..0a3c65b83b5
--- /dev/null
+++ b/include/lbann/layers/loss/entrywise.hpp
@@ -0,0 +1,57 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYER_LOSS_ENTRYWISE_HPP_INCLUDED
+#define LBANN_LAYER_LOSS_ENTRYWISE_HPP_INCLUDED
+
+#include "lbann/layers/math/binary.hpp"
+
+namespace lbann {
+
+// Convenience macro to define a binary math layer class
+// Note: Implementation of entrywise loss layers is identical to
+// binary math layers.
+#define LBANN_DEFINE_BINARY_MATH_LAYER(layer_name, layer_string) \
+  struct layer_name##_name_struct {                             \
+    inline operator std::string() { return layer_string; }      \
+  };                                                            \
+  template <data_layout Layout, El::Device Device>              \
+  using layer_name                                              \
+  = binary_math_layer<Layout, Device, layer_name##_name_struct>;
+
+// Cross entropy loss
+LBANN_DEFINE_BINARY_MATH_LAYER(binary_cross_entropy_layer, "binary cross entropy");
+LBANN_DEFINE_BINARY_MATH_LAYER(sigmoid_binary_cross_entropy_layer, "sigmoid binary cross entropy");
+
+// Boolean loss functions
+LBANN_DEFINE_BINARY_MATH_LAYER(boolean_accuracy_layer, "Boolean accuracy");
+LBANN_DEFINE_BINARY_MATH_LAYER(boolean_false_negative_layer, "Boolean false negative rate");
+LBANN_DEFINE_BINARY_MATH_LAYER(boolean_false_positive_layer, "Boolean false positive rate");
+
+} // namespace lbann
+
+#undef LBANN_DEFINE_BINARY_MATH_LAYER 
+#endif // LBANN_LAYER_LOSS_ENTRYWISE_HPP_INCLUDED
diff --git a/include/lbann/layers/loss/l2_norm2.hpp b/include/lbann/layers/loss/l2_norm2.hpp
new file mode 100644
index 00000000000..f91116ff394
--- /dev/null
+++ b/include/lbann/layers/loss/l2_norm2.hpp
@@ -0,0 +1,130 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED
+#define LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED
+
+#include "lbann/layers/layer.hpp"
+
+namespace lbann {
+
+/** L2 norm squared layer.
+ *  Given an input \f$x\f$, this layer outputs
+ *    \f[ L2(x)^2 = \sum\limits_{i} x_i^2 \f]
+ */
+template <data_layout T_layout, El::Device Dev>
+class l2_norm2_layer : public Layer {
+public:
+
+  l2_norm2_layer(lbann_comm *comm) : Layer(comm) {
+    set_output_dims({1});
+  }
+
+  l2_norm2_layer(const l2_norm2_layer& other)
+    : Layer(other),
+      m_workspace(other.m_workspace ?
+                  other.m_workspace->Copy() : nullptr) {}
+  l2_norm2_layer& operator=(const l2_norm2_layer& other) {
+    Layer::operator=(other);
+    m_workspace.reset(other.m_workspace ?
+                      other.m_workspace->Copy() : nullptr);
+    return *this;
+  }
+
+  l2_norm2_layer* copy() const override { return new l2_norm2_layer(*this); }
+  std::string get_type() const override { return "L2 norm squared"; }
+  data_layout get_data_layout() const override { return T_layout; }
+  El::Device get_device_allocation() const override { return Dev; }
+
+  void setup_data() override {
+    Layer::setup_data();
+
+    // Initialize workspace
+    auto dist = get_prev_activations().DistData();
+    dist.colDist = El::STAR;
+    m_workspace.reset(AbsDistMat::Instantiate(dist));
+#ifdef HYDROGEN_HAVE_CUB
+    if (m_workspace->GetLocalDevice() == El::Device::GPU) {
+      m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool
+    }
+#endif // HYDROGEN_HAVE_CUB
+    
+  }
+
+  void fp_compute() override {
+
+    // Initialize workspace
+    m_workspace->Empty();
+    m_workspace->AlignWith(get_prev_activations());
+    m_workspace->Resize(1, get_prev_activations().Width());
+
+    // Compute local contributions and accumulate
+    /// @todo Consider reduce rather than allreduce
+    local_fp_compute(get_local_prev_activations(),
+                     m_workspace->Matrix());
+    m_comm->allreduce(*m_workspace, m_workspace->RedundantComm());
+    El::Copy(*m_workspace, get_activations());
+
+    // Clean up
+    m_workspace->Empty();
+    
+  }
+  
+  void bp_compute() override {
+
+    // Initialize workspace
+    m_workspace->Empty();
+    m_workspace->AlignWith(get_prev_activations());
+    El::Copy(get_prev_error_signals(), *m_workspace);
+
+    // Compute local gradients
+    local_bp_compute(get_local_prev_activations(),
+                     m_workspace->LockedMatrix(),
+                     get_local_error_signals());
+
+    // Clean up
+    m_workspace->Empty();
+
+  }
+
+private:
+
+  /** Compute local contributions to L2 norm. */
+  static void local_fp_compute(const AbsMat& local_input,
+                               AbsMat& local_contribution);
+  /** Compute local gradients. */
+  static void local_bp_compute(const AbsMat& local_input,
+                               const AbsMat& local_gradient_wrt_output,
+                               AbsMat& local_gradient_wrt_input);
+
+  /** Workspace matrix. */
+  std::unique_ptr<AbsDistMat> m_workspace;
+  
+};
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED
diff --git a/include/lbann/layers/loss/mean_squared_error.hpp b/include/lbann/layers/loss/mean_squared_error.hpp
index 6dfc5cae666..d1776f98b31 100644
--- a/include/lbann/layers/loss/mean_squared_error.hpp
+++ b/include/lbann/layers/loss/mean_squared_error.hpp
@@ -31,6 +31,14 @@
 
 namespace lbann {
 
+/** Mean squared error layer.
+
+ *  Given a prediction \f$y\f$ and ground truth \f$\hat{y}\f$, the
+ *  mean squared error is
+ *    \f[
+ *    MSE(y,\hat{y}) = \frac{1}{n} \sum\limits_{i}^{n} (y_i - \hat{y}_i)^2
+ *    \f]
+ */
 template <data_layout T_layout, El::Device Dev>
 class mean_squared_error_layer : public Layer {
 public:
diff --git a/include/lbann/layers/math/CMakeLists.txt b/include/lbann/layers/math/CMakeLists.txt
new file mode 100644
index 00000000000..1e474ed92ca
--- /dev/null
+++ b/include/lbann/layers/math/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Add the headers for this directory
+set_full_path(THIS_DIR_HEADERS
+  unary.hpp
+  binary.hpp
+  )
+
+# Propagate the files up the tree
+set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/layers/math/binary.hpp b/include/lbann/layers/math/binary.hpp
new file mode 100644
index 00000000000..8c727458605
--- /dev/null
+++ b/include/lbann/layers/math/binary.hpp
@@ -0,0 +1,119 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYER_MATH_BINARY_HPP_INCLUDED
+#define LBANN_LAYER_MATH_BINARY_HPP_INCLUDED
+
+#include "lbann/layers/layer.hpp"
+
+namespace lbann {
+
+/** Base class for binary math layers.
+ *  'Name' should be a type such that Name() returns a human-readable
+ *  layer name, e.g. an empty struct that can be converted to a
+ *  string.
+ */
+template <data_layout Layout, El::Device Device, typename Name>
+class binary_math_layer : public Layer {
+public:
+  binary_math_layer(lbann_comm *comm) : Layer(comm) {
+    m_expected_num_parent_layers = 2;
+  }
+  binary_math_layer* copy() const override {
+    return new binary_math_layer<Layout,Device,Name>(*this);
+  }
+  std::string get_type() const override { return Name(); }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+  
+protected:
+  
+  void setup_dims() override {
+    set_output_dims(get_input_dims());
+    Layer::setup_dims();
+
+    // Check that input dimensions match
+    if (get_input_dims(0) != get_input_dims(1)) {
+      std::stringstream err;
+      err << get_type() << " layer \"" << get_name() << "\" "
+          << "has input tensors with different dimensions (";
+      for (int i = 0; i < get_num_parents(); ++i) {
+        err << (i > 0 ? ", " : "")
+            << "layer \"" << m_parent_layers[i]->get_name() << "\" "
+            << "outputs ";
+        const auto& dims = get_input_dims(i);
+        for (size_t j = 0; j < dims.size(); ++j) {
+          err << (j > 0 ? " x " : "") << dims[j];
+        }
+      }
+      err << ")";
+      LBANN_ERROR(err.str());
+    }
+    
+  }
+  
+  void fp_compute() override;
+  void bp_compute() override;
+  
+};
+
+// Convenience macro to define a binary math layer class
+#define LBANN_DEFINE_BINARY_MATH_LAYER(layer_name, layer_string) \
+  struct layer_name##_name_struct {                             \
+    inline operator std::string() { return layer_string; }      \
+  };                                                            \
+  template <data_layout Layout, El::Device Device>              \
+  using layer_name                                              \
+  = binary_math_layer<Layout, Device, layer_name##_name_struct>;
+
+// Arithmetic operations
+LBANN_DEFINE_BINARY_MATH_LAYER(add_layer,         "add");
+LBANN_DEFINE_BINARY_MATH_LAYER(subtract_layer,    "subtract");
+LBANN_DEFINE_BINARY_MATH_LAYER(multiply_layer,    "multiply");
+LBANN_DEFINE_BINARY_MATH_LAYER(divide_layer,      "divide");
+LBANN_DEFINE_BINARY_MATH_LAYER(mod_layer,         "modulo");
+LBANN_DEFINE_BINARY_MATH_LAYER(pow_layer,         "power");
+LBANN_DEFINE_BINARY_MATH_LAYER(safe_divide_layer, "safe divide");
+
+// Comparison operations
+LBANN_DEFINE_BINARY_MATH_LAYER(max_layer,           "maximum");
+LBANN_DEFINE_BINARY_MATH_LAYER(min_layer,           "minimum");
+LBANN_DEFINE_BINARY_MATH_LAYER(equal_layer,         "equal");
+LBANN_DEFINE_BINARY_MATH_LAYER(not_equal_layer,     "not equal");
+LBANN_DEFINE_BINARY_MATH_LAYER(less_layer,          "less than");
+LBANN_DEFINE_BINARY_MATH_LAYER(less_equal_layer,    "less than or equal");
+LBANN_DEFINE_BINARY_MATH_LAYER(greater_layer,       "greater than");
+LBANN_DEFINE_BINARY_MATH_LAYER(greater_equal_layer, "greater than or equal");
+  
+// Logical operations
+LBANN_DEFINE_BINARY_MATH_LAYER(and_layer, "logical and");
+LBANN_DEFINE_BINARY_MATH_LAYER(or_layer,  "logical or");
+LBANN_DEFINE_BINARY_MATH_LAYER(xor_layer, "logical xor");
+
+} // namespace lbann
+
+#undef LBANN_DEFINE_BINARY_MATH_LAYER 
+#endif // LBANN_LAYER_MATH_BINARY_HPP_INCLUDED
diff --git a/include/lbann/layers/math/unary.hpp b/include/lbann/layers/math/unary.hpp
new file mode 100644
index 00000000000..04efb0a6878
--- /dev/null
+++ b/include/lbann/layers/math/unary.hpp
@@ -0,0 +1,112 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYER_MATH_UNARY_HPP_INCLUDED
+#define LBANN_LAYER_MATH_UNARY_HPP_INCLUDED
+
+#include "lbann/layers/layer.hpp"
+
+namespace lbann {
+
+/** Base class for unary math layers.
+ *  'Name' should be a type such that Name() returns a human-readable
+ *  layer name, e.g. an empty struct that can be converted to a
+ *  string.
+ */
+template <data_layout Layout, El::Device Device, typename Name>
+class unary_math_layer : public Layer {
+public:
+  unary_math_layer(lbann_comm *comm) : Layer(comm) {}
+  unary_math_layer* copy() const override {
+    return new unary_math_layer<Layout,Device,Name>(*this);
+  }
+  std::string get_type() const override { return Name(); }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+protected:
+  void setup_dims() override {
+    set_output_dims(get_input_dims());
+    Layer::setup_dims();
+  }
+  void fp_compute() override;
+  void bp_compute() override;
+};
+
+// Convenience macro to define a unary math layer class
+#define LBANN_DEFINE_UNARY_MATH_LAYER(layer_name, layer_string) \
+  struct layer_name##_name_struct {                             \
+    inline operator std::string() { return layer_string; }      \
+  };                                                            \
+  template <data_layout Layout, El::Device Device>              \
+  using layer_name                                              \
+  = unary_math_layer<Layout, Device, layer_name##_name_struct>;
+
+// Logical operations
+LBANN_DEFINE_UNARY_MATH_LAYER(not_layer, "logical not");
+  
+// Sign operations
+LBANN_DEFINE_UNARY_MATH_LAYER(abs_layer,      "absolute value");
+LBANN_DEFINE_UNARY_MATH_LAYER(negative_layer, "negative");
+LBANN_DEFINE_UNARY_MATH_LAYER(sign_layer,     "sign");
+
+// Rounding operations
+LBANN_DEFINE_UNARY_MATH_LAYER(round_layer, "round");
+LBANN_DEFINE_UNARY_MATH_LAYER(ceil_layer,  "ceil");
+LBANN_DEFINE_UNARY_MATH_LAYER(floor_layer, "floor");
+
+// Power operations
+LBANN_DEFINE_UNARY_MATH_LAYER(reciprocal_layer,      "reciprocal");
+LBANN_DEFINE_UNARY_MATH_LAYER(square_layer,          "square");
+LBANN_DEFINE_UNARY_MATH_LAYER(sqrt_layer,            "square root");
+LBANN_DEFINE_UNARY_MATH_LAYER(rsqrt_layer,           "reciprocal square root");
+LBANN_DEFINE_UNARY_MATH_LAYER(safe_reciprocal_layer, "safe reciprocal");
+
+// Exponential and logarithmic operations
+LBANN_DEFINE_UNARY_MATH_LAYER(exp_layer,   "exponential");
+LBANN_DEFINE_UNARY_MATH_LAYER(expm1_layer, "expm1");
+LBANN_DEFINE_UNARY_MATH_LAYER(log_layer,   "natural logarithm");
+LBANN_DEFINE_UNARY_MATH_LAYER(log1p_layer, "log1p");
+
+// Trigonometric operations
+LBANN_DEFINE_UNARY_MATH_LAYER(cos_layer,  "cosine");
+LBANN_DEFINE_UNARY_MATH_LAYER(sin_layer,  "sine");
+LBANN_DEFINE_UNARY_MATH_LAYER(tan_layer,  "tangent");
+LBANN_DEFINE_UNARY_MATH_LAYER(acos_layer, "arccosine");
+LBANN_DEFINE_UNARY_MATH_LAYER(asin_layer, "arcsine");
+LBANN_DEFINE_UNARY_MATH_LAYER(atan_layer, "arctangent");
+
+// Hyperbolic operations
+LBANN_DEFINE_UNARY_MATH_LAYER(cosh_layer,  "hyperbolic cosine");
+LBANN_DEFINE_UNARY_MATH_LAYER(sinh_layer,  "hyperbolic sine");
+LBANN_DEFINE_UNARY_MATH_LAYER(tanh_layer,  "hyperbolic tangent");
+LBANN_DEFINE_UNARY_MATH_LAYER(acosh_layer, "hyperbolic arccosine");
+LBANN_DEFINE_UNARY_MATH_LAYER(asinh_layer, "hyperbolic arcsine");
+LBANN_DEFINE_UNARY_MATH_LAYER(atanh_layer, "hyperbolic arctangent");
+
+} // namespace lbann
+
+#undef LBANN_DEFINE_UNARY_MATH_LAYER 
+#endif // LBANN_LAYER_MATH_UNARY_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/CMakeLists.txt b/include/lbann/layers/misc/CMakeLists.txt
new file mode 100644
index 00000000000..e55fe68b15e
--- /dev/null
+++ b/include/lbann/layers/misc/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Add the headers for this directory
+set_full_path(THIS_DIR_HEADERS
+  variance.hpp
+  )
+
+# Propagate the files up the tree
+set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/layers/misc/covariance.hpp b/include/lbann/layers/misc/covariance.hpp
new file mode 100644
index 00000000000..dcfaf59b529
--- /dev/null
+++ b/include/lbann/layers/misc/covariance.hpp
@@ -0,0 +1,127 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED
+#define LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED
+
+#include "lbann/layers/layer.hpp"
+
+namespace lbann {
+
+/** Covariance layer. */
+template <data_layout Layout, El::Device Device>
+class covariance_layer : public Layer {
+public:
+
+  covariance_layer(lbann_comm *comm, bool biased)
+    : Layer(comm), m_biased(biased) {
+    m_expected_num_parent_layers = 2;
+  }
+  covariance_layer(const covariance_layer& other)
+    : Layer(other),
+      m_biased(other.m_biased),
+      m_means(other.m_means ? other.m_means->Copy() : nullptr),
+      m_workspace(other.m_workspace ?
+                  other.m_workspace->Copy() : nullptr) {}
+  covariance_layer& operator=(const covariance_layer& other) {
+    Layer::operator=(other);
+    m_biased = other.m_biased;
+    m_means.reset(other.m_means ? other.m_means->Copy() : nullptr);
+    m_workspace.reset(other.m_workspace ?
+                      other.m_workspace->Copy() : nullptr);
+    return *this;
+  }
+  
+  covariance_layer* copy() const override { return new covariance_layer(*this); }
+  std::string get_type() const override { return "covariance"; }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+
+protected:
+
+  void setup_matrices(const El::Grid& grid) override {
+    Layer::setup_matrices(grid);
+    auto dist_data = get_prev_activations().DistData();
+    dist_data.colDist = El::STAR;
+    m_means.reset(AbsDistMat::Instantiate(dist_data));
+    m_workspace.reset(AbsDistMat::Instantiate(dist_data));
+  }  
+  
+  void setup_dims() override {
+    Layer::setup_dims();
+    set_output_dims({1});
+
+    // Check that input dimensions are valid
+    std::stringstream err;
+    const auto& parents = get_parent_layers();
+    const auto& dims0 = get_input_dims(0);
+    const auto& dims1 = get_input_dims(1);
+    if (dims0 != dims1) {
+      err << get_type() << " layer \"" << get_name() << "\" "
+          << "expects input tensors with identical dimensions, "
+          << "but parent layer \"" << parents[0]->get_name() << "\" "
+          << "outputs a tensor with dimensions ";
+      for (size_t i = 0; i < dims0.size(); ++i) {
+        err << (i > 0 ? " x " : "") << dims0[i];
+      }
+      err << " and parent layer \"" << parents[1]->get_name() << "\" "
+          << "outputs a tensor with dimensions ";
+      for (size_t i = 0; i < dims1.size(); ++i) {
+        err << (i > 0 ? " x " : "") << dims1[i];
+      }
+      LBANN_ERROR(err.str());
+    }
+    if (get_input_size() <= 1) {
+      err << get_type() << " layer \"" << get_name() << "\" "
+          << "expects input tensors with at least two entries, "
+          << "but parent layers \"" << parents[0]->get_name() << "\" "
+          << "and \"" << parents[1]->get_name() << "\" "
+          << "output tensors with dimensions ";
+      for (size_t i = 0; i < dims0.size(); ++i) {
+        err << (i > 0 ? " x " : "") << dims0[i];
+      }
+      LBANN_ERROR(err.str());
+    }
+  }
+  
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  /** Whether to use biased covariance estimator. */
+  bool m_biased;
+
+  /** Means for each mini-batch sample.  */
+  std::unique_ptr<AbsDistMat> m_means;
+  /** Workspace. */
+  std::unique_ptr<AbsDistMat> m_workspace;
+  
+};
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED
diff --git a/include/lbann/layers/misc/variance.hpp b/include/lbann/layers/misc/variance.hpp
new file mode 100644
index 00000000000..f4f743ec584
--- /dev/null
+++ b/include/lbann/layers/misc/variance.hpp
@@ -0,0 +1,106 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED
+#define LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED
+
+#include "lbann/layers/layer.hpp"
+
+namespace lbann {
+
+/** Variance layer. */
+template <data_layout Layout, El::Device Device>
+class variance_layer : public Layer {
+public:
+
+  variance_layer(lbann_comm *comm, bool biased)
+    : Layer(comm), m_biased(biased) {}
+  variance_layer(const variance_layer& other)
+    : Layer(other),
+      m_biased(other.m_biased),
+      m_means(other.m_means ? other.m_means->Copy() : nullptr),
+      m_workspace(other.m_workspace ?
+                  other.m_workspace->Copy() : nullptr) {}
+  variance_layer& operator=(const variance_layer& other) {
+    Layer::operator=(other);
+    m_biased = other.m_biased;
+    m_means.reset(other.m_means ? other.m_means->Copy() : nullptr);
+    m_workspace.reset(other.m_workspace ?
+                      other.m_workspace->Copy() : nullptr);
+    return *this;
+  }
+  
+  variance_layer* copy() const override { return new variance_layer(*this); }
+  std::string get_type() const override { return "variance"; }
+  data_layout get_data_layout() const override { return Layout; }
+  El::Device get_device_allocation() const override { return Device; }
+
+protected:
+
+  void setup_matrices(const El::Grid& grid) override {
+    Layer::setup_matrices(grid);
+    auto dist_data = get_prev_activations().DistData();
+    dist_data.colDist = El::STAR;
+    m_means.reset(AbsDistMat::Instantiate(dist_data));
+    m_workspace.reset(AbsDistMat::Instantiate(dist_data));
+  }  
+  
+  void setup_dims() override {
+    Layer::setup_dims();
+    set_output_dims({1});
+    if (get_input_size() <= 1) {
+      std::stringstream err;
+      const auto& parents = get_parent_layers();
+      const auto& dims = get_input_dims();
+      err << get_type() << " layer \"" << get_name() << "\" "
+          << "expects an input tensor with at least two entries, "
+          << "but parent layer \"" << parents[0]->get_name() << "\" "
+          << "outputs a tensor with dimensions ";
+      for (size_t i = 0; i < dims.size(); ++i) {
+        err << (i > 0 ? " x " : "") << dims[i];
+      }
+      LBANN_ERROR(err.str());
+    }
+  }
+  
+  void fp_compute() override;
+  void bp_compute() override;
+
+private:
+
+  /** Whether to use biased variance estimator. */
+  bool m_biased;
+
+  /** Means for each mini-batch sample.  */
+  std::unique_ptr<AbsDistMat> m_means;
+  /** Workspace. */
+  std::unique_ptr<AbsDistMat> m_workspace;
+  
+};
+
+} // namespace lbann
+
+#endif // LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED
diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp
index 887c52019b4..46ae4590584 100644
--- a/include/lbann/layers/regularizers/batch_normalization.hpp
+++ b/include/lbann/layers/regularizers/batch_normalization.hpp
@@ -28,63 +28,10 @@
 #define LBANN_LAYER_REGULARIZER_BATCH_NORMALIZATION_HPP_INCLUDED
 
 #include "lbann/layers/regularizers/regularizer.hpp"
-#include "lbann/utils/cuda.hpp"
+#include "lbann/models/model.hpp"
 
 namespace lbann {
 
-#ifdef LBANN_HAS_GPU
-namespace batch_normalization_cuda {
-/** Compute channel sums.
- *  Sums and squares of sums are used to compute mean and variance.
- */
-void channel_sums(int num_channels,
-                  const AbsMat& data,
-                  AbsMat& sums,
-                  AbsMat& sqsums);
-/** Compute statistics from sums.
- *  On input, mean and var are assumed to contain sums and squares
- *  of sums, respectively.
- */
-void compute_statistics(int num_per_sum,
-                        DataType epsilon,
-                        DataType decay,
-                        AbsMat& mean,
-                        AbsMat& var,
-                        AbsMat& running_mean,
-                        AbsMat& running_var);  
-/** Apply batch normalization. */
-void batch_normalization(const AbsMat& input,
-                         const AbsMat& mean,
-                         const AbsMat& var,
-                         DataType epsilon,
-                         const AbsMat& scale,
-                         const AbsMat& bias,
-                         AbsMat& output);
-/** Compute gradients w.r.t. batch norm parameters. */
-void backprop1(const AbsMat& input,
-               const AbsMat& gradient_wrt_output,
-               const AbsMat& mean,
-               const AbsMat& var,
-               DataType epsilon,
-               const AbsMat& scale,
-               AbsMat& dscale,
-               AbsMat& dbias,
-               AbsMat& dmean,
-               AbsMat& dvar);
-/** Compute gradients w.r.t. inputs. */
-void backprop2(int global_width,
-               const AbsMat& input,
-               const AbsMat& gradient_wrt_output,
-               const AbsMat& mean,
-               const AbsMat& var,
-               DataType epsilon,
-               const AbsMat& scale,
-               const AbsMat& dmean,
-               const AbsMat& dvar,
-               AbsMat& gradient_wrt_input);
-}
-#endif // LBANN_HAS_GPU
-
 /** Batch normalization layer.
  *  Each input channel is normalized across the mini-batch to have
  *  zero mean and unit standard deviation. Learned scaling factors and
@@ -98,7 +45,7 @@ void backprop2(int global_width,
  *    https://cthorey.github.io/backpropagation/
  */
 template <data_layout T_layout, El::Device Dev>
-class batch_normalization : public regularizer_layer {
+class batch_normalization_layer : public regularizer_layer {
 
  private:
 
@@ -110,17 +57,17 @@ class batch_normalization : public regularizer_layer {
   bool m_use_global_stats;
 
   /** Current minibatch means. */
-  AbsDistMat *m_mean;
+  std::unique_ptr<AbsDistMat> m_mean;
   /** Current minibatch standard deviations. */
-  AbsDistMat *m_var;
+  std::unique_ptr<AbsDistMat> m_var;
   /** Gradient w.r.t. means. */
-  AbsDistMat *m_mean_gradient;
+  std::unique_ptr <AbsDistMat> m_mean_gradient;
   /** Gradient w.r.t. standard deviations. */
-  AbsDistMat *m_var_gradient;
+  std::unique_ptr<AbsDistMat> m_var_gradient;
   /** Gradient w.r.t. scaling terms. */
-  AbsDistMat *m_scale_gradient;
+  std::unique_ptr<AbsDistMat> m_scale_gradient;
   /** Gradient w.r.t. bias terms. */
-  AbsDistMat *m_bias_gradient;
+  std::unique_ptr<AbsDistMat> m_bias_gradient;
 
  public:
   /**
@@ -131,20 +78,14 @@ class batch_normalization : public regularizer_layer {
    * @param use_global_stats Whether to use global statistics when
    * training.
    */
-  batch_normalization(lbann_comm *comm,
-                      DataType decay=0.9,
-                      DataType epsilon=1e-5,
-                      bool use_global_stats = false)
+  batch_normalization_layer(lbann_comm *comm,
+                            DataType decay=0.9,
+                            DataType epsilon=1e-5,
+                            bool use_global_stats = false)
     : regularizer_layer(comm),
       m_decay(decay),
       m_epsilon(epsilon),
-      m_use_global_stats(use_global_stats),
-      m_mean(nullptr),
-      m_var(nullptr),
-      m_mean_gradient(nullptr),
-      m_var_gradient(nullptr),
-      m_scale_gradient(nullptr),
-      m_bias_gradient(nullptr) {
+      m_use_global_stats(use_global_stats) {
     static_assert(T_layout == data_layout::DATA_PARALLEL,
                   "batch normalization only supports DATA_PARALLEL");
 #ifdef LBANN_DETERMINISTIC
@@ -153,49 +94,39 @@ class batch_normalization : public regularizer_layer {
 #endif
   }
 
-  batch_normalization(const batch_normalization& other) :
-    regularizer_layer(other),
-    m_decay(other.m_decay),
-    m_epsilon(other.m_epsilon),
-    m_use_global_stats(other.m_use_global_stats),
-    m_mean(other.m_mean),
-    m_var(other.m_var),
-    m_mean_gradient(other.m_mean_gradient),
-    m_var_gradient(other.m_var_gradient),
-    m_scale_gradient(other.m_scale_gradient),
-    m_bias_gradient(other.m_bias_gradient) {
-
-    // Deep copy matrices
-    if (m_mean != nullptr)           { m_mean = m_mean->Copy(); }
-    if (m_var != nullptr)            { m_var = m_var->Copy(); }
-    if (m_mean_gradient != nullptr)  { m_mean_gradient = m_mean_gradient->Copy(); }
-    if (m_var_gradient != nullptr)   { m_var_gradient = m_var_gradient->Copy(); }
-    if (m_scale_gradient != nullptr) { m_scale_gradient = m_scale_gradient->Copy(); }
-    if (m_bias_gradient != nullptr)  { m_bias_gradient = m_bias_gradient->Copy(); }
-  }
-
-  batch_normalization& operator=(const batch_normalization& other) {
+  batch_normalization_layer(const batch_normalization_layer& other)
+    : regularizer_layer(other),
+      m_decay(other.m_decay),
+      m_epsilon(other.m_epsilon),
+      m_use_global_stats(other.m_use_global_stats),
+      m_mean(other.m_mean ? other.m_mean->Copy() : nullptr),
+      m_var(other.m_var ? other.m_var->Copy() : nullptr),
+      m_mean_gradient(other.m_mean_gradient ?
+                      other.m_mean_gradient->Copy() : nullptr),
+      m_var_gradient(other.m_var_gradient ?
+                     other.m_var_gradient->Copy() : nullptr),
+      m_scale_gradient(other.m_scale_gradient ?
+                       other.m_scale_gradient->Copy() : nullptr),
+      m_bias_gradient(other.m_bias_gradient ?
+                      other.m_bias_gradient->Copy() : nullptr) {}
+
+  batch_normalization_layer& operator=(const batch_normalization_layer& other) {
     regularizer_layer::operator=(other);
     m_decay = other.m_decay;
     m_epsilon = other.m_epsilon;
     m_use_global_stats = other.m_use_global_stats;
 
-    // Deallocate matrices
-    deallocate_matrices();
-
     // Deep copy matrices
-    m_mean = other.m_mean;
-    m_var = other.m_var;
-    m_mean_gradient = other.m_mean_gradient;
-    m_var_gradient = other.m_var_gradient;
-    m_scale_gradient = other.m_scale_gradient;
-    m_bias_gradient = other.m_bias_gradient;
-    if (m_mean != nullptr)           { m_mean = m_mean->Copy(); }
-    if (m_var != nullptr)            { m_var = m_var->Copy(); }
-    if (m_mean_gradient != nullptr)  { m_mean_gradient = m_mean_gradient->Copy(); }
-    if (m_var_gradient != nullptr)   { m_var_gradient = m_var_gradient->Copy(); }
-    if (m_scale_gradient != nullptr) { m_scale_gradient = m_scale_gradient->Copy(); }
-    if (m_bias_gradient != nullptr)  { m_bias_gradient = m_bias_gradient->Copy(); }
+    m_mean.reset(other.m_mean ? other.m_mean->Copy() : nullptr);
+    m_var.reset(other.m_var ? other.m_var->Copy() : nullptr);
+    m_mean_gradient.reset(other.m_mean_gradient ?
+                          other.m_mean_gradient->Copy() : nullptr);
+    m_var_gradient.reset(other.m_var_gradient ?
+                         other.m_var_gradient->Copy() : nullptr);
+    m_scale_gradient.reset(other.m_scale_gradient ?
+                           other.m_scale_gradient->Copy() : nullptr);
+    m_bias_gradient.reset(other.m_bias_gradient ?
+                          other.m_bias_gradient->Copy() : nullptr);
 
     return *this;
   }
@@ -210,23 +141,18 @@ class batch_normalization : public regularizer_layer {
     return ss.str();
   }
 
-  virtual ~batch_normalization() override {
-    deallocate_matrices();
-  }
-
-  batch_normalization* copy() const override { return new batch_normalization(*this); }
+  batch_normalization_layer* copy() const override { return new batch_normalization_layer(*this); }
 
   std::string get_type() const override { return "batch normalization"; }
 
   void setup_matrices(const El::Grid& grid) override {
     regularizer_layer::setup_matrices(grid);
-    deallocate_matrices();
-    m_mean = new StarMat<Dev>(grid);
-    m_var = new StarMat<Dev>(grid);
-    m_mean_gradient = new StarMat<Dev>(grid);
-    m_var_gradient = new StarMat<Dev>(grid);
-    m_scale_gradient = new StarMat<Dev>(grid);
-    m_bias_gradient = new StarMat<Dev>(grid);
+    m_mean.reset(new StarMat<Dev>(grid));
+    m_var.reset(new StarMat<Dev>(grid));
+    m_mean_gradient.reset(new StarMat<Dev>(grid));
+    m_var_gradient.reset(new StarMat<Dev>(grid));
+    m_scale_gradient.reset(new StarMat<Dev>(grid));
+    m_bias_gradient.reset(new StarMat<Dev>(grid));
   }
 
   data_layout get_data_layout() const override { return T_layout; }
@@ -343,417 +269,8 @@ class batch_normalization : public regularizer_layer {
 
   }
 
-  void fp_compute() override {
-    if (this->using_gpus()) {
-      fp_compute_gpu();
-    } else {
-      fp_compute_cpu();
-    }
-  }
-
-  void bp_compute() override {
-    if (this->using_gpus()) {
-      bp_compute_gpu();
-    } else {
-      bp_compute_cpu();
-    }
-  }
-
-  void fp_compute_gpu() {
-#ifndef LBANN_HAS_GPU
-    LBANN_ERROR("CUDA not detected");
-#else
-
-    // Matrices
-    const auto& input = get_prev_activations();
-    const auto& local_input = input.LockedMatrix();
-    auto& local_output = get_local_activations();
-
-    // Compute statistics during training
-    const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
-    if (is_training) {
-      const auto& output_dims = get_output_dims();
-      const int num_channels = output_dims[0];
-      const int channel_size = get_output_size() / num_channels;
-      batch_normalization_cuda::channel_sums(num_channels,
-                                             local_input,
-                                             m_mean->Matrix(),
-                                             m_var->Matrix());
-      int num_per_sum = channel_size * input.LocalWidth();
-      if (m_use_global_stats) {
-        m_comm->allreduce(*m_mean, m_mean->RedundantComm(), El::mpi::SUM);
-        m_comm->allreduce(*m_var, m_var->RedundantComm(), El::mpi::SUM);
-        num_per_sum = channel_size * input.Width();
-      }
-      batch_normalization_cuda::compute_statistics(
-        num_per_sum,
-        m_epsilon,
-        m_decay,
-        m_mean->Matrix(),
-        m_var->Matrix(),
-        m_weights[2]->get_values().Matrix(),
-        m_weights[3]->get_values().Matrix());
-    }
-
-    // Perform batch normalization
-    const auto& mean = (is_training ?
-                        m_mean->LockedMatrix() :
-                        m_weights[2]->get_values().Matrix());
-    const auto& var = (is_training ?
-                       m_var->LockedMatrix() :
-                       m_weights[3]->get_values().Matrix());
-    batch_normalization_cuda::batch_normalization(
-      local_input,
-      mean, var, m_epsilon,
-      m_weights[0]->get_values().Matrix(),
-      m_weights[1]->get_values().Matrix(),
-      local_output);
-
-#endif // LBANN_HAS_GPU
-  }
-
-  void bp_compute_gpu() {
-#ifndef LBANN_HAS_GPU
-    LBANN_ERROR("CUDA not detected");
-#else
-    const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
-    const int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size();
-
-    // GPU matrices
-    const auto& input = get_prev_activations();
-    const auto& local_input = input.LockedMatrix();
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-    auto& local_gradient_wrt_input = get_local_error_signals();
-    const auto& mean = (is_training ?
-                        m_mean->LockedMatrix() :
-                        m_weights[2]->get_values().Matrix());
-    const auto& var = (is_training ?
-                       m_var->LockedMatrix() :
-                       m_weights[3]->get_values().Matrix());
-
-    // Compute gradients w.r.t. batch norm parameters
-    batch_normalization_cuda::backprop1(local_input,
-                                        local_gradient_wrt_output,
-                                        mean, var, m_epsilon,
-                                        m_weights[0]->get_values().Matrix(),
-                                        m_scale_gradient->Matrix(),
-                                        m_bias_gradient->Matrix(),
-                                        m_mean_gradient->Matrix(),
-                                        m_var_gradient->Matrix());
-
-    // Accumulate gradients
-    if (is_training) {
-      if (m_use_global_stats) {
-        m_comm->allreduce(*m_mean_gradient,
-                          m_mean_gradient->RedundantComm(),
-                          El::mpi::SUM);
-        m_comm->allreduce(*m_var_gradient,
-                          m_var_gradient->RedundantComm(),
-                          El::mpi::SUM);
-      }
-    } else {
-      El::Zero(*m_mean_gradient);
-      El::Zero(*m_var_gradient);
-    }
-    auto* scale_optimizer = m_weights[0]->get_optimizer();
-    if (scale_optimizer != nullptr) {
-      scale_optimizer->add_to_gradient_staging(
-        *m_scale_gradient,
-        DataType(1) / effective_mini_batch_size);
-    }
-    auto* bias_optimizer = m_weights[1]->get_optimizer();
-    if (bias_optimizer != nullptr) {
-      bias_optimizer->add_to_gradient_staging(
-        *m_bias_gradient,
-        DataType(1) / effective_mini_batch_size);
-    }
-
-    // Compute gradient w.r.t. input
-    batch_normalization_cuda::backprop2(m_use_global_stats ? input.Width() : input.LocalWidth(),
-                                        local_input,
-                                        local_gradient_wrt_output,
-                                        mean, var, m_epsilon,
-                                        m_weights[0]->get_values().Matrix(),
-                                        m_mean_gradient->LockedMatrix(),
-                                        m_var_gradient->LockedMatrix(),
-                                        local_gradient_wrt_input);
-
-#endif // LBANN_HAS_GPU
-  }
-
-  void fp_compute_cpu() {
-    const DataType zero = DataType(0);
-    const DataType one = DataType(1);
-
-    // Check execution mode
-    const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
-
-    // Matrices
-    const auto& input = get_prev_activations();
-    const auto& local_input = input.LockedMatrix();
-    auto& local_output = get_local_activations();
-
-    // Matrix parameters
-    const int width = input.Width();
-    const El::Int local_width = local_input.Width();
-    const auto& output_dims = get_output_dims();
-    const int num_channels = output_dims[0];
-    const int channel_size = get_output_size() / num_channels;
-
-    // Compute statistics
-    if (is_training) {
-
-      // Local matrices
-      // Note: local_new_running_mean and local_new_running_var are
-      // stored in m_mean_gradient and m_var_gradient.
-      auto& local_mean = m_mean->Matrix();
-      auto& local_var = m_var->Matrix();
-      const auto& local_running_mean = this->m_weights[2]->get_values().LockedMatrix();
-      const auto& local_running_var = this->m_weights[3]->get_values().LockedMatrix();
-      auto& local_new_running_mean = m_mean_gradient->Matrix();
-      auto& local_new_running_var = m_var_gradient->Matrix();
-
-      // Compute sums and sums of squares
-      #pragma omp parallel for
-      for (int channel = 0; channel < num_channels; ++channel) {
-        DataType sum = zero;
-        DataType sqsum = zero;
-        const El::Int row_start = channel * channel_size;
-        const El::Int row_end = (channel+1) * channel_size;
-        for (El::Int col = 0; col < local_width; ++col) {
-          for (El::Int row = row_start; row < row_end; ++row) {
-            const DataType x = local_input(row, col);
-            sum += x;
-            sqsum += x * x;
-          }
-        }
-        local_mean(channel, 0) = sum;
-        local_var(channel, 0) = sqsum;
-      }
-      DataType num_per_sum;
-      if (m_use_global_stats) {
-        m_comm->allreduce(*m_mean, m_mean->RedundantComm(), El::mpi::SUM);
-        m_comm->allreduce(*m_var, m_var->RedundantComm(), El::mpi::SUM);
-        num_per_sum = channel_size * width;
-      } else {
-        num_per_sum = channel_size * local_width;
-      }
-
-      // Compute minibatch statistics
-      // Note: local_new_running_mean and local_new_running_var are
-      // stored in m_mean_gradient and m_var_gradient.
-      if (num_per_sum <= 1) {
-        El::Fill(local_var, one);
-      } else {
-        #pragma omp parallel for
-        for (int channel = 0; channel < num_channels; ++channel) {
-          const DataType mean = local_mean(channel, 0) / num_per_sum;
-          const DataType sqmean = local_var(channel, 0) / num_per_sum;
-          const DataType var = num_per_sum / (num_per_sum - one) * std::max(sqmean - mean * mean, m_epsilon);
-          const DataType old_running_mean = local_running_mean(channel, 0);
-          const DataType old_running_var = local_running_var(channel, 0);
-          const DataType new_running_mean = m_decay * old_running_mean + (one - m_decay) * mean;
-          const DataType new_running_var = m_decay * old_running_var + (one - m_decay) * var;
-          local_mean(channel, 0) = mean;
-          local_var(channel, 0) = var;
-          local_new_running_mean(channel, 0) = new_running_mean;
-          local_new_running_var(channel, 0) = new_running_var;
-        }
-        m_weights[2]->set_values(*m_mean_gradient);
-        m_weights[3]->set_values(*m_var_gradient);
-      }
-
-    }
-
-    // Get matrices
-    const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix();
-    const auto& local_bias = this->m_weights[1]->get_values().LockedMatrix();
-    const auto& local_mean = (is_training ?
-                              m_mean->LockedMatrix() :
-                              this->m_weights[2]->get_values().LockedMatrix());
-    const auto& local_var = (is_training ?
-                             m_var->LockedMatrix() :
-                             this->m_weights[3]->get_values().LockedMatrix());
-
-    // Iterate through channels
-    #pragma omp parallel for
-    for (int channel = 0; channel < num_channels; ++channel) {
-
-      // Get channel parameters
-      const DataType mean = local_mean(channel, 0);
-      const DataType var = local_var(channel, 0);
-      const DataType inv_stdev = 1 / std::sqrt(var + m_epsilon);
-      const DataType scale = local_scale(channel, 0);
-      const DataType bias = local_bias(channel, 0);
-
-      // Apply batch normalization to inputs in channel
-      const El::Int row_start = channel * channel_size;
-      const El::Int row_end = (channel+1) * channel_size;
-      for (El::Int col = 0; col < local_width; ++col) {
-        for (El::Int row = row_start; row < row_end; ++row) {
-          const DataType x = local_input(row, col);
-          const DataType xhat = (x - mean) * inv_stdev;
-          const DataType y = scale * xhat + bias;
-          local_output(row, col) = y;
-        }
-      }
-
-    }
-
-  }
-
-  void bp_compute_cpu() {
-
-    // Check execution mode
-    const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
-
-    // Matrices
-    const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix();
-    const auto& local_mean = (is_training ?
-                              m_mean->LockedMatrix() :
-                              this->m_weights[2]->get_values().LockedMatrix());
-    const auto& local_var = (is_training ?
-                             m_var->LockedMatrix() :
-                             this->m_weights[3]->get_values().LockedMatrix());
-    const auto& input = get_prev_activations();
-    const auto& local_input = input.LockedMatrix();
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-    auto& local_gradient_wrt_input = get_local_error_signals();
-    auto& local_mean_gradient = m_mean_gradient->Matrix();
-    auto& local_var_gradient = m_var_gradient->Matrix();
-    auto& local_scale_gradient = m_scale_gradient->Matrix();
-    auto& local_bias_gradient = m_bias_gradient->Matrix();
-
-    // Matrix parameters
-    const int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size();
-    const int width = input.Width();
-    const El::Int local_width = local_input.Width();
-    const auto& output_dims = get_output_dims();
-    const int num_channels = output_dims[0];
-    const int channel_size = get_output_size() / num_channels;
-
-    // Compute local gradients
-    #pragma omp parallel for
-    for (int channel = 0; channel < num_channels; ++channel) {
-
-      // Initialize channel parameters and gradients
-      const DataType mean = local_mean(channel, 0);
-      const DataType var = local_var(channel, 0);
-      const DataType scale = local_scale(channel, 0);
-      const DataType inv_stdev = 1 / std::sqrt(var + m_epsilon);
-      const DataType dvar_factor = inv_stdev * inv_stdev * inv_stdev / 2;
-      DataType dmean = DataType(0);
-      DataType dvar = DataType(0);
-      DataType dscale = DataType(0);
-      DataType dbias = DataType(0);
-
-      // Compute gradient contributions from local entries
-      const El::Int row_start = channel * channel_size;
-      const El::Int row_end = (channel+1) * channel_size;
-      for (El::Int col = 0; col < local_width; ++col) {
-        for (El::Int row = row_start; row < row_end; ++row) {
-          const DataType x = local_input(row, col);
-          const DataType xhat = (x - mean) * inv_stdev;
-          const DataType dy = local_gradient_wrt_output(row, col);
-          dscale += dy * xhat;
-          dbias += dy;
-          const DataType dxhat = dy * scale;
-          dmean += - dxhat * inv_stdev;
-          dvar += - dxhat * (x - mean) * dvar_factor;
-        }
-      }
-      local_mean_gradient(channel, 0) = dmean;
-      local_var_gradient(channel, 0) = dvar;
-      local_scale_gradient(channel, 0) = dscale;
-      local_bias_gradient(channel, 0) = dbias;
-
-    }
-
-    // Accumulate gradients
-    if (is_training) {
-      if (m_use_global_stats) {
-        m_comm->allreduce(*m_mean_gradient,
-                          m_mean_gradient->RedundantComm(),
-                          El::mpi::SUM);
-        m_comm->allreduce(*m_var_gradient,
-                          m_var_gradient->RedundantComm(),
-                          El::mpi::SUM);
-      }
-    } else {
-      El::Zero(*m_mean_gradient);
-      El::Zero(*m_var_gradient);
-    }
-    optimizer* scale_optimizer = m_weights[0]->get_optimizer();
-    if (scale_optimizer != nullptr) {
-      scale_optimizer->add_to_gradient_staging(
-        *m_scale_gradient,
-        DataType(1) / effective_mini_batch_size);
-    }
-    optimizer* bias_optimizer = m_weights[1]->get_optimizer();
-    if (bias_optimizer != nullptr) {
-      bias_optimizer->add_to_gradient_staging(
-        *m_bias_gradient,
-        DataType(1) / effective_mini_batch_size);
-    }
-
-    // Compute error signal
-    const int num_per_sum = (m_use_global_stats ?
-                             width * channel_size :
-                             local_width * channel_size);
-    if (num_per_sum <= 1) {
-      El::Zero(local_gradient_wrt_input);
-    } else {
-      #pragma omp parallel for
-      for (int channel = 0; channel < num_channels; ++channel) {
-
-        // Initialize channel parameters and gradients
-        const auto& mean = local_mean(channel, 0);
-        const auto& var = local_var(channel, 0);
-        const auto& scale = local_scale(channel, 0);
-        const auto& dmean = local_mean_gradient(channel, 0);
-        const auto& dvar = local_var_gradient(channel, 0);
-
-        // Compute useful constants
-        const DataType inv_stdev = 1 / std::sqrt(var + m_epsilon);
-        const auto& dmean_term = dmean / num_per_sum;
-        const auto& dvar_term = dvar * 2 / (num_per_sum - 1);
-
-        // Compute error signal for current channel
-        const El::Int row_start = channel * channel_size;
-        const El::Int row_end = (channel+1) * channel_size;
-        for (El::Int col = 0; col < local_width; ++col) {
-          for (El::Int row = row_start; row < row_end; ++row) {
-            const auto& x = local_input(row, col);
-            const auto& dy = local_gradient_wrt_output(row, col);
-            const auto& dxhat = dy * scale;
-            auto dx = dxhat * inv_stdev;
-            dx += dmean_term;
-            dx += dvar_term * (x - mean);
-            local_gradient_wrt_input(row, col) = dx;
-          }
-        }
-
-      }
-    }
-
-  }
-
- private:
-
-  void deallocate_matrices() {
-    if (m_mean != nullptr)           delete m_mean;
-    if (m_var != nullptr)            delete m_var;
-    if (m_mean_gradient != nullptr)  delete m_mean_gradient;
-    if (m_var_gradient != nullptr)   delete m_var_gradient;
-    if (m_scale_gradient != nullptr) delete m_scale_gradient;
-    if (m_bias_gradient != nullptr)  delete m_bias_gradient;
-    m_mean = nullptr;
-    m_var = nullptr;
-    m_mean_gradient = nullptr;
-    m_var_gradient = nullptr;
-    m_scale_gradient = nullptr;
-    m_bias_gradient = nullptr;
-  }
+  void fp_compute() override;
+  void bp_compute() override;
 
 };
 
diff --git a/include/lbann/layers/regularizers/dropout.hpp b/include/lbann/layers/regularizers/dropout.hpp
index 187a7609cf3..0a564788394 100644
--- a/include/lbann/layers/regularizers/dropout.hpp
+++ b/include/lbann/layers/regularizers/dropout.hpp
@@ -224,7 +224,7 @@ class dropout : public regularizer_layer {
       El::Copy(input, output);
       return;
     }
-    if (local_input.Height() < 1 && local_input.Width() < 1) { return; }
+    if (local_input.Height() < 1 || local_input.Width() < 1) { return; }
 
     // Initialize cuDNN objects
     auto&& input_desc = m_tensors_cudnn_desc.get_prev_activations();
diff --git a/include/lbann/layers/transform/CMakeLists.txt b/include/lbann/layers/transform/CMakeLists.txt
index bdd679cdf77..5b355e31215 100644
--- a/include/lbann/layers/transform/CMakeLists.txt
+++ b/include/lbann/layers/transform/CMakeLists.txt
@@ -3,7 +3,6 @@ set_full_path(THIS_DIR_HEADERS
   concatenation.hpp
   pooling.hpp
   reshape.hpp
-  safe_inv.hpp
   slice.hpp
   split.hpp
   sum.hpp
@@ -22,10 +21,9 @@ set_full_path(THIS_DIR_HEADERS
   categorical_random.hpp
   discrete_random.hpp
   stop_gradient.hpp
-  max.hpp
-  min.hpp
   in_top_k.hpp
   sort.hpp
+  weights.hpp
   )
 
 # Propagate the files up the tree
diff --git a/include/lbann/layers/transform/evaluation.hpp b/include/lbann/layers/transform/evaluation.hpp
index fe0e7739d12..facbb62abd7 100644
--- a/include/lbann/layers/transform/evaluation.hpp
+++ b/include/lbann/layers/transform/evaluation.hpp
@@ -53,20 +53,25 @@ class abstract_evaluation_layer : public transform_layer {
                                               El::Device device);
   
 protected:
-
   abstract_evaluation_layer(lbann_comm *comm);
-
+  void setup_data() override;
   void fp_compute() override;
   void bp_compute() override;
 
 private:
 
   /** Scaling factor to apply to evaluated value. */
-  EvalType m_scale;
-  /** Evaluated value. */
-  DataType m_value;
+  EvalType m_scale = 0;
+  /** Evaluated value.
+   *  The value may be stored in pinned memory.
+   */
+  CPUMat m_value;
   /** Non-blocking allreduce request. */
   Al::request m_allreduce_req;
+#ifdef LBANN_HAS_GPU
+  /** CUDA event after a non-blocking GPU-CPU memory copy. */
+  cuda::event_wrapper m_copy_event;
+#endif // LBANN_HAS_GPU
   
 };
 
@@ -76,23 +81,12 @@ class abstract_evaluation_layer : public transform_layer {
  */
 template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
 class evaluation_layer : public abstract_evaluation_layer {
-
- public:
-
+public:
   evaluation_layer(lbann_comm *comm) : abstract_evaluation_layer(comm) {}
-
   evaluation_layer* copy() const override { return new evaluation_layer(*this); }
   std::string get_type() const override { return "evaluation"; }
   data_layout get_data_layout() const override { return T_layout; }
   El::Device get_device_allocation() const override { return Dev; }
-
-  /** Returns description. */
-  std::string get_description() const override {
-    std::stringstream s;
-     s << "evaluation_layer  dataLayout: " << this->get_data_layout_string(get_data_layout());
-     return s.str();
-  }
-
 };
 
 } // namespace lbann
diff --git a/include/lbann/layers/transform/max.hpp b/include/lbann/layers/transform/max.hpp
deleted file mode 100644
index a91a61a0995..00000000000
--- a/include/lbann/layers/transform/max.hpp
+++ /dev/null
@@ -1,212 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_LAYER_MAX_HPP_INCLUDED
-#define LBANN_LAYER_MAX_HPP_INCLUDED
-
-#include "lbann/layers/transform/transform.hpp"
-
-namespace lbann {
-
-/** Max layer.
- *  This layer outputs the entrywise maximum of its input tensors.
- */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class max_layer : public transform_layer {
-
- public:
-  max_layer(lbann_comm *comm)
-    : transform_layer(comm) {
-
-    /// @todo Implement
-    static_assert(Dev == El::Device::CPU,
-                  "max layer currently only supports CPU");
-
-    // Max layer has no limit on parents
-    m_expected_num_parent_layers = -1;
-
-  }
-
-  max_layer* copy() const override { return new max_layer(*this); }
-  std::string get_type() const override { return "max"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-
- protected:
-
-  void setup_pointers() override {
-    transform_layer::setup_pointers();
-    if (get_num_parents() < 1) {
-      std::stringstream err;
-      err << "layer \"" << get_name() << "\" has no parents, "
-          << "but max layers expect at least one parent";
-      LBANN_ERROR(err.str());
-    }
-  }
-
-  void setup_dims() override {
-    transform_layer::setup_dims();
-    const auto& output_dims = get_output_dims();
-    for (int i = 0; i < get_num_parents(); ++i) {
-      const auto& input_dims = get_input_dims(i);
-      if (input_dims != output_dims) {
-        std::stringstream err;
-        err << get_type() << " layer \"" << get_name() << "\" "
-            << "expects input tensors with dimensions ";
-        for (size_t j = 0; j < output_dims.size(); ++j) {
-          err << (j > 0 ? " x " : "") << output_dims[j];
-        }
-        err << ", but parent layer "
-            << "\"" << m_parent_layers[i]->get_name() << "\" "
-            << "outputs with dimensions ";
-        for (size_t j = 0; j < input_dims.size(); ++j) {
-          err << (j > 0 ? " x " : "") << input_dims[j];
-        }
-        LBANN_ERROR(err.str());
-      }
-    }
-  }
-
-  void fp_compute() override {
-
-    // Handle case with one parent
-    // Note: Case with no parents is handled in setup_pointers
-    const int num_parents = get_num_parents();
-    if (num_parents == 1) {
-      El::LockedView(get_activations(), get_prev_activations());
-      return;
-    }
-    
-    // Local matrices
-    const auto& local_input0 = get_local_prev_activations(0);
-    const auto& local_input1 = get_local_prev_activations(1);
-    auto& local_output = get_local_activations();
-    const int local_height = local_output.Height();
-    const int local_width = local_output.Width();
-
-    // Maximum of first two inputs
-    #pragma omp parallel for collapse(2)
-    for (int col = 0; col < local_width; ++col) {
-      for (int row = 0; row < local_height; ++row) {
-        local_output(row, col) = std::max(local_input0(row, col),
-                                          local_input1(row, col));
-      }
-    }
-
-    // Handle case with more than two parents
-    for (int i = 2; i < num_parents; ++i) {
-      const auto& local_input = get_local_prev_activations(i);
-      #pragma omp parallel for collapse(2)
-      for (int col = 0; col < local_width; ++col) {
-        for (int row = 0; row < local_height; ++row) {
-          const auto& x = local_input(row, col);
-          auto& y = local_output(row, col);
-          if (x > y) { y = x; }
-        }
-      }
-    }
-
-  }
-
-  void bp_compute() override {
-
-    // Useful constants
-    const DataType zero = DataType(0);
-
-    // Local matrices
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-    const int local_height = local_gradient_wrt_output.Height();
-    const int local_width = local_gradient_wrt_output.Width();
-
-    // Handle cases for different number of parents
-    // Note: Case with no parents is handled in setup_pointers
-    const int num_parents = get_num_parents();
-    switch (num_parents) {
-    case 1:
-      El::LockedView(get_error_signals(), get_prev_error_signals());
-      break;
-    case 2:
-      {
-        const auto& local_input0 = get_local_prev_activations(0);
-        const auto& local_input1 = get_local_prev_activations(1);
-        auto& local_gradient_wrt_input0 = get_local_error_signals(0);
-        auto& local_gradient_wrt_input1 = get_local_error_signals(1);
-        #pragma omp parallel for collapse(2)
-        for (int col = 0; col < local_width; ++col) {
-          for (int row = 0; row < local_height; ++row) {
-            const auto& x0 = local_input0(row, col);
-            const auto& x1 = local_input1(row, col);
-            const auto& dy = local_gradient_wrt_output(row, col);
-            auto& dx0 = local_gradient_wrt_input0(row, col);
-            auto& dx1 = local_gradient_wrt_input1(row, col);
-            if (x0 > x1) {
-              dx0 = dy;
-              dx1 = zero;
-            } else if (x0 < x1) {
-              dx0 = zero;
-              dx1 = dy;
-            } else {
-              dx0 = dy / 2;
-              dx1 = dy / 2;
-            }
-          }
-        }
-      }
-      break;
-    default:
-      #pragma omp parallel for collapse(2)
-      for (int col = 0; col < local_width; ++col) {
-        for (int row = 0; row < local_height; ++row) {
-          const auto& dy = local_gradient_wrt_output(row, col);
-
-          // Find maximum input
-          int max_index = 0;
-          int max_value = get_local_activations(0)(row, col);
-          for (int i = 1; i < num_parents; ++i) {
-            const auto& current_value = get_local_activations(i)(row, col);
-            if (current_value > max_value) {
-              max_index = i;
-              max_value = current_value;
-            }
-          }
-
-          // Output error signal to maximum input
-          for (int i = 0; i < num_parents; ++i) {
-            auto& dx = get_local_error_signals(i)(row, col);
-            dx = (i == max_index) ? dy : zero;
-          }
-
-        }
-      }
-    }
-
-  }
-
-};
-
-} // namespace lbann
-
-#endif // LBANN_LAYER_MAX_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/min.hpp b/include/lbann/layers/transform/min.hpp
deleted file mode 100644
index 8879510452b..00000000000
--- a/include/lbann/layers/transform/min.hpp
+++ /dev/null
@@ -1,212 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_LAYER_MIN_HPP_INCLUDED
-#define LBANN_LAYER_MIN_HPP_INCLUDED
-
-#include "lbann/layers/transform/transform.hpp"
-
-namespace lbann {
-
-/** Min layer.
- *  This layer outputs the entrywise minimum of its input tensors.
- */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class min_layer : public transform_layer {
-
- public:
-  min_layer(lbann_comm *comm)
-    : transform_layer(comm) {
-
-    /// @todo Implement
-    static_assert(Dev == El::Device::CPU,
-                  "min layer currently only supports CPU");
-
-    // Min layer has no limit on parents
-    m_expected_num_parent_layers = -1;
-
-  }
-
-  min_layer* copy() const override { return new min_layer(*this); }
-  std::string get_type() const override { return "min"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-
- protected:
-
-  void setup_pointers() override {
-    transform_layer::setup_pointers();
-    if (get_num_parents() < 1) {
-      std::stringstream err;
-      err << "layer \"" << get_name() << "\" has no parents, "
-          << "but min layers expect at least one parent";
-      LBANN_ERROR(err.str());
-    }
-  }
-
-  void setup_dims() override {
-    transform_layer::setup_dims();
-    const auto& output_dims = get_output_dims();
-    for (int i = 0; i < get_num_parents(); ++i) {
-      const auto& input_dims = get_input_dims(i);
-      if (input_dims != output_dims) {
-        std::stringstream err;
-        err << get_type() << " layer \"" << get_name() << "\" "
-            << "expects input tensors with dimensions ";
-        for (size_t j = 0; j < output_dims.size(); ++j) {
-          err << (j > 0 ? " x " : "") << output_dims[j];
-        }
-        err << ", but parent layer "
-            << "\"" << m_parent_layers[i]->get_name() << "\" "
-            << "outputs with dimensions ";
-        for (size_t j = 0; j < input_dims.size(); ++j) {
-          err << (j > 0 ? " x " : "") << input_dims[j];
-        }
-        LBANN_ERROR(err.str());
-      }
-    }
-  }
-
-  void fp_compute() override {
-
-    // Handle case with one parent
-    // Note: Case with no parents is handled in setup_pointers
-    const int num_parents = get_num_parents();
-    if (num_parents == 1) {
-      El::LockedView(get_activations(), get_prev_activations());
-      return;
-    }
-    
-    // Local matrices
-    const auto& local_input0 = get_local_prev_activations(0);
-    const auto& local_input1 = get_local_prev_activations(1);
-    auto& local_output = get_local_activations();
-    const int local_height = local_output.Height();
-    const int local_width = local_output.Width();
-
-    // Minimum of first two inputs
-    #pragma omp parallel for collapse(2)
-    for (int col = 0; col < local_width; ++col) {
-      for (int row = 0; row < local_height; ++row) {
-        local_output(row, col) = std::min(local_input0(row, col),
-                                          local_input1(row, col));
-      }
-    }
-
-    // Handle case with more than two parents
-    for (int i = 2; i < num_parents; ++i) {
-      const auto& local_input = get_local_prev_activations(i);
-      #pragma omp parallel for collapse(2)
-      for (int col = 0; col < local_width; ++col) {
-        for (int row = 0; row < local_height; ++row) {
-          const auto& x = local_input(row, col);
-          auto& y = local_output(row, col);
-          if (x < y) { y = x; }
-        }
-      }
-    }
-
-  }
-
-  void bp_compute() override {
-
-    // Useful constants
-    const DataType zero = DataType(0);
-
-    // Local matrices
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-    const int local_height = local_gradient_wrt_output.Height();
-    const int local_width = local_gradient_wrt_output.Width();
-
-    // Handle cases for different number of parents
-    // Note: Case with no parents is handled in setup_pointers
-    const int num_parents = get_num_parents();
-    switch (num_parents) {
-    case 1:
-      El::LockedView(get_error_signals(), get_prev_error_signals());
-      break;
-    case 2:
-      {
-        const auto& local_input0 = get_local_prev_activations(0);
-        const auto& local_input1 = get_local_prev_activations(1);
-        auto& local_gradient_wrt_input0 = get_local_error_signals(0);
-        auto& local_gradient_wrt_input1 = get_local_error_signals(1);
-        #pragma omp parallel for collapse(2)
-        for (int col = 0; col < local_width; ++col) {
-          for (int row = 0; row < local_height; ++row) {
-            const auto& x0 = local_input0(row, col);
-            const auto& x1 = local_input1(row, col);
-            const auto& dy = local_gradient_wrt_output(row, col);
-            auto& dx0 = local_gradient_wrt_input0(row, col);
-            auto& dx1 = local_gradient_wrt_input1(row, col);
-            if (x0 < x1) {
-              dx0 = dy;
-              dx1 = zero;
-            } else if (x0 > x1) {
-              dx0 = zero;
-              dx1 = dy;
-            } else {
-              dx0 = dy / 2;
-              dx1 = dy / 2;
-            }
-          }
-        }
-      }
-      break;
-    default:
-      #pragma omp parallel for collapse(2)
-      for (int col = 0; col < local_width; ++col) {
-        for (int row = 0; row < local_height; ++row) {
-          const auto& dy = local_gradient_wrt_output(row, col);
-
-          // Find minimum input
-          int min_index = 0;
-          int min_value = get_local_activations(0)(row, col);
-          for (int i = 1; i < num_parents; ++i) {
-            const auto& current_value = get_local_activations(i)(row, col);
-            if (current_value < min_value) {
-              min_index = i;
-              min_value = current_value;
-            }
-          }
-
-          // Output error signal to minimum input
-          for (int i = 0; i < num_parents; ++i) {
-            auto& dx = get_local_error_signals(i)(row, col);
-            dx = (i == min_index) ? dy : zero;
-          }
-
-        }
-      }
-    }
-
-  }
-
-};
-
-} // namespace lbann
-
-#endif // LBANN_LAYER_MIN_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/safe_inv.hpp b/include/lbann/layers/transform/safe_inv.hpp
deleted file mode 100644
index 4323b7c6765..00000000000
--- a/include/lbann/layers/transform/safe_inv.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef LBANN_LAYER_SAFE_INV_HPP_INCLUDED
-#define LBANN_LAYER_SAFE_INV_HPP_INCLUDED
-
-#include "lbann/base.hpp"
-#include "lbann/layers/transform/transform.hpp"
-#include "lbann/utils/exception.hpp"
-
-namespace lbann {
-
-/** Safe entrywise inversion (reciprocal).
- *  Output is zero if input is zero. See https://arxiv.org.abs/1606.06582
- */
-template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
-class safe_inv_layer : public transform_layer {
- private:
-
-  /** Threshhold for computing inverse. */
-  DataType m_threshhold;
-
- public:
-
-  safe_inv_layer(lbann_comm *comm,
-                 DataType threshhold = DataType(0))
-    : transform_layer(comm), m_threshhold(threshhold) {}
-
-  safe_inv_layer* copy() const override { return new safe_inv_layer(*this); }
-  std::string get_type() const override { return "safe_inv"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
-
-  /** Returns description of ctor params */
-  std::string get_description() const override {
-    std::stringstream s;
-     s << " dataLayout: " << this->get_data_layout_string(get_data_layout());
-     return s.str();
-  }
-
- protected:
-
-  void fp_compute() override {
-    const auto& local_input = get_local_prev_activations();
-    auto& local_output = get_local_activations();
-    const int local_height = local_input.Height();
-    const int local_width = local_input.Width();
-    for (int col = 0; col < local_width; ++col) {
-      for (int row = 0; row < local_height; ++row) {
-        const DataType x = local_input(row, col);
-        DataType& y = local_output(row, col);
-        y = std::fabs(x) > m_threshhold ? 1 / x : DataType(0);
-      }
-    }
-  }
-
-  void bp_compute() override {
-    const auto& local_input = get_local_prev_activations();
-    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-    auto& local_gradient_wrt_input = get_local_error_signals();
-    const int local_height = local_input.Height();
-    const int local_width = local_input.Width();
-    for (int col = 0; col < local_width; ++col) {
-      for (int row = 0; row < local_height; ++row) {
-        const DataType x = local_input(row, col);
-        const DataType dy = local_gradient_wrt_output(row, col);
-        DataType& dx = local_gradient_wrt_input(row, col);
-        dx = std::fabs(x) > m_threshhold ?  - dy / (x * x) : DataType(0);
-      }
-    }
-  }
-
-};
-
-} // namespace lbann
-
-#endif // LBANN_LAYER_SAFE_INV_HPP_INCLUDED
diff --git a/include/lbann/layers/transform/weights.hpp b/include/lbann/layers/transform/weights.hpp
new file mode 100644
index 00000000000..63c981aaabb
--- /dev/null
+++ b/include/lbann/layers/transform/weights.hpp
@@ -0,0 +1,208 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_LAYER_WEIGHTS_HPP_INCLUDED
+#define LBANN_LAYER_WEIGHTS_HPP_INCLUDED
+
+#include "lbann/layers/transform/transform.hpp"
+
+namespace lbann {
+
+/** Weights layer.
+ *  This layer outputs the values from a weights tensor.
+ */
+template <data_layout T_layout = data_layout::DATA_PARALLEL, El::Device Dev = El::Device::CPU>
+class weights_layer : public transform_layer {
+
+ public:
+  weights_layer(lbann_comm *comm, std::vector<El::Int> dims)
+    : transform_layer(comm) {
+    std::vector<int> dims_;
+    for (const auto& d : dims) { dims_.push_back(d); }
+    set_output_dims(dims_);
+    m_expected_num_parent_layers = 0;
+  }
+
+  weights_layer(const weights_layer& other)
+    : transform_layer(other),
+      m_gradient(other.m_gradient ? other.m_gradient->Copy() : nullptr) {
+    if (other.m_workspace) {
+      switch (other.m_workspace->GetDevice()) {
+      case El::Device::CPU: m_workspace.reset(new CPUMat()); break;
+#ifdef LBANN_HAS_GPU
+      case El::Device::GPU: m_workspace.reset(new GPUMat()); break;
+#endif // LBANN_HAS_GPU
+      default: LBANN_ERROR("unknown device type");
+      }
+      m_workspace->SetMemoryMode(other.m_workspace->MemoryMode());
+    }
+  }
+  weights_layer& operator=(const weights_layer& other){
+    transform_layer::operator=(other);
+    m_gradient.reset(other.m_gradient ? other.m_gradient->Copy() : nullptr);
+    m_workspace.reset();
+    if (other.m_workspace) {
+      switch (other.m_workspace->GetDevice()) {
+      case El::Device::CPU: m_workspace.reset(new CPUMat()); break;
+#ifdef LBANN_HAS_GPU
+      case El::Device::GPU: m_workspace.reset(new GPUMat()); break;
+#endif // LBANN_HAS_GPU
+      default: LBANN_ERROR("unknown device type");
+      }
+      m_workspace->SetMemoryMode(other.m_workspace->MemoryMode());
+    }
+    return *this;
+  }
+  weights_layer* copy() const override { return new weights_layer(*this); }
+  std::string get_type() const override { return "weights"; }
+  data_layout get_data_layout() const override { return T_layout; }
+  El::Device get_device_allocation() const override { return Dev; }
+
+ protected:
+
+  void setup_matrices(const El::Grid& grid) override {
+    transform_layer::setup_matrices(grid);
+
+    // Initialize weights gradient
+    auto dist = get_activations().DistData();
+    dist.rowDist = El::STAR;
+    m_gradient.reset(AbsDistMat::Instantiate(dist));
+
+    // Initialize workspace
+    switch (Dev) {
+    case El::Device::CPU: m_workspace.reset(new CPUMat()); break;
+#ifdef LBANN_HAS_GPU
+    case El::Device::GPU:
+      m_workspace.reset(new GPUMat());
+#ifdef HYDROGEN_HAVE_CUB
+      m_workspace->SetMemoryMode(1); // Use CUB GPU memory pool if possible
+#endif // HYDROGEN_HAVE_CUB
+      break;
+#endif // LBANN_HAS_GPU
+    default: LBANN_ERROR("unknown device type");
+    }
+    
+  }
+  
+  void setup_data() override {
+    transform_layer::setup_data();
+
+    // Initialize default weights if none are provided
+    if (this->m_weights.size() > 1) {
+      std::stringstream err;
+      err << "attempted to setup "
+          << get_type() << " layer \"" << get_name() << "\" "
+          << "with an invalid number of weights "
+          << "(expected at most 1, "
+          << "but found " << this->m_weights.size() << ")";
+      LBANN_ERROR(err.str());
+    }
+    this->m_weights.resize(1, nullptr);
+    auto& w = this->m_weights[0];
+    if (w == nullptr) {
+      w = new weights(get_comm());
+      std::unique_ptr<weights_initializer> init(new constant_initializer(DataType(0)));
+      std::unique_ptr<optimizer> opt(m_model->create_optimizer());
+      w->set_name(get_name() + "_weights");
+      w->set_initializer(init);
+      w->set_optimizer(opt);
+      this->m_model->add_weights(w);
+    }
+
+    // Setup weights and weights gradient
+    m_gradient->AlignWith(get_activations());
+    m_gradient->Resize(get_output_size(), 1);
+    w->set_dims(get_output_dims());
+    w->set_matrix_distribution(m_gradient->DistData());
+
+    // Initialize freeze state
+    if (this->m_frozen) { w->freeze(); }
+    else                { w->unfreeze(); }
+    if (w->is_frozen() != this->m_frozen) {
+      std::stringstream err;
+      err << (m_frozen ? "" : "un") << "frozen "
+          << "layer \"" << get_name() << "\" has "
+          << (w->is_frozen() ? "" : "un") << "frozen "
+          << "weights \"" << w->get_name() << "\"";
+      LBANN_ERROR(err.str());
+    }
+    
+  }
+  
+  void fp_compute() override {
+    
+    // Matrices
+    const auto& local_weights = m_weights[0]->get_values().LockedMatrix();
+    auto& local_output = get_local_activations();
+    m_workspace->Resize(local_output.Width(), 1);
+    El::Fill(*m_workspace, DataType(1));
+
+    // Duplicate weights across matrix columns
+    El::Gemm(El::NORMAL, El::TRANSPOSE,
+             DataType(1), local_weights, *m_workspace,
+             DataType(0), local_output);
+
+    // Clean up
+    m_workspace->Empty();
+    
+  }
+
+  void bp_compute() override {
+
+    // Get optimizer
+    // Note: Nothing needs to be done if there is no optimizer
+    auto* opt = this->m_weights[0]->get_optimizer();
+    if (opt == nullptr) { return; }
+
+    // Matrices
+    const auto& local_gradient_wrt_output = get_local_prev_error_signals();
+    m_workspace->Resize(local_gradient_wrt_output.Width(), 1);
+    El::Fill(*m_workspace, DataType(1));
+
+    // Compute gradient contribution and accumulate
+    const auto& scale = DataType(1) / this->m_model->get_effective_mini_batch_size();
+    El::Gemv(El::NORMAL,
+             scale, local_gradient_wrt_output, *m_workspace,
+             DataType(0), m_gradient->Matrix());
+    opt->add_to_gradient_staging(*m_gradient);
+
+    // Clean up
+    m_workspace->Empty();
+    
+  }
+  
+ private:
+
+  /** Weights gradient. */
+  std::unique_ptr<AbsDistMat> m_gradient;
+  /** Workspace. */
+  std::unique_ptr<AbsMat> m_workspace;
+
+};
+
+} // namespace lbann
+
+#endif // LBANN_LAYER_WEIGHTS_HPP_INCLUDED
diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp
index 092ddeba5f9..ffd7be7768a 100644
--- a/include/lbann/lbann.hpp
+++ b/include/lbann/lbann.hpp
@@ -22,8 +22,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
-//
-// lbann.hpp - LBANN top level header
 ////////////////////////////////////////////////////////////////////////////////
 
 /**
@@ -40,10 +38,8 @@
 #include "lbann/models/siamese.hpp"
 
 /// Activation Layers
-#include "lbann/layers/activations/atan.hpp"
 #include "lbann/layers/activations/bent_identity.hpp"
 #include "lbann/layers/activations/elu.hpp"
-#include "lbann/layers/activations/exponential.hpp"
 #include "lbann/layers/activations/identity.hpp"
 #include "lbann/layers/activations/leaky_relu.hpp"
 #include "lbann/layers/activations/relu.hpp"
@@ -51,14 +47,12 @@
 #include "lbann/layers/activations/sigmoid.hpp"
 #include "lbann/layers/activations/smooth_relu.hpp"
 #include "lbann/layers/activations/softmax.hpp"
+#include "lbann/layers/activations/log_softmax.hpp"
 #include "lbann/layers/activations/softplus.hpp"
 #include "lbann/layers/activations/swish.hpp"
-#include "lbann/layers/activations/tanh.hpp"
-#include "lbann/layers/activations/power.hpp"
-#include "lbann/layers/activations/sigmoid_bce_with_logits.hpp"
-#include "lbann/layers/activations/abs.hpp"
-#include "lbann/layers/activations/l2_loss.hpp"
-#include "lbann/layers/activations/log.hpp"
+
+/// Image Layers
+#include "lbann/layers/image/bilinear_resize.hpp"
 
 /// Learning Layers
 #include "lbann/layers/learning/fully_connected.hpp"
@@ -66,10 +60,17 @@
 #include "lbann/layers/learning/deconvolution.hpp"
 
 /// Loss Layers
+#include "lbann/layers/loss/categorical_accuracy.hpp"
 #include "lbann/layers/loss/cross_entropy.hpp"
+#include "lbann/layers/loss/entrywise.hpp"
+#include "lbann/layers/loss/l2_norm2.hpp"
 #include "lbann/layers/loss/mean_squared_error.hpp"
 #include "lbann/layers/loss/top_k_categorical_accuracy.hpp"
 
+/// Math layers
+#include "lbann/layers/math/unary.hpp"
+#include "lbann/layers/math/binary.hpp"
+
 /// Transform Layers
 #include "lbann/layers/transform/reshape.hpp"
 #include "lbann/layers/transform/pooling.hpp"
@@ -81,7 +82,6 @@
 #include "lbann/layers/transform/concatenation.hpp"
 #include "lbann/layers/transform/constant.hpp"
 #include "lbann/layers/transform/dummy.hpp"
-#include "lbann/layers/transform/safe_inv.hpp"
 #include "lbann/layers/transform/hadamard.hpp"
 #include "lbann/layers/transform/zero.hpp"
 #include "lbann/layers/transform/reduction.hpp"
@@ -93,10 +93,9 @@
 #include "lbann/layers/transform/categorical_random.hpp"
 #include "lbann/layers/transform/discrete_random.hpp"
 #include "lbann/layers/transform/stop_gradient.hpp"
-#include "lbann/layers/transform/max.hpp"
-#include "lbann/layers/transform/min.hpp"
 #include "lbann/layers/transform/in_top_k.hpp"
 #include "lbann/layers/transform/sort.hpp"
+#include "lbann/layers/transform/weights.hpp"
 
 /// Regularization layers.
 #include "lbann/layers/regularizers/local_response_normalization.hpp"
@@ -111,6 +110,10 @@
 /// Reconstruction Layer
 #include "lbann/layers/io/target/reconstruction.hpp"
 
+/// Miscellaneous Layers
+#include "lbann/layers/misc/covariance.hpp"
+#include "lbann/layers/misc/variance.hpp"
+
 /// Data Readers
 #include "lbann/data_readers/data_reader_imagenet.hpp"
 #include "lbann/data_readers/data_reader_imagenet_patches.hpp"
@@ -131,6 +134,7 @@
 #include "lbann/data_readers/data_reader_ascii.hpp"
 #include "lbann/data_readers/data_reader_pilot2_molecular.hpp"
 #include "lbann/data_readers/data_reader_mesh.hpp"
+#include "lbann/data_readers/data_reader_moving_mnist.hpp"
 
 /// Data Store
 #include "lbann/data_store/generic_data_store.hpp"
diff --git a/include/lbann/models/directed_acyclic_graph.hpp b/include/lbann/models/directed_acyclic_graph.hpp
index d77ca4fabe3..dabd2571b9a 100644
--- a/include/lbann/models/directed_acyclic_graph.hpp
+++ b/include/lbann/models/directed_acyclic_graph.hpp
@@ -57,7 +57,7 @@ class directed_acyclic_graph_model : public model {
   directed_acyclic_graph_model* copy() const override { return new directed_acyclic_graph_model(*this); }
 
   /** Get model name. */
-  std::string name() const override { return "directed_acyclic_graph_model"; }
+  std::string get_type() const override { return "directed acyclic graph"; }
 
  protected:
 
diff --git a/include/lbann/models/greedy_layerwise_autoencoder.hpp b/include/lbann/models/greedy_layerwise_autoencoder.hpp
index 27957aea3eb..4abf0e3d0f6 100644
--- a/include/lbann/models/greedy_layerwise_autoencoder.hpp
+++ b/include/lbann/models/greedy_layerwise_autoencoder.hpp
@@ -54,7 +54,7 @@ class greedy_layerwise_autoencoder : public sequential_model {
   }
 
   /** Get model name. */
-  std::string name() const override { return "greedy layerwise autoencoder"; }
+  std::string get_type() const override { return "greedy layerwise autoencoder"; }
 
   /** Train greedy layerwise autoencoder. */
   void train(int num_epochs, int num_batches=0) override;
diff --git a/include/lbann/models/model.hpp b/include/lbann/models/model.hpp
index 20ac4e0fb0c..f9ff87984a1 100644
--- a/include/lbann/models/model.hpp
+++ b/include/lbann/models/model.hpp
@@ -69,8 +69,23 @@ class model {
   /** Copy model. */
   virtual model* copy() const = 0;
 
-  /** Return the model's name. */
-  virtual std::string name() const = 0;
+  /** Return the model's type. */
+  virtual std::string get_type() const = 0;
+
+  void set_name(std::string name);
+  
+  std::string get_name() const {
+    return m_name;
+  }
+
+  /** Return the model's id; this is an arbitrary string
+   *  that may be useful in multi-model scenarios, e.g,
+   *  LTFB, jag
+   */
+  std::string get_model_id() { return m_model_id; }
+
+  /** Set the model's arbitrary identifying string */
+  void set_model_id(std::string s) { m_model_id = s; }
 
   /** Set up the model. */
   virtual void setup();
@@ -226,7 +241,8 @@ class model {
 
   /** The objective function used to train the model. */
   objective_function *m_objective_function;
-
+  /** Give model a name. */
+  std::string m_name;
   /** The model's current execution mode. */
   execution_mode m_execution_mode;
   /** Flag telling the model to terminate training. */
@@ -419,6 +435,7 @@ class model {
    */
   void add_split_layers();
 
+  std::string m_model_id;
 };
 
 }  // namespace lbann
diff --git a/include/lbann/models/sequential.hpp b/include/lbann/models/sequential.hpp
index 85cdf23321d..3733bcbb36a 100644
--- a/include/lbann/models/sequential.hpp
+++ b/include/lbann/models/sequential.hpp
@@ -57,7 +57,7 @@ class sequential_model : public model {
   sequential_model* copy() const override { return new sequential_model(*this); }
 
   /** Get model name. */
-  std::string name() const override { return "sequential_model"; }
+  std::string get_type() const override { return "sequential"; }
 
   /** Write model to proto file */
   void write_proto(lbann_data::Model* proto) override;
diff --git a/include/lbann/models/siamese.hpp b/include/lbann/models/siamese.hpp
index 56656a7a4ad..119db58f921 100644
--- a/include/lbann/models/siamese.hpp
+++ b/include/lbann/models/siamese.hpp
@@ -54,7 +54,7 @@ class siamese_model : public directed_acyclic_graph_model {
   siamese_model* copy() const override { return new siamese_model(*this); }
 
   /** Get model name. */
-  std::string name() const override { return "siamese_model"; }
+  std::string get_type() const override { return "siamese"; }
 
  protected:
 
diff --git a/include/lbann/objective_functions/weight_regularization/l2.hpp b/include/lbann/objective_functions/weight_regularization/l2.hpp
index 0623e5d34e5..0d94f2c5863 100644
--- a/include/lbann/objective_functions/weight_regularization/l2.hpp
+++ b/include/lbann/objective_functions/weight_regularization/l2.hpp
@@ -35,7 +35,7 @@ namespace lbann {
  *  Given weights \f$w_1,\cdots,w_n\f$, the L2 weight regularization
  *  term is
  *    \f[
- *    L2(w) = \frac{1}{2} \sum\limits_{i} w_i
+ *    L2(w) = \frac{1}{2} \sum\limits_{i} w_i^2
  *    \f]
  *  Note the \f$1/2\f$ scaling factor.
  */
@@ -64,13 +64,23 @@ class l2_weight_regularization : public objective_function_term {
 
  private:
 
-  /** Holds intermediate term for local contributions. */
-  EvalType m_sqsum;
-  /** Aluminum request for local contribution aggregation. */
+  /** Contributions to evaluated value. */
+  std::map<El::Device, CPUMat> m_contributions;
+  /** Non-blocking allreduce request. */
   Al::request m_allreduce_req;
-  /** Whether local contribution aggregation has started. */
-  bool m_allreduce_started;
+#ifdef LBANN_HAS_GPU
+  /** CUDA event after a non-blocking GPU-CPU memory copy. */
+  cuda::event_wrapper m_copy_event;
+#endif // LBANN_HAS_GPU
 
+  /** Accumulate contribution to L2 regularization term.
+   *  The sum of squares of 'vals' is added to the value in
+   *  'contribution'.
+   */
+  template <El::Device Device>
+  static void accumulate_contribution(const DMat<Device>& vals,
+                                      DMat<Device>& contribution);
+  
 };
 
 } // namespace lbann
diff --git a/include/lbann/proto/proto_common.hpp b/include/lbann/proto/proto_common.hpp
index be49b33525c..22c7940c49c 100644
--- a/include/lbann/proto/proto_common.hpp
+++ b/include/lbann/proto/proto_common.hpp
@@ -17,7 +17,7 @@ void init_data_readers(
   std::map<execution_mode, lbann::generic_data_reader *>& data_readers);
 
 /// adjusts the number of parallel data readers
-void set_num_parallel_readers(lbann::lbann_comm *comm, lbann_data::LbannPB& p);
+void set_num_parallel_readers(const lbann::lbann_comm *comm, lbann_data::LbannPB& p);
 
 /// adjusts the values in p by querying the options db
 void get_cmdline_overrides(lbann::lbann_comm *comm, lbann_data::LbannPB& p);
diff --git a/include/lbann/utils/CMakeLists.txt b/include/lbann/utils/CMakeLists.txt
index a83e55350bf..4ad9afb73bd 100644
--- a/include/lbann/utils/CMakeLists.txt
+++ b/include/lbann/utils/CMakeLists.txt
@@ -5,6 +5,7 @@ set_full_path(THIS_DIR_HEADERS
   cuda.hpp
   cudnn.hpp
   dataset.hpp
+  entrywise_operator.hpp
   exception.hpp
   glob.hpp
   im2col.hpp
@@ -22,5 +23,8 @@ set_full_path(THIS_DIR_HEADERS
   timer.hpp
   )
 
+# Add the subdirectories
+add_subdirectory(impl)
+
 # Propagate the files up the tree
 set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/utils/cuda.hpp b/include/lbann/utils/cuda.hpp
index 70cbdcfb7cc..d65001af879 100644
--- a/include/lbann/utils/cuda.hpp
+++ b/include/lbann/utils/cuda.hpp
@@ -33,10 +33,10 @@
 
 #include <cuda.h>
 #include <thrust/memory.h>
+#include <thrust/version.h>
 #include <thrust/detail/allocator/tagged_allocator.h>
-#ifdef __CUDACC__
-#include <cuda_fp16.hpp>
-#endif // __CUDACC__
+#include <thrust/system/cuda/detail/par.h>
+#include <thrust/device_vector.h>
 
 // -------------------------------------------------------------
 // Error utility macros
@@ -57,6 +57,19 @@
       LBANN_ERROR(err_CUDA_SYNC.str());                         \
     }                                                           \
   } while (0)
+#define LBANN_CUDA_CHECK_LAST_ERROR(async)                              \
+  do {                                                                  \
+    cudaError_t status = cudaGetLastError();                            \
+    if (status != cudaSuccess) {                                        \
+      cudaDeviceReset();                                                \
+      std::stringstream err_CUDA_CHECK_LAST_ERROR;                      \
+      if (async) { err_CUDA_CHECK_LAST_ERROR << "Asynchronous "; }      \
+      err_CUDA_CHECK_LAST_ERROR << "CUDA error ("                       \
+                                << cudaGetErrorString(status)           \
+                                << ")";                                 \
+      LBANN_ERROR(err_CUDA_CHECK_LAST_ERROR.str());                     \
+    }                                                                   \
+  } while (0)
 #define FORCE_CHECK_CUDA(cuda_call)                             \
   do {                                                          \
     /* Call CUDA API routine, synchronizing before and */       \
@@ -79,133 +92,190 @@
 namespace lbann {
 namespace cuda {
 
-#ifdef __CUDACC__
 // -------------------------------------------------------------
 // Device functions
 // -------------------------------------------------------------
+#ifdef __CUDACC__
 
-// Atomic add functions
-#if __CUDA_ARCH__ >= 530
-__device__ __inline__ __half atomic_add(__half* address, __half val) {
-#if 0 // TODO: replace this once Nvidia implements atomicAdd for __half
-  return atomicAdd(address, val);
-#else
-  unsigned int* address_as_uint = (unsigned int*) address;
-  unsigned int old = *address_as_uint;
-  __half* old_as_half = (__half*) &old;
-  unsigned int assumed;
-  unsigned int updated;
-  __half* updated_as_half = (__half*) &updated;
-  do {
-    assumed = old;
-    updated = old;
-    *updated_as_half += val;
-    old = atomicCAS(address_as_uint, assumed, updated);
-  } while (assumed != old);
-  return *old_as_half;
-#endif // 0
-}
-#endif // __CUDA_ARCH__ >= 530
-__device__ __inline__ float atomic_add(float* address, float val) {
-  return atomicAdd(address, val);
-}
-__device__ __inline__ double atomic_add(double* address, double val) {
-#if __CUDA_ARCH__ >= 600
-  return atomicAdd(address, val);
-#else
-  unsigned long long int* address_as_ull =
-    (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val +
-                                         __longlong_as_double(assumed)));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-#endif // __CUDA_ARCH__ < 600
-}
-
-// Min and max
-__device__ __inline__ int min(int x, int y) { return x <= y ? x : y; }
-__device__ __inline__ El::Int min(El::Int x, El::Int y) { return x <= y ? x : y; }
-__device__ __inline__ float min(float x, float y) { return fminf(x, y); }
-__device__ __inline__ double min(double x, double y) { return fmin(x, y); }
-__device__ __inline__ int max(int x, int y) { return x >= y ? x : y; }
-__device__ __inline__ El::Int max(El::Int x, El::Int y) { return x >= y ? x : y; }
-__device__ __inline__ float max(float x, float y) { return fmaxf(x, y); }
-__device__ __inline__ double max(double x, double y) { return fmax(x, y); }
+// Atomic add
+template <typename T> __device__ __forceinline__
+T atomic_add(T* address, T val);
+
+// Unary math functions
+template <typename T> __device__ __forceinline__ T abs(const T& x);
+template <typename T> __device__ __forceinline__ T round(const T& x);
+template <typename T> __device__ __forceinline__ T ceil(const T& x);
+template <typename T> __device__ __forceinline__ T floor(const T& x);
+template <typename T> __device__ __forceinline__ T sqrt(const T& x);
+template <typename T> __device__ __forceinline__ T rsqrt(const T& x);
+template <typename T> __device__ __forceinline__ T exp(const T& x);
+template <typename T> __device__ __forceinline__ T expm1(const T& x);
+template <typename T> __device__ __forceinline__ T log(const T& x);
+template <typename T> __device__ __forceinline__ T log1p(const T& x);
+template <typename T> __device__ __forceinline__ T cos(const T& x);
+template <typename T> __device__ __forceinline__ T sin(const T& x);
+template <typename T> __device__ __forceinline__ T tan(const T& x);
+template <typename T> __device__ __forceinline__ T acos(const T& x);
+template <typename T> __device__ __forceinline__ T asin(const T& x);
+template <typename T> __device__ __forceinline__ T atan(const T& x);
+template <typename T> __device__ __forceinline__ T cosh(const T& x);
+template <typename T> __device__ __forceinline__ T sinh(const T& x);
+template <typename T> __device__ __forceinline__ T tanh(const T& x);
+template <typename T> __device__ __forceinline__ T acosh(const T& x);
+template <typename T> __device__ __forceinline__ T asinh(const T& x);
+template <typename T> __device__ __forceinline__ T atanh(const T& x);
+
+// Binary math functions
+template <typename T> __device__ __forceinline__ T min(const T& x, const T& y);
+template <typename T> __device__ __forceinline__ T max(const T& x, const T& y);
+template <typename T> __device__ __forceinline__ T mod(const T& x, const T& y);
+template <typename T> __device__ __forceinline__ T pow(const T& x, const T& y);
+  
+// Numeric limits
+template <typename T> constexpr __device__ __forceinline__ T min();
+template <typename T> constexpr __device__ __forceinline__ T max();
+template <typename T> constexpr __device__ __forceinline__ T epsilon();
+template <typename T> __device__ __forceinline__ T infinity();
   
 #endif // __CUDACC__
+
+// -------------------------------------------------------------
+// Utilities for CUDA events
+// -------------------------------------------------------------
+  
+/** Wrapper class for a CUDA event. */
+class event_wrapper {
+public:
+  event_wrapper();
+  event_wrapper(const event_wrapper& other);
+  event_wrapper& operator=(const event_wrapper& other);
+  ~event_wrapper();
+  /** Enqueue CUDA event on a CUDA stream. */
+  void record(cudaStream_t stream);
+  /** Check whether CUDA event has completed. */
+  bool query() const;
+  /** Wait until CUDA event has completed. */
+  void synchronize();
+  /** Get CUDA event object. */
+  cudaEvent_t& get_event();
+private:
+  /** CUDA event object.
+   *  The event object lifetime is managed internally.
+   */
+  cudaEvent_t m_event;
+  /** CUDA stream object.
+   *  The stream object lifetime is assumed to be managed externally.
+   */
+  cudaStream_t m_stream;
+};
+  
+// -------------------------------------------------------------
+// Helper functions for entrywise operations
+// -------------------------------------------------------------
+#ifdef __CUDACC__
+
+/** Apply an entry-wise unary operator to GPU data.
+ *  The input and output data must be on GPU and must have the same
+ *  dimensions.
+ */
+template <typename UnaryOperator>
+void apply_entrywise_unary_operator(const AbsMat& input,
+                                    AbsMat& output);
+
+/** Apply an entry-wise binary operator to GPU data.
+ *  The input and output data must be on GPU and must have the same
+ *  dimensions.
+ */
+template <typename BinaryOperator>
+void apply_entrywise_binary_operator(const AbsMat& input1,
+                                     const AbsMat& input2,
+                                     AbsMat& output);
+  
+
+/** Apply an entry-wise unary operator to GPU data.
+ *  The input and output data must be on GPU, have the same
+ *  dimensions, and be aligned.
+ */
+template <typename UnaryOperator>
+void apply_entrywise_unary_operator(const AbsDistMat& input,
+                                    AbsDistMat& output);
+
+/** Apply an entry-wise binary operator to GPU data.
+ *  The input and output data must be on GPU, have the same
+ *  dimensions, and be aligned.
+ */
+template <typename BinaryOperator>
+void apply_entrywise_binary_operator(const AbsDistMat& input1,
+                                     const AbsDistMat& input2,
+                                     AbsDistMat& output);
   
+#endif // __CUDACC__
+
 // -------------------------------------------------------------
 // Utilities for Thrust
 // -------------------------------------------------------------
 namespace thrust {
 
+/** Thrust execution policy. */
+using execute_on_stream
+#if THRUST_MAJOR_VERSION > 1 || THRUST_MINOR_VERSION >= 9
+  = ::thrust::cuda_cub::execute_on_stream; // >= 1.9.1
+#elif THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION == 8
+  = ::thrust::system::cuda::detail::execute_on_stream;
+#else
+  = std::nullptr_t;
+  static_assert(false, "Thrust 1.8 or newer is required");
+#endif
+
 /** GPU memory allocator that can interact with Thrust.
- *  Uses Hydrogen's CUB memory pool if available.
+ *  Operations are performed on a provided CUDA stream. Uses
+ *  Hydrogen's CUB memory pool if available.
  */
 template <typename T = El::byte>
 class allocator
   : public ::thrust::detail::tagged_allocator<
-      T,
-      ::thrust::system::cuda::tag,
-      ::thrust::pointer<T, ::thrust::system::cuda::tag>> {
-private:
-  typedef typename ::thrust::detail::tagged_allocator<
-    T,
-    ::thrust::system::cuda::tag,
-    ::thrust::pointer<T, ::thrust::system::cuda::tag>> parent_class;
-
-  /** Active CUDA stream. */
-  cudaStream_t m_stream;
-
+               T, execute_on_stream,
+               ::thrust::pointer<T, execute_on_stream>> {
 public:
-  typedef typename parent_class::value_type value_type;
-  typedef typename parent_class::pointer    pointer;
-  typedef typename parent_class::size_type  size_type;
-
-  allocator(cudaStream_t stream = El::GPUManager::Stream())
-    : m_stream(stream) {}
+  // Convenient typedefs
+  typedef ::thrust::detail::tagged_allocator<
+              T, execute_on_stream,
+              ::thrust::pointer<T, execute_on_stream>> parent_class;
+  typedef typename parent_class::value_type  value_type;
+  typedef typename parent_class::pointer     pointer;
+  typedef typename parent_class::size_type   size_type;
+  typedef typename parent_class::system_type system_type;
 
+  /** Default constructor. */
+  allocator(cudaStream_t stream = El::GPUManager::Stream());
   /** Allocate GPU buffer. */
-  pointer allocate(size_type size) {
-    value_type* buffer = nullptr;
-    if (size > 0) {
-#ifdef HYDROGEN_HAVE_CUB
-      auto& memory_pool = El::cub::MemoryPool();
-      CHECK_CUDA(memory_pool.DeviceAllocate(reinterpret_cast<void**>(&buffer),
-                                            size * sizeof(value_type),
-                                            m_stream));
-#else
-      CHECK_CUDA(cudaMalloc(&buffer, size * sizeof(value_type)));
-#endif // HYDROGEN_HAVE_CUB
-    }
-    return pointer(buffer);
-  }
-
+  pointer allocate(size_type size);
   /** Deallocate GPU buffer.
    *  'size' is unused and maintained for compatibility with Thrust.
    */
-  void deallocate(pointer buffer, size_type size = 0) {
-    auto&& ptr = buffer.get();
-    if (ptr != nullptr) {
-#ifdef HYDROGEN_HAVE_CUB
-      auto& memory_pool = El::cub::MemoryPool();
-      CHECK_CUDA(memory_pool.DeviceFree(ptr));
-#else
-      CHECK_CUDA(cudaFree(ptr));
-#endif // HYDROGEN_HAVE_CUB
-    }
-  }
+  void deallocate(pointer buffer, size_type size = 0);
+  /** Get Thrust execution policy. */
+  system_type& system();
 
+private:
+  /** Active CUDA stream. */
+  cudaStream_t m_stream;
+  /** Thrust execution policy. */
+  system_type m_system;
+  
 };
 
+/** Thrust device vector. */
+template <typename T>
+using vector = ::thrust::device_vector<T, allocator<T>>;
+  
 } // namespace thrust
 
 } // namespace cuda
 } // namespace lbann
 
+// Header implementations
+#include "lbann/utils/impl/cuda.hpp"
+
 #endif // LBANN_HAS_GPU
 #endif // LBANN_UTILS_CUDA_HPP
diff --git a/include/lbann/utils/cudnn.hpp b/include/lbann/utils/cudnn.hpp
index 1cd9fd91ebf..86043bad275 100644
--- a/include/lbann/utils/cudnn.hpp
+++ b/include/lbann/utils/cudnn.hpp
@@ -38,11 +38,8 @@
 #include <cudnn.h>
 
 // Error utility macros
-#define FORCE_CHECK_CUDNN(cudnn_call)                           \
+#define CHECK_CUDNN_NODEBUG(cudnn_call)                         \
   do {                                                          \
-    /* Call cuDNN API routine, synchronizing before and */      \
-    /* after to check for errors. */                            \
-    LBANN_CUDA_SYNC(true);                                      \
     const cudnnStatus_t status_CHECK_CUDNN = (cudnn_call);      \
     if (status_CHECK_CUDNN != CUDNN_STATUS_SUCCESS) {           \
       cudaDeviceReset();                                        \
@@ -50,12 +47,16 @@
                   + cudnnGetErrorString(status_CHECK_CUDNN)     \
                   + std::string(")"));                          \
     }                                                           \
-    LBANN_CUDA_SYNC(false);                                     \
+  } while (0)
+#define CHECK_CUDNN_DEBUG(cudnn_call)                           \
+  do {                                                          \
+    LBANN_CUDA_CHECK_LAST_ERROR(true);                          \
+    CHECK_CUDNN_NODEBUG(cudnn_call);                            \
   } while (0)
 #ifdef LBANN_DEBUG
-#define CHECK_CUDNN(cudnn_call) FORCE_CHECK_CUDNN(cudnn_call);
+#define CHECK_CUDNN(cudnn_call) CHECK_CUDNN_DEBUG(cudnn_call)
 #else
-#define CHECK_CUDNN(cudnn_call) (cudnn_call)
+#define CHECK_CUDNN(cudnn_call) CHECK_CUDNN_NODEBUG(cudnn_call)
 #endif // #ifdef LBANN_DEBUG
 
 namespace lbann {
diff --git a/include/lbann/utils/entrywise_operator.hpp b/include/lbann/utils/entrywise_operator.hpp
new file mode 100644
index 00000000000..fbd2fcea92a
--- /dev/null
+++ b/include/lbann/utils/entrywise_operator.hpp
@@ -0,0 +1,188 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_UTILS_ENTRYWISE_OPERATOR_HPP
+#define LBANN_UTILS_ENTRYWISE_OPERATOR_HPP
+
+#include "lbann/base.hpp"
+#include "lbann/utils/exception.hpp"
+
+namespace lbann {
+
+/** Apply an entry-wise unary operator to CPU data.
+ *  The input and output data must be on CPU and must have the same
+ *  dimensions.
+ */
+template <typename UnaryOperator>
+void apply_entrywise_unary_operator(const AbsMat& input,
+                                    AbsMat& output) {
+
+  // Check that input and output are valid
+  std::stringstream err;
+  if (input.GetDevice() != El::Device::CPU) {
+    LBANN_ERROR("input is not on CPU");
+  } else if (output.GetDevice() != El::Device::CPU) {
+    LBANN_ERROR("output is not on CPU");
+  } else if (input.Height() != output.Height()
+             || input.Width() != output.Width()) {
+    err << "input matrix dimensions "
+        << "(" << input.Height() << " x " << input.Width() << ")"
+        << "don't match output matrix dimensions "
+        << "(" << output.Height() << " x " << output.Width() << ")";
+    LBANN_ERROR(err.str());
+  }
+
+  // Apply unary operator
+  if (input.Contiguous() && output.Contiguous()) {
+    const auto* input_buffer = input.LockedBuffer();
+    auto* output_buffer = output.Buffer();
+    const size_t size = input.Height() * input.Width();
+#pragma omp parallel for
+    for (size_t i = 0; i < size; ++i) {
+      UnaryOperator op;
+      output_buffer[i] = op(input_buffer[i]);
+    }
+  } else {
+    auto const width = input.Width();
+    auto const height = input.Height();
+#pragma omp parallel for collapse(2)
+    for (El::Int col = 0; col < width; ++col) {
+      for (El::Int row = 0; row < height; ++row) {
+        UnaryOperator op;
+        output(row, col) = op(input(row, col));
+      }
+    }
+  }
+
+}
+
+/** Apply an entry-wise binary operator to CPU data.
+ *  The input and output data must be on CPU and must have the same
+ *  dimensions.
+ */
+template <typename BinaryOperator>
+void apply_entrywise_binary_operator(const AbsMat& input1,
+                                     const AbsMat& input2,
+                                     AbsMat& output) {
+
+  // Check that input and output are valid
+  std::stringstream err;
+  if (input1.GetDevice() != El::Device::CPU
+      || input2.GetDevice() != El::Device::CPU) {
+    LBANN_ERROR("input is not on CPU");
+  } else if (output.GetDevice() != El::Device::CPU) {
+    LBANN_ERROR("output is not on CPU");
+  } else if (input1.Height() != input2.Height()
+             || input1.Width() != input2.Width()
+             || input1.Height() != output.Height()
+             || input1.Width() != output.Width()) {
+    err << "input matrix dimensions "
+        << "(" << input1.Height() << " x " << input1.Width() << ", "
+        << input2.Height() << " x " << input2.Width() << ")"
+        << "don't match output matrix dimensions "
+        << "(" << output.Height() << " x " << output.Width() << ")";
+    LBANN_ERROR(err.str());
+  }
+
+  // Apply binary operator
+  if (input1.Contiguous() && input2.Contiguous()
+      && output.Contiguous()) {
+    const auto* input1_buffer = input1.LockedBuffer();
+    const auto* input2_buffer = input2.LockedBuffer();
+    auto* output_buffer = output.Buffer();
+    const size_t size = input1.Height() * input1.Width();
+#pragma omp parallel for
+    for (size_t i = 0; i < size; ++i) {
+      BinaryOperator op;
+      output_buffer[i] = op(input1_buffer[i], input2_buffer[i]);
+    }
+  } else {
+    auto const width = input1.Width();
+    auto const height = input1.Height();
+#pragma omp parallel for collapse(2)
+    for (El::Int col = 0; col < width; ++col) {
+      for (El::Int row = 0; row < height; ++row) {
+        BinaryOperator op;
+        output(row, col) = op(input1(row, col), input2(row, col));
+      }
+    }
+  }
+
+}
+
+/** Apply an entry-wise unary operator to CPU data.
+ *  The input and output data must be on CPU, have the same
+ *  dimensions, and be aligned.
+ */
+template <typename UnaryOperator>
+void apply_entrywise_unary_operator(const AbsDistMat& input,
+                                    AbsDistMat& output) {
+  std::stringstream err;
+  if (input.Height() != output.Height()
+      || input.Width() != output.Width()) {
+    err << "input matrix dimensions "
+        << "(" << input.Height() << " x " << input.Width() << ")"
+        << "don't match output matrix dimensions "
+        << "(" << output.Height() << " x " << output.Width() << ")";
+    LBANN_ERROR(err.str());
+  } else if (input.DistData() != output.DistData()) {
+    LBANN_ERROR("input and output matrix distributions don't match");
+  }
+  apply_entrywise_unary_operator<UnaryOperator>(input.LockedMatrix(),
+                                                output.Matrix());
+}
+
+/** Apply an entry-wise binary operator to GPU data.
+ *  The input and output data must be on GPU, have the same
+ *  dimensions, and be aligned.
+ */
+template <typename BinaryOperator>
+void apply_entrywise_binary_operator(const AbsDistMat& input1,
+                                     const AbsDistMat& input2,
+                                     AbsDistMat& output) {
+  if (input1.Height() != input2.Height()
+      || input1.Width() != input2.Width()
+      || input1.Height() != output.Height()
+      || input1.Width() != output.Width()) {
+    std::stringstream err;
+    err << "input matrix dimensions "
+        << "(" << input1.Height() << " x " << input1.Width() << ", "
+        << input2.Height() << " x " << input2.Width() << ")"
+        << "don't match output matrix dimensions "
+        << "(" << output.Height() << " x " << output.Width() << ")";
+    LBANN_ERROR(err.str());
+  } else if (input1.DistData() != input2.DistData()
+             || input1.DistData() != output.DistData()) {
+    LBANN_ERROR("input and output matrix distributions don't match");
+  }
+  apply_entrywise_binary_operator<BinaryOperator>(input1.LockedMatrix(),
+                                                  input2.LockedMatrix(),
+                                                  output.Matrix());
+}
+
+} // namespace lbann
+
+#endif // LBANN_UTILS_ENTRYWISE_OPERATOR_HPP
diff --git a/include/lbann/utils/impl/CMakeLists.txt b/include/lbann/utils/impl/CMakeLists.txt
new file mode 100644
index 00000000000..9a49a59b125
--- /dev/null
+++ b/include/lbann/utils/impl/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Add the headers for this directory
+set_full_path(THIS_DIR_HEADERS
+  cuda.hpp
+  )
+
+# Propagate the files up the tree
+set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE)
diff --git a/include/lbann/utils/impl/cuda.hpp b/include/lbann/utils/impl/cuda.hpp
new file mode 100644
index 00000000000..1da8f24a1bc
--- /dev/null
+++ b/include/lbann/utils/impl/cuda.hpp
@@ -0,0 +1,436 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <thrust/system/cuda/execution_policy.h>
+#ifdef __CUDACC__
+#include <math_constants.h>
+#include <cuda_fp16.hpp>
+#endif // __CUDACC__
+
+namespace lbann {
+namespace cuda {
+
+// -------------------------------------------------------------
+// Device functions
+// -------------------------------------------------------------
+#ifdef __CUDACC__
+
+// Atomic add function
+#if __CUDA_ARCH__ >= 530
+template <> __device__ __forceinline__
+__half atomic_add<__half>(__half* address, __half val) {
+#if 0 // TODO: replace this once Nvidia implements atomicAdd for __half
+  return atomicAdd(address, val);
+#else
+  unsigned int* address_as_uint = (unsigned int*) address;
+  unsigned int old = *address_as_uint;
+  __half* old_as_half = (__half*) &old;
+  unsigned int assumed;
+  unsigned int updated;
+  __half* updated_as_half = (__half*) &updated;
+  do {
+    assumed = old;
+    updated = old;
+    *updated_as_half += val;
+    old = atomicCAS(address_as_uint, assumed, updated);
+  } while (assumed != old);
+  return *old_as_half;
+#endif // 0
+}
+#endif // __CUDA_ARCH__ >= 530
+template <> __device__ __forceinline__
+float atomic_add<float>(float* address, float val) {
+  return atomicAdd(address, val);
+}
+template <> __device__ __forceinline__
+double atomic_add<double>(double* address, double val) {
+#if __CUDA_ARCH__ >= 600
+  return atomicAdd(address, val);
+#else
+  unsigned long long int* address_as_ull =
+    (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val +
+                                         __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+#endif // __CUDA_ARCH__ < 600
+}
+
+// Unary math functions
+#define WRAP_UNARY_CUDA_MATH_FUNCTION(func)                     \
+  template <> __device__ __forceinline__                        \
+  float func<float>(const float& x) { return ::func##f(x); }    \
+  template <> __device__ __forceinline__                        \
+  double func<double>(const double& x) { return ::func(x); }
+template <typename T> __device__ __forceinline__
+T abs(const T& x) { return x >= static_cast<T>(0) ? x : -x; }
+template <> __device__ __forceinline__
+float abs<float>(const float& x) { return ::fabsf(x); }
+template <> __device__ __forceinline__
+double abs<double>(const double& x) { return ::fabs(x); }
+WRAP_UNARY_CUDA_MATH_FUNCTION(round)
+WRAP_UNARY_CUDA_MATH_FUNCTION(ceil)
+WRAP_UNARY_CUDA_MATH_FUNCTION(floor)
+WRAP_UNARY_CUDA_MATH_FUNCTION(sqrt)
+WRAP_UNARY_CUDA_MATH_FUNCTION(rsqrt)
+WRAP_UNARY_CUDA_MATH_FUNCTION(exp)
+WRAP_UNARY_CUDA_MATH_FUNCTION(expm1)
+WRAP_UNARY_CUDA_MATH_FUNCTION(log)
+WRAP_UNARY_CUDA_MATH_FUNCTION(log1p)
+WRAP_UNARY_CUDA_MATH_FUNCTION(cos)
+WRAP_UNARY_CUDA_MATH_FUNCTION(sin)
+WRAP_UNARY_CUDA_MATH_FUNCTION(tan)
+WRAP_UNARY_CUDA_MATH_FUNCTION(acos)
+WRAP_UNARY_CUDA_MATH_FUNCTION(asin)
+WRAP_UNARY_CUDA_MATH_FUNCTION(atan)
+WRAP_UNARY_CUDA_MATH_FUNCTION(cosh)
+WRAP_UNARY_CUDA_MATH_FUNCTION(sinh)
+WRAP_UNARY_CUDA_MATH_FUNCTION(tanh)
+WRAP_UNARY_CUDA_MATH_FUNCTION(acosh)
+WRAP_UNARY_CUDA_MATH_FUNCTION(asinh)
+WRAP_UNARY_CUDA_MATH_FUNCTION(atanh)
+#undef WRAP_UNARY_CUDA_MATH_FUNCTION  
+  
+// Binary math functions
+#define WRAP_BINARY_CUDA_MATH_FUNCTION(func)                    \
+  template <> __device__ __forceinline__                        \
+  float func<float>(const float& x, const float& y) {           \
+    return ::func##f(x,y);                                      \
+  }                                                             \
+  template <> __device__ __forceinline__                        \
+  double func<double>(const double& x, const double& y) {       \
+    return ::func(x,y);                                         \
+  }
+template <typename T> __device__ __forceinline__
+T min(const T& x, const T& y) { return y < x ? y : x; }
+template <> __device__ __forceinline__
+float min<float>(const float& x, const float& y) { return ::fminf(x,y); }
+template <> __device__ __forceinline__
+double min<double>(const double& x, const double& y) { return ::fmin(x,y); }
+template <typename T> __device__ __forceinline__
+T max(const T& x, const T& y) { return y > x ? y : x; }
+template <> __device__ __forceinline__
+float max<float>(const float& x, const float& y) { return ::fmaxf(x,y); }
+template <> __device__ __forceinline__
+double max<double>(const double& x, const double& y) { return ::fmax(x,y); }
+template <typename T> __device__ __forceinline__
+T mod(const T& x, const T& y) { return x % y; }
+template <> __device__ __forceinline__
+float mod<float>(const float& x, const float& y) { return ::fmodf(x,y); }
+template <> __device__ __forceinline__
+double mod<double>(const double& x, const double& y) { return ::fmod(x,y); }
+WRAP_BINARY_CUDA_MATH_FUNCTION(pow)
+#undef WRAP_BINARY_CUDA_MATH_FUNCTION  
+
+// Numeric limits
+#ifdef __CUDACC_RELAXED_CONSTEXPR__
+template <typename T> constexpr __device__ __forceinline__ T min() {
+  return std::numeric_limits<T>::min();
+}
+template <typename T> constexpr __device__ __forceinline__ T max() {
+  return std::numeric_limits<T>::min();
+}
+template <typename T> constexpr __device__ __forceinline__ T epsilon() {
+  return std::numeric_limits<T>::epsilon();
+}
+template <typename T> __device__ __forceinline__ T infinity() {
+  return std::numeric_limits<T>::infinity();
+}
+#else // __CUDACC_RELAXED_CONSTEXPR__
+#define SPECIFIERS template <> __device__ __forceinline__
+SPECIFIERS constexpr float min<float>()                 { return FLT_MIN;   }
+SPECIFIERS constexpr double min<double>()               { return DBL_MIN;   }
+SPECIFIERS constexpr int min<int>()                     { return INT_MIN;   }
+SPECIFIERS constexpr long int min<long int>()           { return LONG_MIN;  }
+SPECIFIERS constexpr long long int min<long long int>() { return LLONG_MIN; }
+SPECIFIERS constexpr float max<float>()                 { return FLT_MAX;   }
+SPECIFIERS constexpr double max<double>()               { return DBL_MAX;   }
+SPECIFIERS constexpr int max<int>()                     { return INT_MAX;   }
+SPECIFIERS constexpr long int max<long int>()           { return LONG_MAX;  }
+SPECIFIERS constexpr long long int max<long long int>() { return LLONG_MAX; }
+SPECIFIERS constexpr float epsilon<float>()   { return FLT_EPSILON; }
+SPECIFIERS constexpr double epsilon<double>() { return DBL_EPSILON; }
+SPECIFIERS float infinity<float>()   { return CUDART_INF_F; }
+SPECIFIERS double infinity<double>() { return CUDART_INF;   }
+#undef HEADER
+#endif // __CUDACC_RELAXED_CONSTEXPR__
+  
+#endif // __CUDACC__
+  
+// -------------------------------------------------------------
+// Helper functions for entrywise operations
+// -------------------------------------------------------------
+#ifdef __CUDACC__
+
+/** CUDA kernel to apply an entry-wise unary operator. */
+template <typename UnaryOperator>
+__global__
+void entrywise_unary_operator_kernel(El::Int height, El::Int width,
+                                     const DataType* __restrict__ input,
+                                     El::Int input_ldim,
+                                     DataType* __restrict__ output,
+                                     El::Int output_ldim) {
+  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int size = height * width;
+  const El::Int num_threads = blockDim.x * gridDim.x;
+  UnaryOperator op;
+  for (El::Int pos = gid; pos < size; pos += num_threads) {
+    const auto& row = pos % height;
+    const auto& col = pos / height;
+    const auto& x = input[row + col * input_ldim];
+    auto& y = output[row + col * output_ldim];
+    y = op(x);
+  }
+}
+
+/** CUDA kernel to apply an entry-wise binary operator. */
+template <typename BinaryOperator>
+__global__
+void entrywise_binary_operator_kernel(El::Int height, El::Int width,
+                                     const DataType* __restrict__ input1,
+                                     El::Int input1_ldim,
+                                     const DataType* __restrict__ input2,
+                                     El::Int input2_ldim,
+                                     DataType* __restrict__ output,
+                                     El::Int output_ldim) {
+  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int size = height * width;
+  const El::Int num_threads = blockDim.x * gridDim.x;
+  BinaryOperator op;
+  for (El::Int pos = gid; pos < size; pos += num_threads) {
+    const auto& row = pos % height;
+    const auto& col = pos / height;
+    const auto& x1 = input1[row + col * input1_ldim];
+    const auto& x2 = input2[row + col * input2_ldim];
+    auto& y = output[row + col * output_ldim];
+    y = op(x1, x2);
+  }
+}
+
+/** Apply an entry-wise unary operator to GPU data.
+ *  The input and output data must be on GPU and must have the same
+ *  dimensions.
+ */
+template <typename UnaryOperator>
+void apply_entrywise_unary_operator(const AbsMat& input,
+                                    AbsMat& output) {
+
+  // Check that input and output are valid
+  std::stringstream err;
+  if (input.GetDevice() != El::Device::GPU) {
+    LBANN_ERROR("input is not on GPU");
+  } else if (output.GetDevice() != El::Device::GPU) {
+    LBANN_ERROR("output is not on GPU");
+  } else if (input.Height() != output.Height()
+             || input.Width() != output.Width()) {
+    err << "input matrix dimensions "
+        << "(" << input.Height() << " x " << input.Width() << ")"
+        << "don't match output matrix dimensions "
+        << "(" << output.Height() << " x " << output.Width() << ")";
+    LBANN_ERROR(err.str());
+  }
+
+  // Get CUDA grid dimensions
+  // Note: Maximum CUDA grid dimension is 2^32-1
+  // (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications).
+  const El::Int height = input.Height();
+  const El::Int width = input.Width();
+  const El::Int block_dim = 256;
+  El::Int grid_dim = (height * width + block_dim - 1) / block_dim;
+  if (sizeof(El::Int) > sizeof(unsigned int)
+      && grid_dim > std::numeric_limits<uint32_t>::max()) {
+    grid_dim = std::numeric_limits<uint32_t>::max();
+  }
+
+  // Launch CUDA kernel
+  if (grid_dim > 0) {
+    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    entrywise_unary_operator_kernel<UnaryOperator>
+      <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+        height, width, input.LockedBuffer(), input.LDim(),
+        output.Buffer(), output.LDim());
+  }
+  
+}
+
+/** Apply an entry-wise binary operator to GPU data.
+ *  The input and output data must be on GPU and must have the same
+ *  dimensions.
+ */
+template <typename BinaryOperator>
+void apply_entrywise_binary_operator(const AbsMat& input1,
+                                     const AbsMat& input2,
+                                     AbsMat& output) {
+
+  // Check that input and output are valid
+  std::stringstream err;
+  if (input1.GetDevice() != El::Device::GPU
+      || input2.GetDevice() != El::Device::GPU) {
+    LBANN_ERROR("input is not on GPU");
+  } else if (output.GetDevice() != El::Device::GPU) {
+    LBANN_ERROR("output is not on GPU");
+  } else if (input1.Height() != input2.Height()
+             || input1.Width() != input2.Width()
+             || input1.Height() != output.Height()
+             || input1.Width() != output.Width()) {
+    err << "input matrix dimensions "
+        << "(" << input1.Height() << " x " << input1.Width() << ", "
+        << input2.Height() << " x " << input2.Width() << ")"
+        << "don't match output matrix dimensions "
+        << "(" << output.Height() << " x " << output.Width() << ")";
+    LBANN_ERROR(err.str());
+  }
+
+  // Get CUDA grid dimensions
+  // Note: Maximum CUDA grid dimension is 2^32-1
+  // (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications).
+  const El::Int height = input1.Height();
+  const El::Int width = input1.Width();
+  const El::Int block_dim = 256;
+  El::Int grid_dim = (height * width + block_dim - 1) / block_dim;
+  if (sizeof(El::Int) > sizeof(unsigned int)
+      && grid_dim > std::numeric_limits<uint32_t>::max()) {
+    grid_dim = std::numeric_limits<uint32_t>::max();
+  }
+
+  // Launch CUDA kernel
+  if (grid_dim > 0) {
+    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    entrywise_binary_operator_kernel<BinaryOperator>
+      <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+        height, width,
+        input1.LockedBuffer(), input1.LDim(),
+        input2.LockedBuffer(), input2.LDim(),
+        output.Buffer(), output.LDim());
+  }
+  
+}
+  
+/** Apply an entry-wise unary operator to GPU data.
+ *  The input and output data must be on GPU, have the same
+ *  dimensions, and be aligned.
+ */
+template <typename UnaryOperator>
+void apply_entrywise_unary_operator(const AbsDistMat& input,
+                                    AbsDistMat& output) {
+  std::stringstream err;
+  if (input.Height() != output.Height()
+      || input.Width() != output.Width()) {
+    err << "input matrix dimensions "
+        << "(" << input.Height() << " x " << input.Width() << ")"
+        << "don't match output matrix dimensions "
+        << "(" << output.Height() << " x " << output.Width() << ")";
+    LBANN_ERROR(err.str());
+  } else if (input.DistData() != output.DistData()) {
+    LBANN_ERROR("input and output matrix distributions don't match");
+  }
+  apply_entrywise_unary_operator<UnaryOperator>(input.LockedMatrix(),
+                                                output.Matrix());
+}
+
+/** Apply an entry-wise binary operator to GPU data.
+ *  The input and output data must be on GPU, have the same
+ *  dimensions, and be aligned.
+ */
+template <typename BinaryOperator>
+void apply_entrywise_binary_operator(const AbsDistMat& input1,
+                                     const AbsDistMat& input2,
+                                     AbsDistMat& output) {
+  if (input1.Height() != input2.Height()
+      || input1.Width() != input2.Width()
+      || input1.Height() != output.Height()
+      || input1.Width() != output.Width()) {
+    std::stringstream err;
+    err << "input matrix dimensions "
+        << "(" << input1.Height() << " x " << input1.Width() << ", "
+        << input2.Height() << " x " << input2.Width() << ")"
+        << "don't match output matrix dimensions "
+        << "(" << output.Height() << " x " << output.Width() << ")";
+    LBANN_ERROR(err.str());
+  } else if (input1.DistData() != input2.DistData()
+             || input1.DistData() != output.DistData()) {
+    LBANN_ERROR("input and output matrix distributions don't match");
+  }
+  apply_entrywise_binary_operator<BinaryOperator>(input1.LockedMatrix(),
+                                                  input2.LockedMatrix(),
+                                                  output.Matrix());
+}
+  
+#endif // __CUDACC__
+
+// -------------------------------------------------------------
+// Utilities for Thrust
+// -------------------------------------------------------------
+namespace thrust {
+
+template <typename T>
+allocator<T>::allocator(cudaStream_t stream)
+  : m_stream(stream),
+    m_system(stream) {}
+  
+template <typename T>
+typename allocator<T>::pointer allocator<T>::allocate(allocator<T>::size_type size) {
+  value_type* buffer = nullptr;
+  if (size > 0) {
+#ifdef HYDROGEN_HAVE_CUB
+    auto& memory_pool = El::cub::MemoryPool();
+    CHECK_CUDA(memory_pool.DeviceAllocate(reinterpret_cast<void**>(&buffer),
+                                          size * sizeof(value_type),
+                                          m_stream));
+#else
+    CHECK_CUDA(cudaMalloc(&buffer, size * sizeof(value_type)));
+#endif // HYDROGEN_HAVE_CUB
+  }
+  return pointer(buffer);
+}
+
+template <typename T>
+void allocator<T>::deallocate(allocator<T>::pointer buffer,
+                              allocator<T>::size_type size) {
+  auto&& ptr = buffer.get();
+  if (ptr != nullptr) {
+#ifdef HYDROGEN_HAVE_CUB
+    auto& memory_pool = El::cub::MemoryPool();
+    CHECK_CUDA(memory_pool.DeviceFree(ptr));
+#else
+    CHECK_CUDA(cudaFree(ptr));
+#endif // HYDROGEN_HAVE_CUB
+  }
+}
+
+template <typename T>
+typename allocator<T>::system_type& allocator<T>::system() {
+  return m_system;
+}
+  
+} // namespace thrust
+
+} // namespace cuda
+} // namespace lbann
diff --git a/include/lbann/layers/activations/atan.hpp b/include/lbann/utils/omp_pragma.hpp
similarity index 58%
rename from include/lbann/layers/activations/atan.hpp
rename to include/lbann/utils/omp_pragma.hpp
index 0c0c803ae13..efb1f27ae1a 100644
--- a/include/lbann/layers/activations/atan.hpp
+++ b/include/lbann/utils/omp_pragma.hpp
@@ -24,32 +24,24 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
-#ifndef ATAN_HPP_INCLUDED
-#define ATAN_HPP_INCLUDED
+#ifndef LBANN_OMP_PRAGMA_HPP
+#define LBANN_OMP_PRAGMA_HPP
 
-#include "lbann/layers/activations/activation.hpp"
+#include "lbann_config.hpp"
+#include <omp.h>
 
-namespace lbann {
+#define OMP_PARALLEL _Pragma("omp parallel for")
+#define OMP_CRITICAL _Pragma("omp critical")
 
-/** Arctangent activation function. */
-template <data_layout T_layout, El::Device Dev>
-class atan_layer : public entrywise_activation_layer {
- public:
-  atan_layer(lbann_comm *comm) : entrywise_activation_layer(comm) {}
-  atan_layer* copy() const override { return new atan_layer(*this); }
-  std::string get_type() const override { return "atan"; }
-  data_layout get_data_layout() const override { return T_layout; }
-  El::Device get_device_allocation() const override { return Dev; }
+#if defined(LBANN_NO_OMP_FOR_DATA_READERS)
+  #pragma message "Disable OpenMP parallelism for data fetch loops"
+  #define LBANN_DATA_FETCH_OMP_FOR for
+  #define LBANN_OMP_THREAD_NUM 0
+  #define LBANN_DATA_FETCH_OMP_CRITICAL
+#else
+  #define LBANN_DATA_FETCH_OMP_FOR OMP_PARALLEL for
+  #define LBANN_OMP_THREAD_NUM omp_get_thread_num()
+  #define LBANN_DATA_FETCH_OMP_CRITICAL OMP_CRITICAL
+#endif
 
- protected:
-  DataType activation(DataType x) const override {
-    return std::atan(x);
-  }
-  DataType activation_derivative(DataType x) const override {
-    return 1 / (DataType(1) + x * x);
-  }
-};
-
-} // namespace lbann
-
-#endif // ATAN_HPP_INCLUDED
+#endif // LBANN_OMP_PRAGMA_HPP
diff --git a/include/lbann/weights/initializer.hpp b/include/lbann/weights/initializer.hpp
index 6269de5a1cb..00c19c25da8 100644
--- a/include/lbann/weights/initializer.hpp
+++ b/include/lbann/weights/initializer.hpp
@@ -45,12 +45,13 @@ class weights_initializer {
 
 };
 
-/** Constant weights initializer. */
+/** Constant weights initializer.
+ *  All weight values are set equal to a user-provided value.
+ */
 class constant_initializer : public weights_initializer {
 public:
   constant_initializer(DataType value)
     : weights_initializer(), m_value(value) {}
-  ~constant_initializer() override = default;
   constant_initializer* copy() const override {
     return new constant_initializer(*this);
   }
@@ -62,13 +63,33 @@ class constant_initializer : public weights_initializer {
 
 };
 
-/** Uniform random weights initializer. */
+/** Weights initializer by value.
+ *  Weight values are set equal to a user-provided list of values. The
+ *  number of weight entries must match the number of provided values.
+ */
+class value_initializer : public weights_initializer {
+public:
+  value_initializer(std::vector<DataType> values)
+    : weights_initializer(), m_values(std::move(values)) {}
+  value_initializer* copy() const override {
+    return new value_initializer(*this);
+  }
+  void fill(AbsDistMat& matrix) override;
+
+private:
+  /** Initializer values. */
+  std::vector<DataType> m_values;
+
+};  
+
+/** Uniform random weights initializer.
+ *  Weight values are drawn i.i.d. from a uniform distribution.
+ */
 class uniform_initializer : public weights_initializer {
  public:
   uniform_initializer(DataType min = DataType(0),
                       DataType max = DataType(1))
     : weights_initializer(), m_min(min), m_max(max) {}
-  ~uniform_initializer() override = default;
   uniform_initializer* copy() const override {
     return new uniform_initializer(*this);
   }
@@ -82,7 +103,9 @@ class uniform_initializer : public weights_initializer {
 
 };
 
-/** Normal random weights initializer. */
+/** Normal random weights initializer.
+ *  Weight values are drawn i.i.d. from a normal distribution.
+ */
 class normal_initializer : public weights_initializer {
 public:
   normal_initializer(DataType mean = DataType(0),
@@ -90,7 +113,6 @@ class normal_initializer : public weights_initializer {
     : weights_initializer(),
       m_mean(mean),
       m_standard_deviation(standard_deviation) {}
-  ~normal_initializer() override = default;
   normal_initializer* copy() const override {
     return new normal_initializer(*this);
   }
diff --git a/model_zoo/data_readers/data_reader_candle_pilot1_combo.prototext b/model_zoo/data_readers/data_reader_candle_pilot1_combo.prototext
index af38fe66ed5..2e87db717f8 100644
--- a/model_zoo/data_readers/data_reader_candle_pilot1_combo.prototext
+++ b/model_zoo/data_readers/data_reader_candle_pilot1_combo.prototext
@@ -4,9 +4,9 @@ data_reader {
     format: "csv"
     role: "train"
     shuffle: true
-    data_file_pattern: "/p/lscratchf/brainusr/datasets/cancer/pilot1/lm_genes/combo_x_train_lmg.*.txt"
-    label_filename: "/p/lscratchf/brainusr/datasets/cancer/pilot1/lm_genes/combo_y_train_lmg.txt"
-    validation_percent: 0
+    data_file_pattern: "/p/lscratchh/brainusr/datasets/cancer/pilot1/lm_genes/combo_x_train_lmg.*.txt"
+    label_filename: "/p/lscratchh/brainusr/datasets/cancer/pilot1/lm_genes/combo_y_train_lmg.txt"
+    validation_percent: 0.2
     absolute_sample_count: 0
     percent_of_data_to_use: 1.0
     separator: ','
@@ -18,8 +18,8 @@ data_reader {
     format: "csv"
     role: "test"
     shuffle: true
-    data_file_pattern: "/p/lscratchf/brainusr/datasets/cancer/pilot1/lm_genes/combo_x_val_lmg.*.txt"
-    label_filename: "/p/lscratchf/brainusr/datasets/cancer/pilot1/lm_genes/combo_y_val_lmg.txt"
+    data_file_pattern: "/p/lscratchh/brainusr/datasets/cancer/pilot1/lm_genes/combo_x_val_lmg.*.txt"
+    label_filename: "/p/lscratchh/brainusr/datasets/cancer/pilot1/lm_genes/combo_y_val_lmg.txt"
     absolute_sample_count: 0
     percent_of_data_to_use: 1.0
     separator: ','
diff --git a/model_zoo/data_readers/data_reader_candle_pilot1_gdc.prototext b/model_zoo/data_readers/data_reader_candle_pilot1_gdc.prototext
index ff586a2815a..5ed4a330c01 100644
--- a/model_zoo/data_readers/data_reader_candle_pilot1_gdc.prototext
+++ b/model_zoo/data_readers/data_reader_candle_pilot1_gdc.prototext
@@ -3,8 +3,8 @@ data_reader {
     name: "numpy"
     role: "train"
     shuffle: true
-    data_filename: "/p/lscratchf/brainusr/datasets/cancer/pilot1/GDCdata/X_train_norm.npy"
-    validation_percent: 0.1
+    data_filename: "/p/lscratchh/brainusr/datasets/cancer/pilot1/GDCdata/X_traindata_norm.npy"
+    validation_percent: 0
     absolute_sample_count: 0
     percent_of_data_to_use: 1.0
     disable_labels: true
@@ -12,4 +12,16 @@ data_reader {
 
   }
 
+  reader {
+    name: "numpy"
+    role: "test"
+    shuffle: true
+    data_filename: "/p/lscratchh/brainusr/datasets/cancer/pilot1/GDCdata/X_valdata_norm.npy"
+    validation_percent: 0
+    absolute_sample_count: 0
+    percent_of_data_to_use: 1.0
+    disable_labels: true
+    disable_responses: true
+
+  }
 }
diff --git a/model_zoo/data_readers/data_reader_moving_mnist.prototext b/model_zoo/data_readers/data_reader_moving_mnist.prototext
new file mode 100644
index 00000000000..91badb57727
--- /dev/null
+++ b/model_zoo/data_readers/data_reader_moving_mnist.prototext
@@ -0,0 +1,23 @@
+data_reader {
+  reader {
+    name: "moving_mnist"
+    role: "train"
+    shuffle: true
+    data_filedir: "/p/lscratchf/brainusr/datasets/MNIST"
+    data_filename: "train-images-idx3-ubyte"
+    label_filename: "train-labels-idx1-ubyte"
+    validation_percent: 0.1
+    absolute_sample_count: 0
+    percent_of_data_to_use: 1.0
+  }
+  reader {
+    name: "moving_mnist"
+    role: "test"
+    shuffle: true
+    data_filedir: "/p/lscratchf/brainusr/datasets/MNIST"
+    data_filename: "t10k-images-idx3-ubyte"
+    label_filename: "t10k-labels-idx1-ubyte"
+    absolute_sample_count: 0
+    percent_of_data_to_use: 1.0
+  }
+}
diff --git a/model_zoo/data_readers/data_reader_synthetic.prototext b/model_zoo/data_readers/data_reader_synthetic.prototext
index 537da9182cd..84da7ebad1e 100644
--- a/model_zoo/data_readers/data_reader_synthetic.prototext
+++ b/model_zoo/data_readers/data_reader_synthetic.prototext
@@ -3,10 +3,10 @@ data_reader {
     name: "synthetic"
     role: "train"
     shuffle: true
-    num_samples: 240000
-    synth_dimensions: "1024"
-    num_labels: 0
-    validation_percent: 0.1
+    num_samples: 4
+    synth_dimensions: "1"
+    num_labels: 1
+    validation_percent: 0.5
     absolute_sample_count: 0
     percent_of_data_to_use: 1.0
   }
@@ -14,9 +14,9 @@ data_reader {
     name: "synthetic"
     role: "test"
     shuffle: true
-    num_samples: 32000
-    synth_dimensions: "1024"
-    num_labels: 0
+    num_samples: 32
+    synth_dimensions: "1"
+    num_labels: 1
     absolute_sample_count: 0
     percent_of_data_to_use: 1.0
   }
diff --git a/model_zoo/lbann.cpp b/model_zoo/lbann.cpp
index 77847ffc887..edb887f7cd1 100644
--- a/model_zoo/lbann.cpp
+++ b/model_zoo/lbann.cpp
@@ -331,11 +331,16 @@ int main(int argc, char *argv[]) {
       e.print_report(fs);
     }
     El::ReportException(e);
+    finalize(comm);
+    return EXIT_FAILURE;
   } catch (std::exception& e) {
     El::ReportException(e);
+    finalize(comm);
+    return EXIT_FAILURE;
   }
 
-  // free all resources by El and MPI
+  // Clean up
   finalize(comm);
-  return 0;
+  return EXIT_SUCCESS;
+  
 }
diff --git a/model_zoo/lbann2.cpp b/model_zoo/lbann2.cpp
index ccb9a33a832..4cd49265799 100644
--- a/model_zoo/lbann2.cpp
+++ b/model_zoo/lbann2.cpp
@@ -119,11 +119,14 @@ int main(int argc, char *argv[]) {
 
   } catch (std::exception& e) {
     El::ReportException(e);
+    finalize(comm);
+    return EXIT_FAILURE;
   }
 
-  // free all resources by El and MPI
+  // Clean up
   finalize(comm);
-  return 0;
+  return EXIT_SUCCESS;
+  
 }
 
 model * build_model_from_prototext(int argc, char **argv,
@@ -361,7 +364,7 @@ bool load_model_weights(std::string ckpt_dir, model * m){
     closeread(fd, latest);
     if(temp_comm->am_model_master())
       sprintf(latest, "%s/shared.model.%d.epoch.%d.step.%d/", ckpt_dir.c_str(), temp_comm->get_model_rank(), epochLast, stepLast);
-    temp_comm->model_broadcast(0, &(latest[0]), sizeof(latest));
+    temp_comm->model_broadcast(0, &(latest[0]), sizeof(latest), El::SyncInfo<El::Device::CPU>{});
   }
 
   DIR *weight_dir;
diff --git a/model_zoo/lbann_cycgan.cpp b/model_zoo/lbann_cycgan.cpp
index 6bdb2403395..929ae6f470f 100644
--- a/model_zoo/lbann_cycgan.cpp
+++ b/model_zoo/lbann_cycgan.cpp
@@ -67,6 +67,8 @@ int main(int argc, char *argv[]) {
 
     model *model_1 = build_model_from_prototext(argc, argv, *(pbs[0]),
                                                 comm, true); //D1 solver
+    //hack, overide model name to make reporting easy, what can break?"
+    model_1->set_name("dis_model");
     model *model_2 = nullptr; //G1 solver
     model *model_3 = nullptr; //G2 solver
 
@@ -77,21 +79,25 @@ int main(int argc, char *argv[]) {
     if (pbs.size() > 1) {
       model_2 = build_model_from_prototext(argc, argv, *(pbs[1]),
                                            comm, false);
+      model_2->set_name("fw_model");
     }
 
     if (pbs.size() > 2) {
       model_3 = build_model_from_prototext(argc, argv, *(pbs[2]),
                                            comm, false);
+      model_3->set_name("inv_model");
     }
      
     if (pbs.size() > 3) {
       ae_model = build_model_from_prototext(argc, argv, *(pbs[3]),
                                            comm, false);
+      ae_model->set_name("ae_model");
     }
 
     if (pbs.size() > 4) {
       ae_cycgan_model = build_model_from_prototext(argc, argv, *(pbs[4]),
                                            comm, false);
+      ae_cycgan_model->set_name("ae_cycgan_model");
     }
 
     const lbann_data::Model pb_model = pbs[0]->model();
@@ -174,11 +180,14 @@ int main(int argc, char *argv[]) {
 
   } catch (std::exception& e) {
     El::ReportException(e);
+    finalize(comm);
+    return EXIT_FAILURE;
   }
 
-  // free all resources by El and MPI
+  // Clean up
   finalize(comm);
-  return 0;
+  return EXIT_SUCCESS;
+  
 }
 
 model * build_model_from_prototext(int argc, char **argv,
diff --git a/model_zoo/lbann_gan.cpp b/model_zoo/lbann_gan.cpp
index fd1763cb228..254c7fe78a7 100644
--- a/model_zoo/lbann_gan.cpp
+++ b/model_zoo/lbann_gan.cpp
@@ -116,11 +116,14 @@ int main(int argc, char *argv[]) {
 
   } catch (std::exception& e) {
     El::ReportException(e);
+    finalize(comm);
+    return EXIT_FAILURE;
   }
 
-  // free all resources by El and MPI
+  // Clean up
   finalize(comm);
-  return 0;
+  return EXIT_SUCCESS;
+  
 }
 
 model * build_model_from_prototext(int argc, char **argv, lbann_data::LbannPB &pb) {
diff --git a/model_zoo/models/alexnet/data.prototext b/model_zoo/models/alexnet/data.prototext
deleted file mode 100644
index 4bb6459f922..00000000000
--- a/model_zoo/models/alexnet/data.prototext
+++ /dev/null
@@ -1,357 +0,0 @@
-# cmd line for original experiment:
-#  $ /usr/workspace/wsb/hysom/TEST10/lbann/model_zoo/models/alexnet/../../../build/catalyst.llnl.gov/model_zoo/lbann --model=model_alexnet.prototext --reader=../../data_readers/data_reader_imagenet.prototext --optimizer=../../optimizers/opt_adagrad.prototext 
-#
-# Experiment conducted at: Tue Aug 29 08:34:08 2017
-#
-#
-# Experiment was run with lbann version: v0.93-rc0-906-gd84a104-dirty
-#
-#
-# To rerun the experiment: 
-#  $ srun -n12 /usr/workspace/wsb/hysom/TEST10/lbann/model_zoo/models/alexnet/../../../build/catalyst.llnl.gov/model_zoo/lbann --loadme=data.prototext
-#
-#
-# Selected SLURM Environment Variables:
-# HOST=catalyst321
-# SLURM_NODELIST=catalyst321
-# SLURM_NNODES=1
-# SLURM_NTASKS=12
-# SLURM_TASKS_PER_NODE=12
-
-#
-#
-data_reader {
-  reader {
-    name: "imagenet"
-    role: "train"
-    shuffle: true
-    data_filedir: "/p/lscratchf/brainusr/datasets/ILSVRC2012/resized_256x256/train/"
-    data_filename: "/p/lscratchf/brainusr/datasets/ILSVRC2012/labels/train_c0-9.txt"
-    validation_percent: 0.1
-    image_preprocessor {
-      scale: true
-      subtract_mean: true
-      unit_variance: true
-    }
-  }
-  reader {
-    name: "imagenet"
-    role: "test"
-    shuffle: true
-    data_filedir: "/p/lscratchf/brainusr/datasets/ILSVRC2012/resized_256x256/val/"
-    data_filename: "/p/lscratchf/brainusr/datasets/ILSVRC2012/labels/val_c0-9.txt"
-    validation_percent: 1
-    image_preprocessor {
-      scale: true
-      subtract_mean: true
-      unit_variance: true
-    }
-  }
-}
-model {
-  name: "dnn"
-  objective_function: "cross_entropy"
-  num_epochs: 20
-  metric {
-    categorical_accuracy {
-    }
-  }
-  metric {
-    top_k_categorical_accuracy {
-      top_k: 5
-    }
-  }
-  data_layout: "data_parallel"
-  layer {
-    input_partitioned_minibatch {
-    }
-    index: 1
-    parent: 1
-    data_layout: "data_parallel"
-  }
-  layer {
-    convolution {
-      num_dims: 2
-      has_vectors: true
-      num_output_channels: 96
-      conv_dims: "11 11"
-      conv_pads: "0 0"
-      conv_strides: "4 4"
-      weight_initialization: "he_normal"
-      has_bias: true
-      l2_regularization_factor: 0.0005
-    }
-    index: 2
-    parent: 1
-    data_layout: "data_parallel"
-  }
-  layer {
-    relu {
-    }
-    index: 3
-    parent: 2
-    data_layout: "data_parallel"
-  }
-  layer {
-    local_response_normalization {
-      window_width: 5
-      lrn_alpha: 0.0001
-      lrn_beta: 0.75
-      lrn_k: 2
-    }
-    index: 4
-    parent: 3
-    data_layout: "data_parallel"
-  }
-  layer {
-    pooling {
-      num_dims: 2
-      has_vectors: true
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-    }
-    index: 5
-    parent: 4
-    data_layout: "data_parallel"
-  }
-  layer {
-    convolution {
-      num_dims: 2
-      has_vectors: true
-      num_output_channels: 256
-      conv_dims: "5 5"
-      conv_pads: "2 2"
-      conv_strides: "1 1"
-      weight_initialization: "he_normal"
-      has_bias: true
-      l2_regularization_factor: 0.0005
-    }
-    index: 6
-    parent: 5
-    data_layout: "data_parallel"
-  }
-  layer {
-    relu {
-    }
-    index: 7
-    parent: 6
-    data_layout: "data_parallel"
-  }
-  layer {
-    local_response_normalization {
-      window_width: 5
-      lrn_alpha: 0.0001
-      lrn_beta: 0.75
-      lrn_k: 2
-    }
-    index: 8
-    parent: 7
-    data_layout: "data_parallel"
-  }
-  layer {
-    pooling {
-      num_dims: 2
-      has_vectors: true
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-    }
-    index: 9
-    parent: 8
-    data_layout: "data_parallel"
-  }
-  layer {
-    convolution {
-      num_dims: 2
-      has_vectors: true
-      num_output_channels: 384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      weight_initialization: "he_normal"
-      has_bias: true
-      l2_regularization_factor: 0.0005
-    }
-    index: 10
-    parent: 9
-    data_layout: "data_parallel"
-  }
-  layer {
-    relu {
-    }
-    index: 11
-    parent: 10
-    data_layout: "data_parallel"
-  }
-  layer {
-    convolution {
-      num_dims: 2
-      has_vectors: true
-      num_output_channels: 384
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      weight_initialization: "he_normal"
-      has_bias: true
-      l2_regularization_factor: 0.0005
-    }
-    index: 12
-    parent: 11
-    data_layout: "data_parallel"
-  }
-  layer {
-    relu {
-    }
-    index: 13
-    parent: 12
-    data_layout: "data_parallel"
-  }
-  layer {
-    convolution {
-      num_dims: 2
-      has_vectors: true
-      num_output_channels: 256
-      conv_dims: "3 3"
-      conv_pads: "1 1"
-      conv_strides: "1 1"
-      weight_initialization: "he_normal"
-      has_bias: true
-      l2_regularization_factor: 0.0005
-    }
-    index: 14
-    parent: 13
-    data_layout: "data_parallel"
-  }
-  layer {
-    relu {
-    }
-    index: 15
-    parent: 14
-    data_layout: "data_parallel"
-  }
-  layer {
-    pooling {
-      num_dims: 2
-      has_vectors: true
-      pool_dims: "3 3"
-      pool_pads: "0 0"
-      pool_strides: "2 2"
-      pool_mode: "max"
-    }
-    index: 16
-    parent: 15
-    data_layout: "data_parallel"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 4096
-      weight_initialization: "he_normal"
-      has_bias: true
-      l2_regularization_factor: 0.0005
-    }
-    index: 17
-    parent: 16
-    data_layout: "model_parallel"
-  }
-  layer {
-    relu {
-    }
-    index: 18
-    parent: 17
-    data_layout: "model_parallel"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.5
-    }
-    index: 19
-    parent: 18
-    data_layout: "model_parallel"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 4096
-      weight_initialization: "he_normal"
-      has_bias: true
-      l2_regularization_factor: 0.0005
-    }
-    index: 20
-    parent: 19
-    data_layout: "model_parallel"
-  }
-  layer {
-    relu {
-    }
-    index: 21
-    parent: 20
-    data_layout: "model_parallel"
-  }
-  layer {
-    dropout {
-      keep_prob: 0.5
-    }
-    index: 22
-    parent: 21
-    data_layout: "model_parallel"
-  }
-  layer {
-    fully_connected {
-      num_neurons: 1000
-      weight_initialization: "he_normal"
-      l2_regularization_factor: 0.0005
-    }
-    index: 23
-    parent: 22
-    data_layout: "model_parallel"
-  }
-  layer {
-    index: 24
-    parent: 23
-    data_layout: "model_parallel"
-    softmax {
-    }
-  }
-  layer {
-    index: 25
-    parent: 24
-    data_layout: "data_parallel"
-    target_partitioned_minibatch {
-      shared_data_reader: true
-    }
-  }
-  mini_batch_size: 256
-  callback {
-    imcomm {
-      intermodel_comm_method: "normal"
-      layers: "2 6 10 12 14 17 20 23"
-      summary_dir: "."
-    }
-  }
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-      dir: "none"
-    }
-  }
-  callback {
-    summary {
-      dir: "."
-      interval: 1
-    }
-  }
-  block_size: 256
-  num_parallel_readers: 12
-}
-optimizer {
-  adagrad {
-    learn_rate: 0.01
-    eps: 1e-08
-  }
-}
diff --git a/model_zoo/models/alexnet/model_alexnet.prototext b/model_zoo/models/alexnet/model_alexnet.prototext
index 1eb1a75e1a5..ab2f6036e83 100644
--- a/model_zoo/models/alexnet/model_alexnet.prototext
+++ b/model_zoo/models/alexnet/model_alexnet.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 256
   block_size: 256
@@ -321,14 +320,14 @@ model {
     cross_entropy {}    
   }
 
-layer {
+  layer {
     name: "top1_accuracy"
     parents: "prob labels"
     data_layout: "data_parallel"
-    top_k_categorical_accuracy { k: 1 }
+    categorical_accuracy {}
   }
 
-layer {
+  layer {
     name: "top5_accuracy"
     parents: "prob labels"
     data_layout: "data_parallel"
diff --git a/model_zoo/models/autoencoder_candle_pilot1/README.md b/model_zoo/models/autoencoder_candle_pilot1/README.md
index 71c8a85427d..8c5223ea3f9 100644
--- a/model_zoo/models/autoencoder_candle_pilot1/README.md
+++ b/model_zoo/models/autoencoder_candle_pilot1/README.md
@@ -6,22 +6,21 @@ CANcer Distributed Learning Enviroment ([CANDLE](http://candle.cels.anl.gov/)) i
 [Autoencoder](https://en.wikipedia.org/wiki/Autoencoder) is one of deep learning techniques being explored in the **CANDLE** team. In this blog, I will explain how to build autoencoder of interest to **CANDLE** project within LBANN framework. Examples in this blog were taken from Tensorflow version of similar deep learning network architecture provided by the **CANDLE** research team.
 
 ## Autoencoder in LBANN
-A network architecture in LBANN is a collection of layers as a sequential list or graph. To build an autoencoder model in LBANN, the user simply describe how the layers are connected in a [model prototext file](https://github.com/LLNL/lbann/tree/develop/model_zoo/models/autoencoder_candle_pilot1), provide training optimization paratemers in the [optimizer prototext file](https://github.com/LLNL/lbann/tree/develop/model_zoo/optimizers), and input data (and labels in case of classification) in the [data reader prototext file ](https://github.com/LLNL/lbann/tree/develop/model_zoo/data_readers). The prototext files provide the flexibility for users to change a number of network and optimization hyperparameters at run time. For example, an LBANN fully connected (also known as linear or inner product in other deep learning toolkits) layer can be described as shown:
+A network architecture in LBANN is a collection of layers as a directed acyclic graph. To build an autoencoder model in LBANN, the user simply describe how the layers are connected in a [model prototext file](https://github.com/LLNL/lbann/tree/develop/model_zoo/models/autoencoder_candle_pilot1), provide training optimization paratemers in the [optimizer prototext file](https://github.com/LLNL/lbann/tree/develop/model_zoo/optimizers), and input data (and labels in case of classification) in the [data reader prototext file ](https://github.com/LLNL/lbann/tree/develop/model_zoo/data_readers). The prototext files provide the flexibility for users to change a number of network and optimization hyperparameters at run time. For example, an LBANN fully connected (also known as linear or inner product in other deep learning toolkits) layer can be described as shown:
  ```
 layer {
-    index: 8
-    parent: 7
+    name: "8"
+    parents: "7"
     data_layout: "data_parallel"
     fully_connected {
       num_neurons: 5000
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
   
   ```
 
-Most of the attributes are self descriptive and some of them can be changed "on-the-fly". For instance, the glorot_uniform weight initialization scheme can be replaced with other schemes such as uniform, normal, he_normal, he_uniform, glorot_normal and so on.
+Most of the attributes are self descriptive and some of them can be changed "on-the-fly".
 
 
 ## Execute LBANN Autoencoder Example on LC
diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext
index 930746a5de9..5d130a230eb 100644
--- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext
+++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext
@@ -2,7 +2,6 @@ model {
   ### Model description and network architecture taken from:
   ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass
   ### This network description is anologous to AutoEncoder_Chem_ECFP
-  name: "sequential_model"
   data_layout: "model_parallel"
   mini_batch_size: 128
   block_size: 256
@@ -15,7 +14,7 @@ model {
   ###################################################
 
   objective_function {
-    mean_squared_error {}
+    layer_term { layer: "mean_squared_error" }
   }
 
   ###################################################
@@ -23,7 +22,10 @@ model {
   ###################################################
 
   metric {
-    pearson_correlation {}
+    layer_metric {
+      name: "Pearson correlation"
+      layer: "pearson_r"
+    }
   }
 
   ###################################################
@@ -38,18 +40,6 @@ model {
     timer {
     }
   }
- # callback {
- #   summary {
- #     dir: "."
- #     batch_interval: 1
- #     mat_interval: 25
- #   }
- # }
-#  callback {
-#    debug {
-#      phase: "train"
-#    }
-#  }
 
   ###################################################
   # start of layers
@@ -59,24 +49,36 @@ model {
   # INPUT
   #######
   layer {
-    name: "data"
-    children: "encode1 reconstruction"
+    name: "input"
+    children: "data dummy"
     data_layout: "model_parallel"
     input {
       io_buffer: "distributed"
       target_mode: "reconstruction"
     }
   }
+  layer {
+    parents: "input"
+    name: "data"
+    data_layout: "model_parallel"
+    split {}
+  }
+  layer {
+    parents: "input"
+    name: "dummy"
+    data_layout: "model_parallel"
+    dummy {}
+  }
 
   #################
   # FULLY_CONNECTED encode1
   #################
   layer {
+    parents: "data"
     name: "encode1"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 2000
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -85,21 +87,21 @@ model {
   # RELU relu1
   ######
   layer {
+    parents: "encode1"
     name: "relu1"
     data_layout: "model_parallel"
-    relu {
-    }
+    relu {}
   }
 
   #################
   # FULLY_CONNECTED encode2
   #################
   layer {
+    parents: "relu1"
     name: "encode2"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1000
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -108,21 +110,21 @@ model {
   # RELU relu2
   #######
   layer {
+    parents: "encode2"
     name: "relu2"
     data_layout: "model_parallel"
-    relu {
-    }
+    relu {}
   }
 
   #################
   # FULLY_CONNECTED encode3
   #################
   layer {
+    parents: "relu2"
     name: "encode3"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 500
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -131,21 +133,21 @@ model {
   # RELU relu3
   #######
   layer {
+    parents: "encode3"
     name: "relu3"
     data_layout: "model_parallel"
-    relu {
-    }
+    relu {}
   }
 
   #################
   # FULLY_CONNECTED encode4
   #################
   layer {
+    parents: "relu3"
     name: "encode4"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 250
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -154,21 +156,21 @@ model {
   # RELU relu4
   #######
   layer {
+    parents: "encode4"
     name: "relu4"
     data_layout: "model_parallel"
-    relu {
-    }
+    relu {}
   }
 
   #################
   # FULLY_CONNECTED encode5
   #################
   layer {
+    parents: "relu4'
     name: "encode5"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 100
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -177,16 +179,17 @@ model {
   # RELU relu5
   #######
   layer {
+    parents: "encode5"
     name: "relu5"
     data_layout: "model_parallel"
-    relu {
-    }
+    relu {}
   }
 
   #################
   # FULLY_CONNECTED decode5
   #################
   layer {
+    parents: "relu5"
     name: "decode5"
     data_layout: "model_parallel"
     fully_connected {
@@ -200,21 +203,21 @@ model {
   # RELU 6
   #######
   layer {
+    parents: "decode5"
     name: "relu6"
     data_layout: "model_parallel"
-    relu {
-    }
+    relu {}
   }
 
   #################
   # FULLY_CONNECTED decode4
   #################
   layer {
+    parents: "relu6"
     name: "decode4"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 500
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -224,21 +227,21 @@ model {
   # RELU relu7
   #######
   layer {
+    parents: "decode4"
     name: "relu7"
     data_layout: "model_parallel"
-    relu {
-    }
+    relu {}
   }
 
   #################
   # FULLY_CONNECTED decode3
   #################
   layer {
+    parents: "relu7"
     name: "decode3"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1000
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -247,21 +250,21 @@ model {
   # RELU relu8
   #######
   layer {
+    parents: "decode3"
     name: "relu8"
     data_layout: "model_parallel"
-    relu {
-    }
+    relu {}
   }
 
   #################
   # FULLY_CONNECTED decode2
   #################
   layer {
+    parents: "relu8"
     name: "decode2"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 2000
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -270,21 +273,21 @@ model {
   # RELU relu9
   #######
   layer {
+    parents: "decode2"
     name: "relu9"
     data_layout: "model_parallel"
-    relu {
-    }
+    relu {}
   }
 
   #################
   # FULLY_CONNECTED decode1
   #################
   layer {
+    parents: "relu9"
     name: "decode1"
     data_layout: "model_parallel"
     num_neurons_from_data_reader: true
     fully_connected {
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -293,10 +296,10 @@ model {
   # RELU relu10
   #######
   layer {
+    parents: "decode1"
     name: "relu10"
     data_layout: "model_parallel"
-    relu {
-    }
+    relu {}
   }
 
 
@@ -304,10 +307,57 @@ model {
   # RECONSTRUCTION
   #################
   layer {
+    parents: "relu10"
     name: "reconstruction"
-    parents: "relu10 data"
     data_layout: "model_parallel"
-    reconstruction {}
+    split {}
+  }
+  layer {
+    parents: "reconstruction data"
+    name: "mean_squared_error"
+    data_layout: "model_parallel"
+    mean_squared_error {}
+  }
+
+  #####################
+  # PEARSON CORRELATION
+  #####################
+  # rho(x,y) = covariance(x,y) / sqrt( variance(x) * variance(y) )
+  layer {
+    parents: "reconstruction data"
+    name: "pearson_r_cov"
+    data_layout: "model_parallel"
+    covariance {}
+  }
+  layer {
+    parents: "data"
+    name: "pearson_r_var1"
+    data_layout: "model_parallel"
+    variance {}
+  }
+  layer {
+    parents: "reconstruction"
+    name: "pearson_r_var2"
+    data_layout: "model_parallel"
+    variance {}
+  }
+  layer {
+    parents: "pearson_r_var1 pearson_r_var2"
+    name: "pearson_r_mult"
+    data_layout: "model_parallel"
+    multiply {}
+  }
+  layer {
+    parents: "pearson_r_mult"
+    name: "pearson_r_sqrt"
+    data_layout: "model_parallel"
+    sqrt {}
+  }
+  layer {
+    parents: "pearson_r_cov pearson_r_sqrt"
+    name: "pearson_r"
+    data_layout: "model_parallel"
+    divide {}
   }
 
   ###################################################
diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext
index fb004a8fff3..22747322326 100644
--- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext
+++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext
@@ -2,7 +2,6 @@ model {
   ### Model description and network architecture taken from:
   ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass
   ### This network description is anologous to AutoEncoder_Chem_ECFP
-  name: "sequential_model"
   data_layout: "model_parallel"
   mini_batch_size: 1024
   block_size: 256
@@ -15,7 +14,7 @@ model {
   ###################################################
 
   objective_function {
-    mean_squared_error {}
+    layer_term { layer: "mean_squared_error" }
   }
 
   ###################################################
@@ -23,7 +22,10 @@ model {
   ###################################################
 
   metric {
-    pearson_correlation {}
+    layer_metric {
+      name: "Pearson correlation"
+      layer: "pearson_r"
+    }
   }
 
   ###################################################
@@ -38,18 +40,6 @@ model {
     timer {
     }
   }
- # callback {
- #   summary {
- #     dir: "."
- #     batch_interval: 1
- #     mat_interval: 25
- #   }
- # }
-#  callback {
-#    debug {
-#      phase: "train"
-#    }
-#  }
 
   ###################################################
   # start of layers
@@ -59,24 +49,36 @@ model {
   # INPUT
   #######
   layer {
-    name: "data"
-    children: "encode1 reconstruction"
+    name: "input"
+    children: "data dummy"
     data_layout: "model_parallel"
     input {
       io_buffer: "distributed"
       target_mode: "reconstruction"
     }
   }
+  layer {
+    parents: "input"
+    name: "data"
+    data_layout: "model_parallel"
+    split {}
+  }
+  layer {
+    parents: "input"
+    name: "dummy"
+    data_layout: "model_parallel"
+    dummy {}
+  }
 
   #################
   # FULLY_CONNECTED encode1
   #################
   layer {
+    parents: "data"
     name: "encode1"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 200
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -85,6 +87,7 @@ model {
   # RELU relu1
   ######
   layer {
+    parents: "encode1"
     name: "relu1"
     data_layout: "model_parallel"
     relu {
@@ -95,11 +98,11 @@ model {
   # FULLY_CONNECTED encode2
   #################
   layer {
+    parents: "relu1"
     name: "encode2"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 150
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -108,6 +111,7 @@ model {
   # RELU relu2
   #######
   layer {
+    parents: "encode2"
     name: "relu2"
     data_layout: "model_parallel"
     relu {
@@ -118,11 +122,11 @@ model {
   # FULLY_CONNECTED encode3
   #################
   layer {
+    parents: "relu2"
     name: "encode3"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 100
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -131,6 +135,7 @@ model {
   # RELU relu3
   #######
   layer {
+    parents: "encode3"
     name: "relu3"
     data_layout: "model_parallel"
     relu {
@@ -141,11 +146,11 @@ model {
   # FULLY_CONNECTED encode4
   #################
   layer {
+    parents: "relu3"
     name: "encode4"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 100
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -154,6 +159,7 @@ model {
   # RELU relu4
   #######
   layer {
+    parents: "encode4"
     name: "relu4"
     data_layout: "model_parallel"
     relu {
@@ -164,11 +170,11 @@ model {
   # FULLY_CONNECTED encode5
   #################
   layer {
+    parents: "relu4"
     name: "encode5"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 100
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -177,6 +183,7 @@ model {
   # RELU relu5
   #######
   layer {
+    parents: "encode5"
     name: "relu5"
     data_layout: "model_parallel"
     relu {
@@ -187,11 +194,11 @@ model {
   # FULLY_CONNECTED decode5
   #################
   layer {
+    parents: "relu5"
     name: "decode5"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 100
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -200,6 +207,7 @@ model {
   # RELU 6
   #######
   layer {
+    parents: "decode5"
     name: "relu6"
     data_layout: "model_parallel"
     relu {
@@ -210,11 +218,11 @@ model {
   # FULLY_CONNECTED decode4
   #################
   layer {
+    parents: "relu6"
     name: "decode4"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 100
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -224,6 +232,7 @@ model {
   # RELU relu7
   #######
   layer {
+    parents: "decode4"
     name: "relu7"
     data_layout: "model_parallel"
     relu {
@@ -234,11 +243,11 @@ model {
   # FULLY_CONNECTED decode3
   #################
   layer {
+    parents: "relu7"
     name: "decode3"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 150
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -247,6 +256,7 @@ model {
   # RELU relu8
   #######
   layer {
+    parents: "decode3"
     name: "relu8"
     data_layout: "model_parallel"
     relu {
@@ -257,11 +267,11 @@ model {
   # FULLY_CONNECTED decode2
   #################
   layer {
+    parents: "relu8"
     name: "decode2"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 200
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -270,6 +280,7 @@ model {
   # RELU relu9
   #######
   layer {
+    parents: "decode2"
     name: "relu9"
     data_layout: "model_parallel"
     relu {
@@ -280,11 +291,11 @@ model {
   # FULLY_CONNECTED decode1
   #################
   layer {
+    parents: "relu9"
     name: "decode1"
     data_layout: "model_parallel"
     num_neurons_from_data_reader: true
     fully_connected {
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -293,6 +304,7 @@ model {
   # RELU relu10
   #######
   layer {
+    parents: "decode1"
     name: "relu10"
     data_layout: "model_parallel"
     relu {
@@ -304,10 +316,57 @@ model {
   # RECONSTRUCTION
   #################
   layer {
+    parents: "relu10"
     name: "reconstruction"
-    parents: "relu10 data"
     data_layout: "model_parallel"
-    reconstruction {}
+    split {}
+  }
+  layer {
+    parents: "reconstruction data"
+    name: "mean_squared_error"
+    data_layout: "model_parallel"
+    mean_squared_error {}
+  }
+
+  #####################
+  # PEARSON CORRELATION
+  #####################
+  # rho(x,y) = covariance(x,y) / sqrt( variance(x) * variance(y) )
+  layer {
+    parents: "reconstruction data"
+    name: "pearson_r_cov"
+    data_layout: "model_parallel"
+    covariance {}
+  }
+  layer {
+    parents: "data"
+    name: "pearson_r_var1"
+    data_layout: "model_parallel"
+    variance {}
+  }
+  layer {
+    parents: "reconstruction"
+    name: "pearson_r_var2"
+    data_layout: "model_parallel"
+    variance {}
+  }
+  layer {
+    parents: "pearson_r_var1 pearson_r_var2"
+    name: "pearson_r_mult"
+    data_layout: "model_parallel"
+    multiply {}
+  }
+  layer {
+    parents: "pearson_r_mult"
+    name: "pearson_r_sqrt"
+    data_layout: "model_parallel"
+    sqrt {}
+  }
+  layer {
+    parents: "pearson_r_cov pearson_r_sqrt"
+    name: "pearson_r"
+    data_layout: "model_parallel"
+    divide {}
   }
 
   ###################################################
diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext
index 0136e9570b9..986c833693a 100644
--- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext
+++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext
@@ -2,7 +2,6 @@ model {
   ### Model description and network architecture taken from:
   ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass
   ### This network description is anologous to AutoEncoder_Chem_ECFP 
-  name: "sequential_model"
   data_layout: "model_parallel"
   mini_batch_size: 1024
   block_size: 256
@@ -15,7 +14,7 @@ model {
   ###################################################
 
   objective_function {
-    mean_squared_error {}
+    layer_term { layer: "mean_squared_error" }
   }
 
   ###################################################
@@ -23,7 +22,10 @@ model {
   ###################################################
 
   metric {
-    pearson_correlation {}
+    layer_metric {
+      name: "Pearson correlation"
+      layer: "pearson_r"
+    }
   }
 
   ###################################################
@@ -38,18 +40,6 @@ model {
     timer {
     }
   }
- # callback {
- #   summary {
- #     dir: "."
- #     batch_interval: 1
- #     mat_interval: 25
- #   }
- # }
-#  callback {
-#    debug {
-#      phase: "train"
-#    }
-#  }
 
   ###################################################
   # start of layers
@@ -59,22 +49,35 @@ model {
   # INPUT
   #######
   layer {
-    name: "data"
+    name: "input"
+    children: "data dummy"
     data_layout: "model_parallel"
     input {
       io_buffer: "distributed"
     }
   }
+  layer {
+    parents: "input"
+    name: "data"
+    data_layout: "model_parallel"
+    split {}
+  }
+  layer {
+    parents: "input"
+    name: "dummy"
+    data_layout: "model_parallel"
+    dummy {}
+  }
 
   #################
   # FULLY_CONNECTED encode1
   #################
   layer {
+    parents: "data"
     name: "encode1"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 500
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -83,6 +86,7 @@ model {
   # SELU selu1
   ######
   layer {
+    parents: "encode1"
     name: "selu1"
     data_layout: "model_parallel"
     selu {
@@ -93,11 +97,11 @@ model {
   # FULLY_CONNECTED encode2
   #################
   layer {
+    parents: "selu1"
     name: "encode2"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 250
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -106,6 +110,7 @@ model {
   # SELU selu2
   #######
   layer {
+    parents: "encode2"
     name: "selu2"
     data_layout: "model_parallel"
     selu {
@@ -116,11 +121,11 @@ model {
   # FULLY_CONNECTED encode3
   #################
   layer {
+    parents: "selu2"
     name: "encode3"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 100 
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -129,6 +134,7 @@ model {
   # SELU selu3
   #######
   layer {
+    parents: "encode3"
     name: "selu3"
     data_layout: "model_parallel"
     selu {
@@ -140,11 +146,11 @@ model {
   # FULLY_CONNECTED decode3
   #################
   layer {
+    parents: "selu3"
     name: "decode3"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 250
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -153,6 +159,7 @@ model {
   # SELU selu8
   #######
   layer {
+    parents: "decode3"
     name: "selu8"
     data_layout: "model_parallel"
     selu {
@@ -163,11 +170,11 @@ model {
   # FULLY_CONNECTED decode2
   #################
   layer {
+    parents: "selu8"
     name: "decode2"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 500
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -176,6 +183,7 @@ model {
   # SELU selu9
   #######
   layer {
+    parents: "decode2"
     name: "selu9"
     data_layout: "model_parallel"
     selu {
@@ -186,11 +194,11 @@ model {
   # FULLY_CONNECTED decode1
   #################
   layer {
+    parents: "selu9"
     name: "decode1"
     data_layout: "model_parallel"
     num_neurons_from_data_reader: true
     fully_connected {
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -199,6 +207,7 @@ model {
   # SELU selu10 
   #######
   layer {
+    parents: "decode1"
     name: "selu10"
     data_layout: "model_parallel"
     #selu {
@@ -211,11 +220,57 @@ model {
   # RECONSTRUCTION
   #################
   layer {
+    parents: "relu10"
     name: "reconstruction"
     data_layout: "model_parallel"
-    reconstruction {
-      original_layer: "data"
-    }
+    split {}
+  }
+  layer {
+    parents: "reconstruction data"
+    name: "mean_squared_error"
+    data_layout: "model_parallel"
+    mean_squared_error {}
+  }
+
+  #####################
+  # PEARSON CORRELATION
+  #####################
+  # rho(x,y) = covariance(x,y) / sqrt( variance(x) * variance(y) )
+  layer {
+    parents: "reconstruction data"
+    name: "pearson_r_cov"
+    data_layout: "model_parallel"
+    covariance {}
+  }
+  layer {
+    parents: "data"
+    name: "pearson_r_var1"
+    data_layout: "model_parallel"
+    variance {}
+  }
+  layer {
+    parents: "reconstruction"
+    name: "pearson_r_var2"
+    data_layout: "model_parallel"
+    variance {}
+  }
+  layer {
+    parents: "pearson_r_var1 pearson_r_var2"
+    name: "pearson_r_mult"
+    data_layout: "model_parallel"
+    multiply {}
+  }
+  layer {
+    parents: "pearson_r_mult"
+    name: "pearson_r_sqrt"
+    data_layout: "model_parallel"
+    sqrt {}
+  }
+  layer {
+    parents: "pearson_r_cov pearson_r_sqrt"
+    name: "pearson_r"
+    data_layout: "model_parallel"
+    divide {}
   }
 
   ###################################################
diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext
index c0f94db8b96..60566f2f084 100644
--- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext
+++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext
@@ -2,7 +2,6 @@ model {
   ### Model description and network architecture taken from:
   ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass
   ### This network description is anologous to AutoEncoder_Chem_Sigmoid
-  name: "sequential_model"
   data_layout: "model_parallel"
   mini_batch_size: 128
   block_size: 256
@@ -15,7 +14,7 @@ model {
   ###################################################
 
   objective_function {
-    mean_squared_error {}
+    layer_term { layer: "mean_squared_error" }
   }
 
   ###################################################
@@ -37,11 +36,6 @@ model {
       mat_interval: 25
     }
   }
-#  callback {
-#    debug {
-#      phase: "train"
-#    }
-#  }
 
   ###################################################
   # start of layers
@@ -51,24 +45,36 @@ model {
   # INPUT
   #######
   layer {
-    name: "data"
-    children: "encode1 reconstruction"
+    name: "input"
+    children: "data dummy"
     data_layout: "model_parallel"
     input {
       io_buffer: "distributed"
       target_mode: "reconstruction"
     }
   }
+  layer {
+    parents: "input"
+    name: "data"
+    data_layout: "model_parallel"
+    split {}
+  }
+  layer {
+    parents: "input"
+    name: "dummy"
+    data_layout: "model_parallel"
+    dummy {}
+  }
 
   #################
   # FULLY_CONNECTED encode1
   #################
   layer {
+    parents: "data"
     name: "encode1"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 2000
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -77,6 +83,7 @@ model {
   # SIGMOID sigmoid1
   ######
   layer {
+    parents: "encode1"
     name: "sigmoid1"
     data_layout: "model_parallel"
     sigmoid {
@@ -87,11 +94,11 @@ model {
   # FULLY_CONNECTED encode2
   #################
   layer {
+    parents: "sigmoid1"
     name: "encode2"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 500
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -100,6 +107,7 @@ model {
   # SIGMOID sigmoid2
   #######
   layer {
+    parents: "encode2"
     name: "sigmoid2"
     data_layout: "model_parallel"
     sigmoid {
@@ -110,11 +118,11 @@ model {
   # FULLY_CONNECTED encode3
   #################
   layer {
+    parents: "sigmoid2"
     name: "encode3"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 100
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -123,6 +131,7 @@ model {
   # SIGMOID sigmoid3
   #######
   layer {
+    parents: "encode3"
     name: "sigmoid3"
     data_layout: "model_parallel"
     sigmoid {
@@ -134,11 +143,11 @@ model {
   # FULLY_CONNECTED decode3
   #################
   layer {
+    parents: "sigmoid3"
     name: "decode3"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 500
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -147,6 +156,7 @@ model {
   # SIGMOID sigmoid4
   #######
   layer {
+    parents: "decode3"
     name: "sigmoid4"
     data_layout: "model_parallel"
     sigmoid {
@@ -157,11 +167,11 @@ model {
   # FULLY_CONNECTED decode2
   #################
   layer {
+    parents: "sigmoid4"
     name: "decode2"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 2000
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -171,6 +181,7 @@ model {
   # SIGMOID sigmoid5
   #######
   layer {
+    parents: "decode2"
     name: "sigmoid5"
     data_layout: "model_parallel"
     sigmoid {
@@ -181,11 +192,11 @@ model {
   # FULLY_CONNECTED decode1
   #################
   layer {
+    parents: "sigmoid5"
     name: "decode1"
     data_layout: "model_parallel"
     num_neurons_from_data_reader: true
     fully_connected {
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -194,6 +205,7 @@ model {
   # SIGMOID sigmoid6
   #######
   layer {
+    parents: "decode1"
     name: "sigmoid6"
     data_layout: "model_parallel"
     sigmoid {
@@ -205,10 +217,16 @@ model {
   # RECONSTRUCTION
   #################
   layer {
+    parents: "relu10"
     name: "reconstruction"
-    parents: "sigmoid6 data"
     data_layout: "model_parallel"
-    reconstruction {}
+    split {}
+  }
+  layer {
+    parents: "reconstruction data"
+    name: "mean_squared_error"
+    data_layout: "model_parallel"
+    mean_squared_error {}
   }
 
   ###################################################
diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext
index fd2a4e3d606..397111b4899 100644
--- a/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext
+++ b/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext
@@ -2,7 +2,6 @@ model {
   ### Model description and network architecture taken from:
   ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass
   ### This network description is anologous to AutoEncoder_Chem_ECFP
-  name: "sequential_model"
   data_layout: "model_parallel"
   mini_batch_size: 128
   block_size: 256
@@ -15,7 +14,7 @@ model {
   ###################################################
 
   objective_function {
-    cross_entropy {}
+    layer_term { layer: "cross_entropy" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -26,7 +25,10 @@ model {
   ###################################################
 
   metric {
-    categorical_accuracy {}
+    layer_metric {
+      name: "accuracy"
+      layer: "categorical_accuracy"
+    }
   }
 
 
@@ -42,18 +44,6 @@ model {
     timer {
     }
   }
- # callback {
- #   summary {
- #     dir: "."
- #     batch_interval: 1
- #     mat_interval: 25
- #   }
- # }
-#  callback {
-#    debug {
-#      phase: "train"
-#    }
-#  }
 
   ###################################################
   # start of layers
@@ -63,23 +53,35 @@ model {
   # INPUT
   #######
   layer {
-    name: "finetunedata"
-    children: "encode target"
+    name: "data"
+    children: "finetunedata label"
     data_layout: "model_parallel"
     input {
       io_buffer: "distributed"
     }
   }
+  layer {
+    parents: "data"
+    name: "finetunedata"
+    data_layout: "model_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "label"
+    data_layout: "model_parallel"
+    split {}
+  }
 
   #################
   # FULLY_CONNECTED encode1
   #################
   layer {
+    parents: "finetunedata"
     name: "encode1"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 2000
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -88,6 +90,7 @@ model {
   # RELU relu1
   ######
   layer {
+    parents: "encode1"
     name: "relu1"
     data_layout: "model_parallel"
     relu {
@@ -98,11 +101,11 @@ model {
   # FULLY_CONNECTED encode2
   #################
   layer {
+    parents: "relu1"
     name: "encode2"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1000
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -111,6 +114,7 @@ model {
   # RELU relu2
   #######
   layer {
+    parents: "encode2"
     name: "relu2"
     data_layout: "model_parallel"
     relu {
@@ -121,11 +125,11 @@ model {
   # FULLY_CONNECTED encode3
   #################
   layer {
+    parents: "relu2"
     name: "encode3"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 500
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -134,6 +138,7 @@ model {
   # RELU relu3
   #######
   layer {
+    parents: "encode3"
     name: "relu3"
     data_layout: "model_parallel"
     relu {
@@ -144,11 +149,11 @@ model {
   # FULLY_CONNECTED encode4
   #################
   layer {
+    parents: "relu3"
     name: "encode4"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 250
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -157,6 +162,7 @@ model {
   # RELU relu4
   #######
   layer {
+    parents: "encode4"
     name: "relu4"
     data_layout: "model_parallel"
     relu {
@@ -167,11 +173,11 @@ model {
   # FULLY_CONNECTED encode5
   #################
   layer {
+    parents: "relu4"
     name: "encode5"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 100
-      weight_initialization: "glorot_uniform"
       has_bias: true
     }
   }
@@ -180,6 +186,7 @@ model {
   # RELU relu5
   #######
   layer {
+    parents: "encode5"
     name: "relu5"
     data_layout: "model_parallel"
     relu {
@@ -188,6 +195,7 @@ model {
 
 
   layer {
+    parents: "relu5"
     name: "ip2"
     data_layout: "model_parallel"
     fully_connected {
@@ -197,17 +205,26 @@ model {
   }
 
   layer {
+    parents: "ip2"
     name: "prob"
     data_layout: "model_parallel"
     softmax {}
   }
 
   layer {
-    name: "target"
-    parents: "prob finetunedata"
+    parents: "prob label"
+    name: "cross_entropy"
+    data_layout: "model_parallel"
+    cross_entropy {}
+  }
+
+  layer {
+    parents: "prob label"
+    name: "categorical_accuracy"
     data_layout: "model_parallel"
-    target {}
+    categorical_accuracy {}
   }
+  
   ###################################################
   # end of layers
   ###################################################
diff --git a/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext b/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext
index 316b10565b9..78207785cf4 100644
--- a/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext
+++ b/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "model_parallel"
   mini_batch_size: 32
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    mean_squared_error {}
+    layer_term { layer: "mean_squared_error" }
   }
 
   ###################################################
@@ -20,7 +19,10 @@ model {
   ###################################################
 
   metric {
-    mean_squared_error {}
+    layer_metric {
+      name: "mean squared error"
+      layer: "mean_squared_error"
+    }
   }
 
   ###################################################
@@ -31,12 +33,6 @@ model {
       interval: 1
     }
   }
-  #callback {
-  #  save_images {
-  #    image_dir: "images_"
-  #    extension: "pgm"
-  #  }
-  #}
 
   ###################################################
   # start of layers
@@ -47,18 +43,30 @@ model {
   #######
   layer {
     name: "data"
-    children: "encode1 reconstruction"
+    children: "image dummy"
     data_layout: "model_parallel"
     input {
       io_buffer: "distributed"
-      target_mode: "reconstruction"
     }
   }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "model_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "dummy"
+    data_layout: "model_parallel"
+    dummy {}
+  }
 
   #################
   # FULLY_CONNECTED encode1
   #################
   layer {
+    parents: "image"
     name: "encode1"
     data_layout: "model_parallel"
     fully_connected {
@@ -71,6 +79,7 @@ model {
   # RELU
   ######
   layer {
+    parents: "encode1"
     name: "relu1"
     data_layout: "model_parallel"
     relu {
@@ -80,6 +89,7 @@ model {
   # DROPOUT
   #########
   layer {
+    parents: "relu1"
     name: "dropout1"
     data_layout: "model_parallel"
     dropout {
@@ -92,6 +102,7 @@ model {
   # FULLY_CONNECTED decode1
   #################
   layer {
+    parents: "dropout1"
     name: "decode1"
     data_layout: "model_parallel"
     num_neurons_from_data_reader: true
@@ -104,6 +115,7 @@ model {
   # SIGMOID
   #########
   layer {
+    parents: "decode1"
     name: "sigmoid1"
     data_layout: "model_parallel"
     sigmoid {
@@ -113,6 +125,7 @@ model {
   # DROPOUT
   #########
   layer {
+    parents: "sigmoid1"
     name: "dropout2"
     data_layout: "model_parallel"
     dropout {
@@ -124,10 +137,10 @@ model {
   # RECONSTRUCTION
   #################
   layer {
-    name: "reconstruction"
-    parents: "dropout2 data"
+    parents: "dropout2 image"
+    name: "mean_squared_error"
     data_layout: "model_parallel"
-    reconstruction {}
+    mean_squared_error {}
   }
 
   ###################################################
diff --git a/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext b/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext
index b23a2c53b11..8a686b6a479 100644
--- a/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext
+++ b/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "data_parallel"
   mini_batch_size: 128
   block_size: 256
@@ -14,7 +13,7 @@ model {
   ###################################################
 
   objective_function {
-    mean_squared_error {}
+    layer_term { layer: "mean_squared_error" }
     l2_weight_regularization {
       scale_factor: 0.0005
     }
@@ -25,7 +24,10 @@ model {
   ###################################################
 
   metric {
-    mean_squared_error {}
+    layer_metric {
+      name: "mean squared error"
+      layer: "mean_squared_error"
+    }
   }
 
   ###################################################
@@ -40,12 +42,6 @@ model {
     timer {
     }
   }
- # callback {
- #   save_images {
- #     image_dir: "images_"
- #     extension: "pgm"
- #   }
- # }
 
   ###################################################
   # start of layers
@@ -56,18 +52,30 @@ model {
   #######
   layer {
     name: "data"
-    children: "conv1 reconstruction"
+    children: "image dummy"
     data_layout: "data_parallel"
     input {
       io_buffer: "distributed"
-      target_mode: "reconstruction"
     }
   }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "dummy"
+    data_layout: "data_parallel"
+    dummy {}
+  }
 
   #############
   # CONVOLUTION 1
   #############
   layer {
+    parents: "image"
     name: "conv1"
     data_layout: "data_parallel"
     convolution {
@@ -85,6 +93,7 @@ model {
   # RELU 1
   ######
   layer {
+    image: "conv1"
     name: "relu1"
     data_layout: "data_parallel"
     relu {
@@ -95,6 +104,7 @@ model {
   # POOLING 1
   #########
   layer {
+    parents: "relu1"
     name: "pool1"
     data_layout: "data_parallel"
     pooling {
@@ -111,6 +121,7 @@ model {
   # CONVOLUTION 2
   #############
   layer {
+    parents: "pool1"
     name: "conv2"
     data_layout: "data_parallel"
     convolution {
@@ -128,6 +139,7 @@ model {
   # RELU 2
   ######
   layer {
+    parents: "conv2"
     name: "relu2"
     data_layout: "data_parallel"
     relu {
@@ -138,6 +150,7 @@ model {
   # POOLING 2
   #########
   layer {
+    parents: "relu2"
     name: "pool2"
     data_layout: "data_parallel"
     pooling {
@@ -154,6 +167,7 @@ model {
   # CONVOLUTION 3
   #############
   layer {
+    parents: "pool2"
     name: "conv3"
     data_layout: "data_parallel"
     convolution {
@@ -171,6 +185,7 @@ model {
   # RELU 3
   ######
   layer {
+    parents: "conv3"
     name: "relu3"
     data_layout: "data_parallel"
     relu {
@@ -181,6 +196,7 @@ model {
   # POOLING 3
   #########
   layer {
+    parents: "relu3"
     name: "pool3"
     data_layout: "data_parallel"
     pooling {
@@ -197,6 +213,7 @@ model {
   # UNPOOLING 3
   #########
   layer {
+    parents: "pool3"
     name: "unpool3"
     data_layout: "data_parallel"
     unpooling {
@@ -209,6 +226,7 @@ model {
   # DECONVOLUTION 3
   #############
   layer {
+    parents: "unpool3"
     name: "deconv3"
     data_layout: "data_parallel"
     deconvolution {
@@ -226,6 +244,7 @@ model {
   # RELU 4
   ######
   layer {
+    parents: "deconv3"
     name: "relu4"
     data_layout: "data_parallel"
     relu {
@@ -236,6 +255,7 @@ model {
   # UNPOOLING 2
   #########
   layer {
+    parents: "relu4"
     name: "unpool2"
     data_layout: "data_parallel"
     unpooling {
@@ -248,6 +268,7 @@ model {
   # DECONVOLUTION 2
   #############
   layer {
+    parents: "unpool2"
     name: "deconv2"
     data_layout: "data_parallel"
     deconvolution {
@@ -265,6 +286,7 @@ model {
   # RELU 5
   ######
   layer {
+    parents: "deconv2"
     name: "relu5"
     data_layout: "data_parallel"
     relu {
@@ -275,6 +297,7 @@ model {
   # UNPOOLING 1
   #########
   layer {
+    parents: "relu5"
     name: "unpool1"
     data_layout: "data_parallel"
     unpooling {
@@ -287,6 +310,7 @@ model {
   # DECONVOLUTION 1
   #############
   layer {
+    parents: "unpool1"
     name: "deconv1"
     data_layout: "data_parallel"
     deconvolution {
@@ -304,6 +328,7 @@ model {
   # RELU 6
   ######
   layer {
+    parents: "deconv1"
     name: "relu6"
     data_layout: "data_parallel"
     relu {
@@ -315,6 +340,7 @@ model {
   # FULLY_CONNECTED decode1
   #################
   layer {
+    parents: "relu6"
     name: "decode1"
     data_layout: "data_parallel"
     num_neurons_from_data_reader: true
@@ -328,7 +354,8 @@ model {
   # SIGMOID sigmoid
   #######
   layer {
-    name: "sigmoid"
+    parents: "decode1"
+    name: "reconstruction"
     data_layout: "data_parallel"
     sigmoid {
     }
@@ -339,10 +366,10 @@ model {
   # RECONSTRUCTION
   #################
   layer {
-    name: "reconstruction"
-    parents: "sigmoid data"
+    parents: "reconstruction image"
+    name: "mean_squared_error"
     data_layout: "data_parallel"
-    reconstruction {}
+    mean_squared_error {}
   }
 
   ###################################################
diff --git a/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext b/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext
index dba3ded4663..b05e80ae336 100644
--- a/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext
+++ b/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "data_parallel"
   mini_batch_size: 128
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    mean_squared_error {}
+    layer_term { layer: "mean_squared_error" }
     l2_weight_regularization {
       scale_factor: 0.0005
     }
@@ -32,8 +31,8 @@ model {
   }
   callback {
     save_images {
-      image_dir: "images_"
-      extension: "pgm"
+      layers: "image reconstruction"
+      image_format: "pgm"
     }
   }
 
@@ -46,18 +45,30 @@ model {
   #######
   layer {
     name: "data"
-    children: "conv1 reconstruction"
+    children: "image dummy"
     data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
-      target_mode: "reconstruction"
     }
   }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "dummy"
+    data_layout: "data_parallel"
+    dummy {}
+  }
 
   #############
   # CONVOLUTION 1
   #############
   layer {
+    parents: "image"
     name: "conv1"
     data_layout: "data_parallel"
     convolution {
@@ -75,6 +86,7 @@ model {
   # RELU 1
   ######
   layer {
+    parents: "conv1"
     name: "relu1"
     data_layout: "data_parallel"
     relu {
@@ -85,6 +97,7 @@ model {
   # POOLING 1
   #########
   layer {
+    parents: "relu1"
     name: "pool1"
     data_layout: "data_parallel"
     pooling {
@@ -103,6 +116,7 @@ model {
   # UNPOOLING 1
   #########
   layer {
+    parents: "pool1"
     name: "unpool1"
     data_layout: "data_parallel"
     unpooling {
@@ -115,6 +129,7 @@ model {
   # DECONVOLUTION 1
   #############
   layer {
+    parents: "unpool1"
     name: "deconv1"
     data_layout: "data_parallel"
     deconvolution {
@@ -132,7 +147,8 @@ model {
   # SIGMOID sigmoid
   #######
   layer {
-    name: "sigmoid"
+    parents: "deconv1"
+    name: "reconstruction"
     data_layout: "data_parallel"
     sigmoid {
     }
@@ -143,10 +159,10 @@ model {
   # RECONSTRUCTION
   #################
   layer {
-    name: "reconstruction"
-    parents: "sigmoid data"
+    parents: "reconstruction image"
+    name: "mean_squared_error"
     data_layout: "data_parallel"
-    reconstruction {}
+    mean_squared_error {}
   }
 
   ###################################################
diff --git a/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext b/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext
index 53e8a96d340..e4c570d6215 100644
--- a/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext
+++ b/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "model_parallel"
   mini_batch_size: 10
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    mean_squared_error {}
+    layer_term { layer: "mean_squared_error" }
   }
 
   ###################################################
@@ -23,16 +22,6 @@ model {
       interval: 1
     }
   }
-#  callback {
-#    timer {
-#    }
-#  }
-#  callback {
-#    save_images {
-#      image_dir: "images_"
-#      extension: "pgm"
-#    }
-#  }
 
   ###################################################
   # start of layers
@@ -43,17 +32,29 @@ model {
   #######
   layer {
     name: "data"
-    children: "encode1 reconstruction"
+    children: "image dummy"
     data_layout: "model_parallel"
     input {
       io_buffer: "distributed"
-      target_mode: "reconstruction"
     }
   }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "model_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "dummy"
+    data_layout: "model_parallel"
+    dummy {}
+  }
 
   # FULLY_CONNECTED encode1
   #################
   layer {
+    parents: "image"
     name: "encode1"
     data_layout: "model_parallel"
     fully_connected {
@@ -66,6 +67,7 @@ model {
   # RELU relu1
   ######
   layer {
+    parents: "encode1"
     name: "relu1"
     data_layout: "model_parallel"
     relu {
@@ -75,6 +77,7 @@ model {
   # FULLY_CONNECTED encode2
   #################
   layer {
+    parents: "relu1"
     name: "encode2"
     data_layout: "model_parallel"
     fully_connected {
@@ -87,6 +90,7 @@ model {
   # RELU relu2
   #######
   layer {
+    parents: "encode2"
     name: "relu2"
     data_layout: "model_parallel"
     relu {
@@ -96,6 +100,7 @@ model {
   # FULLY_CONNECTED encode3
   #################
   layer {
+    parents: "relu2"
     name: "encode3"
     data_layout: "model_parallel"
     fully_connected {
@@ -109,6 +114,7 @@ model {
   # RELU relu3
   #######
   layer {
+    parents: "encode3"
     name: "relu3"
     data_layout: "model_parallel"
     relu {
@@ -118,6 +124,7 @@ model {
   # FULLY_CONNECTED encode4
   #################
   layer {
+    parents: "relu3"
     name: "encode4"
     data_layout: "model_parallel"
     fully_connected {
@@ -130,6 +137,7 @@ model {
   # FULLY_CONNECTED decode4
   #################
   layer {
+    parents: "encode4"
     name: "decode4"
     data_layout: "model_parallel"
     fully_connected {
@@ -142,6 +150,7 @@ model {
   # RELU 4
   #######
   layer {
+    parents: "decode4"
     name: "relu4"
     data_layout: "model_parallel"
     relu {
@@ -151,6 +160,7 @@ model {
   # FULLY_CONNECTED decode3
   #################
   layer {
+    parents: "relu4"
     name: "decode3"
     data_layout: "model_parallel"
     fully_connected {
@@ -164,6 +174,7 @@ model {
   # RELU relu5
   #######
   layer {
+    parents: "decode3"
     name: "relu5"
     data_layout: "model_parallel"
     relu {
@@ -173,6 +184,7 @@ model {
   # FULLY_CONNECTED decode2
   #################
   layer {
+    parents: "relu5"
     name: "decode2"
     data_layout: "model_parallel"
     fully_connected {
@@ -185,6 +197,7 @@ model {
   # RELU relu6
   #######
   layer {
+    parents: "decode2"
     name: "relu6"
     data_layout: "model_parallel"
     relu {
@@ -194,6 +207,7 @@ model {
   # FULLY_CONNECTED decode1
   #################
   layer {
+    parents: "relu6"
     name: "decode1"
     data_layout: "model_parallel"
     num_neurons_from_data_reader: true
@@ -207,7 +221,8 @@ model {
   # SIGMOID sigmoid
   #######
   layer {
-    name: "sigmoid"
+    parents: "decode1"
+    name: "reconstruction"
     data_layout: "model_parallel"
     sigmoid {
     }
@@ -218,10 +233,10 @@ model {
   # RECONSTRUCTION
   #################
   layer {
-    name: "reconstruction"
-    parents: "sigmoid data"
+    parents: "reconstruction image"
+    name: "mean_squared_error"
     data_layout: "model_parallel"
-    reconstruction {}
+    mean_squared_error {}
   }
 
   ###################################################
diff --git a/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext b/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext
index 2bf6a347dda..1a54f23311f 100644
--- a/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext
+++ b/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "data_parallel"
   mini_batch_size: 128
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    mean_squared_error {}
+    layer_term { layer: "mean_squared_error" }
     l2_weight_regularization {
       scale_factor: 0.0005
     }
@@ -32,8 +31,8 @@ model {
   }
   callback {
     save_images {
-      image_dir: "images_"
-      extension: "pgm"
+      layers: "image reconstruction"
+      image_format: "pgm"
     }
   }
 
@@ -46,18 +45,30 @@ model {
   #######
   layer {
     name: "data"
-    children: "conv1 reconstruction"
+    children: "image dummy"
     data_layout: "data_parallel"
     input {
       io_buffer: "distributed"
-      target_mode: "reconstruction"
     }
   }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "dummy"
+    data_layout: "data_parallel"
+    dummy {}
+  }
 
   #############
   # CONVOLUTION 1
   #############
   layer {
+    parents: "image"
     name: "conv1"
     data_layout: "data_parallel"
     convolution {
@@ -75,6 +86,7 @@ model {
   # RELU 1
   ######
   layer {
+    parents: "conv1"
     name: "relu1"
     data_layout: "data_parallel"
     relu {
@@ -85,6 +97,7 @@ model {
   # POOLING 1
   #########
   layer {
+    parents: "relu1"
     name: "pool1"
     data_layout: "data_parallel"
     pooling {
@@ -101,6 +114,7 @@ model {
   # CONVOLUTION 2
   #############
   layer {
+    parents: "pool1"
     name: "conv2"
     data_layout: "data_parallel"
     convolution {
@@ -118,6 +132,7 @@ model {
   # RELU 2
   ######
   layer {
+    parents: "conv2"
     name: "relu2"
     data_layout: "data_parallel"
     relu {
@@ -128,6 +143,7 @@ model {
   # POOLING 2
   #########
   layer {
+    parents: "relu2"
     name: "pool2"
     data_layout: "data_parallel"
     pooling {
@@ -144,6 +160,7 @@ model {
   # CONVOLUTION 3
   #############
   layer {
+    parents: "pool2"
     name: "conv3"
     data_layout: "data_parallel"
     convolution {
@@ -161,6 +178,7 @@ model {
   # RELU 3
   ######
   layer {
+    parents: "conv3"
     name: "relu3"
     data_layout: "data_parallel"
     relu {
@@ -171,6 +189,7 @@ model {
   # POOLING 3
   #########
   layer {
+    parents: "relu3"
     name: "pool3"
     data_layout: "data_parallel"
     pooling {
@@ -187,6 +206,7 @@ model {
   # UNPOOLING 3
   #########
   layer {
+    parents: "pool3"
     name: "unpool3"
     data_layout: "data_parallel"
     unpooling {
@@ -199,6 +219,7 @@ model {
   # DECONVOLUTION 3
   #############
   layer {
+    parents: "unpool3"
     name: "deconv3"
     data_layout: "data_parallel"
     deconvolution {
@@ -216,6 +237,7 @@ model {
   # RELU 4
   ######
   layer {
+    parents: "deconv3"
     name: "relu4"
     data_layout: "data_parallel"
     relu {
@@ -226,6 +248,7 @@ model {
   # UNPOOLING 2
   #########
   layer {
+    parents: "relu4"
     name: "unpool2"
     data_layout: "data_parallel"
     unpooling {
@@ -238,6 +261,7 @@ model {
   # DECONVOLUTION 2
   #############
   layer {
+    parents: "unpool2"
     name: "deconv2"
     data_layout: "data_parallel"
     deconvolution {
@@ -255,6 +279,7 @@ model {
   # RELU 5
   ######
   layer {
+    parents: "deconv2"
     name: "relu5"
     data_layout: "data_parallel"
     relu {
@@ -265,6 +290,7 @@ model {
   # UNPOOLING 1
   #########
   layer {
+    parents: "relu5"
     name: "unpool1"
     data_layout: "data_parallel"
     unpooling {
@@ -277,6 +303,7 @@ model {
   # DECONVOLUTION 1
   #############
   layer {
+    parents: "unpool1"
     name: "deconv1"
     data_layout: "data_parallel"
     deconvolution {
@@ -294,6 +321,7 @@ model {
   # RELU 6
   ######
   layer {
+    parents: "deconv1"
     name: "relu6"
     data_layout: "data_parallel"
     relu {
@@ -305,6 +333,7 @@ model {
   # FULLY_CONNECTED decode1
   #################
   layer {
+    parents: "relu6"
     name: "decode1"
     data_layout: "data_parallel"
     num_neurons_from_data_reader: true
@@ -317,7 +346,8 @@ model {
   # SIGMOID sigmoid
   #######
   layer {
-    name: "sigmoid"
+    parents: "decode1"
+    name: "reconstruction"
     data_layout: "data_parallel"
     sigmoid {
     }
@@ -328,10 +358,10 @@ model {
   # RECONSTRUCTION
   #################
   layer {
-    name: "reconstruction"
-    parents: "sigmoid data"
+    parents: "reconstruction image"
+    name: "mean_squared_error"
     data_layout: "data_parallel"
-    reconstruction {}
+    mean_squared_error {}
   }
 
   ###################################################
diff --git a/model_zoo/models/autoencoder_mnist/vae_mnist.prototext b/model_zoo/models/autoencoder_mnist/vae_mnist.prototext
index 83280c30f0a..059ef41643b 100644
--- a/model_zoo/models/autoencoder_mnist/vae_mnist.prototext
+++ b/model_zoo/models/autoencoder_mnist/vae_mnist.prototext
@@ -1,7 +1,6 @@
 # LBANN implementation of MNIST VAE in Doersch's autoencoder tutorial
 # See https://github.com/cdoersch/vae_tutorial/blob/master/mnist_vae.prototxt
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 100
   block_size: 256
@@ -14,11 +13,8 @@ model {
   ###################################################
 
   objective_function {
-    binary_cross_entropy {}
-    layer_term {
-      scale_factor: 1.0
-      layer: "klloss"
-    }
+    layer_term { layer: "binary_cross_entropy" }
+    layer_term { layer: "kldiv" }
     l2_weight_regularization {
       scale_factor: 0.0005
     }
@@ -28,7 +24,12 @@ model {
   # Metrics
   ###################################################
 
-  metric { mean_squared_error {} }
+  metric {
+    layer_metric {
+      name: "mean squared error"
+      layer: "mean_squared_error"
+    }
+  }
 
   ###################################################
   # Callbacks
@@ -47,8 +48,8 @@ model {
   }
   callback {
     save_images {
-      image_dir: "images_"
-      extension: "png"
+      layers: "image reconstruction"
+      image_format: "pgm"
     }
   }
 
@@ -61,20 +62,31 @@ model {
   ######################
   layer {
     name: "data"
-    children: "encode1 reconstruction"
+    children: "image dummy"
     data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
-      target_mode: "reconstruction"
     }
   }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "dummy"
+    data_layout: "data_parallel"
+    dummy {}
+  }
 
   ######################
   # Encoder
   ######################
 
   layer {
-    parents: "data"
+    parents: "image"
     name: "encode1"
     data_layout: "data_parallel"
     fully_connected {
@@ -150,23 +162,19 @@ model {
     parents: "logsd"
     name: "sd"
     data_layout: "data_parallel"
-    exponential {}
+    exp {}
   }
   layer {
     parents: "sd"
     name: "var"
     data_layout: "data_parallel"
-    power {
-      exponent: 2
-    }
+    square {}
   }
   layer {
     parents: "mu"
     name: "meansq"
     data_layout: "data_parallel"
-    power {
-      exponent: 2
-    }
+    square {}
   }
   layer {
     parents: "meansq var logsd"
@@ -180,9 +188,7 @@ model {
     parents: "kldiv_plus_half"
     name: "kldiv_full"
     data_layout: "data_parallel"
-    power {
-      exponent: -0.5
-    }
+    rsqrt {}
   }
   layer {
     parents: "kldiv_full"
@@ -192,12 +198,6 @@ model {
       mode: "sum"
     }
   }
-  layer {
-    parents: "kldiv"
-    name: "klloss"
-    data_layout: "data_parallel"
-    evaluation {}
-  }
 
   ######################
   # Generate sample
@@ -290,16 +290,21 @@ model {
 
   layer {
     parents: "decode1"
-    name: "loss_sigmoid"
+    name: "reconstruction"
     data_layout: "data_parallel"
     sigmoid {}
   }
   layer {
-    parents: "loss_sigmoid"
-    parents: "decode1 data"
-    name: "reconstruction"
+    parents: "reconstruction image"
+    name: "binary_cross_entropy"
+    data_layout: "data_parallel"
+    binary_cross_entropy {}
+  }
+  layer {
+    parents: "reconstruction image"
+    name: "mean_squared_error"
     data_layout: "data_parallel"
-    reconstruction {}
+    mean_squared_error {}
   }
 
   ###################################################
diff --git a/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext b/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext
index 7c2a0d77735..c863285dccc 100644
--- a/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext
+++ b/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "model_parallel"
   mini_batch_size: 50
   block_size: 256
@@ -12,15 +11,16 @@ model {
   ###################################################
 
   objective_function {
-    mean_squared_error {}
+    layer_term { layer: "mse" }
   }
 
   ###################################################
   # Metrics
   ###################################################
-
   metric {
-    pearson_correlation {}
+    layer_metric {
+      layer: "pearson_r"
+    }
   }
 
   ###################################################
@@ -47,14 +47,21 @@ model {
   #######
   layer {
     name: "data"
-    children: "encode1 reconstruction"
+    #children: "encode1 recon_data"
     data_layout: "model_parallel"
     input {
       io_buffer: "distributed"
-      target_mode: "reconstruction"
+      target_mode: "N/A"
     }
   }
 
+  layer {
+    name: "recon_data"
+    parents: "data"
+    data_layout: "model_parallel"
+    identity { }
+  }
+
   #################
   # FULLY_CONNECTED encode1
   #################
@@ -66,6 +73,7 @@ model {
 
   layer {
     name: "encode1"
+    parents: "recon_data"
     data_layout: "model_parallel"
     weights: "w1"
     fully_connected {
@@ -78,6 +86,7 @@ model {
   # SIGMOID
   ######
   layer {
+    parents: "encode1"
     name: "sigmoid1"
     data_layout: "model_parallel"
     sigmoid {
@@ -89,6 +98,7 @@ model {
   # FULLY_CONNECTED decode1
   #################
   layer {
+    parents: "sigmoid1"
     name: "decode1"
     data_layout: "model_parallel"
     weights: "w1"
@@ -102,20 +112,66 @@ model {
   # SIGMOID
   #########
   layer {
+    parents: "decode1"
     name: "sigmoid2"
     data_layout: "model_parallel"
     sigmoid {
     }
   }
 
-  #################
-  # RECONSTRUCTION
-  #################
+  #pearson_r: A Tensor representing the current Pearson product-moment correlation coefficient, 
+  #the value of cov(predictions, labels) / sqrt(var(predictions) * var(labels))
+  layer {
+    parents: "sigmoid2 recon_data"
+    children: "pearson_r"
+    name: "unbiased_covariance"
+    covariance { biased: false }
+    data_layout: "model_parallel"
+   }
+
+  layer {
+    parents: "sigmoid2"
+    children: "mult"
+    name: "pred_variance"
+    variance { biased: false }
+    data_layout: "model_parallel"
+   }
+
+  layer {
+    parents: "recon_data"
+    children: "mult"
+    name: "data_variance"
+    variance { biased: false }
+    data_layout: "model_parallel"
+   }
+
+  layer {
+    parents: "pred_variance data_variance"
+    name: "mult"
+    multiply { }
+    data_layout: "model_parallel"
+   }
+   
   layer {
-    name: "reconstruction"
-    parents: "sigmoid2 data"
+    parents: "mult"
+    name: "sqrt"
+    sqrt { }
+    data_layout: "model_parallel"
+   }
+   
+   
+  layer {
+    parents: "unbiased_covariance sqrt"
+    name: "pearson_r"
+    divide { }
+    data_layout: "model_parallel"
+   }
+
+  layer {
+    parents: "recon_data sigmoid2"
+    name: "mse"
+    mean_squared_error {}
     data_layout: "model_parallel"
-    reconstruction {}
   }
 
   ###################################################
diff --git a/model_zoo/models/candle/pilot1/combo.prototext b/model_zoo/models/candle/pilot1/combo.prototext
index 79279d21f8c..4d472a3fec9 100644
--- a/model_zoo/models/candle/pilot1/combo.prototext
+++ b/model_zoo/models/candle/pilot1/combo.prototext
@@ -1,7 +1,6 @@
 #Example taken from:https://github.com/ECP-CANDLE/Benchmarks/tree/frameworks/Pilot1/Combo
 #Timestamp 03/07/2018 8:30PM
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "model_parallel"
   mini_batch_size: 256
   block_size: 256
@@ -14,7 +13,7 @@ model {
   ###################################################
 
   objective_function {
-    mean_squared_error {}
+    layer_term { layer: "mean_squared_error" }
   }
 
   ###################################################
@@ -22,10 +21,16 @@ model {
   ###################################################
 
   metric {
-    mean_squared_error {}
+    layer_metric {
+      name: "mean squared error"
+      layer: "mean_squared_error"
+    }
   }
   metric {
-    r2{}
+    layer_metric {
+      name: "R2"
+      layer: "r2"
+    }
   }
 
   ###################################################
@@ -44,14 +49,26 @@ model {
 
   # INPUT (Merged Features)
   layer {
-    name: "data"
-    children: "slice_data target"
+    name: "input"
+    children: "data response"
     data_layout: "model_parallel"
     input {
       io_buffer: "distributed"
       target_mode: "regression"
     }
   }
+  layer {
+    parents: "input"
+    name: "data"
+    data_layout: "model_parallel"
+    split {}    
+  }
+  layer {
+    parents: "input"
+    name: "response"
+    data_layout: "model_parallel"
+    split {}    
+  }
 
   # SLICE
   layer {
@@ -480,12 +497,42 @@ model {
     }
   }
 
-  #TARGET
+  #MEAN_SQUARED_ERROR
+  layer {
+    parents: "fc response"
+    name: "mean_squared_error"
+    data_layout: "model_parallel"
+    mean_squared_error {}
+  }
+
+  # R2(x,y) = 1 - sum( (x-y)^2 ) / sum( (x-mean(x))^2 )
+  layer {
+    parents: "fc"
+    name: "r2_var"
+    data_layout: "model_parallel"
+    variance {
+      biased: true
+    }
+  }
+  layer {
+    parents: "mean_squared_error r2_var"
+    name: "r2_div"
+    data_layout: "model_parallel"
+    divide {}
+  }
+  layer {
+    name: "r2_one"
+    data_layout: "model_parallel"
+    constant {
+      value: 1
+      num_neurons: "1"
+    }
+  }
   layer {
-    parents: "fc data"
-    name: "target"
-    target {}
+    parents: "r2_one r2_div"
+    name: "r2"
     data_layout: "model_parallel"
+    subtract {}
   }
 
   ###################################################
diff --git a/model_zoo/models/char_rnn/char_lstm.prototext b/model_zoo/models/char_rnn/char_lstm.prototext
deleted file mode 100644
index a75aad69cb5..00000000000
--- a/model_zoo/models/char_rnn/char_lstm.prototext
+++ /dev/null
@@ -1,247 +0,0 @@
-model {
-  name: "recurrent_model"
-  data_layout: "data_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs: 20
-  num_parallel_readers: 0
-  procs_per_model: 0
-  recurrent {
-    unroll_depth : 5
-  }
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    mean_squared_error {}
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback { print {} }
-  callback { timer {} }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  # Data
-  layer {
-    name: "data"
-    children: "lstm1_forgetgate_input eval"
-    input {
-      io_buffer: "partitioned"
-    }
-    data_layout: "data_parallel"
-  }
-
-  # lstm1 forget gate
-  layer {
-    parents: "data"
-    name: "lstm1_forgetgate_input"
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_output"
-    name: "lstm1_forgetgate_output"
-    fully_connected {
-      num_neurons: 128
-      has_bias: false
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_forgetgate_input lstm1_forgetgate_output"
-    name: "lstm1_forgetgate_sum"
-    sum {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_forgetgate_sum"
-    name: "lstm1_forgetgate"
-    sigmoid {}
-    data_layout: "data_parallel"
-  }
-
-  # lstm1 input gate
-  layer {
-    parents: "data"
-    name: "lstm1_inputgate_input"
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_output"
-    name: "lstm1_inputgate_output"
-    fully_connected {
-      num_neurons: 128
-      has_bias: false
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_inputgate_input lstm1_inputgate_output"
-    name: "lstm1_inputgate_sum"
-    sum {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_inputgate_sum"
-    name: "lstm1_inputgate"
-    sigmoid {}
-    data_layout: "data_parallel"
-  }
-
-  # lstm1 output gate
-  layer {
-    parents: "data"
-    name: "lstm1_outputgate_input"
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_output"
-    name: "lstm1_outputgate_output"
-    fully_connected {
-      num_neurons: 128
-      has_bias: false
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_outputgate_input lstm1_outputgate_output"
-    name: "lstm1_outputgate_sum"
-    sum {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_outputgate_sum"
-    name: "lstm1_outputgate"
-    sigmoid {}
-    data_layout: "data_parallel"
-  }
-
-  # lstm1 cell update
-  layer {
-    parents: "data"
-    name: "lstm1_cellupdate_input"
-    fully_connected {
-      num_neurons: 128
-      has_bias: true
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_output"
-    name: "lstm1_cellupdate_history"
-    fully_connected {
-      num_neurons: 128
-      has_bias: false
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_cellupdate_input lstm1_cellupdate_history"
-    name: "lstm1_cellupdate_sum"
-    sum {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_cellupdate_sum"
-    name: "lstm1_cellupdate_tanh"
-    tanh {}
-    data_layout: "data_parallel"
-  }
-
-  # lstm1 cell state
-  layer {
-    parents: "lstm1_forgetgate lstm1_cell"
-    name: "lstm1_cell_history"
-    hadamard {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_inputgate lstm1_cellupdate_tanh"
-    name: "lstm1_cell_update"
-    hadamard {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_cell_history lstm1_cell_update"
-    name: "lstm1_cell_sum"
-    sum {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_cell_sum"
-    name: "lstm1_cell"
-    reshape {
-      num_dims: 1
-      dims: "128"
-    }
-    data_layout: "data_parallel"
-  }
-
-  # lstm1 output
-  layer {
-    parents: "lstm1_cell"
-    name: "lstm1_output_tanh"
-    tanh {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_outputgate lstm1_output_tanh"
-    name: "lstm1_output_hadamard"
-    hadamard {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "lstm1_output_hadamard"
-    name: "lstm1_output"
-    reshape {
-      num_dims: 1
-      dims: "128"
-    }
-    data_layout: "data_parallel"
-  }
-
-  # prediction
-  layer {
-    parents: "lstm1_output"
-    name: "fc"
-    fully_connected {
-      num_neurons: 128
-      has_bias: false
-    }
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "fc"
-    name: "prob"
-    softmax {}
-    data_layout: "data_parallel"
-  }
-  layer {
-    parents: "prob data"
-    name: "eval"
-    target {}
-    data_layout: "data_parallel"
-  }
-
-}
diff --git a/model_zoo/models/char_rnn/char_rnn.prototext b/model_zoo/models/char_rnn/char_rnn.prototext
deleted file mode 100644
index 82b7c989745..00000000000
--- a/model_zoo/models/char_rnn/char_rnn.prototext
+++ /dev/null
@@ -1,116 +0,0 @@
-model {
-  name: "recurrent_model"
-  data_layout: "data_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs: 20
-  num_parallel_readers: 0
-  procs_per_model: 0
-  recurrent {
-    unroll_depth : 5
-  }
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    mean_squared_error {}
-    l2_weight_regularization {
-      scale_factor: 0.0005
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  # Data
-  layer {
-    name: "data"
-    children: "rnn1_input eval"
-    input {
-      io_buffer: "partitioned"
-    }
-    data_layout: "data_parallel"
-  }
-
-  # rnn1
-  layer {
-    parents: "data"
-    name: "rnn1_input"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "rnn1"
-    name: "rnn1_context"
-    fully_connected {
-      num_neurons: 256
-      has_bias: true
-    }
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "rnn1_input rnn1_context"
-    name: "rnn1_sum"
-    sum {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "rnn1_sum"
-    name: "rnn1_act"
-    tanh {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    # This reshape layer is a hack to get around ambiguous neuron dimensions
-    parents: "rnn1_act"
-    name: "rnn1"
-    reshape {
-      num_dims: 1
-      dims: "256"
-    }
-    data_layout: "model_parallel"
-  }
-
-  # Decode
-  layer {
-    parents: "rnn1"
-    name: "decode"
-    fully_connected {
-      num_neurons: 128
-      has_bias: false
-    }
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "decode"
-    name: "prob"
-    softmax {}
-    data_layout: "model_parallel"
-  }
-  layer {
-    parents: "prob data"
-    name: "eval"
-    target {}
-    data_layout: "data_parallel"
-  }
-
-}
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext
index 53aaae51f86..bcada3863b5 100644
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext
+++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   objective_function {
     l2_weight_regularization {
       scale_factor: 0.0001
@@ -31,6 +30,22 @@ model {
     data_layout: "data_parallel"
     parents: " "
   }
+  layer {
+    name: "zero"
+    data_layout: "data_parallel"
+    constant {
+      value: 0.0
+      num_neurons: "1"
+    }
+  }
+  layer {
+    name: "one"
+    data_layout: "data_parallel"
+    constant {
+      value: 1.0
+      num_neurons: "1"
+    }
+  }
   layer {
     name: "slice_data"
     data_layout: "data_parallel"
@@ -392,9 +407,8 @@ model {
   layer {
     name: "disc1_real_bce"
     data_layout: "data_parallel"
-    parents: "disc1fc3_real"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc1fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -407,9 +421,8 @@ model {
   layer {
     name: "disc1_fake_bce"
     data_layout: "data_parallel"
-    parents: "disc1fc3_fake"
-    bce_with_logits {
-      true_label: 0
+    parents: "disc1fc3_fake zero"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -422,9 +435,8 @@ model {
   layer {
     name: "disc2_real_bce"
     data_layout: "data_parallel"
-    parents: "disc2fc3_real"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc2fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -437,9 +449,8 @@ model {
   layer {
     name: "disc2_fake_bce"
     data_layout: "data_parallel"
-    parents: "disc2fc3_fake"
-    bce_with_logits {
-      true_label: 0
+    parents: "disc2fc3_fake zero"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext
index 182adde9c6d..8c1673f8dff 100644
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext
+++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 64 
   block_size: 256
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext
index bc41dfec585..d4a68df8d48 100644
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext
+++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   objective_function {
     l2_weight_regularization {
       scale_factor: 0.0001
@@ -17,8 +16,8 @@ model {
     #  layer: "L_cyc_x_eval"
     #}
     layer_term {
-      scale_factor: 0.05
-      layer: "l_l2_y_eval"
+      scale_factor: 0.025
+      layer: "l_l2_y"
     }
   }
   num_epochs: 1
@@ -177,12 +176,19 @@ model {
     weights: "disc1fc3linearity"
     parents: "disc1relu2_real"
   }
+  layer {
+    name: "one"
+    data_layout: "data_parallel"
+    constant {
+      value: 1.0
+      num_neurons: "1"
+    }
+  }
   layer {
     name: "g_adv1_bce"
     data_layout: "data_parallel"
-    parents: "disc1fc3_real"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc1fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -444,19 +450,12 @@ model {
     }
   }
   layer {
-    l2_loss {
+    l2_norm2 {
     }
     name: "l_l2_y"
     data_layout: "data_parallel"
     parents: "gsample_minus_y"
   }
-  layer {
-    name: "l_l2_y_eval"
-    data_layout: "data_parallel"
-    parents: "l_l2_y"
-    evaluation {
-    }
-  }
   weights {
     name: "gen1fc1linearity"
     he_normal_initializer {
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext
index 762ad29b798..590e9d64644 100644
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext
+++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 64 
   block_size: 256
@@ -27,8 +26,8 @@ model {
    #   layer: "L_cyc_x_eval"
    # }
     layer_term {
-      scale_factor: 0.05
-      layer: "l_l2_y_eval"
+      scale_factor: 0.025
+      layer: "l_l2_y"
     }
     l2_weight_regularization {
       scale_factor: 1e-4
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext
index f18f53ed959..414f9495b2e 100644
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext
+++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   objective_function {
     l2_weight_regularization {
       scale_factor: 0.0001
@@ -17,8 +16,8 @@ model {
     #  layer: "L_cyc_x_eval"
     #}
     layer_term {
-      scale_factor: 0.05
-      layer: "l_l2_x_eval"
+      scale_factor: 0.025
+      layer: "l_l2_x"
     }
   }
   num_epochs: 1
@@ -169,12 +168,19 @@ model {
     weights: "disc2fc3linearity"
     parents: "disc2relu2_real"
   }
+  layer {
+    name: "one"
+    data_layout: "data_parallel"
+    constant {
+      value: 1.0
+      num_neurons: "1"
+    }
+  }
   layer {
     name: "g_adv2_bce"
     data_layout: "data_parallel"
-    parents: "disc2fc3_real"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc2fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -505,19 +511,12 @@ model {
     }
   }
   layer {
-    l2_loss {
+    l2_norm2 {
     }
     name: "l_l2_x"
     data_layout: "data_parallel"
     parents: "gsample2_minus_x"
   }
-  layer {
-    name: "l_l2_x_eval"
-    data_layout: "data_parallel"
-    parents: "l_l2_x"
-    evaluation {
-    }
-  }
   weights {
     name: "gen2fc1linearity"
     he_normal_initializer {
diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext
index 00ac1bc3236..eb7a3a698af 100644
--- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext
+++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 64 
   block_size: 256
@@ -27,8 +26,8 @@ model {
     #  layer: "L_cyc_x_eval"
     #}
     layer_term {
-      scale_factor: 0.05
-      layer: "l_l2_x_eval"
+      scale_factor: 0.025
+      layer: "l_l2_x"
     }
     l2_weight_regularization {
       scale_factor: 1e-4
diff --git a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m1.py b/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m1.py
index 966306e5f21..d03c92ebd04 100644
--- a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m1.py
+++ b/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m1.py
@@ -154,6 +154,14 @@ def configure_model(model):
     l.children = 'image_data_dummy param_data_id'
     l.slice.slice_points = str_list(slice_points)
 
+    #Useful constants
+    zero = new_layer(model,'zero','','constant')
+    zero.constant.value = 0.0
+    zero.constant.num_neurons = '1'
+    one = new_layer(model,'one','','constant')
+    one.constant.value = 1.0
+    one.constant.num_neurons = '1'
+
     #ID Image (Y) data
     l = new_layer(model,'image_data_dummy','slice_data','identity')
 
@@ -194,20 +202,16 @@ def configure_model(model):
     D_fake2 = add_discriminator(model,'concat_gsample2_n_img','disc2', False, False, '_fake')
 
     #Objective and evaluation layers here
-    l = new_layer(model, 'disc1_real_bce', D_real, 'bce_with_logits')
-    l.bce_with_logits.true_label = 1
+    l = new_layer(model, 'disc1_real_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'disc1_real_eval','disc1_real_bce', 'evaluation')
 
-    l = new_layer(model, 'disc1_fake_bce', D_fake, 'bce_with_logits')
-    l.bce_with_logits.true_label = 0
+    l = new_layer(model, 'disc1_fake_bce', [D_fake, zero.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'disc1_fake_eval','disc1_fake_bce', 'evaluation')
  
-    l = new_layer(model, 'disc2_real_bce', D_real2, 'bce_with_logits')
-    l.bce_with_logits.true_label = 1
+    l = new_layer(model, 'disc2_real_bce', [D_real2, one.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'disc2_real_eval','disc2_real_bce', 'evaluation')
 
-    l = new_layer(model, 'disc2_fake_bce', D_fake2, 'bce_with_logits')
-    l.bce_with_logits.true_label = 0
+    l = new_layer(model, 'disc2_fake_bce', [D_fake2, zero.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'disc2_fake_eval','disc2_fake_bce', 'evaluation')
 
 
diff --git a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m2.py b/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m2.py
index fa6bc24d693..6c28fe41b41 100644
--- a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m2.py
+++ b/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m2.py
@@ -177,8 +177,10 @@ def configure_model(model):
     #freeze discriminator, fake it as real
     D_real = add_discriminator(model,'concat_gsample_n_param','disc1',True, True, '_real')
     #objective function
-    l = new_layer(model, 'g_adv1_bce', D_real, 'bce_with_logits')
-    l.bce_with_logits.true_label = 1
+    one = new_layer(model,'one','','constant')
+    one.constant.value = 1.0
+    one.constant.num_neurons = '1'
+    l = new_layer(model, 'g_adv1_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'g_adv1_eval','g_adv1_bce', 'evaluation')
     
     #************************************************
@@ -208,8 +210,7 @@ def configure_model(model):
     l = new_layer(model, 'gsample_minus_y', g_sample+' image_data_dummy','weighted_sum')
     l.weighted_sum.scaling_factors = '1 -1'
 
-    l = new_layer(model, 'l_l2_y', 'gsample_minus_y', 'l2_loss')
-    l = new_layer(model, 'l_l2_y_eval','l_l2_y', 'evaluation')
+    l = new_layer(model, 'l_l2_y', 'gsample_minus_y', 'l2_norm2')
 
 if __name__ == "__main__":
 
diff --git a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m3.py b/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m3.py
index 4f59cbd9c55..361e939edf8 100644
--- a/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m3.py
+++ b/model_zoo/models/gan/jags/cycle_gan/generate_cycgan_m3.py
@@ -176,8 +176,10 @@ def configure_model(model):
     #freeze discriminator, fake it as real
     D_real = add_discriminator(model,'concat_gsample2_n_img','disc2',True, True, '_real')
     #objective function
-    l = new_layer(model, 'g_adv2_bce', D_real, 'bce_with_logits')
-    l.bce_with_logits.true_label = 1
+     one = new_layer(model,'one','','constant')
+    one.constant.value = 1.0
+    one.constant.num_neurons = '1'
+    l = new_layer(model, 'g_adv2_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'g_adv2_eval','g_adv2_bce', 'evaluation')
     
     #************************************************
@@ -209,8 +211,7 @@ def configure_model(model):
     l = new_layer(model, 'gsample2_minus_x', g_sample2+' param_data_id','weighted_sum')
     l.weighted_sum.scaling_factors = '1 -1'
 
-    l = new_layer(model, 'l_l2_x', 'gsample2_minus_x', 'l2_loss')
-    l = new_layer(model, 'l_l2_x_eval','l_l2_x', 'evaluation')
+    l = new_layer(model, 'l_l2_x', 'gsample2_minus_x', 'l2_norm2')
 
 if __name__ == "__main__":
 
diff --git a/model_zoo/models/gan/mnist/adversarial_model.prototext b/model_zoo/models/gan/mnist/adversarial_model.prototext
index 37267b751d4..6a7d8c8e234 100644
--- a/model_zoo/models/gan/mnist/adversarial_model.prototext
+++ b/model_zoo/models/gan/mnist/adversarial_model.prototext
@@ -1,6 +1,5 @@
 #Adversarial Model
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "model_parallel"
   mini_batch_size: 32
   block_size: 256
@@ -14,7 +13,7 @@ model {
   ###################################################
 
   objective_function {
-    binary_cross_entropy {}
+    layer_term { layer: "binary_cross_entropy" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -25,7 +24,11 @@ model {
   ###################################################
 
   metric {
-    categorical_accuracy {}
+    layer_metric {
+      name: "categorical accuracy"
+      layer: "categorical_accuracy"
+      unit: "%"
+    }
   }
 
   ###################################################
@@ -48,9 +51,9 @@ model {
   }
   callback {
     save_images {
-      image_dir: "/usr/workspace/wsa/jacobs32/github.saj.lbann/fcgan_dump_images_32mb/"
-      layer_names: "fc4_tanh sum"
-      extension: "png"
+      image_prefix: "/usr/workspace/wsa/jacobs32/github.saj.lbann/fcgan_dump_images_32mb/"
+      layers: "fc4_tanh sum"
+      image_format: "png"
     }
   }
 
@@ -61,13 +64,25 @@ model {
 
   # INPUT real data
   layer {
-    name: "data"
-    children: "zero_data target"
+    name: "input"
+    children: "input label"
     data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
     }
   }
+  layer {
+    parents: "input"
+    name: "data"
+    data_layout: "data_parallel"
+    split {}    
+  }
+  layer {
+    parents: "input"
+    name: "label"
+    data_layout: "data_parallel"
+    split {}    
+  }
 
   #ZERO
   layer {
@@ -109,7 +124,6 @@ model {
     #weights: "gen_fc_weights"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -142,7 +156,6 @@ model {
     #weights: "gen_fc_weights"
     fully_connected {
       num_neurons: 512
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -175,7 +188,6 @@ model {
     #weights: "gen_fc_weights"
     fully_connected {
       num_neurons: 1024
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -208,7 +220,6 @@ model {
     #weights: "gen_fc_weights"
     fully_connected {
       num_neurons: 784
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -270,7 +281,6 @@ model {
     weights: "dis_flatten_weights"
     fully_connected {
       num_neurons: 784
-      #weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -287,7 +297,6 @@ model {
     weights: "dis_fc1_weights"
     fully_connected {
       num_neurons: 512
-      #weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -312,7 +321,6 @@ model {
     weights: "dis_fc2_weights"
     fully_connected {
       num_neurons: 256
-      #weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -338,7 +346,6 @@ model {
     weights: "dis_fc3_weights"
     fully_connected {
       num_neurons: 2
-      #weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -360,10 +367,16 @@ model {
   }
 
   layer {
-    parents: "prob data"
-    name: "target"
+    parents: "prob label"
+    name: "binary_cross_entropy"
+    data_layout: "data_parallel"
+    binary_cross_entropy {}
+  }
+  layer {
+    parents: "prob label"
+    name: "categorical_accuracy"
     data_layout: "data_parallel"
-    target {}
+    categorical_accuracy {}
   }
 
 }
diff --git a/model_zoo/models/gan/mnist/discriminator_model.prototext b/model_zoo/models/gan/mnist/discriminator_model.prototext
index 9a3111bc2ce..b373f33b785 100644
--- a/model_zoo/models/gan/mnist/discriminator_model.prototext
+++ b/model_zoo/models/gan/mnist/discriminator_model.prototext
@@ -1,6 +1,5 @@
 #Discriminator Model
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "model_parallel"
   mini_batch_size: 32
   block_size: 256
@@ -14,7 +13,7 @@ model {
   ###################################################
 
   objective_function {
-    binary_cross_entropy {}
+    layer_term { layer: "binary_cross_entropy" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -25,7 +24,11 @@ model {
   ###################################################
 
   metric {
-    categorical_accuracy {}
+    layer_metric {
+      name: "categorical accuracy"
+      layer: "categorical_accuracy"
+      unit: "%"
+    }
   }
 
   ###################################################
@@ -54,14 +57,25 @@ model {
 
   # INPUT real data
   layer {
-    name: "data"
-    children: "zero_data target"
+    name: "input"
+    children: "input label"
     data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
     }
   }
-
+  layer {
+    parents: "input"
+    name: "data"
+    data_layout: "data_parallel"
+    split {}    
+  }
+  layer {
+    parents: "input"
+    name: "label"
+    data_layout: "data_parallel"
+    split {}    
+  }
 
   #ZERO
   layer {
@@ -102,7 +116,6 @@ model {
     weights: "gen_fc1_weights"
     fully_connected {
       num_neurons: 256
-     # weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -141,7 +154,6 @@ model {
     weights: "gen_fc2_weights"
     fully_connected {
       num_neurons: 512
-      #weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -180,7 +192,6 @@ model {
     weights: "gen_fc3_weights"
     fully_connected {
       num_neurons: 1024
-     # weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -219,7 +230,6 @@ model {
     weights: "gen_fc4_weights"
     fully_connected {
       num_neurons: 784
-      #weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -275,7 +285,6 @@ model {
     data_layout: "data_parallel"
     fully_connected {
       num_neurons: 784
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -287,7 +296,6 @@ model {
     #weights: "gen_fc_weights"
     fully_connected {
       num_neurons: 512
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -307,7 +315,6 @@ model {
     #weights: "gen_fc_weights"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -327,7 +334,6 @@ model {
     data_layout: "data_parallel"
     fully_connected {
       num_neurons: 2
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -349,10 +355,16 @@ model {
   }
 
   layer {
-    parents: "prob data"
-    name: "target"
+    parents: "prob label"
+    name: "binary_cross_entropy"
+    data_layout: "data_parallel"
+    binary_cross_entropy {}
+  }
+  layer {
+    parents: "prob label"
+    name: "categorical_accuracy"
     data_layout: "data_parallel"
-    target {}
+    categorical_accuracy {}
   }
 
 }
diff --git a/model_zoo/models/greedy_layerwise_autoencoder_mnist/model_greedy_layerwise_autoencoder_mnist.prototext b/model_zoo/models/greedy_layerwise_autoencoder_mnist/model_greedy_layerwise_autoencoder_mnist.prototext
deleted file mode 100644
index 51e1664204d..00000000000
--- a/model_zoo/models/greedy_layerwise_autoencoder_mnist/model_greedy_layerwise_autoencoder_mnist.prototext
+++ /dev/null
@@ -1,125 +0,0 @@
-model {
-  name: "greedy_layerwise_autoencoder"
-  data_layout: "model_parallel"
-  mini_batch_size: 192
-  block_size: 256
-  num_epochs: 10
-  num_parallel_readers: 0
-  procs_per_model: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    mean_squared_error {}
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric { mean_squared_error {} }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-#  callback {
-#    timer {
-#    }
-#  }
-#  callback {
-#    save_images {
-#      image_dir: "images_"
-#      extension: "pgm"
-#    }
-#  }
-
-  ###################################################
-  # start of layers
-  ###################################################
-
-  # data
-  layer {
-    name: "data"
-    children: "fc1 reconstruction"
-    data_layout: "model_parallel"
-    input {
-      io_buffer: "distributed"
-      target_mode: "reconstruction"
-    }
-  }
-
-  # encode1
-  layer {
-    name: "fc1:
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 100
-      has_bias: true
-    }
-  }
-  layer {
-    data_layout: "model_parallel"
-    sigmoid {}
-  }
-
-  # encode2
-  layer {
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 50
-      has_bias: true
-    }
-  }
-  layer {
-    data_layout: "model_parallel"
-    sigmoid {}
-  }
-
-  # decode2
-  layer {
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 100
-      has_bias: true
-    }
-  }
-  layer {
-    data_layout: "model_parallel"
-    sigmoid {}
-  }
-
-  # decode1
-  layer {
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons: 784
-      has_bias: true
-    }
-  }
-  layer {
-    name: "sigmoid"
-    data_layout: "model_parallel"
-    sigmoid {}
-  }
-
-  # reconstruction
-  layer {
-    name: "reconstruction"
-    parents: "sigmoid data"
-    data_layout: "model_parallel"
-    reconstruction {
-      original_layer: "data"
-    }
-  }
-
-  ###################################################
-  # end of layers
-  ###################################################
-}
diff --git a/model_zoo/models/greedy_layerwise_autoencoder_mnist/runme.py b/model_zoo/models/greedy_layerwise_autoencoder_mnist/runme.py
deleted file mode 100755
index 41e35338f4a..00000000000
--- a/model_zoo/models/greedy_layerwise_autoencoder_mnist/runme.py
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/python
-
-import sys
-sys.path.insert(0, '../python')
-import common
-
-common.build_and_submit_slurm_script( 
-    'model_greedy_layerwise_autoencoder_mnist.prototext',
-    '../../data_readers/data_reader_mnist.prototext',
-    '../../optimizers/opt_adagrad.prototext' ) 
diff --git a/model_zoo/models/imagenet/model_imagenet.prototext b/model_zoo/models/imagenet/model_imagenet.prototext
index 4fb0c840bf2..73f851bf4a5 100644
--- a/model_zoo/models/imagenet/model_imagenet.prototext
+++ b/model_zoo/models/imagenet/model_imagenet.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   num_epochs: 20
   data_layout: "data_parallel"
 
@@ -8,7 +7,7 @@ model {
   ###################################################
 
   objective_function {
-    cross_entropy {}
+    layer_term { layer: "cross_entropy" }
     l2_weight_regularization {
       scale_factor: 0.0005
     }
@@ -18,21 +17,41 @@ model {
   # Metrics
   ###################################################
 
-  metric { categorical_accuracy {} }
   metric {
-    top_k_categorical_accuracy {
-       top_k: 5
+    layer_metric {
+      name: "categorical accuracy"
+      layer: "top1_accuracy"
+      unit: "%"
+    }
+  }
+  metric {
+    layer_metric {
+      name: "top-5 categorical accuracy"
+      layer: "top5_accuracy"
+      unit: "%"
     }
   }
 
+
   layer {
+    name: "1"
+    children: "1a 1b"
+    data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
     }
-    name: "1"
+  }
+  layer {
+    name: "1a"
     parents: "1"
-    children: "2 7"
     data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    name: "1b"
+    parents: "1"
+    data_layout: "data_parallel"
+    split {}
   }
   #############################################
   layer {
@@ -41,7 +60,7 @@ model {
       has_bias: true
     }
     name: "2"
-    parents: "1"
+    parents: "1a"
     children: ""
     data_layout: "model_parallel"
   }
@@ -76,17 +95,28 @@ model {
     name: "6"
     parents: "5"
     children: ""
-    data_layout: "model_parallel"
+    data_layout: "data_parallel"
     softmax {
     }
   }
   #############################################
   layer {
-    name: "7"
-    parents: "6 1"
-    children: ""
+    name: "cross_entropy"
+    parents: "6 1b"
+    data_layout: "data_parallel"
+    cross_entropy {}
+  }
+  layer {
+    name: "top1_accuracy"
+    parents: "6 1b"
+    data_layout: "data_parallel"
+    categorical_accuracy {}
+  }
+  layer {
+    name: "top5_accuracy"
+    parents: "6 1b"
     data_layout: "data_parallel"
-    target {}
+    top_k_categorical_accuracy { k: 5 }
   }
   #############################################
   mini_batch_size: 256
diff --git a/model_zoo/models/jag/ae_cycle_gan/cycgan_m1.prototext b/model_zoo/models/jag/ae_cycle_gan/cycgan_m1.prototext
index 711501845c2..4b951148694 100644
--- a/model_zoo/models/jag/ae_cycle_gan/cycgan_m1.prototext
+++ b/model_zoo/models/jag/ae_cycle_gan/cycgan_m1.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   objective_function {
     l2_weight_regularization {
       scale_factor: 0.0001
@@ -33,6 +32,22 @@ model {
     data_layout: "data_parallel"
     parents: " "
   }
+  layer {
+    name: "zero"
+    data_layout: "model_parallel"
+    constant {
+      value: 0.0
+      num_neurons: "1"
+    }
+  }
+  layer {
+    name: "one"
+    data_layout: "model_parallel"
+    constant {
+      value: 1.0
+      num_neurons: "1"
+    }
+  }
   layer {
     name: "slice_data"
     data_layout: "model_parallel"
@@ -70,7 +85,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -98,7 +112,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -125,7 +138,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -155,7 +167,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons:20
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -166,7 +177,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons:20
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -187,15 +197,13 @@ model {
     parents: "z_mean"
     name: "kl_z_mean2"
     data_layout: "model_parallel"
-    power {
-      exponent: 2.0
-    }
+    square {}
   }
   layer {
     parents: "z_log_sigma"
     name: "kl_exp"
     data_layout: "model_parallel"
-    exponential {}
+    exp {}
   }
   layer {
     parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
@@ -236,7 +244,7 @@ model {
     parents: "sample_half"
     name: "sample_exp"
     data_layout: "model_parallel"
-    exponential {}
+    exp {}
   }
   layer {
     name: "sample_noise"
@@ -613,10 +621,8 @@ model {
   layer {
     name: "disc1_real_bce"
     data_layout: "model_parallel"
-    parents: "disc1fc3_real"
-    device_allocation: "cpu"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc1fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -629,10 +635,8 @@ model {
   layer {
     name: "disc1_fake_bce"
     data_layout: "model_parallel"
-    parents: "disc1fc3_fake"
-    device_allocation: "cpu"
-    bce_with_logits {
-      true_label: 0
+    parents: "disc1fc3_fake zero"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -645,10 +649,8 @@ model {
   layer {
     name: "disc2_real_bce"
     data_layout: "model_parallel"
-    parents: "disc2fc3_real"
-    device_allocation: "cpu"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc2fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -661,10 +663,8 @@ model {
   layer {
     name: "disc2_fake_bce"
     data_layout: "model_parallel"
-    parents: "disc2fc3_fake"
-    device_allocation: "cpu"
-    bce_with_logits {
-      true_label: 0
+    parents: "disc2fc3_fake zero"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
diff --git a/model_zoo/models/jag/ae_cycle_gan/cycgan_m2.prototext b/model_zoo/models/jag/ae_cycle_gan/cycgan_m2.prototext
index cf919e68e78..9b640a2acb2 100644
--- a/model_zoo/models/jag/ae_cycle_gan/cycgan_m2.prototext
+++ b/model_zoo/models/jag/ae_cycle_gan/cycgan_m2.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   objective_function {
     l2_weight_regularization {
       scale_factor: 0.0001
@@ -9,8 +8,8 @@ model {
       layer: "g_adv1_eval"
     }
     layer_term {
-      scale_factor: 0.05
-      layer: "l_l2_y_eval"
+      scale_factor: 0.025
+      layer: "l_l2_y"
     }
   }
   num_epochs: 1
@@ -61,7 +60,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -88,7 +86,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -115,7 +112,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -145,7 +141,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons:20
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -156,7 +151,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons:20
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -177,15 +171,13 @@ model {
     parents: "z_mean"
     name: "kl_z_mean2"
     data_layout: "model_parallel"
-    power {
-      exponent: 2.0
-    }
+    square {}
   }
   layer {
     parents: "z_log_sigma"
     name: "kl_exp"
     data_layout: "model_parallel"
-    exponential {}
+    exp {}
   }
   layer {
     parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
@@ -226,7 +218,7 @@ model {
     parents: "sample_half"
     name: "sample_exp"
     data_layout: "model_parallel"
-    exponential {}
+    exp {}
   }
   layer {
     name: "sample_noise"
@@ -378,13 +370,19 @@ model {
     weights: "disc1fc3linearity"
     parents: "disc1relu2_real"
   }
+  layer {
+    name: "one"
+    data_layout: "model_parallel"
+    constant {
+      value: 1.0
+      num_neurons: "1"
+    }
+  }
   layer {
     name: "g_adv1_bce"
     data_layout: "model_parallel"
-    device_allocation: "cpu"
-    parents: "disc1fc3_real"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc1fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -650,20 +648,13 @@ model {
     }
   }
   layer {
-    l2_loss {
+    l2_norm2 {
     }
     name: "l_l2_y"
     device_allocation: "cpu"
     data_layout: "model_parallel"
     parents: "gsample_minus_y"
   }
-  layer {
-    name: "l_l2_y_eval"
-    data_layout: "model_parallel"
-    parents: "l_l2_y"
-    evaluation {
-    }
-  }
   weights {
     name: "gen1fc1linearity"
     he_normal_initializer {
diff --git a/model_zoo/models/jag/ae_cycle_gan/cycgan_m3.prototext b/model_zoo/models/jag/ae_cycle_gan/cycgan_m3.prototext
index 8060319c5e5..bbc627758ba 100644
--- a/model_zoo/models/jag/ae_cycle_gan/cycgan_m3.prototext
+++ b/model_zoo/models/jag/ae_cycle_gan/cycgan_m3.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   objective_function {
     l2_weight_regularization {
       scale_factor: 0.0001
@@ -9,8 +8,8 @@ model {
       layer: "g_adv2_eval"
     }
     layer_term {
-      scale_factor: 0.05
-      layer: "l_l2_x_eval"
+      scale_factor: 0.025
+      layer: "l_l2_x"
     }
   }
   num_epochs: 1
@@ -61,7 +60,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -88,7 +86,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -115,7 +112,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -145,7 +141,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons:20
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -156,7 +151,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons:20
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -177,15 +171,13 @@ model {
     parents: "z_mean"
     name: "kl_z_mean2"
     data_layout: "model_parallel"
-    power {
-      exponent: 2.0
-    }
+    square {}
   }
   layer {
     parents: "z_log_sigma"
     name: "kl_exp"
     data_layout: "model_parallel"
-    exponential {}
+    exp {}
   }
   layer {
     parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
@@ -226,7 +218,7 @@ model {
     parents: "sample_half"
     name: "sample_exp"
     data_layout: "model_parallel"
-    exponential {}
+    exp {}
   }
   layer {
     name: "sample_noise"
@@ -367,13 +359,19 @@ model {
     weights: "disc2fc3linearity"
     parents: "disc2relu2_real"
   }
+  layer {
+    name: "one"
+    data_layout: "model_parallel"
+    constant {
+      value: 1.0
+      num_neurons: "1"
+    }
+  }
   layer {
     name: "g_adv2_bce"
     data_layout: "model_parallel"
-    device_allocation: "cpu"
-    parents: "disc2fc3_real"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc2fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -710,20 +708,13 @@ model {
     }
   }
   layer {
-    l2_loss {
+    l2_norm2 {
     }
     name: "l_l2_x"
     data_layout: "model_parallel"
     device_allocation: "cpu"
     parents: "gsample2_minus_x"
   }
-  layer {
-    name: "l_l2_x_eval"
-    data_layout: "model_parallel"
-    parents: "l_l2_x"
-    evaluation {
-    }
-  }
   weights {
     name: "gen2fc1linearity"
     he_normal_initializer {
diff --git a/model_zoo/models/jag/ae_cycle_gan/vae1.prototext b/model_zoo/models/jag/ae_cycle_gan/vae1.prototext
index 7af600bf172..d3f107edf7b 100644
--- a/model_zoo/models/jag/ae_cycle_gan/vae1.prototext
+++ b/model_zoo/models/jag/ae_cycle_gan/vae1.prototext
@@ -2,7 +2,6 @@
 #https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py
 #Timestamp 02/26/2018 8:45AM
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "model_parallel"
   #mini_batch_size: 128
   mini_batch_size: 100 #more last minibatch images to save
@@ -16,12 +15,8 @@ model {
   ###################################################
 
   objective_function {
-    binary_cross_entropy {}
-    #mean_squared_error {}
-    layer_term {
-      scale_factor: 1.0
-      layer: "kl_divergence"
-    }
+    layer_term { layer: "binary_cross_entropy" }
+    layer_term { layer: "kl_divergence" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -31,7 +26,12 @@ model {
   # Metrics
   ###################################################
 
-  metric { mean_squared_error {} }
+  metric {
+    layer_metric {
+      name: "mean squared error"
+      layer: "mean_squared_error"
+    }
+  }
 
   ###################################################
   # Callbacks
@@ -51,8 +51,8 @@ model {
   #}
   #callback {
   #  save_images {
-  #    image_dir: "vae_fcn_images_"
-  #    extension: "jpg"
+  #    image_prefix: "vae_fcn_images_"
+  #    image_format: "jpg"
   #  }
   #}
 
@@ -78,9 +78,21 @@ model {
       io_buffer: "partitioned"
       target_mode: "N/A"
     }
+    name: "input"
+    data_layout: "data_parallel"
+    children: "data dummy"
+  }
+  layer {
+    parents: "input"
     name: "data"
     data_layout: "data_parallel"
-    parents: " "
+    split {}
+  }
+  layer {
+    parents: "input"
+    name: "dummy"
+    data_layout: "data_parallel"
+    dummy {}
   }
   layer {
     name: "slice_data"
@@ -117,7 +129,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -144,7 +155,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -170,7 +180,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -199,7 +208,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons:20
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -209,7 +217,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons:20
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -231,16 +238,14 @@ model {
     name: "kl_z_mean2"
     device_allocation: "cpu"
     data_layout: "model_parallel"
-    power {
-      exponent: 2.0
-    }
+    square {}
   }
   layer {
     parents: "z_log_sigma"
     name: "kl_exp"
     data_layout: "model_parallel"
     device_allocation: "cpu"
-    exponential {}
+    exp {}
   }
   layer {
     parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
@@ -282,7 +287,7 @@ model {
     name: "sample_exp"
     data_layout: "model_parallel"
     device_allocation: "cpu"
-    exponential {}
+    exp {}
   }
   layer {
     name: "sample_noise"
@@ -317,7 +322,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -343,7 +347,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -369,7 +372,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -397,7 +399,6 @@ model {
     data_layout: "model_parallel"
     #num_neurons_from_data_reader: true
     fully_connected {
-      weight_initialization: "glorot_normal"
       num_neurons: 16384
       has_bias: true
     }
@@ -414,11 +415,22 @@ model {
   ######################
 
   layer {
-    #parents: "sigmoid data"
-    parents: "sigmoid image_data_dummy"
+    parents: "sigmoid"
     name: "reconstruction"
     data_layout: "model_parallel"
-    reconstruction {}
+    split {}    
+  }
+  layer {
+    parents: "reconstruction image_data_dummy"
+    name: "binary_cross_entropy"
+    data_layout: "model_parallel"
+    binary_cross_entropy {}
+  }
+  layer {
+    parents: "reconstruction image_data_dummy"
+    name: "mean_squared_error"
+    data_layout: "model_parallel"
+    mean_squared_error {}
   }
 
   ###################################################
diff --git a/model_zoo/models/jag/ae_cycle_gan/vae_cyc.prototext b/model_zoo/models/jag/ae_cycle_gan/vae_cyc.prototext
index 05ba774c633..024c3e318e1 100644
--- a/model_zoo/models/jag/ae_cycle_gan/vae_cyc.prototext
+++ b/model_zoo/models/jag/ae_cycle_gan/vae_cyc.prototext
@@ -2,7 +2,6 @@
 #https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py
 #Timestamp 02/26/2018 8:45AM
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "model_parallel"
   #mini_batch_size: 128
   mini_batch_size: 100 #more last minibatch images to save
@@ -16,12 +15,8 @@ model {
   ###################################################
 
   objective_function {
-    binary_cross_entropy {}
-    #mean_squared_error {}
-    layer_term {
-      scale_factor: 1.0
-      layer: "kl_divergence"
-    }
+    layer_term { layer: "binary_cross_entropy" }
+    layer_term { layer: "kl_divergence" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -31,7 +26,12 @@ model {
   # Metrics
   ###################################################
 
-  metric { mean_squared_error {} }
+  metric {
+    layer_metric {
+      name: "mean squared error"
+      layer: "mean_squared_error"
+    }
+  }
 
   ###################################################
   # Callbacks
@@ -44,8 +44,8 @@ model {
   callback { timer {} }
  # callback {
  #   save_images {
- #     image_dir: "vae_fcn_images_"
- #     extension: "jpg"
+ #     image_prefix: "vae_fcn_images_"
+ #     image_format: "jpg"
  #   }
  # }
 
@@ -62,9 +62,21 @@ model {
       io_buffer: "partitioned"
       target_mode: "N/A"
     }
+    name: "input"
+    data_layout: "data_parallel"
+    children: "data dummy"
+  }
+  layer {
+    parents: "input"
     name: "data"
     data_layout: "data_parallel"
-    parents: " "
+    split {}
+  }
+  layer {
+    parents: "input"
+    name: "dummy"
+    data_layout: "data_parallel"
+    dummy {}
   }
   layer {
     name: "slice_data"
@@ -193,7 +205,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -220,7 +231,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -246,36 +256,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
-      has_bias: true
-    }
-  }
-  layer {
-    parents: "encode3"
-    name: "encode3_tanh"
-    data_layout: "model_parallel"
-    tanh {}
-  }
-  layer {
-    parents: "encode3_tanh"
-    name: "encode3_dropout"
-    data_layout: "model_parallel"
-    dropout {
-      keep_prob: 0.95
-    }
-  }
-
-  ######################
-  # Latent space
-  ######################
-
-  layer {
-    parents: "encode3_dropout"
-    name: "z_mean"
-    data_layout: "model_parallel"
-    fully_connected {
-      num_neurons:20
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -285,7 +265,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons:20
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -307,16 +286,14 @@ model {
     name: "kl_z_mean2"
     data_layout: "model_parallel"
     device_allocation: "cpu"
-    power {
-      exponent: 2.0
-    }
+    square {}
   }
   layer {
     parents: "z_log_sigma"
     name: "kl_exp"
     data_layout: "model_parallel"
     device_allocation: "cpu"
-    exponential {}
+    exp {}
   }
   layer {
     parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
@@ -358,7 +335,7 @@ model {
     name: "sample_exp"
     data_layout: "model_parallel"
     device_allocation: "cpu"
-    exponential {}
+    exp {}
   }
   layer {
     name: "sample_noise"
@@ -395,7 +372,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -421,7 +397,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -447,7 +422,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -475,7 +449,6 @@ model {
     data_layout: "model_parallel"
     #num_neurons_from_data_reader: true
     fully_connected {
-      weight_initialization: "glorot_normal"
       num_neurons: 16384
       has_bias: true
     }
@@ -492,12 +465,22 @@ model {
   ######################
 
   layer {
-    #parents: "sigmoid data"
-    #parents: "sigmoid gen1fc4_1"
-    parents: "sigmoid image_data_id"
+    parents: "sigmoid"
     name: "reconstruction"
     data_layout: "model_parallel"
-    reconstruction {}
+    split {}    
+  }
+  layer {
+    parents: "reconstruction image_data_id"
+    name: "binary_cross_entropy"
+    data_layout: "model_parallel"
+    binary_cross_entropy {}
+  }
+  layer {
+    parents: "reconstruction image_data_id"
+    name: "mean_squared_error"
+    data_layout: "model_parallel"
+    mean_squared_error {}
   }
 
   ###################################################
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m1.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m1.prototext
index 529a4fc706e..7c0fb4c7ef4 100644
--- a/model_zoo/models/jag/cycle_gan/cycgan_m1.prototext
+++ b/model_zoo/models/jag/cycle_gan/cycgan_m1.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   objective_function {
     l2_weight_regularization {
       scale_factor: 0.0001
@@ -32,13 +31,30 @@ model {
     data_layout: "data_parallel"
     parents: " "
   }
+  layer {
+    name: "zero"
+    data_layout: "data_parallel"
+    constant {
+      value: 0.0
+      num_neurons: "1"
+    }
+  }
+  layer {
+    name: "one"
+    data_layout: "data_parallel"
+    constant {
+      value: 1.0
+      num_neurons: "1"
+    }
+  }
   layer {
     name: "slice_data"
     data_layout: "data_parallel"
     parents: "data"
     children: "image_data_dummy param_data_id"
     slice {
-      slice_points: "0 2500 2511"
+      # slice_points: "0 49174 49179"
+      get_slice_points_from_reader: "independent"
     }
   }
   layer {
@@ -120,7 +136,9 @@ model {
   layer {
     freeze: true
     fully_connected {
-      num_neurons: 2500
+      # num_neurons: 49174
+      get_slice_points_from_reader: "independent"
+      get_num_neurons_of_slice_from_reader: [ 1 ]
       has_bias: true
     }
     name: "gen1fc4"
@@ -185,7 +203,9 @@ model {
   layer {
     freeze: true
     fully_connected {
-      num_neurons: 11
+      # num_neurons: 5
+      get_slice_points_from_reader: "independent"
+      get_num_neurons_of_slice_from_reader: [ 2 ]
       has_bias: true
     }
     name: "gen2fc4"
@@ -393,10 +413,8 @@ model {
   layer {
     name: "disc1_real_bce"
     data_layout: "data_parallel"
-    parents: "disc1fc3_real"
-    device_allocation: "cpu"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc1fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -409,10 +427,8 @@ model {
   layer {
     name: "disc1_fake_bce"
     data_layout: "data_parallel"
-    parents: "disc1fc3_fake"
-    device_allocation: "cpu"
-    bce_with_logits {
-      true_label: 0
+    parents: "disc1fc3_fake zero"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -425,10 +441,8 @@ model {
   layer {
     name: "disc2_real_bce"
     data_layout: "data_parallel"
-    parents: "disc2fc3_real"
-    device_allocation: "cpu"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc2fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -441,10 +455,8 @@ model {
   layer {
     name: "disc2_fake_bce"
     data_layout: "data_parallel"
-    parents: "disc2fc3_fake"
-    device_allocation: "cpu"
-    bce_with_logits {
-      true_label: 0
+    parents: "disc2fc3_fake zero"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m1_template.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m1_template.prototext
index 182adde9c6d..8c1673f8dff 100644
--- a/model_zoo/models/jag/cycle_gan/cycgan_m1_template.prototext
+++ b/model_zoo/models/jag/cycle_gan/cycgan_m1_template.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 64 
   block_size: 256
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m2.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m2.prototext
index 0d5b1f1e433..e6723e92568 100644
--- a/model_zoo/models/jag/cycle_gan/cycgan_m2.prototext
+++ b/model_zoo/models/jag/cycle_gan/cycgan_m2.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   objective_function {
     l2_weight_regularization {
       scale_factor: 0.0001
@@ -9,8 +8,8 @@ model {
       layer: "g_adv1_eval"
     }
     layer_term {
-      scale_factor: 0.05
-      layer: "l_l2_y_eval"
+      scale_factor: 0.025
+      layer: "l_l2_y"
     }
   }
   num_epochs: 1
@@ -30,7 +29,8 @@ model {
     parents: "data"
     children: "image_data_dummy param_data_id"
     slice {
-      slice_points: "0 2500 2511"
+      # slice_points: "0 49174 49179"
+      get_slice_points_from_reader: "independent"
     }
   }
   layer {
@@ -108,7 +108,9 @@ model {
   }
   layer {
     fully_connected {
-      num_neurons: 2500
+      # num_neurons: 49174
+      get_slice_points_from_reader: "independent"
+      get_num_neurons_of_slice_from_reader: [ 1 ]
       has_bias: true
     }
     name: "gen1fc4_1"
@@ -170,13 +172,19 @@ model {
     weights: "disc1fc3linearity"
     parents: "disc1relu2_real"
   }
+  layer {
+    name: "one"
+    data_layout: "data_parallel"
+    constant {
+      value: 1.0
+      num_neurons: "1"
+    }
+  }
   layer {
     name: "g_adv1_bce"
     data_layout: "data_parallel"
-    device_allocation: "cpu"
-    parents: "disc1fc3_real"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc1fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -243,7 +251,9 @@ model {
   layer {
     freeze: true
     fully_connected {
-      num_neurons: 11
+      # num_neurons: 5
+      get_slice_points_from_reader: "independent"
+      get_num_neurons_of_slice_from_reader: [ 2 ]
       has_bias: true
     }
     name: "gen2fc4_y"
@@ -312,7 +322,9 @@ model {
   }
   layer {
     fully_connected {
-      num_neurons: 2500
+      # num_neurons: 49174
+      get_slice_points_from_reader: "independent"
+      get_num_neurons_of_slice_from_reader: [ 1 ]
       has_bias: true
     }
     name: "gen1fc4_2"
@@ -400,7 +412,9 @@ model {
   layer {
     freeze: true
     fully_connected {
-      num_neurons: 11
+      # num_neurons: 5
+      get_slice_points_from_reader: "independent"
+      get_num_neurons_of_slice_from_reader: [ 2 ]
       has_bias: true
     }
     name: "gen2fc4_gsample"
@@ -440,20 +454,13 @@ model {
     }
   }
   layer {
-    l2_loss {
+    l2_norm2 {
     }
     name: "l_l2_y"
     device_allocation: "cpu"
     data_layout: "data_parallel"
     parents: "gsample_minus_y"
   }
-  layer {
-    name: "l_l2_y_eval"
-    data_layout: "data_parallel"
-    parents: "l_l2_y"
-    evaluation {
-    }
-  }
   weights {
     name: "gen1fc1linearity"
     he_normal_initializer {
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m2_template.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m2_template.prototext
index aaf337e9118..23b504800cd 100644
--- a/model_zoo/models/jag/cycle_gan/cycgan_m2_template.prototext
+++ b/model_zoo/models/jag/cycle_gan/cycgan_m2_template.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 64 
   block_size: 256
@@ -19,8 +18,8 @@ model {
       layer: "g_adv1_eval"
     }
     layer_term {
-      scale_factor: 0.05
-      layer: "l_l2_y_eval"
+      scale_factor: 0.025
+      layer: "l_l2_y"
     }
     l2_weight_regularization {
       scale_factor: 1e-4
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m3.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m3.prototext
index c905ffbaf40..c50df8e7b4c 100644
--- a/model_zoo/models/jag/cycle_gan/cycgan_m3.prototext
+++ b/model_zoo/models/jag/cycle_gan/cycgan_m3.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   objective_function {
     l2_weight_regularization {
       scale_factor: 0.0001
@@ -9,8 +8,8 @@ model {
       layer: "g_adv2_eval"
     }
     layer_term {
-      scale_factor: 0.05
-      layer: "l_l2_x_eval"
+      scale_factor: 0.025
+      layer: "l_l2_x"
     }
   }
   num_epochs: 1
@@ -30,7 +29,8 @@ model {
     parents: "data"
     children: "image_data_dummy param_data_id"
     slice {
-      slice_points: "0 2500 2511"
+      # slice_points: "0 49174 49179"
+      get_slice_points_from_reader: "independent"
     }
   }
   layer {
@@ -100,7 +100,9 @@ model {
   }
   layer {
     fully_connected {
-      num_neurons: 11
+      # num_neurons: 5
+      get_slice_points_from_reader: "independent"
+      get_num_neurons_of_slice_from_reader: [ 2 ]
       has_bias: true
     }
     name: "gen2fc4_1"
@@ -162,13 +164,19 @@ model {
     weights: "disc2fc3linearity"
     parents: "disc2relu2_real"
   }
+  layer {
+    name: "one"
+    data_layout: "data_parallel"
+    constant {
+      value: 1.0
+      num_neurons: "1"
+    }
+  }
   layer {
     name: "g_adv2_bce"
     data_layout: "data_parallel"
-    device_allocation: "cpu"
-    parents: "disc2fc3_real"
-    bce_with_logits {
-      true_label: 1
+    parents: "disc2fc3_real one"
+    sigmoid_binary_cross_entropy {
     }
   }
   layer {
@@ -231,7 +239,9 @@ model {
   }
   layer {
     fully_connected {
-      num_neurons: 11
+      # num_neurons: 5
+      get_slice_points_from_reader: "independent"
+      get_num_neurons_of_slice_from_reader: [ 2 ]
       has_bias: true
     }
     name: "gen2fc4_y"
@@ -304,7 +314,9 @@ model {
   layer {
     freeze: true
     fully_connected {
-      num_neurons: 2500
+      # num_neurons: 49174
+      get_slice_points_from_reader: "independent"
+      get_num_neurons_of_slice_from_reader: [ 1 ]
       has_bias: true
     }
     name: "gen1fc4_2"
@@ -400,7 +412,9 @@ model {
   layer {
     freeze: true
     fully_connected {
-      num_neurons: 2500
+      # num_neurons: 49174
+      get_slice_points_from_reader: "independent"
+      get_num_neurons_of_slice_from_reader: [ 1 ]
       has_bias: true
     }
     name: "gen1fc4_1"
@@ -461,7 +475,9 @@ model {
   }
   layer {
     fully_connected {
-      num_neurons: 11
+      # num_neurons: 5
+      get_slice_points_from_reader: "independent"
+      get_num_neurons_of_slice_from_reader: [ 2 ]
       has_bias: true
     }
     name: "gen2fc4_gsample"
@@ -501,20 +517,13 @@ model {
     }
   }
   layer {
-    l2_loss {
+    l2_norm2 {
     }
     name: "l_l2_x"
     data_layout: "data_parallel"
     device_allocation: "cpu"
     parents: "gsample2_minus_x"
   }
-  layer {
-    name: "l_l2_x_eval"
-    data_layout: "data_parallel"
-    parents: "l_l2_x"
-    evaluation {
-    }
-  }
   weights {
     name: "gen2fc1linearity"
     he_normal_initializer {
diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m3_template.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m3_template.prototext
index 43d7fbca7cf..f5d6baf7cde 100644
--- a/model_zoo/models/jag/cycle_gan/cycgan_m3_template.prototext
+++ b/model_zoo/models/jag/cycle_gan/cycgan_m3_template.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 64 
   block_size: 256
@@ -19,8 +18,8 @@ model {
       layer: "g_adv2_eval"
     }
     layer_term {
-      scale_factor: 0.05
-      layer: "l_l2_x_eval"
+      scale_factor: 0.025
+      layer: "l_l2_x"
     }
     l2_weight_regularization {
       scale_factor: 1e-4
diff --git a/model_zoo/models/jag/cycle_gan/generate_cycgan_m1.py b/model_zoo/models/jag/cycle_gan/generate_cycgan_m1.py
index d19b8d45ce3..2abc985a995 100644
--- a/model_zoo/models/jag/cycle_gan/generate_cycgan_m1.py
+++ b/model_zoo/models/jag/cycle_gan/generate_cycgan_m1.py
@@ -155,6 +155,14 @@ def configure_model(model):
     l.children = 'image_data_dummy param_data_id'
     l.slice.slice_points = str_list(slice_points)
 
+    #Useful constants
+    zero = new_layer(model,'zero','','constant')
+    zero.constant.value = 0.0
+    zero.constant.num_neurons = '1'
+    one = new_layer(model,'one','','constant')
+    one.constant.value = 1.0
+    one.constant.num_neurons = '1'
+    
     #ID Image (Y) data
     l = new_layer(model,'image_data_dummy','slice_data','identity')
 
@@ -198,20 +206,16 @@ def configure_model(model):
     D_fake2 = add_discriminator(model,'concat_gsample2_n_img','disc2', False, False, '_fake')
 
     #Objective and evaluation layers here
-    l = new_layer(model, 'disc1_real_bce', D_real, 'bce_with_logits')
-    l.bce_with_logits.true_label = 1
+    l = new_layer(model, 'disc1_real_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'disc1_real_eval','disc1_real_bce', 'evaluation')
 
-    l = new_layer(model, 'disc1_fake_bce', D_fake, 'bce_with_logits')
-    l.bce_with_logits.true_label = int(0)
+    l = new_layer(model, 'disc1_fake_bce', [D_fake, zero.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'disc1_fake_eval','disc1_fake_bce', 'evaluation')
  
-    l = new_layer(model, 'disc2_real_bce', D_real2, 'bce_with_logits')
-    l.bce_with_logits.true_label = 1
+    l = new_layer(model, 'disc2_real_bce', [D_real2, one.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'disc2_real_eval','disc2_real_bce', 'evaluation')
 
-    l = new_layer(model, 'disc2_fake_bce', D_fake2, 'bce_with_logits')
-    l.bce_with_logits.true_label = int(0)
+    l = new_layer(model, 'disc2_fake_bce', [D_fake2, zero.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'disc2_fake_eval','disc2_fake_bce', 'evaluation')
 
 
diff --git a/model_zoo/models/jag/cycle_gan/generate_cycgan_m2.py b/model_zoo/models/jag/cycle_gan/generate_cycgan_m2.py
index 779e66edae0..80447d1329d 100644
--- a/model_zoo/models/jag/cycle_gan/generate_cycgan_m2.py
+++ b/model_zoo/models/jag/cycle_gan/generate_cycgan_m2.py
@@ -180,8 +180,10 @@ def configure_model(model):
     #freeze discriminator, fake it as real
     D_real = add_discriminator(model,'concat_gsample_n_param','disc1',True, True, '_real')
     #objective function
-    l = new_layer(model, 'g_adv1_bce', D_real, 'bce_with_logits')
-    l.bce_with_logits.true_label = 1
+    one = new_layer(model,'one','','constant')
+    one.constant.value = 1.0
+    one.constant.num_neurons = '1'
+    l = new_layer(model, 'g_adv1_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'g_adv1_eval','g_adv1_bce', 'evaluation')
     
     #************************************************
@@ -217,8 +219,7 @@ def configure_model(model):
     l = new_layer(model, 'gsample_minus_y', g_sample+' image_data_dummy','weighted_sum')
     l.weighted_sum.scaling_factors = '1 -1'
 
-    l = new_layer(model, 'l_l2_y', 'gsample_minus_y', 'l2_loss')
-    l = new_layer(model, 'l_l2_y_eval','l_l2_y', 'evaluation')
+    l = new_layer(model, 'l_l2_y', 'gsample_minus_y', 'l2_norm2')
 
 if __name__ == "__main__":
 
diff --git a/model_zoo/models/jag/cycle_gan/generate_cycgan_m3.py b/model_zoo/models/jag/cycle_gan/generate_cycgan_m3.py
index 182909f20fb..ee0a9c7b198 100644
--- a/model_zoo/models/jag/cycle_gan/generate_cycgan_m3.py
+++ b/model_zoo/models/jag/cycle_gan/generate_cycgan_m3.py
@@ -180,8 +180,10 @@ def configure_model(model):
     #freeze discriminator, fake it as real
     D_real = add_discriminator(model,'concat_gsample2_n_img','disc2',True, True, '_real')
     #objective function
-    l = new_layer(model, 'g_adv2_bce', D_real, 'bce_with_logits')
-    l.bce_with_logits.true_label = 1
+    one = new_layer(model,'one','','constant')
+    one.constant.value = 1.0
+    one.constant.num_neurons = '1'
+    l = new_layer(model, 'g_adv2_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'g_adv2_eval','g_adv2_bce', 'evaluation')
     
     #************************************************
@@ -219,8 +221,7 @@ def configure_model(model):
     l = new_layer(model, 'gsample2_minus_x', g_sample2+' param_data_id','weighted_sum')
     l.weighted_sum.scaling_factors = '1 -1'
 
-    l = new_layer(model, 'l_l2_x', 'gsample2_minus_x', 'l2_loss')
-    l = new_layer(model, 'l_l2_x_eval','l_l2_x', 'evaluation')
+    l = new_layer(model, 'l_l2_x', 'gsample2_minus_x', 'l2_norm2')
 
 if __name__ == "__main__":
 
diff --git a/model_zoo/models/jag/data_reader_jag_conduit.prototext b/model_zoo/models/jag/data_reader_jag_conduit.prototext
index bb0f4ccb367..b115e4ad590 100644
--- a/model_zoo/models/jag/data_reader_jag_conduit.prototext
+++ b/model_zoo/models/jag/data_reader_jag_conduit.prototext
@@ -3,41 +3,49 @@ data_reader {
     name: "jag_conduit"
     role: "train"
     shuffle: true
-    data_filedir: "/usr/workspace/wsb/icfsi/lbann_datasets/1BJAGS_10k_1/0/0/"
-    data_filename: "samples0-25.bundle"
+    # change to a lustre path
+    data_filedir: "/p/lscratchh/brainusr/datasets/1MJAG/1MJAG-A/0/"
+    data_filename: "*/*/*.bundle"
+
     validation_percent: 0.01
     absolute_sample_count: 0
     percent_of_data_to_use: 1.0
+    disable_responses: true
+    disable_labels: true
+
+    split_jag_image_channels: true
+
+    # JAG_Image, JAG_Scalar, JAG_Input
+    independent: [ { pieces: [ JAG_Image, JAG_Scalar ] }, { pieces: [ JAG_Input ] } ]
+    dependent: [ { pieces: [ JAG_Input ] } ]
 
-    # 1: JAG_Image,  2: JAG_Scalar,  3: JAG_Input
-    independent: [1, 2]
-    dependent: [3]
+    jag_image_keys: ["(0.0, 0.0)/0.0", "(90.0, 0.0)/0.0", "(90.0, 78.0)/0.0"]
 
     # An empty list indicates to use all
     # The commented out variables are not on the Jim's original list but used in the numpy-based format
     jag_scalar_keys:
       [ "BWx",
         "BT",
-#        "tMAXt",
+        "tMAXt",        # absent in Jim's list
         "BWn",
         "MAXpressure",
         "BAte",
         "MAXtion",
         "tMAXpressure",
-#        "BAt",
+        "BAt",          # absent in Jim's list
         "Yn",
         "Ye",
         "Yx",
-#        "tMAXte",
+        "tMAXte",       # absent in Jim's list
         "BAtion",
         "MAXte",
-#        "tMAXtion",
+        "tMAXtion",     # absent in Jim's list
         "BTx",
-#        "MAXt",
+        "MAXt",         # absent in Jim's list
         "BTn",
         "BApressure",
-        "tMINradius"
-#        "MINradius"
+        "tMINradius",
+        "MINradius"     # absent in Jim's list
       ]
 
     # When using all the keys without explicit selection, key filters can be used
@@ -60,94 +68,54 @@ data_reader {
 
     num_labels: 5
 
-    image_preprocessor {
-      # assume fixed size of input images if cropper is not used
-      raw_width: 64
-      raw_height: 64
-
-      normalizer {
-        disable: false
-        scale: false
-        subtract_mean: false
-        unit_variance: false
-        z_score: true
-      }
-
-      subtractor {
-        disable: true
-      }
-
-      cropper {
-        disable: true
-      }
-
-      colorizer {
-        disable: true
-      }
-
-      augmenter {
-        disable: true
-      }
-    }
-  }
-
-  reader {
-    name: "jag_conduit"
-    role: "test"
-    shuffle: true
-    data_filedir: "/usr/workspace/wsb/icfsi/lbann_datasets/1BJAGS_10k_1/0/0/"
-    data_filename: "samples25-50.bundle"
-    absolute_sample_count: 0
-    percent_of_data_to_use: 1.0
-
-    # 1: JAG_Image,  2: JAG_Scalar,  3: JAG_Input
-    independent: [1, 2]
-    dependent: [3]
-
-    # An empty list indicates to use all
-    jag_scalar_keys:
-      [ "BWx",
-        "BT",
-#        "tMAXt",
-        "BWn",
-        "MAXpressure",
-        "BAte",
-        "MAXtion",
-        "tMAXpressure",
-#        "BAt",
-        "Yn",
-        "Ye",
-        "Yx",
-#        "tMAXte",
-        "BAtion",
-        "MAXte",
-#        "tMAXtion",
-        "BTx",
-#        "MAXt",
-        "BTn",
-        "BApressure",
-        "tMINradius"
-#        "MINradius"
-      ]
-
-    jag_scalar_prefix_filters: [ { key_prefix: "image_(" min_len: 26} ]
-    jag_scalar_filters: [ "iBT" ]
-
-    jag_input_keys: ["shape_model_initial_modes:(4,3)",
-                     "betti_prl15_trans_u",
-                     "betti_prl15_trans_v",
-                     "shape_model_initial_modes:(2,1)",
-                     "shape_model_initial_modes:(1,0)"];
-
-    num_labels: 5
+    jag_image_normalization_params: [
+      # TODO: temporarily reusing the parameters computed for the first view here. need to obtain the parameters for the other views
+      { scale: 28.128928461  bias: 0.0 }, { scale: 817.362315273  bias: 0.0 }, { scale: 93066.843470244  bias: 0.0 }, { scale: 4360735.362407147  bias: 0.0 },
+      { scale: 28.128928461  bias: 0.0 }, { scale: 817.362315273  bias: 0.0 }, { scale: 93066.843470244  bias: 0.0 }, { scale: 4360735.362407147  bias: 0.0 },
+      { scale: 28.128928461  bias: 0.0 }, { scale: 817.362315273  bias: 0.0 }, { scale: 93066.843470244  bias: 0.0 }, { scale: 4360735.362407147  bias: 0.0 }
+    ]
+
+    jag_scalar_normalization_params: [
+      { scale: 1.660399380e+01  bias: -8.914478521e-01},        # BWx
+      { scale: 1.499062171e+00  bias: -3.529513015e+00},        # BT
+      { scale: 1.530702521e+00  bias: -3.599429878e+00},        # tMAXt
+      { scale: 4.644040100e+01  bias: -1.703187287e+00},        # BWn
+      { scale: 1.795164343e-06  bias: -5.849243445e-01},        # MAXpressure
+      { scale: 2.807222136e-01  bias: -1.042499360e+00},        # BAte
+      { scale: 2.571981124e-01  bias: -1.050431705e+00},        # MAXtion
+      { scale: 1.468048973e+00  bias: -3.447884539e+00},        # tMAXpressure
+      { scale: 2.807222136e-01  bias: -1.042499360e+00},        # BAt
+      { scale: 8.210767783e-18  bias: -2.182660862e-02},        # Yn
+      { scale: 3.634574711e-03  bias: -2.182660596e-02},        # Ye
+      { scale: 2.242376030e-02  bias: -3.376249820e-01},        # Yx
+      { scale: 1.530702521e+00  bias: -3.599429878e+00},        # tMAXte
+      { scale: 2.807222136e-01  bias: -1.042499360e+00},        # BAtion
+      { scale: 2.571981124e-01  bias: -1.050431705e+00},        # MAXte
+      { scale: 1.530702521e+00  bias: -3.599429878e+00},        # tMAXtion
+      { scale: 1.461374463e+00  bias: -3.414620490e+00},        # BTx
+      { scale: 2.571981124e-01  bias: -1.050431705e+00},        # MAXt
+      { scale: 1.499062171e+00  bias: -3.529513015e+00},        # BTn
+      { scale: 2.240009139e-06  bias: -5.837354616e-01},        # BApressure
+      { scale: 1.427286973e+00  bias: -3.328267524e+00},        # tMINradius
+      { scale: 6.404465614e-02  bias: -1.418863592e+00}         # MINradius
+    ]
+
+    jag_input_normalization_params: [
+      { scale: 1.667587753e+00  bias: 4.997824968e-01},         # shape_model_initial_modes:(4,3)
+      { scale: 1.000245480e+00  bias: -8.438836401e-05},        # betti_prl15_trans_u
+      { scale: 1.000870539e+00  bias: -7.346414236e-04},        # betti_prl15_trans_v
+      { scale: 1.668835219e+00  bias: 4.997744013e-01},         # shape_model_initial_modes:(2,1)
+      { scale: 1.667992865e+00  bias: 4.999102733e-01}          # shape_model_initial_modes:(1,0)
+    ]
 
     image_preprocessor {
       # assume fixed size of input images if cropper is not used
       raw_width: 64
       raw_height: 64
+      raw_num_channels: 4
 
       normalizer {
-        disable: false
+        disable: true
         scale: false
         subtract_mean: false
         unit_variance: false
diff --git a/model_zoo/models/jag/gan/cyclic/.gitignore b/model_zoo/models/jag/gan/cyclic/.gitignore
new file mode 100644
index 00000000000..d4a466c3a66
--- /dev/null
+++ b/model_zoo/models/jag/gan/cyclic/.gitignore
@@ -0,0 +1,2 @@
+lbann_pb2.py
+lbann_pb2.pyc
diff --git a/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext b/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext
new file mode 100644
index 00000000000..4c1817c2d8a
--- /dev/null
+++ b/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext
@@ -0,0 +1,719 @@
+model {
+  procs_per_model:0 
+  objective_function {
+    l2_weight_regularization {
+      scale_factor: 0.0001
+    }
+    layer_term {
+      scale_factor: 1.0
+      layer: "disc1_real_bce"
+    }
+    layer_term {
+      scale_factor: 1.0
+      layer: "disc1_fake_bce"
+    }
+    layer_term {
+      scale_factor: 0.05
+      layer: "g_adv1_bce"
+    }
+    layer_term {
+      scale_factor: 0.025
+      layer: "l_l2_y"
+    }
+    layer_term {
+      scale_factor: 1.0
+      layer: "disc1_inv_real_bce"
+    }
+    layer_term {
+      scale_factor: 1.0
+      layer: "disc1_inv_fake_bce"
+    }
+    layer_term {
+      scale_factor: 0.05
+      layer: "g_inv_adv1_bce"
+    }
+    layer_term {
+      scale_factor: 0.025
+      layer: "l_l2_x"
+    }
+  }
+  num_epochs: 100
+  metric {
+    layer_metric {
+      layer: "l_l2_y"
+    }
+  }
+  data_layout: "data_parallel"
+  layer {
+    input {
+      io_buffer: "partitioned"
+      data_set_per_model: true
+      target_mode: "N/A"
+    }
+    name: "data"
+    data_layout: "data_parallel"
+    parents: " "
+  }
+  layer {
+    name: "zero"
+    data_layout: "data_parallel"
+    constant {
+      value: 0.0
+      num_neurons: "1"
+    }
+  }
+  layer {
+    name: "one"
+    data_layout: "data_parallel"
+    constant {
+      value: 1.0
+      num_neurons: "1"
+    }
+  }
+  layer {
+    name: "slice_data"
+    data_layout: "data_parallel"
+    parents: "data"
+    children: "image_data_dummy param_data_id"
+    slice {
+      slice_points: "0 2500 2511"
+    }
+  }
+  layer {
+    identity {
+    }
+    name: "image_data_dummy"
+    data_layout: "data_parallel"
+    parents: "slice_data"
+  }
+  layer {
+    identity {
+    }
+    name: "param_data_id"
+    data_layout: "data_parallel"
+    parents: "slice_data"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 16
+      has_bias: true
+    }
+    name: "gen1fc1"
+    data_layout: "data_parallel"
+    #weights: "gen1fc1linearity"
+    parents: "param_data_id"
+  }
+  layer {
+    relu {
+    }
+    name: "gen1relu1"
+    data_layout: "data_parallel"
+    parents: "gen1fc1"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 128
+      has_bias: true
+    }
+    name: "gen1fc2"
+    data_layout: "data_parallel"
+    #weights: "gen1fc2linearity"
+    parents: "gen1relu1"
+  }
+  layer {
+    relu {
+    }
+    name: "gen1relu2"
+    data_layout: "data_parallel"
+    parents: "gen1fc2"
+  }
+  layer {
+    dropout {
+      keep_prob: 0.8
+    }
+    name: "gen1dropout1"
+    data_layout: "data_parallel"
+    parents: "gen1relu2"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 512
+      has_bias: true
+    }
+    name: "gen1fc3"
+    data_layout: "data_parallel"
+    #weights: "gen1fc3linearity"
+    parents: "gen1dropout1"
+  }
+  layer {
+    relu {
+    }
+    name: "gen1relu3"
+    data_layout: "data_parallel"
+    parents: "gen1fc3"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 2500
+      has_bias: true
+    }
+    name: "gen1fc4"
+    data_layout: "data_parallel"
+    #weights: "gen1fc4linearity"
+    parents: "gen1relu3"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 128
+      has_bias: true
+    }
+    name: "d1fc1_real"
+    data_layout: "data_parallel"
+    weights: "d1fc1linearity d1fc1bias"
+    parents: "data"
+  }
+  layer {
+    relu {
+    }
+    name: "d1relu1_real"
+    data_layout: "data_parallel"
+    parents: "d1fc1_real"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 16
+      has_bias: true
+    }
+    name: "d1fc2_real"
+    data_layout: "data_parallel"
+    weights: "d1fc2linearity d1fc2bias"
+    parents: "d1relu1_real"
+  }
+  layer {
+    relu {
+    }
+    name: "d1relu2_real"
+    data_layout: "data_parallel"
+    parents: "d1fc2_real"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 1
+      has_bias: true
+    }
+    name: "d1fc3_real"
+    data_layout: "data_parallel"
+    weights: "d1fc3linearity d1fc3bias"
+    parents: "d1relu2_real"
+  }
+  layer {
+    name: "concat_gsample_n_param"
+    data_layout: "data_parallel"
+    parents: "gen1fc4 param_data_id"
+    children: "d1_stop_gradient d2_dummy"
+    concatenation {
+    }
+  }
+  layer {
+    name: "d1_stop_gradient"
+    data_layout: "data_parallel"
+    parents: "concat_gsample_n_param"
+    stop_gradient {
+    }
+  }
+  layer {
+    fully_connected {
+      num_neurons: 128
+      has_bias: true
+    }
+    name: "d1fc1_fake"
+    data_layout: "data_parallel"
+    weights: "d1fc1linearity d1fc1bias"
+    parents: "d1_stop_gradient"
+  }
+  layer {
+    relu {
+    }
+    name: "d1relu1_fake"
+    data_layout: "data_parallel"
+    parents: "d1fc1_fake"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 16
+      has_bias: true
+    }
+    name: "d1fc2_fake"
+    data_layout: "data_parallel"
+    weights: "d1fc2linearity d1fc2bias"
+    parents: "d1relu1_fake"
+  }
+  layer {
+    relu {
+    }
+    name: "d1relu2_fake"
+    data_layout: "data_parallel"
+    parents: "d1fc2_fake"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 1
+      has_bias: true
+    }
+    name: "d1fc3_fake"
+    data_layout: "data_parallel"
+    weights: "d1fc3linearity d1fc3bias"
+    parents: "d1relu2_fake"
+  }
+  layer {
+    sigmoid_binary_cross_entropy {
+    }
+    name: "disc1_real_bce"
+    data_layout: "data_parallel"
+    parents: "d1fc3_real one"
+  }
+  layer {
+    sigmoid_binary_cross_entropy {
+    }
+    name: "disc1_fake_bce"
+    data_layout: "data_parallel"
+    parents: "d1fc3_fake zero"
+  }
+  layer {
+    identity {
+    }
+    name: "d2_dummy"
+    data_layout: "data_parallel"
+    parents: "concat_gsample_n_param"
+  }
+  layer {
+    freeze: true
+    fully_connected {
+      num_neurons: 128
+      has_bias: true
+    }
+    name: "d2fc1"
+    data_layout: "data_parallel"
+    parents: "d2_dummy"
+  }
+  layer {
+    relu {
+    }
+    name: "d2relu1"
+    data_layout: "data_parallel"
+    parents: "d2fc1"
+  }
+  layer {
+    freeze: true
+    fully_connected {
+      num_neurons: 16
+      has_bias: true
+    }
+    name: "d2fc2"
+    data_layout: "data_parallel"
+    parents: "d2relu1"
+  }
+  layer {
+    relu {
+    }
+    name: "d2relu2"
+    data_layout: "data_parallel"
+    parents: "d2fc2"
+  }
+  layer {
+    freeze: true
+    fully_connected {
+      num_neurons: 1
+      has_bias: true
+    }
+    name: "d2fc3"
+    data_layout: "data_parallel"
+    parents: "d2relu2"
+  }
+  layer {
+    sigmoid_binary_cross_entropy {
+    }
+    name: "g_adv1_bce"
+    data_layout: "data_parallel"
+    parents: "d2fc3 one"
+  }
+  layer {
+    name: "gsample_minus_y"
+    data_layout: "data_parallel"
+    parents: "gen1fc4 image_data_dummy"
+    weighted_sum {
+      scaling_factors: "1 -1"
+    }
+  }
+  layer {
+    name: "l_l2_y"
+    data_layout: "data_parallel"
+    l2_norm2 {
+    }
+    parents: "gsample_minus_y"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 16
+      has_bias: true
+    }
+    name: "gen2fc1"
+    data_layout: "data_parallel"
+    #weights: "gen2fc1linearity"
+    parents: "image_data_dummy"
+  }
+  layer {
+    relu {
+    }
+    name: "gen2relu1"
+    data_layout: "data_parallel"
+    parents: "gen2fc1"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 128
+      has_bias: true
+    }
+    name: "gen2fc2"
+    data_layout: "data_parallel"
+    #weights: "gen2fc2linearity"
+    parents: "gen2relu1"
+  }
+  layer {
+    relu {
+    }
+    name: "gen2relu2"
+    data_layout: "data_parallel"
+    parents: "gen2fc2"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 512
+      has_bias: true
+    }
+    name: "gen2fc3"
+    data_layout: "data_parallel"
+    #weights: "gen2fc3linearity"
+    parents: "gen2relu2"
+  }
+  layer {
+    relu {
+    }
+    name: "gen2relu3"
+    data_layout: "data_parallel"
+    parents: "gen2fc3"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 11
+      has_bias: true
+    }
+    name: "gen2fc4"
+    data_layout: "data_parallel"
+    #weights: "gen2fc4linearity"
+    parents: "gen2relu3"
+  }
+  layer {
+    name: "concat_param_n_img"
+    data_layout: "data_parallel"
+    parents: "param_data_id image_data_dummy"
+    concatenation {
+    }
+  }
+  layer {
+    fully_connected {
+      num_neurons: 128
+      has_bias: true
+    }
+    name: "d1_invfc1_real"
+    data_layout: "data_parallel"
+    weights: "d1_invfc1linearity d1_invfc1bias"
+    parents: "concat_param_n_img"
+  }
+  layer {
+    relu {
+    }
+    name: "d1_invrelu1_real"
+    data_layout: "data_parallel"
+    parents: "d1_invfc1_real"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 16
+      has_bias: true
+    }
+    name: "d1_invfc2_real"
+    data_layout: "data_parallel"
+    weights: "d1_invfc2linearity d1_invfc2bias"
+    parents: "d1_invrelu1_real"
+  }
+  layer {
+    relu {
+    }
+    name: "d1_invrelu2_real"
+    data_layout: "data_parallel"
+    parents: "d1_invfc2_real"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 1
+      has_bias: true
+    }
+    name: "d1_invfc3_real"
+    data_layout: "data_parallel"
+    weights: "d1_invfc3linearity d1_invfc3bias"
+    parents: "d1_invrelu2_real"
+  }
+  layer {
+    name: "concat_gsample2_n_img"
+    data_layout: "data_parallel"
+    parents: "gen2fc4 image_data_dummy"
+    children: "d1_inv_stop_gradient d2_inv_dummy"
+    concatenation {
+    }
+  }
+  layer {
+    name: "d1_inv_stop_gradient"
+    data_layout: "data_parallel"
+    parents: "concat_gsample2_n_img"
+    stop_gradient {
+    }
+  }
+  layer {
+    fully_connected {
+      num_neurons: 128
+      has_bias: true
+    }
+    name: "d1_invfc1_fake"
+    data_layout: "data_parallel"
+    weights: "d1_invfc1linearity d1_invfc1bias"
+    parents: "d1_inv_stop_gradient"
+  }
+  layer {
+    relu {
+    }
+    name: "d1_invrelu1_fake"
+    data_layout: "data_parallel"
+    parents: "d1_invfc1_fake"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 16
+      has_bias: true
+    }
+    name: "d1_invfc2_fake"
+    data_layout: "data_parallel"
+    weights: "d1_invfc2linearity d1_invfc2bias"
+    parents: "d1_invrelu1_fake"
+  }
+  layer {
+    relu {
+    }
+    name: "d1_invrelu2_fake"
+    data_layout: "data_parallel"
+    parents: "d1_invfc2_fake"
+  }
+  layer {
+    fully_connected {
+      num_neurons: 1
+      has_bias: true
+    }
+    name: "d1_invfc3_fake"
+    data_layout: "data_parallel"
+    weights: "d1_invfc3linearity d1_invfc3bias"
+    parents: "d1_invrelu2_fake"
+  }
+  layer {
+    sigmoid_binary_cross_entropy {
+    }
+    name: "disc1_inv_real_bce"
+    data_layout: "data_parallel"
+    parents: "d1_invfc3_real one"
+  }
+  layer {
+    sigmoid_binary_cross_entropy {
+    }
+    name: "disc1_inv_fake_bce"
+    data_layout: "data_parallel"
+    parents: "d1_invfc3_fake zero"
+  }
+  layer {
+    identity {
+    }
+    name: "d2_inv_dummy"
+    data_layout: "data_parallel"
+    parents: "concat_gsample2_n_img"
+  }
+  layer {
+    freeze: true
+    fully_connected {
+      num_neurons: 128
+      has_bias: true
+    }
+    name: "d2_invfc1"
+    data_layout: "data_parallel"
+    parents: "d2_inv_dummy"
+  }
+  layer {
+    relu {
+    }
+    name: "d2_invrelu1"
+    data_layout: "data_parallel"
+    parents: "d2_invfc1"
+  }
+  layer {
+    freeze: true
+    fully_connected {
+      num_neurons: 16
+      has_bias: true
+    }
+    name: "d2_invfc2"
+    data_layout: "data_parallel"
+    parents: "d2_invrelu1"
+  }
+  layer {
+    relu {
+    }
+    name: "d2_invrelu2"
+    data_layout: "data_parallel"
+    parents: "d2_invfc2"
+  }
+  layer {
+    freeze: true
+    fully_connected {
+      num_neurons: 1
+      has_bias: true
+    }
+    name: "d2_invfc3"
+    data_layout: "data_parallel"
+    parents: "d2_invrelu2"
+  }
+  layer {
+    sigmoid_binary_cross_entropy {
+    }
+    name: "g_inv_adv1_bce"
+    data_layout: "data_parallel"
+    parents: "d2_invfc3 one"
+  }
+  layer {
+    name: "gsample2_minus_x"
+    data_layout: "data_parallel"
+    parents: "gen2fc4 param_data_id"
+    weighted_sum {
+      scaling_factors: "1 -1"
+    }
+  }
+  layer {
+    name: "l_l2_x"
+    data_layout: "data_parallel"
+    l2_norm2 {
+    }
+    parents: "gsample2_minus_x"
+  }
+  weights {
+    name: "gen1fc1linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "gen1fc2linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "gen1fc3linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "gen1fc4linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "d1fc1linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "d1fc1bias"
+  }
+  weights {
+    name: "d1fc2linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "d1fc2bias"
+  }
+  weights {
+    name: "d1fc3linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "d1fc3bias"
+  }
+  weights {
+    name: "gen2fc1linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "gen2fc2linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "gen2fc3linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "gen2fc4linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "d1_invfc1linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "d1_invfc1bias"
+  }
+  weights {
+    name: "d1_invfc2linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "d1_invfc2bias"
+  }
+  weights {
+    name: "d1_invfc3linearity"
+    he_normal_initializer {
+    }
+  }
+  weights {
+    name: "d1_invfc3bias"
+  }
+  mini_batch_size: 64
+  callback {
+    print {
+      interval: 10
+    }
+  }
+  callback {
+    timer {
+    }
+  }
+  callback {
+    replace_weights {
+      source_layers: "d1fc1_real d1fc2_real d1fc3_real d1_invfc1_real d1_invfc2_real d1_invfc3_real"
+      destination_layers: "d2fc1 d2fc2 d2fc3 d2_invfc1 d2_invfc2 d2_invfc3"
+      batch_interval: 1
+    }
+  }
+  block_size: 256
+}
diff --git a/model_zoo/models/jag/gan/cyclic/generate_model.py b/model_zoo/models/jag/gan/cyclic/generate_model.py
new file mode 100644
index 00000000000..5c1e22c9c9d
--- /dev/null
+++ b/model_zoo/models/jag/gan/cyclic/generate_model.py
@@ -0,0 +1,299 @@
+import sys
+import os
+import subprocess
+import functools
+
+# Parameters
+lbann_dir       = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip()
+lbann_proto_dir = lbann_dir + '/src/proto/'
+work_dir        = lbann_dir + '/model_zoo/models/jag/gan/cyclic'
+template_proto  = lbann_dir + '/model_zoo/models/jag/gan/cyclic/model_template.prototext'
+output_proto    = lbann_dir + '/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext'
+
+# Convert a list into a space-separated string
+def str_list(l):
+    if isinstance(l, list):
+        return ' '.join(str(i) for i in l)
+    elif isinstance(l, str):
+        return l
+    else:
+        raise TypeError('str_list expects a list or a string')
+
+# Construct a new layer and add it to the model
+def new_layer(model, name, parents, layer_type, layout = 'data_parallel'):
+    l = model.layer.add()
+    l.name = name
+    l.data_layout = layout
+    l.parents = str_list(parents)
+    #l.device_allocation = device
+    exec('l.' + layer_type + '.SetInParent()')
+    return l
+
+# Construct a new set of weights and add it to the model
+def new_weights(model, name, initializer = 'constant_initializer'):
+    w = model.weights.add()
+    w.name = name
+    exec('w.' + initializer + '.SetInParent()')
+    return w
+
+# Discriminator
+#@todo: clean up, tag may not be needed
+#Weight sharing on the same branch (D1) or (D2)
+def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
+  #Shared weights for same path (e.g. D1 fake and D1 real)
+  #@todo add  bias ---difficult to debug problem without bias
+  w1 = prefix+'fc1'
+  w2 = prefix+'fc2'
+  w3 = prefix+'fc3'
+
+  fc1 = w1+tag
+  fc2 = w2+tag
+  fc3 = w3+tag
+
+
+  relu1 = prefix+'relu1'+tag
+  relu2 = prefix+'relu2'+tag
+
+  l = new_layer(model, fc1, disc_input,'fully_connected')
+  l.fully_connected.num_neurons = 128
+  l.fully_connected.has_bias = True
+  l.freeze = freeze
+  if(add_weight) :
+    w = new_weights(model, w1 + 'linearity', 'he_normal_initializer')
+  l.weights = w1 + 'linearity'
+
+  l = new_layer(model, relu1, fc1,'relu')
+  
+
+  l = new_layer(model, fc2, relu1,'fully_connected')
+  l.fully_connected.num_neurons = 16
+  l.fully_connected.has_bias = True
+  l.freeze = freeze
+  #@todo/bug: fix, will still add weights to layer even though it is not suppose to
+  if(add_weight) :
+    w = new_weights(model, w2 + 'linearity', 'he_normal_initializer')
+  l.weights = w2 + 'linearity'
+  
+  l = new_layer(model, relu2, fc2,'relu')
+
+  l = new_layer(model, fc3, relu2, 'fully_connected')
+  l.fully_connected.num_neurons = 1
+  l.fully_connected.has_bias = True
+  l.freeze = freeze
+  if(add_weight) :
+    w = new_weights(model, w3 + 'linearity', 'he_normal_initializer')
+  l.weights = w3 + 'linearity'
+  return fc3 
+
+
+#Generator
+#Weight frozen, no weight sharing
+#todo, handle weight sharing
+#@todo, use default weight/bias, adding weights cause bad thing to happen with LTFB except you add/transfer both w and b
+#@todo, generally automate manual editing made in debugging process 
+def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, tag=''):
+  #different weights  
+  fc1 = prefix+'fc1'+tag
+  fc2 = prefix+'fc2'+tag
+  fc3 = prefix+'fc3'+tag
+  fc4 = prefix+'fc4'+tag
+
+  relu1 = prefix+'relu1'+tag
+  relu2 = prefix+'relu2'+tag
+  relu3 = prefix+'relu3'+tag
+
+  dropout1 = prefix+'dropout1'+tag
+
+  l = new_layer(model, fc1, gen_input,'fully_connected')
+  l.fully_connected.num_neurons = 16
+  l.fully_connected.has_bias = True
+  l.freeze = freeze
+  w = new_weights(model, fc1 + 'linearity', 'he_normal_initializer')
+  l.weights = fc1 + 'linearity'
+
+  l = new_layer(model, relu1, fc1,'relu')
+
+  l = new_layer(model, fc2, relu1,'fully_connected')
+  l.fully_connected.num_neurons = 128
+  l.fully_connected.has_bias = True
+  l.freeze = freeze
+  w = new_weights(model, fc2 + 'linearity', 'he_normal_initializer')
+  l.weights = fc2 + 'linearity'
+  
+  l = new_layer(model, relu2, fc2,'relu')
+  next_parent = relu2
+  if(add_dropout):
+    l = new_layer(model,dropout1,next_parent, 'dropout')
+    l.dropout.keep_prob = 0.8
+    next_parent=dropout1
+
+  l = new_layer(model, fc3, next_parent, 'fully_connected')
+  l.fully_connected.num_neurons = 512
+  l.fully_connected.has_bias = True
+  l.freeze = freeze
+  w = new_weights(model, fc3 + 'linearity', 'he_normal_initializer')
+  l.weights = fc3 + 'linearity'
+  
+  l = new_layer(model, relu3, fc3, 'relu')
+
+  l = new_layer(model, fc4, relu3, 'fully_connected')
+  l.fully_connected.num_neurons = output_dim
+  l.fully_connected.has_bias = True
+  l.freeze = freeze
+  w = new_weights(model, fc4 + 'linearity', 'he_normal_initializer')
+  l.weights = fc4 + 'linearity'
+
+  return fc4
+
+
+# Configure a prototext model (e.g. add layers)
+def configure_model(model):
+
+    #####INPUT DATA (including Slices)
+    ### Input data comes from merge features of image (Y) and param (X)
+    l = new_layer(model,'data',' ', 'input')
+    l.input.io_buffer = 'partitioned'
+    
+    slice_points = [0,2500,2511]
+    l = new_layer(model, 'slice_data','data', 'slice')
+    l.children = 'image_data_dummy param_data_id'
+    l.slice.slice_points = str_list(slice_points)
+
+    #Useful constants
+    zero = new_layer(model,'zero','','constant')
+    zero.constant.value = 0.0
+    zero.constant.num_neurons = '1'
+    one = new_layer(model,'one','','constant')
+    one.constant.value = 1.0
+    one.constant.num_neurons = '1'
+
+    #ID Image (Y) data
+    l = new_layer(model,'image_data_dummy','slice_data','identity')
+
+    #ID parameter data (X)
+    l = new_layer(model,'param_data_id','slice_data','identity')
+   
+    # Forward Model 
+    #D_Loss1 branch
+    #Fake path
+    #def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, tag=''):
+    #freeze generator = False
+    #forward generator x->y'
+    #g_sample=generator1(x)
+    g_sample = add_generator(model, 'param_data_id','gen1', 2500, False,True)
+     
+    #True path (share weights with fake path discriminator)
+    #discriminator(y,x)
+    #data = y + x
+    #def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
+    #forward_model
+    D_real = add_discriminator(model, 'data','d1',False, True, '_real')
+    
+    #CONCAT 
+    # Gsample + x
+    #
+    l = new_layer(model, 'concat_gsample_n_param','','concatenation')
+    l.parents = g_sample+' param_data_id'
+    l.children = 'd1_stop_gradient d2_dummy'
+    #discriminator false path
+    #question: how to deal with d1 weight sharing? //Dreal and Dfake weights are shared?
+    #And copied to discriminator (d2) on adversarial path at every iteration 
+    #discriminator(g_sample,x)
+    #add stop gradient, so gradient doesnt go to generator on Dfake path
+    l = new_layer(model, 'd1_stop_gradient','concat_gsample_n_param', 'stop_gradient') 
+    #D_fake = add_discriminator(model,'concat_gsample_n_param','disc1',False, False, '_fake')
+    D_fake = add_discriminator(model,'d1_stop_gradient','d1',False, False, '_fake')
+
+    #Objective term (and metric) layers here
+    l = new_layer(model, 'disc1_real_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
+    l = new_layer(model, 'disc1_fake_bce', [D_fake, zero.name], 'sigmoid_binary_cross_entropy')
+    
+    #Adversarial part
+    #replicate discriminator (freeze it), weight will be copied through replace_layer callback, fake it as real
+    #add identity/dummy layer that is a copy of concat
+    l = new_layer(model, 'd2_dummy','concat_gsample_n_param', 'identity')
+    #def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
+    D_adv = add_discriminator(model,'d2_dummy','d2',True, False)
+    #objective function
+    #fake as real
+    l = new_layer(model, 'g_adv1_bce', [D_adv, one.name], 'sigmoid_binary_cross_entropy')
+
+    #Add L2 loss
+    l = new_layer(model, 'gsample_minus_y', ' ', 'weighted_sum')
+    l.parents = g_sample+' image_data_dummy'
+    l.weighted_sum.scaling_factors = '1 -1'
+ 
+    l = new_layer(model,'l_l2_y', 'gsample_minus_y','l2_norm2')
+
+    #####Inverse Model
+    
+    #inverse generator y->x'
+    #g_sample2=generator2(y)
+    g_sample2 = add_generator(model, 'image_data_dummy','gen2', 11, False,False)
+
+    #inverse model real part, we need a concat of param and image
+    l = new_layer(model, 'concat_param_n_img','','concatenation')
+    l.parents =  'param_data_id image_data_dummy'
+    #l.children = ' '
+    D_inv_real = add_discriminator(model, 'concat_param_n_img','d1_inv',False, True, '_real')
+    #CONCAT 
+    # Gsample2 (that is x') + y
+    #
+    l = new_layer(model, 'concat_gsample2_n_img','','concatenation')
+    l.parents = g_sample2+' image_data_dummy'
+    l.children = 'd1_inv_stop_gradient d2_inv_dummy'
+    #discriminator(g_sample2,y)
+    #add stop gradient, so gradient doesnt go to generator on this path
+    l = new_layer(model, 'd1_inv_stop_gradient','concat_gsample2_n_img', 'stop_gradient') 
+    D_inv_fake = add_discriminator(model,'d1_inv_stop_gradient','d1_inv',False, False, '_fake')
+    #Objective term (and metric) layers here
+    l = new_layer(model, 'disc1_inv_real_bce', [D_inv_real, one.name], 'sigmoid_binary_cross_entropy')
+    l = new_layer(model, 'disc1_inv_fake_bce', [D_inv_fake, zero.name], 'sigmoid_binary_cross_entropy')
+    #Adversarial part
+    l = new_layer(model, 'd2_inv_dummy','concat_gsample2_n_img', 'identity')
+    D_inv_adv = add_discriminator(model,'d2_inv_dummy','d2_inv',True, False)
+    #objective function
+    #fake as real
+    l = new_layer(model, 'g_inv_adv1_bce', [D_inv_adv, one.name], 'sigmoid_binary_cross_entropy')
+
+    #Add L2 loss
+    l = new_layer(model, 'gsample2_minus_x', ' ', 'weighted_sum')
+    l.parents = g_sample2+' param_data_id'
+    l.weighted_sum.scaling_factors = '1 -1'
+ 
+    l = new_layer(model,'l_l2_x', 'gsample2_minus_x','l2_norm2')
+
+if __name__ == "__main__":
+
+    # Make sure protobuf Python implementation is built
+    host = subprocess.check_output('hostname').strip('\n1234567890')
+    protoc = lbann_dir + '/build/gnu.Release.' + host + '.llnl.gov/install/bin/protoc'
+    proto_python_dir = lbann_dir + '/build/gnu.Release.' + host + '.llnl.gov/protobuf/src/python'
+    os.putenv('PROTOC', protoc)
+    subprocess.call('cd ' + proto_python_dir + '; '
+                    + sys.executable + ' '
+                    + proto_python_dir + '/setup.py build',
+                    shell=True)
+    sys.path.append(proto_python_dir)
+    import google.protobuf.text_format as txtf
+
+    # Compile LBANN protobuf
+    subprocess.call([protoc,
+                     '-I=' + lbann_proto_dir,
+                     '--python_out=' + work_dir,
+                     lbann_proto_dir + '/lbann.proto'])
+    sys.path.append(work_dir)
+    global lbann_pb2
+    import lbann_pb2
+
+    # Load template prototext
+    with open(template_proto, 'r') as f:
+        pb = txtf.Merge(f.read(), lbann_pb2.LbannPB())
+
+    # Configure prototext model
+    configure_model(pb.model)
+
+    # Export prototext
+    with open(output_proto, 'w') as f:
+        f.write(txtf.MessageToString(pb))
+    
diff --git a/model_zoo/models/jag/gan/cyclic/jag_data.prototext b/model_zoo/models/jag/gan/cyclic/jag_data.prototext
new file mode 100644
index 00000000000..39f39b95804
--- /dev/null
+++ b/model_zoo/models/jag/gan/cyclic/jag_data.prototext
@@ -0,0 +1,26 @@
+data_reader {
+  reader {
+    name: "merge_features"
+    format: "numpy"
+    role: "train"
+    shuffle: true
+    data_file_pattern: "/p/lscratchf/brainusr/datasets/jag/jag_train_*.npy"
+    validation_percent: 0.1
+    percent_of_data_to_use: 1.0
+    disable_responses: true 
+    disable_labels: true
+  }
+  reader {
+    name: "merge_features"
+    format: "numpy"
+    role: "test"
+    shuffle: false 
+    data_file_pattern: "/p/lscratchf/brainusr/datasets/jag/jag_test_*.npy"
+    validation_percent: 0
+    #test first 16 samples only to match TF version 
+    absolute_sample_count: 100
+    #percent_of_data_to_use: 1.0
+    disable_responses: true 
+    disable_labels: true
+  }
+}
diff --git a/model_zoo/models/jag/gan/cyclic/model_template.prototext b/model_zoo/models/jag/gan/cyclic/model_template.prototext
new file mode 100644
index 00000000000..a6f9af560e9
--- /dev/null
+++ b/model_zoo/models/jag/gan/cyclic/model_template.prototext
@@ -0,0 +1,109 @@
+model {
+  data_layout: "data_parallel"
+  mini_batch_size: 64 
+  block_size: 256
+  num_epochs: 10
+  num_parallel_readers: 0
+  procs_per_model: 0
+
+  ###################################################
+  # Objective function
+  ###################################################
+
+  objective_function {
+    layer_term {
+      scale_factor: 1.0
+      layer: "disc1_real_bce"
+    }
+    layer_term {
+      scale_factor: 1.0
+      layer: "disc1_fake_bce"
+    }
+    layer_term {
+      scale_factor: 0.05
+      layer: "g_adv1_bce"
+    }
+    layer_term {
+      scale_factor: 0.025
+      layer: "l_l2_y"
+    }
+
+    layer_term {
+      scale_factor: 1.0
+      layer: "disc1_inv_real_bce"
+    }
+    layer_term {
+      scale_factor: 1.0
+      layer: "disc1_inv_fake_bce"
+    }
+    layer_term {
+      scale_factor: 0.05
+      layer: "g_inv_adv1_bce"
+    }
+    layer_term {
+      scale_factor: 0.025
+      layer: "l_l2_x"
+    }
+    l2_weight_regularization {
+      scale_factor: 1e-4
+    }
+  }
+
+  ###################################################
+  # Metrics
+  ###################################################
+
+  metric {
+    layer_metric {
+      layer: "l_l2_x"
+    }
+  }
+  metric {
+    layer_metric {
+      layer: "l_l2_y"
+    }
+  }
+
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback {
+    print {
+      interval: 1
+    }
+  }
+  callback { timer {} }
+  
+  #Add callback for replace_weights
+  callback {
+    replace_weights{
+      source_layers: "d1fc1_real d1fc2_real d1fc3_real d1_invfc1_real d1_invfc2_real d1_invfc3_real" 
+      destination_layers: "d2fc1 d2fc2 d2fc3 d2_invfc1 d2_invfc2 d2_invfc3"
+    }
+  }
+  
+  #Optional callbacks
+  callback {
+    dump_activations {
+      basename: "/usr/workspace/wsa/jacobs32/github.saj.lbann/jag_imgs/cyclic_gan/"
+      interval: 100
+      layer_names: "image_data_dummy gen1fc4 param_data_id gen2fc4"
+    }
+  }
+  callback {
+    ltfb {
+      round_size: 100 
+      increasing_metric_mode: false
+      eval_metrics: "l_l2_y_eval"
+      #weights_tosend: "gen1fc1linearity gen1fc1bias gen1fc2linearity gen1fc2bias gen1fc3linearity gen1fc3bias gen1fc4linearity gen1fc4bias" 
+      weights_tosend: "gen1fc1_linearity_weights gen1fc1_bias_weights gen1fc2_linearity_weights gen1fc2_bias_weights gen1fc3_linearity_weights gen1fc3_bias_weights gen1fc4_linearity_weights gen1fc4_bias_weights" 
+      }
+ 
+  }
+ 
+
+  ###################################################
+  # start of layers
+  ###################################################
+
+}
diff --git a/model_zoo/models/jag/gan/vanilla/.gitignore b/model_zoo/models/jag/gan/vanilla/.gitignore
new file mode 100644
index 00000000000..d4a466c3a66
--- /dev/null
+++ b/model_zoo/models/jag/gan/vanilla/.gitignore
@@ -0,0 +1,2 @@
+lbann_pb2.py
+lbann_pb2.pyc
diff --git a/model_zoo/models/jag/gan/gan.prototext b/model_zoo/models/jag/gan/vanilla/gan.prototext
similarity index 86%
rename from model_zoo/models/jag/gan/gan.prototext
rename to model_zoo/models/jag/gan/vanilla/gan.prototext
index 612dd6c2ab1..7acf17a0d57 100644
--- a/model_zoo/models/jag/gan/gan.prototext
+++ b/model_zoo/models/jag/gan/vanilla/gan.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   random_init_models_differently: true
   objective_function {
     l2_weight_regularization {
@@ -7,19 +6,23 @@ model {
     }
     layer_term {
       scale_factor: 1.0
-      layer: "disc1_real_eval"
+      #layer: "disc1_real_eval"
+      layer: "disc1_real_bce"
     }
     layer_term {
       scale_factor: 1.0
-      layer: "disc1_fake_eval"
+      #layer: "disc1_fake_eval"
+      layer: "disc1_fake_bce"
     }
     layer_term {
       scale_factor: 0.05
-      layer: "g_adv1_eval"
+      #layer: "g_adv1_eval"
+      layer: "g_adv1_bce"
     }
     layer_term {
-      scale_factor: 0.05
-      layer: "l_l2_y_eval"
+      scale_factor: 0.025
+      #layer: "l_l2_y_eval"
+      layer: "l_l2_y"
     }
   }
  # metric {
@@ -41,7 +44,8 @@ model {
  # }
   metric {
     layer_metric {
-      layer: "l_l2_y_eval"
+      #layer: "l_l2_y_eval"
+      layer: "l_l2_y"
     }
   }
   num_epochs: 100
@@ -55,7 +59,24 @@ model {
     name: "data1"
     data_layout: "data_parallel"
   }
-   
+
+  layer {
+    name: "zero"
+    data_layout: "data_parallel"
+    constant {
+      value: 0.0
+      num_neurons: "1"
+    }
+  }
+  layer {
+    name: "one"
+    data_layout: "data_parallel"
+    constant {
+      value: 1.0
+      num_neurons: "1"
+    }
+  }
+
   layer {
     name: "slice_data"
     data_layout: "data_parallel"
@@ -252,37 +273,33 @@ model {
     parents: "d1relu2_fake"
   }
   layer {
-    bce_with_logits {
-      true_label: 1
+    sigmoid_binary_cross_entropy {
     }
     name: "disc1_real_bce"
     data_layout: "data_parallel"
-    device_allocation: "cpu"
-    parents: "d1fc3_real"
-  }
-  layer {
-    name: "disc1_real_eval"
-    data_layout: "data_parallel"
-    parents: "disc1_real_bce"
-    evaluation {
-    }
+    parents: "d1fc3_real one"
   }
+  #layer {
+  #  name: "disc1_real_eval"
+  #  data_layout: "data_parallel"
+  #  parents: "disc1_real_bce"
+  #  evaluation {
+  #  }
+  #}
   layer {
-    bce_with_logits {
-      true_label: 0
+    sigmoid_binary_cross_entropy {
     }
     name: "disc1_fake_bce"
     data_layout: "data_parallel"
-    device_allocation: "cpu"
-    parents: "d1fc3_fake"
-  }
-  layer {
-    name: "disc1_fake_eval"
-    data_layout: "data_parallel"
-    parents: "disc1_fake_bce"
-    evaluation {
-    }
+    parents: "d1fc3_fake zero"
   }
+  #layer {
+  #  name: "disc1_fake_eval"
+  #  data_layout: "data_parallel"
+  #  parents: "disc1_fake_bce"
+  #  evaluation {
+  #  }
+  #}
   layer {
     identity {
     }
@@ -335,21 +352,19 @@ model {
     parents: "d2relu2"
   }
   layer {
-    bce_with_logits {
-      true_label: 1
+    sigmoid_binary_cross_entropy {
     }
     name: "g_adv1_bce"
     data_layout: "data_parallel"
-    device_allocation: "cpu"
-    parents: "d2fc3"
-  }
-  layer {
-    name: "g_adv1_eval"
-    data_layout: "data_parallel"
-    parents: "g_adv1_bce"
-    evaluation {
-    }
+    parents: "d2fc3 one"
   }
+  #layer {
+  #  name: "g_adv1_eval"
+  #  data_layout: "data_parallel"
+  #  parents: "g_adv1_bce"
+  #  evaluation {
+  #  }
+  #}
   #L2loss
   layer {
     name: "gsample_minus_y"
@@ -360,21 +375,21 @@ model {
     }
   }
   layer {
-    l2_loss {
+    l2_norm2 {
     }
     name: "l_l2_y"
     data_layout: "data_parallel"
-    device_allocation: "cpu"
+    #device_allocation: "cpu"
     parents: "gsample_minus_y"
   }
     
-  layer {
-    name: "l_l2_y_eval"
-    data_layout: "data_parallel"
-    parents: "l_l2_y"
-    evaluation {
-    }
-  }
+  #layer {
+  #  name: "l_l2_y_eval"
+  #  data_layout: "data_parallel"
+  #  parents: "l_l2_y"
+  #  evaluation {
+  #  }
+  #}
   weights {
     name: "gen1fc1linearity"
     he_normal_initializer {
diff --git a/model_zoo/models/jag/gan/gan_template.prototext b/model_zoo/models/jag/gan/vanilla/gan_template.prototext
similarity index 97%
rename from model_zoo/models/jag/gan/gan_template.prototext
rename to model_zoo/models/jag/gan/vanilla/gan_template.prototext
index 834ad56212a..2d0021aeae8 100644
--- a/model_zoo/models/jag/gan/gan_template.prototext
+++ b/model_zoo/models/jag/gan/vanilla/gan_template.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 64 
   block_size: 256
diff --git a/model_zoo/models/jag/gan/generate_gan.py b/model_zoo/models/jag/gan/vanilla/generate_gan.py
similarity index 93%
rename from model_zoo/models/jag/gan/generate_gan.py
rename to model_zoo/models/jag/gan/vanilla/generate_gan.py
index 0331940764a..277f9063fe9 100644
--- a/model_zoo/models/jag/gan/generate_gan.py
+++ b/model_zoo/models/jag/gan/vanilla/generate_gan.py
@@ -155,6 +155,14 @@ def configure_model(model):
     l.children = 'image_data_dummy param_data_id'
     l.slice.slice_points = str_list(slice_points)
 
+    #Useful constants
+    zero = new_layer(model,'zero','','constant')
+    zero.constant.value = 0.0
+    zero.constant.num_neurons = '1'
+    one = new_layer(model,'one','','constant')
+    one.constant.value = 1.0
+    one.constant.num_neurons = '1'
+
     #ID Image (Y) data
     l = new_layer(model,'image_data_dummy','slice_data','identity')
 
@@ -188,12 +196,10 @@ def configure_model(model):
     D_fake = add_discriminator(model,'d1_stop_gradient','d1',False, False, '_fake')
 
     #Objective and evaluation layers here
-    l = new_layer(model, 'disc1_real_bce', D_real, 'bce_with_logits')
-    l.bce_with_logits.true_label = 1
+    l = new_layer(model, 'disc1_real_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'disc1_real_eval','disc1_real_bce', 'evaluation')
 
-    l = new_layer(model, 'disc1_fake_bce', D_fake, 'bce_with_logits')
-    l.bce_with_logits.true_label = int(0)
+    l = new_layer(model, 'disc1_fake_bce', [D_real, zero.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'disc1_fake_eval','disc1_fake_bce', 'evaluation')
 
     #Adversarial part
@@ -203,9 +209,8 @@ def configure_model(model):
     #def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''):
     D_real2 = add_discriminator(model,'d2_dummy','d2',True, False)
     #objective function
-    #fake as real 
-    l = new_layer(model, 'g_adv1_bce', D_real2, 'bce_with_logits')
-    l.bce_with_logits.true_label = 1
+    #fake as real
+    l = new_layer(model, 'g_adv1_bce', [D_real2, one.name], 'sigmoid_binary_cross_entropy')
     l = new_layer(model, 'g_adv1_eval','g_adv1_bce', 'evaluation')
     
 
diff --git a/model_zoo/models/jag/input.minmax.1M.txt b/model_zoo/models/jag/input.minmax.1M.txt
new file mode 100644
index 00000000000..f63bdb63d3e
--- /dev/null
+++ b/model_zoo/models/jag/input.minmax.1M.txt
@@ -0,0 +1,5 @@
+      { scale: 1.666721654e+01 	bias: 5.000145788e+00}, 	# shape_model_initial_modes:(4,3)
+      { scale: 1.000025133e+01 	bias: -1.603520955e-06}, 	# betti_prl15_trans_u
+      { scale: 1.000001645e+00 	bias: -1.406676728e-06}, 	# betti_prl15_trans_v
+      { scale: 1.666672975e+00 	bias: 4.999989818e-01}, 	# shape_model_initial_modes:(2,1)
+      { scale: 1.666668753e+00 	bias: 5.000004967e-01}  	# shape_model_initial_modes:(1,0)
diff --git a/model_zoo/models/jag/scalar.minmax.1M.txt b/model_zoo/models/jag/scalar.minmax.1M.txt
new file mode 100644
index 00000000000..e260dd86dea
--- /dev/null
+++ b/model_zoo/models/jag/scalar.minmax.1M.txt
@@ -0,0 +1,22 @@
+	{ scale: 1.5420795e+01 	 bias: -8.313582e-01 }, 	 # BWx
+	{ scale: 1.4593427e+00 	 bias: -3.426026e+00 }, 	 # BT
+	{ scale: 1.4901131e+00 	 bias: -3.493689e+00 }, 	 # tMAXt
+	{ scale: 4.4250137e+01 	 bias: -1.623055e+00 }, 	 # BWn
+	{ scale: 2.4432852e-06 	 bias: -7.724349e-01 }, 	 # MAXpressure
+	{ scale: 2.6368040e-01 	 bias: -9.765773e-01 }, 	 # BAte
+	{ scale: 2.4198603e-01 	 bias: -9.856284e-01 }, 	 # MAXtion
+	{ scale: 1.4302059e+00 	 bias: -3.349900e+00 }, 	 # tMAXpressure
+	{ scale: 2.6368040e-01 	 bias: -9.765773e-01 }, 	 # BAt
+	{ scale: 7.1544386e-18 	 bias: -1.869906e-02 }, 	 # Yn
+	{ scale: 3.1669860e-03 	 bias: -1.869906e-02 }, 	 # Ye
+	{ scale: 2.1041247e-02 	 bias: -3.084058e-01 }, 	 # Yx
+	{ scale: 1.4901131e+00 	 bias: -3.493689e+00 }, 	 # tMAXte
+	{ scale: 2.6368040e-01 	 bias: -9.765773e-01 }, 	 # BAtion
+	{ scale: 2.4198603e-01 	 bias: -9.856284e-01 }, 	 # MAXte
+	{ scale: 1.4901131e+00 	 bias: -3.493689e+00 }, 	 # tMAXtion
+	{ scale: 1.3456596e+00 	 bias: -3.116023e+00 }, 	 # BTx
+	{ scale: 2.4198603e-01 	 bias: -9.856284e-01 }, 	 # MAXt
+	{ scale: 1.4593427e+00 	 bias: -3.426026e+00 }, 	 # BTn
+	{ scale: 3.0520000e-06 	 bias: -7.714907e-01 }, 	 # BApressure
+	{ scale: 1.3925443e+00 	 bias: -3.239921e+00 }, 	 # tMINradius
+	{ scale: 1.0023756e-01 	 bias: -2.815272e+00 }  	 # MINradius
diff --git a/model_zoo/models/jag/vae_fcn.prototext b/model_zoo/models/jag/vae_fcn.prototext
index 6bc6c29e559..d1ed1f55cce 100644
--- a/model_zoo/models/jag/vae_fcn.prototext
+++ b/model_zoo/models/jag/vae_fcn.prototext
@@ -2,7 +2,6 @@
 #https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py
 #Timestamp 02/26/2018 8:45AM
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "model_parallel"
   #mini_batch_size: 128
   mini_batch_size: 100 #more last minibatch images to save
@@ -16,12 +15,8 @@ model {
   ###################################################
 
   objective_function {
-    binary_cross_entropy {}
-    #mean_squared_error {}
-    layer_term {
-      scale_factor: 1.0
-      layer: "kl_divergence"
-    }
+    layer_term { layer: "binary_cross_entropy" }
+    layer_term { layer: "kl_divergence" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -31,7 +26,12 @@ model {
   # Metrics
   ###################################################
 
-  metric { mean_squared_error {} }
+  metric {
+    layer_metric {
+      name: "mean squared error"
+      layer: "mean_squared_error"
+    }
+  }
 
   ###################################################
   # Callbacks
@@ -50,8 +50,8 @@ model {
   }
   callback {
     save_images {
-      image_dir: "vae_fcn_images_"
-      extension: "jpg"
+      image_prefix: "vae_fcn_images_"
+      image_format: "jpg"
     }
   }
 
@@ -63,14 +63,26 @@ model {
   # Data
   ######################
   layer {
-    name: "data"
-    children: "encode1 reconstruction"
+    name: "input"
+    children: "data dummy"
     data_layout: "model_parallel"
     input {
       io_buffer: "distributed"
       target_mode: "reconstruction"
     }
   }
+  layer {
+    parents: "input"
+    name: "data"
+    data_layout: "model_parallel"
+    split {}
+  }
+  layer {
+    parents: "input"
+    name: "dummy"
+    data_layout: "model_parallel"
+    dummy {}
+  }
 
   ######################
   # Encoder
@@ -83,7 +95,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -109,7 +120,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -135,7 +145,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -164,7 +173,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons:5
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -174,7 +182,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons:5
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -195,15 +202,13 @@ model {
     parents: "z_mean"
     name: "kl_z_mean2"
     data_layout: "model_parallel"
-    power {
-      exponent: 2.0
-    }
+    square {}
   }
   layer {
     parents: "z_log_sigma"
     name: "kl_exp"
     data_layout: "model_parallel"
-    exponential {}
+    exp {}
   }
   layer {
     parents: "kl_one z_log_sigma kl_z_mean2 kl_exp"
@@ -215,18 +220,12 @@ model {
   }
   layer {
     parents: "kl_full"
-    name: "kl_sum"
+    name: "kl_divergence"
     data_layout: "data_parallel"
     reduction {
       mode: "sum"
     }
   }
-  layer {
-    parents: "kl_sum"
-    name: "kl_divergence"
-    data_layout: "data_parallel"
-    evaluation {}
-  }
 
   ######################
   # Sample from latent space
@@ -244,7 +243,7 @@ model {
     parents: "sample_half"
     name: "sample_exp"
     data_layout: "model_parallel"
-    exponential {}
+    exp {}
   }
   layer {
     name: "sample_noise"
@@ -259,13 +258,13 @@ model {
     parents: "sample_exp sample_noise"
     name: "sample_exp_noise"
     data_layout: "model_parallel"
-    hadamard {}
+    multiply {}
   }
   layer {
     parents: "z_mean sample_exp_noise"
     name: "sample"
     data_layout: "model_parallel"
-    sum {}
+    add {}
   }
 
   ######################
@@ -279,7 +278,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -305,7 +303,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -331,7 +328,6 @@ model {
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 256
-      weight_initialization: "he_normal"
       has_bias: true
     }
   }
@@ -358,7 +354,6 @@ model {
     data_layout: "model_parallel"
     num_neurons_from_data_reader: true
     fully_connected {
-      weight_initialization: "glorot_normal"
       has_bias: true
     }
   }
@@ -374,10 +369,22 @@ model {
   ######################
 
   layer {
-    parents: "sigmoid data"
+    parents: "sigmoid"
     name: "reconstruction"
     data_layout: "model_parallel"
-    reconstruction {}
+    split {}
+  }
+  layer {
+    parents: "reconstruction data"
+    name: "mean_squared_error"
+    data_layout: "model_parallel"
+    mean_squared_error {}
+  }
+  layer {
+    parents: "reconstruction data"
+    name: "binary_cross_entropy"
+    data_layout: "model_parallel"
+    binary_cross_entropy {}
   }
 
   ###################################################
diff --git a/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext b/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext
index a255cf7f940..0c6bc3af663 100644
--- a/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext
+++ b/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "data_parallel"
   mini_batch_size: 64
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    cross_entropy {}
+    layer_term { layer: "cross_entropy" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -23,7 +22,11 @@ model {
   ###################################################
 
   metric {
-    categorical_accuracy {}
+    layer_metric {
+      name: "categorical accuracy"
+      layer: "top1_accuracy"
+      unit: "%"
+    }
   }
 
   ###################################################
@@ -93,14 +96,27 @@ model {
 
   layer {
     name: "data"
-    children: "conv1 target"
+    children: "images labels"
     data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
     }
   }
+  layer {
+    name: "images"
+    parents: "data"
+    data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    name: "labels"
+    parents: "data"
+    data_layout: "data_parallel"
+    split {}
+  }
 
   layer {
+    parents: "images"
     name: "conv1"
     data_layout: "data_parallel"
     convolution {
@@ -114,6 +130,7 @@ model {
   }
 
   layer {
+    parents: "conv1"
     name: "pool1"
     data_layout: "data_parallel"
     pooling {
@@ -126,6 +143,7 @@ model {
   }
 
   layer {
+    parents: "pool1"
     name: "conv2"
     data_layout: "data_parallel"
     convolution {
@@ -139,6 +157,7 @@ model {
   }
 
   layer {
+    parents: "conv2"
     name: "pool2"
     data_layout: "data_parallel"
     pooling {
@@ -151,6 +170,7 @@ model {
   }
 
   layer {
+    parents: "pool2"
     name: "ip1"
     data_layout: "model_parallel"
     fully_connected {
@@ -160,12 +180,14 @@ model {
   }
 
   layer {
+    parents: "ip1"
     name: "relu1"
     data_layout: "model_parallel"
     relu {}
   }
 
   layer {
+    parents: "relu1"
     name: "ip2"
     data_layout: "model_parallel"
     fully_connected {
@@ -175,16 +197,24 @@ model {
   }
 
   layer {
+    parents: "ip2"
     name: "prob"
-    data_layout: "model_parallel"
+    data_layout: "data_parallel"
     softmax {}
   }
 
   layer {
-    parents: "prob data"
-    name: "target"
+    name: "cross_entropy"
+    parents: "prob labels"
+    data_layout: "data_parallel"
+    cross_entropy {}
+  }
+
+  layer {
+    name: "top1_accuracy"
+    parents: "prob labels"
     data_layout: "data_parallel"
-    target {}
+    categorical_accuracy {}
   }
 
 }
diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext
index 6d03c9668ed..b1c55e8c8f6 100644
--- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext
+++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext
@@ -35,8 +35,8 @@ model {
   }
   # callback {
   #   save_images {
-  #     image_dir: "images_"
-  #     extension: "pgm"
+  #     image_prefix: "images_"
+  #     image_format: "pgm"
   #   }
   # }
 
diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext
index 12c6c9893d4..70ce51e4fb8 100644
--- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext
+++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext
@@ -32,8 +32,8 @@ model {
   }
   # callback {
   #   save_images {
-  #     image_dir: "images_"
-  #     extension: "pgm"
+  #     image_prefix: "images_"
+  #     image_format: "pgm"
   #   }
   # }
 
diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext
index 7c3d257c32c..a4d041d412e 100644
--- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext
+++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext
@@ -32,8 +32,8 @@ model {
   }
   # callback {
   #   save_images {
-  #     image_dir: "images_"
-  #     extension: "pgm"
+  #     image_prefix: "images_"
+  #     image_format: "pgm"
   #   }
   # }
   ###################################################
diff --git a/model_zoo/models/resnet50/model_resnet50.prototext b/model_zoo/models/resnet50/model_resnet50.prototext
index 5dd08522f92..e0c83d90a00 100644
--- a/model_zoo/models/resnet50/model_resnet50.prototext
+++ b/model_zoo/models/resnet50/model_resnet50.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 256
   block_size: 256
@@ -1893,7 +1892,7 @@ model {
     name: "top1_accuracy"
     parents: "prob labels"
     data_layout: "data_parallel"
-    top_k_categorical_accuracy { k: 1 }
+    categorical_accuracy {}
   }
   layer {
     name: "top5_accuracy"
diff --git a/model_zoo/models/resnet50/model_resnet50_motif.prototext b/model_zoo/models/resnet50/model_resnet50_motif.prototext
deleted file mode 100644
index 36d451ae468..00000000000
--- a/model_zoo/models/resnet50/model_resnet50_motif.prototext
+++ /dev/null
@@ -1,548 +0,0 @@
-motif_definitions {
-  #--------------------------------------------------------------
-  # primary resnet motif; this is repeated 16 times for resnet 50
-  #--------------------------------------------------------------
-  motif {
-    name: "motif_resnet"
-
-    #----------------------------------------------------------
-    # input layer: split
-    #----------------------------------------------------------
-    layer {
-      parents: ""
-      children: "branch1 branch2a"
-      name: "split"
-      data_layout: "data_parallel"
-      split { }
-    }
-
-    #----------------------------------------------------------
-    # final layers: sum -> relu
-    #----------------------------------------------------------
-    layer {
-      name: "sum"
-      parents: "branch2c_bn branch1_bn"
-      children: "relu"
-      data_layout: "data_parallel"
-      sum { }
-    }
-    layer {
-      name: "relu"
-      parents: "sum"
-      children: ""
-      data_layout: "data_parallel"
-      relu { }
-    }
-
-    #----------------------------------------------------------
-    # branch1: conv -> batch_norm -> [sum]
-    #----------------------------------------------------------
-    layer {
-      name: "branch1"
-      parents: "split"
-      children: "branch1_bn"
-      data_layout: "data_parallel"
-      convolution {
-        num_dims: 2
-        num_output_channels: -1
-        conv_dims_i: 1
-        conv_pads_i: 0
-        conv_strides_i: 2
-        has_bias: false
-      }
-    }
-    layer {
-      name: "branch1_bn"
-      parents: "branch1"
-      children: "sum"
-      data_layout: "data_parallel"
-      batch_normalization {
-        decay: 0.9
-        scale_init: 1.0
-        bias_init: 0.0
-        epsilon: 1e-5
-      }
-    }
-
-    #----------------------------------------------------------
-    # branch2: conv -> batch_norm -> relu ->
-    #          conv -> batch_norm -> relu ->
-    #          conv -> batch_norm -> [sum]
-    #----------------------------------------------------------
-    layer {
-      name: "branch2a"
-      parents: "split"
-      children: "branch2a_bn"
-      data_layout: "data_parallel"
-      convolution {
-        num_dims: 2
-        num_output_channels: -1
-        conv_dims_i: 1
-        conv_pads_i: 0
-        conv_strides_i: 1
-        has_bias: false
-      }
-    }
-    layer {
-      name: "branch2a_bn"
-      parents: "branch2a"
-      children: "branch2a_relu"
-      data_layout: "data_parallel"
-      batch_normalization {
-        decay: 0.9
-        scale_init: 1.0
-        bias_init: 0.0
-        epsilon: 1e-5
-      }
-    }
-    layer {
-      name: "branch2a_relu"
-      parents: "branch2a_bn"
-      children: "branch2b"
-      data_layout: "data_parallel"
-      relu { }
-    }
-
-    layer {
-      name: "branch2b"
-      parents: "branch2a_relu"
-      children: "branch2b_bn"
-      data_layout: "data_parallel"
-      convolution {
-        num_dims: 2
-        num_output_channels: -1
-        conv_dims_i: 3
-        conv_pads_i: 1
-        conv_strides_i: 1
-        has_bias: false
-      }
-    }
-    layer {
-      name: "branch2b_bn"
-      parents: "branch2b"
-      children: "branch2b_relu"
-      data_layout: "data_parallel"
-      batch_normalization {
-        decay: 0.9
-        scale_init: 1.0
-        bias_init: 0.0
-        epsilon: 1e-5
-      }
-    }
-    layer {
-      name: "branch2b_relu"
-      parents: "branch2b_bn"
-      children: "branch2c"
-      data_layout: "data_parallel"
-      relu { }
-    }
-    layer {
-      name: "branch2c"
-      parents: "branch2b_relu"
-      children: "branch2c_bn"
-      data_layout: "data_parallel"
-      convolution {
-        num_dims: 2
-        num_output_channels: -1
-        conv_dims_i: 1
-        conv_pads_i: 0
-        conv_strides_i: 1
-        has_bias: false
-      }
-    }
-    layer {
-      name: "branch2c_bn"
-      parents: "branch2c"
-      children: "sum"
-      data_layout: "data_parallel"
-      batch_normalization {
-        decay: 0.9
-        scale_init: 1.0
-        bias_init: 0.0
-        epsilon: 1e-5
-      }
-    }
-
-  } # motif_resnet
-
-} # motif_definitions
-
-
-model {
-  name: "directed_acyclic_graph_model"
-  data_layout: "data_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs: 10
-  num_parallel_readers: 0
-  procs_per_model: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    cross_entropy {}
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric { categorical_accuracy {} }
-  metric {
-    top_k_categorical_accuracy {
-       top_k: 5
-    }
-  }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  # conv1
-  layer {
-    parents: ""
-    name: "data"
-    children: "conv1 target"
-    data_layout: "data_parallel"
-    input {
-      io_buffer: "partitioned"
-    }
-  }
-  layer {
-    parents: "data"
-    name: "conv1"
-    children: "bn_conv1"
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 7
-      conv_pads_i: 3
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    parents: "conv1"
-    name: "bn_conv1"
-    children: "conv1_relu"
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "conv1_relu"
-    parents: "bn_conv1"
-    children: "pool1"
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "pool1"
-    parents: "conv1_relu"
-    children: "res2a"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims_i: 3
-      pool_pads_i: 1
-      pool_strides_i: 2
-      pool_mode: "max"
-    }
-  }
-
-  # res2a
-  layer {
-    name: "res2a"
-    parents: "pool1"
-    children: "res2b"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "branch1 num_output_channels 256"
-      variable: "branch1 conv_strides_i 1"
-      variable: "branch2a num_output_channels 64"
-      variable: "branch2b num_output_channels 64"
-      variable: "branch2c num_output_channels 256"
-    }
-  }
-  # res2b
-  layer {
-    name: "res2b"
-    parents: "res2a"
-    children: "res2c"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 64"
-      variable: "branch2b num_output_channels 64"
-      variable: "branch2c num_output_channels 256"
-    }
-  }
-  # res2c
-  layer {
-    name: "res2c"
-    parents: "res2b"
-    children: "res3a"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 64"
-      variable: "branch2b num_output_channels 64"
-      variable: "branch2c num_output_channels 256"
-    }
-  }
-  #=================================================
-  # res3a
-  layer {
-    name: "res3a"
-    parents: "res2c"
-    children: "res3b"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "branch1 num_output_channels 512"
-      variable: "branch2a num_output_channels 128"
-      variable: "branch2a conv_strides_i 2"
-      variable: "branch2b num_output_channels 128"
-      variable: "branch2c num_output_channels 512"
-    }
-  }
-  # res3b
-  layer {
-    name: "res3b"
-    parents: "res3a"
-    children: "res3c"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 128"
-      variable: "branch2b num_output_channels 128"
-      variable: "branch2c num_output_channels 512"
-    }
-  }
-  # res3c
-  layer {
-    name: "res3c"
-    parents: "res3b"
-    children: "res3d"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 128"
-      variable: "branch2b num_output_channels 128"
-      variable: "branch2c num_output_channels 512"
-    }
-  }
-  # res3d
-  layer {
-    name: "res3d"
-    parents: "res3c"
-    children: "res4a"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 128"
-      variable: "branch2b num_output_channels 128"
-      variable: "branch2c num_output_channels 512"
-    }
-  }
-  #==========================================================
-  # res4a
-  layer {
-    name: "res4a"
-    parents: "res3d"
-    children: "res4b"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "branch1 num_output_channels 1024"
-      variable: "branch2a num_output_channels 256"
-      variable: "branch2a conv_strides_i 2"
-      variable: "branch2b num_output_channels 256"
-      variable: "branch2c num_output_channels 1024"
-    }
-  }
-  # res4b
-  layer {
-    name: "res4b"
-    parents: "res4a"
-    children: "res4c"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 256"
-      variable: "branch2b num_output_channels 256"
-      variable: "branch2c num_output_channels 1024"
-    }
-  }
-  # res4c
-  layer {
-    name: "res4c"
-    parents: "res4b"
-    children: "res4d"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 256"
-      variable: "branch2b num_output_channels 256"
-      variable: "branch2c num_output_channels 1024"
-    }
-  }
-  # res4d
-  layer {
-    name: "res4d"
-    parents: "res4c"
-    children: "res4e"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 256"
-      variable: "branch2b num_output_channels 256"
-      variable: "branch2c num_output_channels 1024"
-    }
-  }
-  # res4e
-  layer {
-    name: "res4e"
-    parents: "res4d"
-    children: "res4f"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 256"
-      variable: "branch2b num_output_channels 256"
-      variable: "branch2c num_output_channels 1024"
-    }
-  }
-  # res4f
-  layer {
-    name: "res4f"
-    parents: "res4e"
-    children: "res5a"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 256"
-      variable: "branch2b num_output_channels 256"
-      variable: "branch2c num_output_channels 1024"
-    }
-  }
-  #======================================================================
-  # res5a
-  layer {
-    name: "res5a"
-    parents: "res4f"
-    children: "res5b"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "branch1 num_output_channels 2048"
-      variable: "branch2a num_output_channels 512"
-      variable: "branch2a conv_strides_i 2"
-      variable: "branch2b num_output_channels 512"
-      variable: "branch2c num_output_channels 2048"
-    }
-  }
-  # res5b
-  layer {
-    name: "res5b"
-    parents: "res5a"
-    children: "res5c"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 512"
-      variable: "branch2b num_output_channels 512"
-      variable: "branch2c num_output_channels 2048"
-    }
-  }
-  # res5c
-  layer {
-    name: "res5c"
-    parents: "res5b"
-    children: "pool5"
-    data_layout: "data_parallel"
-    motif_layer {
-      motif_id: "motif_resnet"
-      variable: "do_not_use branch1"
-      variable: "do_not_use branch1_bn"
-      variable: "branch2a num_output_channels 512"
-      variable: "branch2b num_output_channels 512"
-      variable: "branch2c num_output_channels 2048"
-    }
-  }
-
-  #================================================================
-  # Inference
-  layer {
-    name: "pool5"
-    parents: "res5c"
-    children: "fc1000"
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims_i: 7
-      pool_pads_i: 0
-      pool_strides_i: 1
-      pool_mode: "average"
-    }
-  }
-  layer {
-    parents: "pool5"
-    name: "fc1000"
-    children: "prob"
-    data_layout: "model_parallel"
-    fully_connected {
-    num_neurons: 1000
-    has_bias: false
-    }
-  }
-  layer {
-    parents: "fc1000"
-    name: "prob"
-    children: "target"
-    data_layout: "model_parallel"
-    softmax {
-    }
-  }
-  layer {
-    parents: "prob data"
-    name: "target"
-    children: ""
-    data_layout: "data_parallel"
-    target {}
-  }
-}
diff --git a/model_zoo/models/resnet50/model_resnet50_sequential.prototext b/model_zoo/models/resnet50/model_resnet50_sequential.prototext
deleted file mode 100644
index 2a8d316ecbf..00000000000
--- a/model_zoo/models/resnet50/model_resnet50_sequential.prototext
+++ /dev/null
@@ -1,2053 +0,0 @@
-model {
-  name: "sequential_model"
-  data_layout: "data_parallel"
-  mini_batch_size: 256
-  block_size: 256
-  num_epochs: 10
-  num_parallel_readers: 0
-  procs_per_model: 0
-
-  ###################################################
-  # Objective function
-  ###################################################
-
-  objective_function {
-    cross_entropy {}
-    l2_weight_regularization {
-      scale_factor: 1e-4
-    }
-  }
-
-  ###################################################
-  # Metrics
-  ###################################################
-
-  metric { categorical_accuracy {} }
-  metric {
-    top_k_categorical_accuracy {
-       top_k: 5
-    }
-  }
-
-  ###################################################
-  # Callbacks
-  ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback {
-    summary {
-      dir: "."
-      batch_interval: 1
-      mat_interval: 25
-    }
-  }
-  callback {
-    imcomm {
-      intermodel_comm_method: "normal"
-      all_optimizers: true
-    }
-  }
-  # callback { gradient_check {} }
-
-  ###################################################
-  # Layers
-  ###################################################
-
-  layer {
-    name: "0"
-    parents: "0"
-    children: "1 184"
-    data_layout: "data_parallel"
-    input {
-      io_buffer: "partitioned"
-    }
-  }
-  layer {
-    name: "1"
-    parents: "0"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 7
-      conv_pads_i: 3
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    name: "2"
-    parents: "1"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "3"
-    parents: "2"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "4"
-    parents: "3"
-    children: ""
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims_i: 3
-      pool_pads_i: 1
-      pool_strides_i: 2
-      pool_mode: "max"
-    }
-  }
-  layer {
-    name: "5"
-    parents: "4"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 320
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "6"
-    parents: "5"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "7"
-    parents: "6"
-    children: "14 8"
-    data_layout: "data_parallel"
-    slice {
-      slice_axis: 0
-      slice_points: "0 256 320"
-    }
-  }
-  layer {
-    name: "8"
-    parents: "7"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "9"
-    parents: "8"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "10"
-    parents: "9"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "11"
-    parents: "10"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "12"
-    parents: "11"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "13"
-    parents: "12"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "14"
-    parents: "13 7"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "15"
-    parents: "14"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "16"
-    parents: "15"
-    children: "17 25"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "17"
-    parents: "16"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "18"
-    parents: "17"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "19"
-    parents: "18"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "20"
-    parents: "19"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "21"
-    parents: "20"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "22"
-    parents: "21"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "23"
-    parents: "22"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "24"
-    parents: "23"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "25"
-    parents: "24 16"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "26"
-    parents: "25"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "27"
-    parents: "26"
-    children: "28 36"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "28"
-    parents: "27"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "29"
-    parents: "28"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "30"
-    parents: "29"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "31"
-    parents: "30"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 64
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "32"
-    parents: "31"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "33"
-    parents: "32"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "34"
-    parents: "33"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "35"
-    parents: "34"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "36"
-    parents: "35 27"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "37"
-    parents: "36"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "38"
-    parents: "37"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 640
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    name: "39"
-    parents: "38"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "40"
-    parents: "39"
-    children: "47 41"
-    data_layout: "data_parallel"
-    slice {
-      slice_axis: 0
-      slice_points: "0 512 640"
-    }
-  }
-  layer {
-    name: "41"
-    parents: "40"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "42"
-    parents: "41"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "43"
-    parents: "42"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "44"
-    parents: "43"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "45"
-    parents: "44"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "46"
-    parents: "45"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "47"
-    parents: "46 40"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "48"
-    parents: "47"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "49"
-    parents: "48"
-    children: "50 58"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "50"
-    parents: "49"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "51"
-    parents: "50"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "52"
-    parents: "51"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "53"
-    parents: "52"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "54"
-    parents: "53"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "55"
-    parents: "54"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "56"
-    parents: "55"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "57"
-    parents: "56"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "58"
-    parents: "57 49"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "59"
-    parents: "58"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "60"
-    parents: "59"
-    children: "61 69"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "61"
-    parents: "60"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "62"
-    parents: "61"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "63"
-    parents: "62"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "64"
-    parents: "63"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "65"
-    parents: "64"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "66"
-    parents: "65"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "67"
-    parents: "66"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "68"
-    parents: "67"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "69"
-    parents: "68 60"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "70"
-    parents: "69"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "71"
-    parents: "70"
-    children: "72 80"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "72"
-    parents: "71"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "73"
-    parents: "72"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "74"
-    parents: "73"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "75"
-    parents: "74"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 128
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "76"
-    parents: "75"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "77"
-    parents: "76"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "78"
-    parents: "77"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "79"
-    parents: "78"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "80"
-    parents: "79 71"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "81"
-    parents: "80"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "82"
-    parents: "81"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1280
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    name: "83"
-    parents: "82"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "84"
-    parents: "83"
-    children: "91 85"
-    data_layout: "data_parallel"
-    slice {
-      slice_axis: 0
-      slice_points: "0 1024 1280"
-    }
-  }
-  layer {
-    name: "85"
-    parents: "84"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "86"
-    parents: "85"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "87"
-    parents: "86"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "88"
-    parents: "87"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "89"
-    parents: "88"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "90"
-    parents: "89"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "91"
-    parents: "90 84"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "92"
-    parents: "91"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "93"
-    parents: "92"
-    children: " 94 102"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "94"
-    parents: "93"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "95"
-    parents: "94"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "96"
-    parents: "95"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "97"
-    parents: "96"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "98"
-    parents: "97"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "99"
-    parents: "98"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "100"
-    parents: "99"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "101"
-    parents: "100"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "102"
-    parents: "101 93"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "103"
-    parents: "102"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "104"
-    parents: "103"
-    children: "105 113"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "105"
-    parents: "104"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "106"
-    parents: "105"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "107"
-    parents: "106"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "108"
-    parents: "107"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "109"
-    parents: "108"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "110"
-    parents: "109"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "111"
-    parents: "110"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "112"
-    parents: "111"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "113"
-    parents: "112 104"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "114"
-    parents: "113"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "115"
-    parents: "114"
-    children: "116 124"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "116"
-    parents: "115"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "117"
-    parents: "116"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "118"
-    parents: "117"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "119"
-    parents: "118"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "120"
-    parents: "119"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "121"
-    parents: "120"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "122"
-    parents: "121"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "123"
-    parents: "122"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "124"
-    parents: "123 115"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "125"
-    parents: "124"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "126"
-    parents: "125"
-    children: "127 135"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "127"
-    parents: "126"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "128"
-    parents: "127"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "129"
-    parents: "128"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "130"
-    parents: "129"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "131"
-    parents: "130"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "132"
-    parents: "131"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "133"
-    parents: "132"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "134"
-    parents: "133"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "135"
-    parents: "134 126"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "136"
-    parents: "135"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "137"
-    parents: "136"
-    children: "138 146"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "138"
-    parents: "137"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "139"
-    parents: "138"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "140"
-    parents: "139"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "141"
-    parents: "140"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 256
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "142"
-    parents: "141"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "143"
-    parents: "142"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "144"
-    parents: "143"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 1024
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "145"
-    parents: "144"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "146"
-    parents: "145 137"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "147"
-    parents: "146"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "148"
-    parents: "147"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 2560
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 2
-      has_bias: false
-    }
-  }
-  layer {
-    name: "149"
-    parents: "148"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "150"
-    parents: "149"
-    children: "157 151"
-    data_layout: "data_parallel"
-    slice {
-      slice_axis: 0
-      slice_points: "0 2048 2560"
-    }
-  }
-  layer {
-    name: "151"
-    parents: "150"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "152"
-    parents: "151"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "153"
-    parents: "152"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "154"
-    parents: "153"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "155"
-    parents: "154"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 2048
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "156"
-    parents: "155"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "157"
-    parents: "156 150"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "158"
-    parents: "157"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "159"
-    parents: "158"
-    children: "160 168"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "160"
-    parents: "159"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "161"
-    parents: "160"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "162"
-    parents: "161"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "163"
-    parents: "162"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "164"
-    parents: "163"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "165"
-    parents: "164"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "166"
-    parents: "165"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 2048
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "167"
-    parents: "166"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "168"
-    parents: "167 159"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "169"
-    parents: "168"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "170"
-    parents: "169"
-    children: "171 179"
-    data_layout: "data_parallel"
-    split {
-    }
-  }
-  layer {
-    name: "171"
-    parents: "170"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "172"
-    parents: "171"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "173"
-    parents: "172"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "174"
-    parents: "173"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 512
-      conv_dims_i: 3
-      conv_pads_i: 1
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "175"
-    parents: "174"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "176"
-    parents: "175"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "177"
-    parents: "176"
-    children: ""
-    data_layout: "data_parallel"
-    convolution {
-      num_dims: 2
-      num_output_channels: 2048
-      conv_dims_i: 1
-      conv_pads_i: 0
-      conv_strides_i: 1
-      has_bias: false
-    }
-  }
-  layer {
-    name: "178"
-    parents: "177"
-    children: ""
-    data_layout: "data_parallel"
-    batch_normalization {
-      decay: 0.9
-      scale_init: 1.0
-      bias_init: 0.0
-      epsilon: 1e-5
-    }
-  }
-  layer {
-    name: "179"
-    parents: "178 170"
-    children: ""
-    data_layout: "data_parallel"
-    sum {
-    }
-  }
-  layer {
-    name: "180"
-    parents: "179"
-    children: ""
-    data_layout: "data_parallel"
-    relu {
-    }
-  }
-  layer {
-    name: "181"
-    parents: "180"
-    children: ""
-    data_layout: "data_parallel"
-    pooling {
-      num_dims: 2
-      pool_dims_i: 7
-      pool_pads_i: 0
-      pool_strides_i: 1
-      pool_mode: "average"
-    }
-  }
-  layer {
-    name: "182"
-    parents: "181"
-    children: ""
-    data_layout: "model_parallel"
-    fully_connected {
-    num_neurons: 1000
-    has_bias: false
-    }
-  }
-  layer {
-    name: "183"
-    parents: "182"
-    children: ""
-    data_layout: "model_parallel"
-    softmax {
-    }
-  }
-  layer {
-    name: "184"
-    parents: "183 0"
-    children: ""
-    data_layout: "data_parallel"
-    target {}
-  }
-}
diff --git a/model_zoo/models/siamese/finetune-cub/data_reader_cub.prototext b/model_zoo/models/siamese/finetune-cub/data_reader_cub.prototext
index 202009f060c..038f2dadc39 100644
--- a/model_zoo/models/siamese/finetune-cub/data_reader_cub.prototext
+++ b/model_zoo/models/siamese/finetune-cub/data_reader_cub.prototext
@@ -14,6 +14,7 @@ data_reader {
     image_preprocessor {
       raw_width: 256
       raw_height: 256
+      raw_num_channels: 3
 
       cropper {
         disable: false
@@ -57,6 +58,7 @@ data_reader {
     image_preprocessor {
       raw_width: 256
       raw_height: 256
+      raw_num_channels: 3
 
       cropper {
         disable: false
diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext
new file mode 100644
index 00000000000..d7e39ab2fcd
--- /dev/null
+++ b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext
@@ -0,0 +1,964 @@
+model {
+  name: "directed_acyclic_graph_model"
+  data_layout: "data_parallel"
+  mini_batch_size: 64
+  block_size: 256
+  num_epochs: 50
+  num_parallel_readers: 0
+  procs_per_model: 0
+
+  ###################################################
+  # Objective function
+  ###################################################
+
+  objective_function {
+    cross_entropy {}
+    l2_weight_regularization {
+      scale_factor: 0.0005
+    }
+  }
+
+  ###################################################
+  # Metrics
+  ###################################################
+
+  metric { categorical_accuracy {} }
+  metric {
+    top_k_categorical_accuracy {
+       top_k: 5
+    }
+  }
+
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback {
+    imcomm {
+      intermodel_comm_method: "normal"
+      all_optimizers: true
+    }
+  }
+  callback { print {} }
+  callback { timer {} }
+  callback {
+    poly_learning_rate {
+      power: 0.5
+    }
+  }
+
+  ###################################################
+  # start of weights
+  ###################################################
+
+  # The weights of the layers conv1_head0, conv2_head0, conv3_head0, conv4_head0, and conv5_head0
+  # will be initialized as described here but overwritten with pretrained ones.
+  # The optimizer states may not be transferred if lbann2 is used.
+  # The weights of the rest learning layers will be initialized as described below and trained fresh.
+
+  weights {
+    name: "conv1_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.01
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv1_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv2_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.01
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv2_bias"
+    constant_initializer {
+      value: 0.1
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv3_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.01
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv3_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv4_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.01
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv4_bias"
+    constant_initializer {
+      value: 0.1
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv5_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.01
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv5_bias"
+    constant_initializer {
+      value: 0.1
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv6_new_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.01
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv6_new_bias"
+    constant_initializer {
+      value: 0.1
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv6b_new_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.01
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv6b_new_bias"
+    constant_initializer {
+      value: 0.1
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc7_new_linearity"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.005
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.01
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc7_new_bias"
+    constant_initializer {
+      value: 0.1
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc8_new_linearity"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.01
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc8_new_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  ###################################################################
+  # weights of batch normalization layers shared among Siamese heads
+  ###################################################################
+
+  weights {
+    name: "bn_conv1_scale"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv1_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv1_running_mean"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv1_running_variance"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv2_scale"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv2_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv2_running_mean"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv2_running_variance"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv3_scale"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv3_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv3_running_mean"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv3_running_variance"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv4_scale"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv4_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv4_running_mean"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv4_running_variance"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv5_scale"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv5_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv5_running_mean"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv5_running_variance"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  ###################################################
+  # start of replicate layers
+  ###################################################
+
+  layer {
+    name: "data_new"
+    children: "conv1_head0 target_new"
+    data_layout: "data_parallel"
+    input {
+      io_buffer: "partitioned"
+    }
+  }
+
+  layer {
+    parents: "data_new"
+    name: "conv1_head0"
+    children: "bn_conv1_head0"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels: 96
+      conv_dims: "11 11"
+      conv_pads: "5 5"
+      conv_strides: "4 4"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv1_kernel conv1_bias"
+  }
+
+  layer {
+    parents: "conv1_head0"
+    name: "bn_conv1_head0"
+    children: "relu1_new"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv1_scale bn_conv1_bias bn_conv1_running_mean bn_conv1_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv1_head0"
+    name: "relu1_new"
+    children: "pool1_new"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu1_new"
+    name: "pool1_new"
+    children: "conv2_head0"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  layer {
+    parents: "pool1_new"
+    name: "conv2_head0"
+    children: "bn_conv2_head0"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels: 256
+      conv_dims: "5 5"
+      conv_pads: "2 2"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv2_kernel conv2_bias"
+  }
+
+  layer {
+    parents: "conv2_head0"
+    name: "bn_conv2_head0"
+    children: "relu2_new"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv2_scale bn_conv2_bias bn_conv2_running_mean bn_conv2_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv2_head0"
+    name: "relu2_new"
+    children: "pool2_new"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu2_new"
+    name: "pool2_new"
+    children: "conv3_head0"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  layer {
+    parents: "pool2_new"
+    name: "conv3_head0"
+    children: "bn_conv3_head0"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  384
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv3_kernel conv3_bias"
+  }
+
+  layer {
+    parents: "conv3_head0"
+    name: "bn_conv3_head0"
+    children: "relu3_new"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv3_scale bn_conv3_bias bn_conv3_running_mean bn_conv3_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv3_head0"
+    name: "relu3_new"
+    children: "conv4_head0"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu3_new"
+    name: "conv4_head0"
+    children: "bn_conv4_head0"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  384
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv4_kernel conv4_bias"
+  }
+
+  layer {
+    parents: "conv4_head0"
+    name: "bn_conv4_head0"
+    children: "relu4_new"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv4_scale bn_conv4_bias bn_conv4_running_mean bn_conv4_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv4_head0"
+    name: "relu4_new"
+    children: "conv5_head0"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu4_new"
+    name: "conv5_head0"
+    children: "bn_conv5_head0"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  256
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv5_kernel conv5_bias"
+  }
+
+  layer {
+    parents: "conv5_head0"
+    name: "bn_conv5_head0"
+    children: "relu5_new"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv5_scale bn_conv5_bias bn_conv5_running_mean bn_conv5_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv5_head0"
+    name: "relu5_new"
+    children: "pool5_new"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu5_new"
+    name: "pool5_new"
+    children: "conv6_new"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  ###################################################
+  # end of replicate layers
+  ###################################################
+  ######################################
+  # Start of Doersch Layer 6
+  ######################################
+
+  layer {
+    parents: "pool5_new"
+    name: "conv6_new"
+    children: "bn_conv6_new"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  4096
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv6_new_kernel conv6_new_bias"
+  }
+
+  layer {
+    parents: "conv6_new"
+    name: "bn_conv6_new"
+    children: "relu6_new"
+    data_layout: "data_parallel"
+    freeze: false
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+  }
+
+  layer {
+    parents: "bn_conv6_new"
+    name: "relu6_new"
+    children: "conv6b_new"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu6_new"
+    name: "conv6b_new"
+    children: "bn_conv6b_new"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  1024
+      conv_dims: "1 1"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv6b_new_kernel conv6b_new_bias"
+  }
+
+  layer {
+    parents: "conv6b_new"
+    name: "bn_conv6b_new"
+    children: "relu6b_new"
+    data_layout: "data_parallel"
+    freeze: false
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+  }
+
+  layer {
+    parents: "bn_conv6b_new"
+    name: "relu6b_new"
+    children: "pool6_new"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu6b_new"
+    name: "pool6_new"
+    children: "fc7_new"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  ######################################
+  # End of Doersch Layer 6
+  ######################################
+
+  layer {
+    parents: "pool6_new"
+    name: "fc7_new"
+    children: "bn_fc7_new"
+    data_layout: "data_parallel"
+    fully_connected {
+      num_neurons: 4096
+      has_bias: true
+    }
+    weights: "fc7_new_linearity fc7_new_bias"
+  }
+
+  layer {
+    parents: "fc7_new"
+    name: "bn_fc7_new"
+    children: "relu7_new"
+    data_layout: "data_parallel"
+    freeze: false
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+  }
+
+  layer {
+    parents: "bn_fc7_new"
+    name: "relu7_new"
+    children: "drop7_new"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu7_new"
+    name: "drop7_new"
+    children: "fc8_new"
+    data_layout: "data_parallel"
+    dropout {
+      keep_prob: 0.9
+    }
+  }
+
+  layer {
+    parents: "drop7_new"
+    name: "fc8_new"
+    children: "prob_new"
+    data_layout: "data_parallel"
+    fully_connected {
+      # The number of outputs specific to the dataset used.
+      # E.g., 200 for CUB, and 431 for CompCars.
+      num_neurons_is_num_labels: true
+      has_bias: false
+    }
+    weights: "fc8_new_linearity fc8_new_bias"
+  }
+
+  layer {
+    parents: "fc8_new"
+    name: "prob_new"
+    children: "target_new"
+    data_layout: "data_parallel"
+    softmax {}
+  }
+
+  layer {
+    parents: "prob_new data_new"
+    name: "target_new"
+    data_layout: "data_parallel"
+    target {}
+  }
+
+}
diff --git a/model_zoo/models/siamese/triplet/data_reader_triplet.prototext b/model_zoo/models/siamese/triplet/data_reader_triplet.prototext
index 0d609866940..baa90c196bf 100644
--- a/model_zoo/models/siamese/triplet/data_reader_triplet.prototext
+++ b/model_zoo/models/siamese/triplet/data_reader_triplet.prototext
@@ -14,6 +14,7 @@ data_reader {
     image_preprocessor {
       raw_width: 110
       raw_height: 110
+      raw_num_channels: 3
 
       cropper {
         disable: false
@@ -61,6 +62,7 @@ data_reader {
     image_preprocessor {
       raw_width: 110
       raw_height: 110
+      raw_num_channels: 3
 
       cropper {
         disable: false
diff --git a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext
new file mode 100644
index 00000000000..7be81a09ea6
--- /dev/null
+++ b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext
@@ -0,0 +1,1558 @@
+model {
+  name: "directed_acyclic_graph_model"
+  data_layout: "data_parallel"
+  mini_batch_size: 128
+  block_size: 256
+  num_epochs: 1
+  num_parallel_readers: 0
+  procs_per_model: 0
+
+  ###################################################
+  # Objective function
+  ###################################################
+
+  objective_function {
+    cross_entropy {}
+    l2_weight_regularization {
+      scale_factor: 0.0005
+    }
+  }
+
+  ###################################################
+  # Metrics
+  ###################################################
+
+  metric { categorical_accuracy {} }
+  metric {
+    top_k_categorical_accuracy {
+       top_k: 2
+    }
+  }
+
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback {
+    imcomm {
+      intermodel_comm_method: "normal"
+      all_optimizers: true
+    }
+  }
+  callback { print {} }
+  callback { timer {} }
+  callback {
+    poly_learning_rate {
+      power: 0.5
+    }
+  }
+  callback {
+    checkpoint {
+      checkpoint_dir: "pretrain-stage12"
+      checkpoint_epochs: 1
+      checkpoint_steps: 5000
+    }
+  }
+
+  ###################################################
+  # start of weights
+  ###################################################
+  # In general it is not necessary to explicitly describe weights whether they are shared or not.
+  # Here, we do so to apply a specific initialization method for each weight.
+
+  weights {
+    name: "conv1_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.007
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv1_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv2_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.008
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv2_bias"
+    constant_initializer {
+      value: 0.1
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv3_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.009
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv3_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv4_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.01
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv4_bias"
+    constant_initializer {
+      value: 0.1
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv5_kernel"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.009
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "conv5_bias"
+    constant_initializer {
+      value: 0.1
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc6_linearity"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.005
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.008
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc6_bias"
+    constant_initializer {
+      value: 0.1
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc7_linearity"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.007
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc7_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc8_linearity"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.006
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc8_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc9_linearity"
+    normal_initializer {
+      mean: 0.0
+      standard_deviation: 0.01
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.005
+        momentum: 0.9
+        decay_rate: 0.0002
+        nesterov: false
+      }
+    }
+  }
+
+  weights {
+    name: "fc9_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {
+      sgd {
+        learn_rate: 0.02
+        momentum: 0.9
+        decay_rate: 0
+        nesterov: false
+      }
+    }
+  }
+
+  ###################################################################
+  # weights of batch normalization layers shared among Siamese heads
+  ###################################################################
+
+  weights {
+    name: "bn_conv1_scale"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv1_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv1_running_mean"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv1_running_variance"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv2_scale"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv2_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv2_running_mean"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv2_running_variance"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv3_scale"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv3_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv3_running_mean"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv3_running_variance"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv4_scale"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv4_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv4_running_mean"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv4_running_variance"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv5_scale"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv5_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv5_running_mean"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_conv5_running_variance"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_fc6_scale"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_fc6_bias"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_fc6_running_mean"
+    constant_initializer {
+      value: 0.0
+    }
+    optimizer {}
+  }
+
+
+  weights {
+    name: "bn_fc6_running_variance"
+    constant_initializer {
+      value: 1.0
+    }
+    optimizer {}
+  }
+
+  ###################################################
+  # start of layers
+  ###################################################
+
+  layer {
+    name: "data"
+    children: "slice target"
+    data_layout: "data_parallel"
+    input {
+      io_buffer: "partitioned"
+    }
+  }
+
+  layer {
+    parents: "data"
+    name: "slice"
+    children: "conv1_head0 conv1_head1 conv1_head2"
+    data_layout: "data_parallel"
+    slice {
+      slice_axis: 0
+      slice_points: "0 3 6 9"
+    }
+  }
+
+  #### Siamese head 0 begins ####
+
+  layer {
+    parents: "slice"
+    name: "conv1_head0"
+    children: "bn_conv1_head0"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels: 96
+      conv_dims: "11 11"
+      conv_pads: "0 0"
+      conv_strides: "4 4"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv1_kernel conv1_bias"
+  }
+
+  layer {
+    parents: "conv1_head0"
+    name: "bn_conv1_head0"
+    children: "relu1_head0"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv1_scale bn_conv1_bias bn_conv1_running_mean bn_conv1_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv1_head0"
+    name: "relu1_head0"
+    children: "pool1_head0"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu1_head0"
+    name: "pool1_head0"
+    children: "conv2_head0"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  layer {
+    parents: "pool1_head0"
+    name: "conv2_head0"
+    children: "bn_conv2_head0"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels: 256
+      conv_dims: "5 5"
+      conv_pads: "2 2"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv2_kernel conv2_bias"
+  }
+
+  layer {
+    parents: "conv2_head0"
+    name: "bn_conv2_head0"
+    children: "relu2_head0"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv2_scale bn_conv2_bias bn_conv2_running_mean bn_conv2_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv2_head0"
+    name: "relu2_head0"
+    children: "pool2_head0"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu2_head0"
+    name: "pool2_head0"
+    children: "conv3_head0"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  layer {
+    parents: "pool2_head0"
+    name: "conv3_head0"
+    children: "bn_conv3_head0"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  384
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv3_kernel conv3_bias"
+  }
+
+  layer {
+    parents: "conv3_head0"
+    name: "bn_conv3_head0"
+    children: "relu3_head0"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv3_scale bn_conv3_bias bn_conv3_running_mean bn_conv3_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv3_head0"
+    name: "relu3_head0"
+    children: "conv4_head0"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu3_head0"
+    name: "conv4_head0"
+    children: "bn_conv4_head0"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  384
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv4_kernel conv4_bias"
+  }
+
+  layer {
+    parents: "conv4_head0"
+    name: "bn_conv4_head0"
+    children: "relu4_head0"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv4_scale bn_conv4_bias bn_conv4_running_mean bn_conv4_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv4_head0"
+    name: "relu4_head0"
+    children: "conv5_head0"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu4_head0"
+    name: "conv5_head0"
+    children: "bn_conv5_head0"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  256
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv5_kernel conv5_bias"
+  }
+
+  layer {
+    parents: "conv5_head0"
+    name: "bn_conv5_head0"
+    children: "relu5_head0"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv5_scale bn_conv5_bias bn_conv5_running_mean bn_conv5_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv5_head0"
+    name: "relu5_head0"
+    children: "pool3_head0"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu5_head0"
+    name: "pool3_head0"
+    children: "fc6_head0"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  layer {
+    parents: "pool3_head0"
+    name: "fc6_head0"
+    children: "bn_fc6_head0"
+    data_layout: "data_parallel"
+    fully_connected {
+      num_neurons: 4096
+      has_bias: true
+    }
+    weights: "fc6_linearity fc6_bias"
+  }
+
+  layer {
+    parents: "fc6_head0"
+    name: "bn_fc6_head0"
+    children: "relu6_head0"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_fc6_scale bn_fc6_bias bn_fc6_running_mean bn_fc6_running_variance"
+  }
+
+  layer {
+    parents: "bn_fc6_head0"
+    name: "relu6_head0"
+    children: "concatenation"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  #### Siamese head 0 ends ####
+
+  #### Siamese head 1 begins ####
+
+  layer {
+    parents: "slice"
+    name: "conv1_head1"
+    children: "bn_conv1_head1"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels: 96
+      conv_dims: "11 11"
+      conv_pads: "0 0"
+      conv_strides: "4 4"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv1_kernel conv1_bias"
+  }
+
+  layer {
+    parents: "conv1_head1"
+    name: "bn_conv1_head1"
+    children: "relu1_head1"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv1_scale bn_conv1_bias bn_conv1_running_mean bn_conv1_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv1_head1"
+    name: "relu1_head1"
+    children: "pool1_head1"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu1_head1"
+    name: "pool1_head1"
+    children: "conv2_head1"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  layer {
+    parents: "pool1_head1"
+    name: "conv2_head1"
+    children: "bn_conv2_head1"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels: 256
+      conv_dims: "5 5"
+      conv_pads: "2 2"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv2_kernel conv2_bias"
+  }
+
+  layer {
+    parents: "conv2_head1"
+    name: "bn_conv2_head1"
+    children: "relu2_head1"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv2_scale bn_conv2_bias bn_conv2_running_mean bn_conv2_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv2_head1"
+    name: "relu2_head1"
+    children: "pool2_head1"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu2_head1"
+    name: "pool2_head1"
+    children: "conv3_head1"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  layer {
+    parents: "pool2_head1"
+    name: "conv3_head1"
+    children: "bn_conv3_head1"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  384
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv3_kernel conv3_bias"
+  }
+
+  layer {
+    parents: "conv3_head1"
+    name: "bn_conv3_head1"
+    children: "relu3_head1"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv3_scale bn_conv3_bias bn_conv3_running_mean bn_conv3_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv3_head1"
+    name: "relu3_head1"
+    children: "conv4_head1"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu3_head1"
+    name: "conv4_head1"
+    children: "bn_conv4_head1"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  384
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv4_kernel conv4_bias"
+  }
+
+  layer {
+    parents: "conv4_head1"
+    name: "bn_conv4_head1"
+    children: "relu4_head1"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv4_scale bn_conv4_bias bn_conv4_running_mean bn_conv4_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv4_head1"
+    name: "relu4_head1"
+    children: "conv5_head1"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu4_head1"
+    name: "conv5_head1"
+    children: "bn_conv5_head1"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  256
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv5_kernel conv5_bias"
+  }
+
+  layer {
+    parents: "conv5_head1"
+    name: "bn_conv5_head1"
+    children: "relu5_head1"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv5_scale bn_conv5_bias bn_conv5_running_mean bn_conv5_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv5_head1"
+    name: "relu5_head1"
+    children: "pool3_head1"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu5_head1"
+    name: "pool3_head1"
+    children: "fc6_head1"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  layer {
+    parents: "pool3_head1"
+    name: "fc6_head1"
+    children: "bn_fc6_head1"
+    data_layout: "data_parallel"
+    fully_connected {
+      num_neurons: 4096
+      has_bias: true
+    }
+    weights: "fc6_linearity fc6_bias"
+  }
+
+  layer {
+    parents: "fc6_head1"
+    name: "bn_fc6_head1"
+    children: "relu6_head1"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_fc6_scale bn_fc6_bias bn_fc6_running_mean bn_fc6_running_variance"
+  }
+
+  layer {
+    parents: "bn_fc6_head1"
+    name: "relu6_head1"
+    children: "concatenation"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  #### Siamese head 1 ends ####
+
+  #### Siamese head 2 begins ####
+
+  layer {
+    parents: "slice"
+    name: "conv1_head2"
+    children: "bn_conv1_head2"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels: 96
+      conv_dims: "11 11"
+      conv_pads: "0 0"
+      conv_strides: "4 4"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv1_kernel conv1_bias"
+  }
+
+  layer {
+    parents: "conv1_head2"
+    name: "bn_conv1_head2"
+    children: "relu1_head2"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv1_scale bn_conv1_bias bn_conv1_running_mean bn_conv1_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv1_head2"
+    name: "relu1_head2"
+    children: "pool1_head2"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu1_head2"
+    name: "pool1_head2"
+    children: "conv2_head2"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  layer {
+    parents: "pool1_head2"
+    name: "conv2_head2"
+    children: "bn_conv2_head2"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels: 256
+      conv_dims: "5 5"
+      conv_pads: "2 2"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv2_kernel conv2_bias"
+  }
+
+  layer {
+    parents: "conv2_head2"
+    name: "bn_conv2_head2"
+    children: "relu2_head2"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv2_scale bn_conv2_bias bn_conv2_running_mean bn_conv2_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv2_head2"
+    name: "relu2_head2"
+    children: "pool2_head2"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu2_head2"
+    name: "pool2_head2"
+    children: "conv3_head2"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  layer {
+    parents: "pool2_head2"
+    name: "conv3_head2"
+    children: "bn_conv3_head2"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  384
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv3_kernel conv3_bias"
+  }
+
+  layer {
+    parents: "conv3_head2"
+    name: "bn_conv3_head2"
+    children: "relu3_head2"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv3_scale bn_conv3_bias bn_conv3_running_mean bn_conv3_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv3_head2"
+    name: "relu3_head2"
+    children: "conv4_head2"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu3_head2"
+    name: "conv4_head2"
+    children: "bn_conv4_head2"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  384
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv4_kernel conv4_bias"
+  }
+
+  layer {
+    parents: "conv4_head2"
+    name: "bn_conv4_head2"
+    children: "relu4_head2"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv4_scale bn_conv4_bias bn_conv4_running_mean bn_conv4_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv4_head2"
+    name: "relu4_head2"
+    children: "conv5_head2"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu4_head2"
+    name: "conv5_head2"
+    children: "bn_conv5_head2"
+    data_layout: "data_parallel"
+    convolution {
+      num_dims: 2
+      num_output_channels:  256
+      conv_dims: "3 3"
+      conv_pads: "1 1"
+      conv_strides: "1 1"
+      has_bias: true
+      has_vectors: true
+    }
+    weights: "conv5_kernel conv5_bias"
+  }
+
+  layer {
+    parents: "conv5_head2"
+    name: "bn_conv5_head2"
+    children: "relu5_head2"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_conv5_scale bn_conv5_bias bn_conv5_running_mean bn_conv5_running_variance"
+  }
+
+  layer {
+    parents: "bn_conv5_head2"
+    name: "relu5_head2"
+    children: "pool3_head2"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu5_head2"
+    name: "pool3_head2"
+    children: "fc6_head2"
+    data_layout: "data_parallel"
+    pooling {
+      num_dims: 2
+      pool_dims: "3 3"
+      pool_pads: "0 0"
+      pool_strides: "2 2"
+      pool_mode: "max"
+      has_vectors: true
+    }
+  }
+
+  layer {
+    parents: "pool3_head2"
+    name: "fc6_head2"
+    children: "bn_fc6_head2"
+    data_layout: "data_parallel"
+    fully_connected {
+      num_neurons: 4096
+      has_bias: true
+    }
+    weights: "fc6_linearity fc6_bias"
+  }
+
+  layer {
+    parents: "fc6_head2"
+    name: "bn_fc6_head2"
+    children: "relu6_head2"
+    data_layout: "data_parallel"
+    freeze: true
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+    weights: "bn_fc6_scale bn_fc6_bias bn_fc6_running_mean bn_fc6_running_variance"
+  }
+
+  layer {
+    parents: "bn_fc6_head2"
+    name: "relu6_head2"
+    children: "concatenation"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  #### Siamese head 2 ends ####
+
+  layer {
+    parents: "relu6_head0 relu6_head1 relu6_head2"
+    name: "concatenation"
+    children: "fc7"
+    data_layout: "data_parallel"
+    concatenation {}
+  }
+
+  layer {
+    parents: "concatenation"
+    name: "fc7"
+    children: "bn_fc7"
+    data_layout: "data_parallel"
+    fully_connected {
+      num_neurons: 4096
+      has_bias: true
+    }
+    weights: "fc7_linearity fc7_bias"
+  }
+
+  layer {
+    parents: "fc7"
+    name: "bn_fc7"
+    children: "relu7"
+    data_layout: "data_parallel"
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+  }
+
+  layer {
+    parents: "bn_fc7"
+    name: "relu7"
+    children: "fc8"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu7"
+    name: "fc8"
+    children: "bn_fc8"
+    data_layout: "data_parallel"
+    fully_connected {
+      num_neurons: 4096
+      has_bias: true
+    }
+    weights: "fc8_linearity fc8_bias"
+  }
+
+  layer {
+    parents: "fc8"
+    name: "bn_fc8"
+    children: "relu8"
+    data_layout: "data_parallel"
+    batch_normalization {
+      decay: 0.9
+      scale_init: 1.0
+      bias_init: 0.0
+      epsilon: 1e-5
+      global_stats: true
+    }
+  }
+
+  layer {
+    parents: "bn_fc8"
+    name: "relu8"
+    children: "fc9"
+    data_layout: "data_parallel"
+    relu {}
+  }
+
+  layer {
+    parents: "relu8"
+    name: "fc9"
+    children: "prob"
+    data_layout: "data_parallel"
+    fully_connected {
+      num_neurons_is_num_labels: true
+      has_bias: false
+    }
+    weights: "fc9_linearity fc9_bias"
+  }
+
+  layer {
+    parents: "fc9"
+    name: "prob"
+    children: "target"
+    data_layout: "data_parallel"
+    softmax {}
+  }
+
+  layer {
+    parents: "prob data"
+    name: "target"
+    children: ""
+    data_layout: "data_parallel"
+    target {}
+  }
+
+}
diff --git a/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext b/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext
index 90b52bfdb01..aecff85da34 100644
--- a/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext
+++ b/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "data_parallel"
   mini_batch_size: 64
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    cross_entropy {}
+    layer_term { layer: "cross_entropy" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -23,7 +22,11 @@ model {
   ###################################################
 
   metric {
-    categorical_accuracy {}
+    layer_metric {
+      name: "categorical accuracy"
+      layer: "accuracy"
+      unit: "%"
+    }
   }
 
   ###################################################
@@ -51,13 +54,27 @@ model {
 
   layer {
     name: "data"
+    children: "image label"
     data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
     }
   }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "label"
+    data_layout: "data_parallel"
+    split {}
+  }
 
   layer {
+    parents: "image"
     name: "ip1"
     data_layout: "model_parallel"
     fully_connected {
@@ -67,12 +84,14 @@ model {
   }
 
   layer {
+    parents: "ip1"
     name: "relu1"
     data_layout: "model_parallel"
     relu {}
   }
 
   layer {
+    parents: "relu1"
     name: "ip2"
     data_layout: "model_parallel"
     fully_connected {
@@ -82,16 +101,24 @@ model {
   }
 
   layer {
+    parents: "ip2"
     name: "prob"
-    data_layout: "model_parallel"
+    data_layout: "data_parallel"
     softmax {}
   }
 
   layer {
-    parents: "prob data"
-    name: "target"
+    parents: "prob label"
+    name: "cross_entropy"
+    data_layout: "data_parallel"
+    cross_entropy {}
+  }
+
+  layer {
+    parents: "prob label"
+    name: "accuracy"
     data_layout: "data_parallel"
-    target {}
+    categorical_accuracy {}
   }
 
 }
diff --git a/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext b/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext
index f74e68f6449..87a04ad3e45 100644
--- a/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext
+++ b/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "data_parallel"
   mini_batch_size: 64
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    cross_entropy {}
+    layer_term { layer: "cross_entropy" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -23,7 +22,11 @@ model {
   ###################################################
 
   metric {
-    categorical_accuracy {}
+    layer_metric {
+      name: "categorical accuracy"
+      layer: "accuracy"
+      unit: "%"
+    }
   }
 
   ###################################################
@@ -51,13 +54,27 @@ model {
 
   layer {
     name: "data"
+    children: "image label"
     data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
     }
   }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "label"
+    data_layout: "data_parallel"
+    split {}
+  }
 
   layer {
+    parents: "image"
     name: "ip1"
     data_layout: "model_parallel"
     fully_connected {
@@ -67,12 +84,14 @@ model {
   }
 
   layer {
+    parents: "ip1"
     name: "relu1"
     data_layout: "model_parallel"
     relu {}
   }
 
   layer {
+    parents: "relu1"
     name: "ip2"
     data_layout: "model_parallel"
     fully_connected {
@@ -82,11 +101,13 @@ model {
   }
 
   layer {
+    parents: "ip2"
     name: "relu2"
     data_layout: "model_parallel"
     relu {}
   }
   layer {
+    parents: "relu2"
     name: "ip3"
     data_layout: "model_parallel"
     fully_connected {
@@ -96,16 +117,24 @@ model {
   }
 
   layer {
+    parents: "ip3"
     name: "prob"
-    data_layout: "model_parallel"
+    data_layout: "data_parallel"
     softmax {}
   }
 
   layer {
-    parents: "prob data"
-    name: "target"
+    parents: "prob label"
+    name: "cross_entropy"
+    data_layout: "data_parallel"
+    cross_entropy {}
+  }
+
+  layer {
+    parents: "prob label"
+    name: "accuracy"
     data_layout: "data_parallel"
-    target {}
+    categorical_accuracy {}
   }
 
 }
diff --git a/model_zoo/models/vram/dram_template.prototext b/model_zoo/models/vram/dram_template.prototext
index 258a9f3224f..8271cdc623e 100644
--- a/model_zoo/models/vram/dram_template.prototext
+++ b/model_zoo/models/vram/dram_template.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 256
   block_size: 256
diff --git a/model_zoo/models/vram/generate_dram.py b/model_zoo/models/vram/generate_dram.py
index a55a7a2b871..d0778b0846d 100755
--- a/model_zoo/models/vram/generate_dram.py
+++ b/model_zoo/models/vram/generate_dram.py
@@ -354,8 +354,7 @@ def configure_model(model):
 
         # Categorical accuracy
         acc1 = new_layer(model, "top1_accuracy_step%d" % step,
-                         [class_prob, label], "top_k_categorical_accuracy")
-        acc1.top_k_categorical_accuracy.k = 1
+                         [class_prob, label], "categorical_accuracy")
         acc5 = new_layer(model, "top5_accuracy_step%d" % step,
                          [class_prob, label], "top_k_categorical_accuracy")
         acc5.top_k_categorical_accuracy.k = 5
diff --git a/model_zoo/tests/comm_test.cpp b/model_zoo/tests/comm_test.cpp
index 0eb373542f2..b30f4021b38 100644
--- a/model_zoo/tests/comm_test.cpp
+++ b/model_zoo/tests/comm_test.cpp
@@ -133,19 +133,20 @@ void test_send_recv_blob() {
                    LBANN_COMM_TEST_NUM_MODELS;
   int send_data = 42;
   int recv_data = 0;
+  El::SyncInfo<El::Device::CPU> syncInfoCPU;
   // Test sends/recvs with full model/rank spec.
-  comm->send(&send_data, 1, send_model, comm->get_rank_in_model());
-  comm->recv(&recv_data, 1, recv_model, comm->get_rank_in_model());
+  comm->send(&send_data, 1, send_model, comm->get_rank_in_model(), syncInfoCPU);
+  comm->recv(&recv_data, 1, recv_model, comm->get_rank_in_model(), syncInfoCPU);
   ASSERT_EQ(send_data, recv_data);
   // Test sends/recvs with only the model.
   recv_data = 0;
-  comm->send(&send_data, 1, send_model);
-  comm->recv(&recv_data, 1, recv_model);
+  comm->send(&send_data, 1, send_model, syncInfoCPU);
+  comm->recv(&recv_data, 1, recv_model, syncInfoCPU);
   ASSERT_EQ(send_data, recv_data);
   // Test with receiving from anywhere.
   recv_data = 0;
-  comm->send(&send_data, 1, send_model, comm->get_rank_in_model());
-  comm->recv(&recv_data, 1);
+  comm->send(&send_data, 1, send_model, comm->get_rank_in_model(), syncInfoCPU);
+  comm->recv(&recv_data, 1, syncInfoCPU);
   ASSERT_EQ(send_data, recv_data);
   fini_comm(comm);
 }
diff --git a/model_zoo/tests/layer_tests/model_covariance.prototext b/model_zoo/tests/layer_tests/model_covariance.prototext
new file mode 100644
index 00000000000..15aba72183c
--- /dev/null
+++ b/model_zoo/tests/layer_tests/model_covariance.prototext
@@ -0,0 +1,109 @@
+model {
+  data_layout: "data_parallel"
+  mini_batch_size: 11
+  block_size: 256
+  num_epochs: 0
+  num_parallel_readers: 0
+  procs_per_model: 0
+
+  ###################################################
+  # Objective function
+  ###################################################
+
+  objective_function {
+    layer_term { layer: "l2" }
+  }
+
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback { print {} }
+  callback { timer {} }
+  callback {
+    gradient_check {
+      verbose: false
+      fail_on_error: true
+    }
+  }
+
+  ###################################################
+  # Layers
+  ###################################################
+
+  layer {
+    name: "data"
+    data_layout: "data_parallel"
+    input {
+      io_buffer: "partitioned"
+    }
+  }
+
+  # Input data
+  layer {
+    name: "x0"
+    weights_layer {
+      dims: "5"
+    }
+    data_layout: "model_parallel"
+    weights: "x0_vals"
+  }
+  weights {
+    name: "x0_vals"
+    value_initializer {
+      values: "1 -0.5 0.25 -0.125 0.0675"
+    }
+  }
+  layer {
+    name: "x1"
+    weights_layer {
+      dims: "5"
+    }
+    data_layout: "model_parallel"
+    weights: "x1_vals"
+  }
+  weights {
+    name: "x1_vals"
+    value_initializer {
+      values: "0.1 0.2 0.4 0.8 1.6"
+    }
+  }
+
+  # Variations of covariance layer
+  layer {
+    parents: "x0 x1"
+    name: "unbiased_covariance_model_parallel"
+    covariance { biased: false }
+    data_layout: "model_parallel"
+  }
+  layer {
+    parents: "x0 x1"
+    name: "biased_covariance_model_parallel"
+    covariance { biased: true }
+    data_layout: "model_parallel"
+  }
+  layer {
+    parents: "x0 x1"
+    name: "unbiased_covariance_data_parallel"
+    covariance { biased: false }
+    data_layout: "data_parallel"
+  }
+  layer {
+    parents: "x0 x1"
+    name: "biased_covariance_data_parallel"
+    covariance { biased: true }
+    data_layout: "data_parallel"
+  }
+
+  # Combine into objective function
+  layer {
+    parents: "unbiased_covariance_model_parallel biased_covariance_model_parallel unbiased_covariance_data_parallel biased_covariance_data_parallel"
+    name: "sum"
+    sum {}
+  }
+  layer {
+    parents: "sum"
+    name: "l2"
+    l2_norm2 {}
+  }
+
+}
diff --git a/model_zoo/tests/layer_tests/model_l2_norm2.prototext b/model_zoo/tests/layer_tests/model_l2_norm2.prototext
new file mode 100644
index 00000000000..8a966d74994
--- /dev/null
+++ b/model_zoo/tests/layer_tests/model_l2_norm2.prototext
@@ -0,0 +1,63 @@
+model {
+  data_layout: "data_parallel"
+  mini_batch_size: 11
+  block_size: 256
+  num_epochs: 0
+  num_parallel_readers: 0
+  procs_per_model: 0
+
+  ###################################################
+  # Objective function
+  ###################################################
+
+  objective_function {
+    layer_term { layer: "l2" }
+  }
+
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback { print {} }
+  callback { timer {} }
+  callback {
+    gradient_check {
+      verbose: false
+      fail_on_error: true
+    }
+  }
+
+  ###################################################
+  # Layers
+  ###################################################
+
+  layer {
+    name: "data"
+    data_layout: "data_parallel"
+    input {
+      io_buffer: "partitioned"
+    }
+  }
+
+  layer {
+    name: "x"
+    weights_layer {
+      dims: "4"
+    }
+    data_layout: "model_parallel"
+    weights: "x_vals"
+  }
+  weights {
+    name: "x_vals"
+    value_initializer {
+      values: "1 2 3 4"
+    }
+  }
+
+  layer {
+    parents: "x"
+    name: "l2"
+    l2_norm2 {}
+    data_layout: "model_parallel"
+  }
+
+}
diff --git a/model_zoo/tests/layer_tests/model_log_softmax.prototext b/model_zoo/tests/layer_tests/model_log_softmax.prototext
new file mode 100644
index 00000000000..ee05fda2890
--- /dev/null
+++ b/model_zoo/tests/layer_tests/model_log_softmax.prototext
@@ -0,0 +1,83 @@
+model {
+  data_layout: "data_parallel"
+  mini_batch_size: 11
+  block_size: 256
+  num_epochs: 0
+  num_parallel_readers: 0
+  procs_per_model: 0
+
+  ###################################################
+  # Objective function
+  ###################################################
+
+  objective_function {
+    layer_term { layer: "l2" }
+  }
+
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback { print {} }
+  callback { timer {} }
+  callback {
+    gradient_check {
+      verbose: false
+      fail_on_error: true
+    }
+  }
+
+  ###################################################
+  # Layers
+  ###################################################
+
+  layer {
+    name: "data"
+    data_layout: "data_parallel"
+    input {
+      io_buffer: "partitioned"
+    }
+  }
+
+  # Input data
+  layer {
+    name: "x"
+    weights_layer {
+      dims: "5"
+    }
+    data_layout: "model_parallel"
+    weights: "x_vals"
+  }
+  weights {
+    name: "x_vals"
+    value_initializer {
+      values: "-4 -2 0 1 2"
+    }
+  }
+
+  # Variations of log softmax layer
+  layer {
+    parents: "x"
+    name: "log_softmax_model_parallel"
+    log_softmax {}
+    data_layout: "model_parallel"
+  }
+  layer {
+    parents: "x"
+    name: "log_softmax_data_parallel"
+    log_softmax {}
+    data_layout: "data_parallel"
+  }
+
+  # Combine into objective function
+  layer {
+    parents: "log_softmax_model_parallel log_softmax_data_parallel"
+    name: "sum"
+    sum {}
+  }
+  layer {
+    parents: "sum"
+    name: "l2"
+    l2_norm2 {}
+  }
+
+}
diff --git a/model_zoo/tests/layer_tests/model_softmax.prototext b/model_zoo/tests/layer_tests/model_softmax.prototext
new file mode 100644
index 00000000000..d01a3601f99
--- /dev/null
+++ b/model_zoo/tests/layer_tests/model_softmax.prototext
@@ -0,0 +1,83 @@
+model {
+  data_layout: "data_parallel"
+  mini_batch_size: 11
+  block_size: 256
+  num_epochs: 0
+  num_parallel_readers: 0
+  procs_per_model: 0
+
+  ###################################################
+  # Objective function
+  ###################################################
+
+  objective_function {
+    layer_term { layer: "l2" }
+  }
+
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback { print {} }
+  callback { timer {} }
+  callback {
+    gradient_check {
+      verbose: false
+      fail_on_error: true
+    }
+  }
+
+  ###################################################
+  # Layers
+  ###################################################
+
+  layer {
+    name: "data"
+    data_layout: "data_parallel"
+    input {
+      io_buffer: "partitioned"
+    }
+  }
+
+  # Input data
+  layer {
+    name: "x"
+    weights_layer {
+      dims: "5"
+    }
+    data_layout: "model_parallel"
+    weights: "x_vals"
+  }
+  weights {
+    name: "x_vals"
+    value_initializer {
+      values: "-4 -2 0 1 2"
+    }
+  }
+
+  # Variations of softmax layer
+  layer {
+    parents: "x"
+    name: "softmax_model_parallel"
+    softmax {}
+    data_layout: "model_parallel"
+  }
+  layer {
+    parents: "x"
+    name: "softmax_data_parallel"
+    softmax {}
+    data_layout: "data_parallel"
+  }
+
+  # Combine into objective function
+  layer {
+    parents: "softmax_model_parallel softmax_data_parallel"
+    name: "sum"
+    sum {}
+  }
+  layer {
+    parents: "sum"
+    name: "l2"
+    l2_norm2 {}
+  }
+
+}
diff --git a/model_zoo/tests/layer_tests/model_variance.prototext b/model_zoo/tests/layer_tests/model_variance.prototext
new file mode 100644
index 00000000000..e9c28d3d90a
--- /dev/null
+++ b/model_zoo/tests/layer_tests/model_variance.prototext
@@ -0,0 +1,95 @@
+model {
+  data_layout: "data_parallel"
+  mini_batch_size: 11
+  block_size: 256
+  num_epochs: 0
+  num_parallel_readers: 0
+  procs_per_model: 0
+
+  ###################################################
+  # Objective function
+  ###################################################
+
+  objective_function {
+    layer_term { layer: "l2" }
+  }
+
+  ###################################################
+  # Callbacks
+  ###################################################
+  callback { print {} }
+  callback { timer {} }
+  callback {
+    gradient_check {
+      verbose: false
+      fail_on_error: true
+    }
+  }
+
+  ###################################################
+  # Layers
+  ###################################################
+
+  layer {
+    name: "data"
+    data_layout: "data_parallel"
+    input {
+      io_buffer: "partitioned"
+    }
+  }
+
+  # Input data
+  layer {
+    name: "x"
+    weights_layer {
+      dims: "5"
+    }
+    data_layout: "model_parallel"
+    weights: "x_vals"
+  }
+  weights {
+    name: "x_vals"
+    value_initializer {
+      values: "1 -0.5 0.25 -0.125 0.0675"
+    }
+  }
+
+  # Variations of variance layer
+  layer {
+    parents: "x"
+    name: "unbiased_variance_model_parallel"
+    variance { biased: false }
+    data_layout: "model_parallel"
+  }
+  layer {
+    parents: "x"
+    name: "biased_variance_model_parallel"
+    variance { biased: true }
+    data_layout: "model_parallel"
+  }
+  layer {
+    parents: "x"
+    name: "unbiased_variance_data_parallel"
+    variance { biased: false }
+    data_layout: "data_parallel"
+  }
+  layer {
+    parents: "x"
+    name: "biased_variance_data_parallel"
+    variance { biased: true }
+    data_layout: "data_parallel"
+  }
+
+  # Combine into objective function
+  layer {
+    parents: "unbiased_variance_model_parallel biased_variance_model_parallel unbiased_variance_data_parallel biased_variance_data_parallel"
+    name: "sum"
+    sum {}
+  }
+  layer {
+    parents: "sum"
+    name: "l2"
+    l2_norm2 {}
+  }
+
+}
diff --git a/model_zoo/tests/model_lenet_mnist_ckpt.prototext b/model_zoo/tests/model_lenet_mnist_ckpt.prototext
index 9da32d6b14d..bb1bb320dd7 100644
--- a/model_zoo/tests/model_lenet_mnist_ckpt.prototext
+++ b/model_zoo/tests/model_lenet_mnist_ckpt.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "data_parallel"
   mini_batch_size: 64
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    cross_entropy {}
+    layer_term { layer: "cross_entropy" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -23,7 +22,11 @@ model {
   ###################################################
 
   metric {
-    categorical_accuracy {}
+    layer_metric {
+      name: "categorical accuracy"
+      layer: "accuracy"
+      unit: "%"
+    }
   }
 
   ###################################################
@@ -65,14 +68,27 @@ model {
 
   layer {
     name: "data"
-    children: "conv1 target"
+    children: "image label"
     data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
     }
   }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "label"
+    data_layout: "data_parallel"
+    split {}
+  }
 
   layer {
+    parents: "image"
     name: "conv1"
     data_layout: "data_parallel"
     convolution {
@@ -86,6 +102,7 @@ model {
   }
 
   layer {
+    parents: "conv1"
     name: "pool1"
     data_layout: "data_parallel"
     pooling {
@@ -98,6 +115,7 @@ model {
   }
 
   layer {
+    parents: "pool1"
     name: "conv2"
     data_layout: "data_parallel"
     convolution {
@@ -111,6 +129,7 @@ model {
   }
 
   layer {
+    parents: "conv2"
     name: "pool2"
     data_layout: "data_parallel"
     pooling {
@@ -123,6 +142,7 @@ model {
   }
 
   layer {
+    parents: "pool2"
     name: "ip1"
     data_layout: "model_parallel"
     fully_connected {
@@ -132,12 +152,14 @@ model {
   }
 
   layer {
+    parents: "ip1"
     name: "relu1"
     data_layout: "model_parallel"
     relu {}
   }
 
   layer {
+    parents: "relu1"
     name: "ip2"
     data_layout: "model_parallel"
     fully_connected {
@@ -147,16 +169,24 @@ model {
   }
 
   layer {
+    parents: "ip2"
     name: "prob"
-    data_layout: "model_parallel"
+    data_layout: "data_parallel"
     softmax {}
   }
 
   layer {
-    parents: "prob data"
-    name: "target"
+    parents: "prob label"
+    name: "cross_entropy"
     data_layout: "data_parallel"
-    target {}
+    cross_entropy {}    
+  }
+
+  layer {
+    parents: "prob label"
+    name: "accuracy"
+    data_layout: "data_parallel"
+    categorical_accuracy {}
   }
 
 }
diff --git a/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext b/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext
index 084b0f83646..7ae5e2cc089 100644
--- a/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext
+++ b/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "data_parallel"
   mini_batch_size: 64
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    cross_entropy {}
+    layer_term { layer: "cross_entropy" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -23,7 +22,11 @@ model {
   ###################################################
 
   metric {
-    categorical_accuracy {}
+    layer_metric {
+      name: "categorical accuracy"
+      layer: "accuracy"
+      unit: "%"
+    }
   }
 
   ###################################################
@@ -67,14 +70,27 @@ model {
 
   layer {
     name: "data"
-    children: "conv1 target"
+    children: "image label"
     data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
     }
   }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "label"
+    data_layout: "data_parallel"
+    split {}
+  }
 
   layer {
+    parents: "image"
     name: "conv1"
     data_layout: "data_parallel"
     convolution {
@@ -88,6 +104,7 @@ model {
   }
 
   layer {
+    parents: "conv1"
     name: "pool1"
     data_layout: "data_parallel"
     pooling {
@@ -100,6 +117,7 @@ model {
   }
 
   layer {
+    parents: "pool1"
     name: "conv2"
     data_layout: "data_parallel"
     convolution {
@@ -113,6 +131,7 @@ model {
   }
 
   layer {
+    parents: "conv2"
     name: "pool2"
     data_layout: "data_parallel"
     pooling {
@@ -125,6 +144,7 @@ model {
   }
 
   layer {
+    parents: "pool2"
     name: "ip1"
     data_layout: "model_parallel"
     fully_connected {
@@ -134,12 +154,14 @@ model {
   }
 
   layer {
+    parents: "ip1"
     name: "relu1"
     data_layout: "model_parallel"
     relu {}
   }
 
   layer {
+    parents: "relu1"
     name: "ip2"
     data_layout: "model_parallel"
     fully_connected {
@@ -149,16 +171,24 @@ model {
   }
 
   layer {
+    parents: "ip2"
     name: "prob"
-    data_layout: "model_parallel"
+    data_layout: "data_parallel"
     softmax {}
   }
 
   layer {
-    parents: "prob data"
-    name: "target"
+    parents: "prob label"
+    name: "cross_entropy"
     data_layout: "data_parallel"
-    target {}
+    cross_entropy {}    
+  }
+
+  layer {
+    parents: "prob label"
+    name: "accuracy"
+    data_layout: "data_parallel"
+    categorical_accuracy {}
   }
 
 }
diff --git a/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext b/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext
index de1e4687c16..150e3f3593f 100644
--- a/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext
+++ b/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "data_parallel"
   mini_batch_size: 64
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    cross_entropy {}
+    layer_term { layer: "cross_entropy" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -23,7 +22,11 @@ model {
   ###################################################
 
   metric {
-    categorical_accuracy {}
+    layer_metric {
+      name: "categorical accuracy"
+      layer: "accuracy"
+      unit: "%"
+    }
   }
 
   ###################################################
@@ -64,14 +67,27 @@ model {
 
   layer {
     name: "data"
-    children: "conv1 target"
+    children: "image label"
     data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
     }
   }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "label"
+    data_layout: "data_parallel"
+    split {}
+  }
 
   layer {
+    parents: "image"
     name: "conv1"
     data_layout: "data_parallel"
     convolution {
@@ -85,6 +101,7 @@ model {
   }
 
   layer {
+    parents: "conv1"
     name: "pool1"
     data_layout: "data_parallel"
     pooling {
@@ -97,6 +114,7 @@ model {
   }
 
   layer {
+    parents: "pool1"
     name: "conv2"
     data_layout: "data_parallel"
     convolution {
@@ -110,6 +128,7 @@ model {
   }
 
   layer {
+    parents: "conv2"
     name: "pool2"
     data_layout: "data_parallel"
     pooling {
@@ -122,6 +141,7 @@ model {
   }
 
   layer {
+    parents: "pool2"
     name: "ip1"
     data_layout: "model_parallel"
     fully_connected {
@@ -131,12 +151,14 @@ model {
   }
 
   layer {
+    parents: "ip1"
     name: "relu1"
     data_layout: "model_parallel"
     relu {}
   }
 
   layer {
+    parents: "relu1"
     name: "ip2"
     data_layout: "model_parallel"
     fully_connected {
@@ -146,16 +168,24 @@ model {
   }
 
   layer {
+    parents: "ip2"
     name: "prob"
-    data_layout: "model_parallel"
+    data_layout: "data_parallel"
     softmax {}
   }
 
   layer {
-    parents: "prob data"
-    name: "target"
+    parents: "prob label"
+    name: "cross_entropy"
+    data_layout: "data_parallel"
+    cross_entropy {}
+  }
+
+  layer {
+    parents: "prob label"
+    name: "accuracy"
     data_layout: "data_parallel"
-    target {}
+    categorical_accuracy {}
   }
 
 }
diff --git a/model_zoo/tests/model_mnist_conv_graph.prototext b/model_zoo/tests/model_mnist_conv_graph.prototext
index ee0b84f119b..54c31db7dc4 100644
--- a/model_zoo/tests/model_mnist_conv_graph.prototext
+++ b/model_zoo/tests/model_mnist_conv_graph.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
   mini_batch_size: 31
   block_size: 257
@@ -10,6 +9,7 @@ model {
   ###################################################
   # Objective function
   ###################################################
+
   objective_function {
     layer_term { layer: "cross_entropy" }
   }
@@ -17,6 +17,7 @@ model {
   ###################################################
   # Callbacks
   ###################################################
+  
   callback { print {} }
   callback { timer {} }
   callback {
diff --git a/model_zoo/tests/model_mnist_distributed_io.prototext b/model_zoo/tests/model_mnist_distributed_io.prototext
index 86a06b7e812..6319ea582a3 100644
--- a/model_zoo/tests/model_mnist_distributed_io.prototext
+++ b/model_zoo/tests/model_mnist_distributed_io.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "model_parallel"
   mini_batch_size: 10
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    cross_entropy {}
+    layer_term { layer: "cross_entropy" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -23,7 +22,11 @@ model {
   ###################################################
 
   metric {
-    categorical_accuracy {}
+    layer_metric {
+      name: "categorical accuracy"
+      layer: "accuracy"
+      unit: "%"
+    }
   }
 
   ###################################################
@@ -89,19 +92,30 @@ model {
   ######################
   layer {
     name: "1"
-    parents: "1"
-    children: "2 13"
+    children: "1a 1b"
     data_layout: "model_parallel"
     input {
       io_buffer: "distributed"
     }
   }
+  layer {
+    parents: "1"
+    name: "1a"
+    data_layout: "model_parallel"
+    split {}
+  }
+  layer {
+    parents: "1"
+    name: "1b"
+    data_layout: "model_parallel"
+    split {}
+  }
 
   # FULLY_CONNECTED 2
   ######################
   layer {
+    parents: "1a"
     name: "2"
-    parents: "1"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1024
@@ -113,8 +127,8 @@ model {
   # RELU 3
   ######################
   layer {
-    name: "3"
     parents: "2"
+    name: "3"
     data_layout: "model_parallel"
     relu {
     }
@@ -123,8 +137,8 @@ model {
   # DROPOUT 4
   ######################
   layer {
-    name: "4"
     parents: "3"
+    name: "4"
     data_layout: "model_parallel"
     dropout {
       keep_prob: -1
@@ -134,8 +148,8 @@ model {
   # FULLY_CONNECTED 5
   ######################
   layer {
-    name: "5"
     parents: "4"
+    name: "5"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1024
@@ -147,8 +161,8 @@ model {
   # RELU 6
   ######################
   layer {
-    name: "6"
     parents: "5"
+    name: "6"
     data_layout: "model_parallel"
     relu {
     }
@@ -157,8 +171,8 @@ model {
   # DROPOUT 7
   ######################
   layer {
-    name: "7"
     parents: "6"
+    name: "7"
     data_layout: "model_parallel"
     dropout {
       keep_prob: -1
@@ -168,8 +182,8 @@ model {
   # FULLY_CONNECTED 8
   ######################
   layer {
-    name: "8"
     parents: "7"
+    name: "8"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 1024
@@ -181,8 +195,8 @@ model {
   # RELU 9
   ######################
   layer {
-    name: "9"
     parents: "8"
+    name: "9"
     data_layout: "model_parallel"
     relu {
     }
@@ -191,8 +205,8 @@ model {
   # DROPOUT 10
   ######################
   layer {
-    name: "10"
     parents: "9"
+    name: "10"
     data_layout: "model_parallel"
     dropout {
       keep_prob: -1
@@ -202,8 +216,8 @@ model {
   # FULLY_CONNECTED 11
   ######################
   layer {
-    name: "11"
     parents: "10"
+    name: "11"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 10
@@ -215,19 +229,26 @@ model {
   # SOFTMAX 12
   ######################
   layer {
-    name: "12"
     parents: "11"
+    name: "12"
     data_layout: "model_parallel"
     softmax {
     }
   }
 
-  # TARGET 13
+  # Evaluation
   ######################
   layer {
-    name: "13"
-    parents: "12 1"
+    parents: "12 1b"
+    name: "cross_entropy"
     data_layout: "model_parallel"
-    target {}
+    cross_entropy {}
+  }
+  layer {
+    parents: "12 1b"
+    name: "accuracy"
+    data_layout: "model_parallel"
+    categorical_accuracy {}
   }
+  
 }
diff --git a/model_zoo/tests/model_mnist_partitioned_io.prototext b/model_zoo/tests/model_mnist_partitioned_io.prototext
index 02f750a265e..ea9f52e7d8a 100644
--- a/model_zoo/tests/model_mnist_partitioned_io.prototext
+++ b/model_zoo/tests/model_mnist_partitioned_io.prototext
@@ -1,5 +1,4 @@
 model {
-  name: "sequential_model"
   data_layout: "model_parallel"
   mini_batch_size: 10
   block_size: 256
@@ -12,7 +11,7 @@ model {
   ###################################################
 
   objective_function {
-    cross_entropy {}
+    layer_term { layer: "cross_entropy" }
     l2_weight_regularization {
       scale_factor: 1e-4
     }
@@ -23,7 +22,11 @@ model {
   ###################################################
 
   metric {
-    categorical_accuracy {}
+    layer_metric {
+      name: "categorical accuracy"
+      layer: "accuracy"
+      unit: "%"
+    }
   }
 
   ###################################################
@@ -101,16 +104,29 @@ model {
   ######################
   layer {
     name: "1"
-    children: "2 13"
+    children: "1a 1b"
     data_layout: "data_parallel"
     input {
       io_buffer: "partitioned"
     }
   }
+  layer {
+    parents: "1"
+    name: "1a"
+    data_layout: "data_parallel"
+    split {}
+  }
+  layer {
+    parents: "1"
+    name: "1b"
+    data_layout: "data_parallel"
+    split {}
+  }
 
   # FULLY_CONNECTED 2
   ######################
   layer {
+    parents: "1"
     name: "2"
     data_layout: "model_parallel"
     fully_connected {
@@ -123,6 +139,7 @@ model {
   # RELU 3
   ######################
   layer {
+    parents: "2"
     name: "3"
     data_layout: "model_parallel"
     relu {
@@ -132,6 +149,7 @@ model {
   # DROPOUT 4
   ######################
   layer {
+    parents: "3"
     name: "4"
     data_layout: "model_parallel"
     dropout {
@@ -142,6 +160,7 @@ model {
   # FULLY_CONNECTED 5
   ######################
   layer {
+    parents: "4"
     name: "5"
     data_layout: "model_parallel"
     fully_connected {
@@ -154,6 +173,7 @@ model {
   # RELU 6
   ######################
   layer {
+    parents: "5"
     name: "6"
     data_layout: "model_parallel"
     relu {
@@ -163,6 +183,7 @@ model {
   # DROPOUT 7
   ######################
   layer {
+    parents: "6"
     name: "7"
     data_layout: "model_parallel"
     dropout {
@@ -173,6 +194,7 @@ model {
   # FULLY_CONNECTED 8
   ######################
   layer {
+    parents: "7"
     name: "8"
     data_layout: "model_parallel"
     fully_connected {
@@ -185,6 +207,7 @@ model {
   # RELU 9
   ######################
   layer {
+    parents: "8"
     name: "9"
     data_layout: "model_parallel"
     relu {
@@ -194,6 +217,7 @@ model {
   # DROPOUT 10
   ######################
   layer {
+    parents: "9"
     name: "10"
     data_layout: "model_parallel"
     dropout {
@@ -204,6 +228,7 @@ model {
   # FULLY_CONNECTED 11
   ######################
   layer {
+    parents: "10"
     name: "11"
     data_layout: "model_parallel"
     fully_connected {
@@ -216,18 +241,26 @@ model {
   # SOFTMAX 12
   ######################
   layer {
+    parents: "11"
     name: "12"
     data_layout: "model_parallel"
     softmax {
     }
   }
 
-  # TARGET 13
+  # Evaluation
   ######################
   layer {
-    parents: "12 1"
-    name: "13"
-    data_layout: "data_parallel"
-    target {}
+    parents: "12 1b"
+    name: "cross_entropy"
+    data_layout: "model_parallel"
+    cross_entropy {}
+  }
+  layer {
+    parents: "12 1b"
+    name: "accuracy"
+    data_layout: "model_parallel"
+    categorical_accuracy {}
   }
+
 }
diff --git a/model_zoo/tests/model_mnist_ridge_regression.prototext b/model_zoo/tests/model_mnist_ridge_regression.prototext
index 2dfc0bd70a5..19035149839 100644
--- a/model_zoo/tests/model_mnist_ridge_regression.prototext
+++ b/model_zoo/tests/model_mnist_ridge_regression.prototext
@@ -1,8 +1,7 @@
 model {
-  name: "directed_acyclic_graph_model"
   data_layout: "data_parallel"
-  mini_batch_size: 32
-  block_size: 256
+  mini_batch_size: 131
+  block_size: 257
   num_epochs: 4
   num_parallel_readers: 0
   procs_per_model: 0
@@ -44,25 +43,23 @@ model {
     name: "data"
     children: "image label"
     data_layout: "data_parallel"
-    input {
-      io_buffer: "partitioned"
-    }
+    input { io_buffer: "partitioned" }
   }
   layer {
-    name: "image"
     parents: "data"
+    name: "image"
     data_layout: "model_parallel"
     split {}
   }
   layer {
-    name: "label"
     parents: "data"
+    name: "label"
     data_layout: "model_parallel"
     split {}
   }
   layer {
-    name: "fc"
     parents: "image"
+    name: "fc"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 10
diff --git a/model_zoo/tests/model_mnist_softmax_regression.prototext b/model_zoo/tests/model_mnist_softmax_classifier.prototext
similarity index 54%
rename from model_zoo/tests/model_mnist_softmax_regression.prototext
rename to model_zoo/tests/model_mnist_softmax_classifier.prototext
index 81fcfffdf10..ae0fa72ba13 100644
--- a/model_zoo/tests/model_mnist_softmax_regression.prototext
+++ b/model_zoo/tests/model_mnist_softmax_classifier.prototext
@@ -1,8 +1,7 @@
 model {
-  name: "sequential_model"
   data_layout: "data_parallel"
-  mini_batch_size: 32
-  block_size: 256
+  mini_batch_size: 103
+  block_size: 199
   num_epochs: 4
   num_parallel_readers: 0
   procs_per_model: 0
@@ -12,37 +11,21 @@ model {
   ###################################################
 
   objective_function {
-    cross_entropy {}
-    l2_weight_regularization {
-      scale_factor: 0.01
-    }
+    layer_term { layer: "cross_entropy" }
   }
 
   ###################################################
   # Metrics
   ###################################################
-
-  metric { categorical_accuracy {} }
+  
+  metric { layer_metric { layer: "accuracy" } }
 
   ###################################################
   # Callbacks
   ###################################################
-  callback {
-    print {
-      interval: 1
-    }
-  }
-  callback {
-    timer {
-    }
-  }
-  callback {
-    summary {
-      dir: "."
-      batch_interval: 1
-      mat_interval: 25
-    }
-  }
+  
+  callback { print {} }
+  callback { timer {} }
   callback {
     gradient_check {
       verbose: false
@@ -51,54 +34,53 @@ model {
   }
 
   ###################################################
-  # start of layers
+  # Layers
   ###################################################
 
-  #######
-  # INPUT
-  #######
   layer {
     name: "data"
-    children: "fc target"
+    children: "image label"
     data_layout: "data_parallel"
-    input {
-      io_buffer: "partitioned"
-    }
+    input { io_buffer: "partitioned" }
+  }
+  layer {
+    parents: "data"
+    name: "image"
+    data_layout: "model_parallel"
+    split {}
+  }
+  layer {
+    parents: "data"
+    name: "label"
+    data_layout: "model_parallel"
+    split {}
   }
-
-  #################
-  # FULLY_CONNECTED
-  #################
   layer {
+    parents: "image"
     name: "fc"
     data_layout: "model_parallel"
     fully_connected {
       num_neurons: 10
-      weight_initialization: "glorot_uniform"
       has_bias: false
     }
   }
-
-  #########
-  # SOFTMAX
-  #########
   layer {
+    parents: "fc"
     name: "prob"
     data_layout: "model_parallel"
     softmax {}
   }
-
-  ########
-  # TARGET
-  ########
   layer {
-    parents: "prob data"
-    name: "target"
-    data_layout: "data_parallel"
-    target {}
+    parents: "prob label"
+    name: "cross_entropy"
+    data_layout: "model_parallel"
+    cross_entropy {}
+  }
+  layer {
+    parents: "prob label"
+    name: "accuracy"
+    data_layout: "model_parallel"
+    categorical_accuracy {}
   }
 
-  ###################################################
-  # end of layers
-  ###################################################
 }
diff --git a/scripts/build_lbann_lc.sh b/scripts/build_lbann_lc.sh
index 0d75f85d145..dbeb66b0759 100755
--- a/scripts/build_lbann_lc.sh
+++ b/scripts/build_lbann_lc.sh
@@ -4,6 +4,7 @@
 CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g')
 TOSS=$(uname -r | sed 's/\([0-9][0-9]*\.*\)\-.*/\1/g')
 ARCH=$(uname -m)
+CORAL=$([[ $(hostname) =~ (sierra|lassen|ray) ]] && echo 1 || echo 0)
 
 ################################################################
 # Default options
@@ -44,7 +45,6 @@ if [ "${ARCH}" == "x86_64" ]; then
     fi
 fi
 
-#CONDUIT_DIR=/usr/workspace/wsb/icfsi/conduit/install-toss3
 
 ELEMENTAL_MATH_LIBS=
 PATCH_OPENBLAS=ON
@@ -56,6 +56,7 @@ DATATYPE=float
 VERBOSE=0
 CMAKE_INSTALL_MESSAGE=LAZY
 MAKE_NUM_PROCESSES=$(($(nproc) + 1))
+NINJA_NUM_PROCESSES=0 # Let ninja decide
 GEN_DOC=0
 INSTALL_LBANN=0
 BUILD_DIR=
@@ -63,14 +64,16 @@ INSTALL_DIR=
 BUILD_SUFFIX=
 DETERMINISTIC=OFF
 WITH_CUDA=
+WITH_CUDA_2=ON
 WITH_TOPO_AWARE=ON
 INSTRUMENT=
-WITH_ALUMINUM=OFF
+WITH_ALUMINUM=
 ALUMINUM_WITH_MPI_CUDA=OFF
-ALUMINUM_WITH_NCCL=OFF
+ALUMINUM_WITH_NCCL=
 WITH_CONDUIT=OFF
 WITH_TBINF=OFF
 RECONFIGURE=0
+USE_NINJA=0
 # In case that autoconf fails during on-demand buid on surface, try the newer
 # version of autoconf installed under '/p/lscratchh/brainusr/autoconf/bin'
 # by putting it at the beginning of the PATH or use the preinstalled library
@@ -123,9 +126,12 @@ Options:
   ${C}--instrument${N}            Use -finstrument-functions flag, for profiling stack traces
   ${C}--disable-cuda${N}          Disable CUDA
   ${C}--disable-topo-aware${N}    Disable topological-aware configuration (no HWLOC)
-  ${C}--with-aluminum${N}              Use Aluminum allreduce library
+  ${C}--disable-aluminum${N}           Disable the Aluminum communication library
   ${C}--aluminum-with-mpi-cuda         Enable MPI-CUDA backend in Aluminum
-  ${C}--aluminum-with-nccl             Enable NCCL backend in Aluminum
+  ${C}--disable-aluminum-with-nccl     Disable the NCCL backend in Aluminum
+  ${C}--with-conduit              Build with conduit interface
+  ${C}--ninja                     Generate ninja files instead of makefiles
+  ${C}--ninja-processes${N} <val> Number of parallel processes for ninja.
 EOF
 }
 
@@ -190,6 +196,18 @@ while :; do
                 exit 1
             fi
             ;;
+        --ninja)
+            USE_NINJA=1
+            ;;
+        --ninja-processes)
+            if [ -n "${2}" ]; then
+                NINJA_NUM_PROCESSES=${2}
+                shift
+            else
+                echo "\"${1}\" option requires a non-empty option argument" >&2
+                exit 1
+            fi
+            ;;
         -v|--verbose)
             # Verbose output
             VERBOSE=1
@@ -233,20 +251,20 @@ while :; do
             ;;
         --disable-cuda)
             WITH_CUDA=OFF
+            WITH_CUDA_2=OFF
             ;;
         --disable-topo-aware)
             WITH_TOPO_AWARE=OFF
             ;;
-        --with-aluminum)
-            WITH_ALUMINUM=ON
+        --disable-aluminum)
+            WITH_ALUMINUM=OFF
             ;;
         --aluminum-with-mpi-cuda)
             WITH_ALUMINUM=ON
             ALUMINUM_WITH_MPI_CUDA=ON
             ;;
-        --aluminum-with-nccl)
-            WITH_ALUMINUM=ON
-            ALUMINUM_WITH_NCCL=ON
+        --disable-aluminum-with-nccl)
+            ALUMINUM_WITH_NCCL=OFF
             ;;
         --with-conduit)
             WITH_CONDUIT=ON
@@ -304,7 +322,7 @@ else
     CMAKE_PATH=/usr/workspace/wsb/brain/utils/toss2/cmake-3.9.6/bin
 fi
 
-if [ ${CLUSTER} == "ray" -o ${CLUSTER} == "sierra" ]; then
+if [[ ${CORAL} -eq 1 ]]; then
 	# the latest version, 3.12.1, has several issues
     module load cmake/3.9.2
     CMAKE_PATH=$(dirname $(which cmake))
@@ -372,7 +390,7 @@ elif [ "${COMPILER}" == "intel" ]; then
 elif [ "${COMPILER}" == "clang" ]; then
     # clang
     # clang depends on gnu fortran library. so, find the dependency
-    if [ "${CLUSTER}" == "ray" -o "{CLUSTER}" == "sierra" ]; then
+    if [[ ${CORAL} -eq 1 ]]; then
         #gccdep=`ldd ${COMPILER_BASE}/lib/*.so 2> /dev/null | grep gcc | awk '(NF>2){print $3}' | sort | uniq | head -n 1`
         #GCC_VERSION=`ls -l $gccdep | awk '{print $NF}' | cut -d '-' -f 2 | cut -d '/' -f 1`
         # Forcing to gcc 4.9.3 because of the current way of ray's gcc and various clang installation
@@ -419,7 +437,7 @@ if [ "${BUILD_TYPE}" == "Release" ]; then
             C_FLAGS="${C_FLAGS} -mcpu=power8 -mtune=power8"
             CXX_FLAGS="${CXX_FLAGS} -mcpu=power8 -mtune=power8"
             Fortran_FLAGS="${Fortran_FLAGS} -mcpu=power8 -mtune=power8"
-        elif [ "${CLUSTER}" == "sierra" ]; then
+        elif [ "${CLUSTER}" == "sierra" -o "${CLUSTER}" == "lassen" ]; then
 			# no power9 option shown in the manual
             C_FLAGS="${C_FLAGS} -mcpu=power8 -mtune=power8"
             CXX_FLAGS="${CXX_FLAGS} -mcpu=power8 -mtune=power8"
@@ -488,9 +506,11 @@ if [ "${MPI}" == "spectrum" ]; then
 fi
 
 # Use CUDA-aware MVAPICH2 on Surface and Pascal
-if [ "${CLUSTER}" == "pascal" -o "${CLUSTER}" == "surface" ]; then
-  MPI_HOME=/usr/workspace/wsb/brain/utils/toss3/mvapich2-2.3rc2-gcc-4.9.3-cuda-9.1-install/
-  export MV2_USE_CUDA=1
+if [ "${WITH_CUDA_2}" == "ON" ]; then
+  if [ "${CLUSTER}" == "pascal" -o "${CLUSTER}" == "surface" ]; then
+    MPI_HOME=/usr/workspace/wsb/brain/utils/toss3/mvapich2-2.3rc2-gcc-4.9.3-cuda-9.1-install/
+    export MV2_USE_CUDA=1
+  fi  
 fi
 
 if [ -z "${MPI_HOME}" ]; then
@@ -559,21 +579,21 @@ fi
 # Initialize GPU libraries
 ################################################################
 
-if [ "${CLUSTER}" == "surface" -o "${CLUSTER}" == "ray" -o \
-	 "${CLUSTER}" == "pascal" -o "${CLUSTER}" == "sierra" ]; then
+if [ "${CLUSTER}" == "surface" -o "${CORAL}" -eq 1 -o "${CLUSTER}" == "pascal" ]; then
     HAS_GPU=1
     WITH_CUDA=${WITH_CUDA:-ON}
     WITH_CUDNN=ON
     WITH_CUB=ON
     ELEMENTAL_USE_CUBLAS=OFF
-	case $CLUSTER in
-		ray|sierra)
-			export NCCL_DIR=/usr/workspace/wsb/brain/nccl2/nccl_2.2.13-1+cuda9.2_ppc64le
-			;;
-		*)
-			export NCCL_DIR=/usr/workspace/wsb/brain/nccl2/nccl_2.2.12-1+cuda9.0_x86_64
-			;;
-	esac
+    WITH_ALUMINUM=${WITH_ALUMINUM:-ON}
+    ALUMINUM_WITH_NCCL=${ALUMINUM_WITH_NCCL:-ON}
+	if [[ ${CORAL} -eq 1 ]]; then
+		export NCCL_DIR=/usr/workspace/wsb/brain/nccl2/nccl_2.3.7-1+cuda9.2_ppc64le
+		module del cuda
+		CUDA_TOOLKIT_MODULE=${CUDA_TOOLKIT_MODULE:-cuda/9.2.148}
+	else
+		export NCCL_DIR=/usr/workspace/wsb/brain/nccl2/nccl_2.2.12-1+cuda9.0_x86_64
+	fi
 
     # Hack for surface
 	case $CLUSTER in
@@ -581,10 +601,6 @@ if [ "${CLUSTER}" == "surface" -o "${CLUSTER}" == "ray" -o \
 			. /usr/share/[mM]odules/init/bash
 			CUDA_TOOLKIT_MODULE=cudatoolkit/9.1
 			;;
-		ray|sierra)
-			module del cuda
-			CUDA_TOOLKIT_MODULE=${CUDA_TOOLKIT_MODULE:-cuda/9.2.148}
-			;;
 	esac
 fi
 
@@ -634,12 +650,45 @@ fi
 ################################################################
 # Library options
 ################################################################
-if [ "${CLUSTER}" == "sierra" ]; then
+if [ "${CLUSTER}" == "sierra" -o "${CLUSTER}" == "lassen" ]; then
 	OPENBLAS_ARCH="TARGET=POWER8"
 else
 	OPENBLAS_ARCH=
 fi
 
+if [ "${WITH_CONDUIT}" = "ON" ] ; then
+echo $COMPILER_VERSION
+  if [ -z ${CONDUIT_DIR} ] || [ ! -d ${CONDUIT_DIR} ] ; then
+      echo "CONDUIT_DIR not available."
+      if [ "${CLUSTER}" == "sierra" -o "${CLUSTER}" == "lassen" ]; then
+          export CONDUIT_DIR=/usr/workspace/wsb/icfsi/conduit/install-blueos-dev
+      elif [ "${CLUSTER}" = "catalyst" ] && [ "${COMPILER}" == "gnu" ] && [ "${COMPILER_VERSION}" = "7.1.0" ]; then
+          export CONDUIT_DIR=/p/lscratchh/brainusr/conduit/install-catalyst-gcc7.1
+      elif [ "${CLUSTER}" = "catalyst" ] && [ "${COMPILER}" == "gnu" ] && [ "${COMPILER_VERSION}" = "7.3.0" ]; then
+          export CONDUIT_DIR=/usr/workspace/wsb/icfsi/conduit/install-toss3-7.3.0
+      else
+          # This installation has been built by using gcc 4.9.3 on a TOSS3 platform (quartz)
+          export CONDUIT_DIR=/usr/workspace/wsb/icfsi/conduit/install-toss3-dev
+      fi
+      echo "Set to the default CONDUIT_DIR="$CONDUIT_DIR
+  fi
+fi
+################################################################
+# Setup Ninja, if using
+################################################################
+
+if [ ${USE_NINJA} -ne 0 ]; then
+    if ! which ninja ; then
+        if [ "${ARCH}" == "x86_64" ]; then
+            export PATH=/usr/workspace/wsb/brain/utils/toss3/ninja/bin:$PATH        
+        elif [ "${ARCH}" == "ppc64le" ]; then
+            export PATH=/usr/workspace/wsb/brain/utils/coral/ninja/bin:$PATH        
+        fi
+    fi
+    if ! which ninja ; then
+        USE_NINJA=0
+    fi
+fi
 ################################################################
 # Display parameters
 ################################################################
@@ -731,9 +780,16 @@ fi
 # ATM: goes after Elemental_DIR
 #-D OpenCV_DIR=${OpenCV_DIR} \
 
+# Setup the CMake generator
+GENERATOR="\"Unix Makefiles\""
+if [ ${USE_NINJA} -ne 0 ]; then
+    GENERATOR="Ninja"
+fi
+
 # Configure build with CMake
 CONFIGURE_COMMAND=$(cat << EOF
  ${CMAKE_PATH}/cmake \
+-G ${GENERATOR} \
 -D CMAKE_EXPORT_COMPILE_COMMANDS=ON \
 -D CMAKE_BUILD_TYPE=${BUILD_TYPE} \
 -D CMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE} \
@@ -764,6 +820,7 @@ CONFIGURE_COMMAND=$(cat << EOF
 -D LBANN_DETERMINISTIC=${DETERMINISTIC} \
 -D LBANN_WITH_ALUMINUM=${WITH_ALUMINUM} \
 -D LBANN_WITH_CONDUIT=${WITH_CONDUIT} \
+-D LBANN_NO_OMP_FOR_DATA_READERS=${NO_OMP_FOR_DATA_READERS} \
 -D LBANN_CONDUIT_DIR=${CONDUIT_DIR} \
 -D LBANN_BUILT_WITH_SPECTRUM=${WITH_SPECTRUM} \
 -D OPENBLAS_ARCH_COMMAND=${OPENBLAS_ARCH} \
@@ -773,7 +830,9 @@ EOF
 
 
 if [ ${VERBOSE} -ne 0 ]; then
-    echo "${CONFIGURE_COMMAND}"
+    echo "${CONFIGURE_COMMAND}" |& tee cmake_superbuild_invocation.txt
+else
+    echo "${CONFIGURE_COMMAND}" > cmake_superbuild_invocation.txt
 fi
 eval ${CONFIGURE_COMMAND}
 if [ $? -ne 0 ]; then
@@ -786,13 +845,21 @@ fi
 # Build LBANN with make
 # Note: Ensure Elemental to be built before LBANN. Dependency violation appears to occur only when using cuda_add_library.
 BUILD_COMMAND="make -j${MAKE_NUM_PROCESSES} VERBOSE=${VERBOSE}"
+if [ ${USE_NINJA} -ne 0 ]; then
+    if [ ${NINJA_NUM_PROCESSES} -ne 0 ]; then
+        BUILD_COMMAND="ninja -j${NINJA_NUM_PROCESSES}"
+    else
+        # Usually equivalent to -j<num_cpus+2>
+        BUILD_COMMAND="ninja"
+    fi
+fi
 if [ ${VERBOSE} -ne 0 ]; then
     echo "${BUILD_COMMAND}"
 fi
 eval ${BUILD_COMMAND}
 if [ $? -ne 0 ]; then
     echo "--------------------"
-    echo "MAKE FAILED"
+    echo "BUILD FAILED"
     echo "--------------------"
     exit 1
 fi
@@ -800,13 +867,21 @@ fi
 # Install LBANN with make
 if [ ${INSTALL_LBANN} -ne 0 ]; then
     INSTALL_COMMAND="make install -j${MAKE_NUM_PROCESSES} VERBOSE=${VERBOSE}"
+    if [ ${USE_NINJA} -ne 0 ]; then
+        if [ ${NINJA_NUM_PROCESSES} -ne 0 ]; then
+            BUILD_COMMAND="ninja -j${NINJA_NUM_PROCESSES} install"
+        else
+            # Usually equivalent to -j<num_cpus+2>
+            BUILD_COMMAND="ninja install"
+        fi
+    fi
     if [ ${VERBOSE} -ne 0 ]; then
         echo "${INSTALL_COMMAND}"
     fi
     eval ${INSTALL_COMMAND}
     if [ $? -ne 0 ]; then
         echo "--------------------"
-        echo "MAKE INSTALL FAILED"
+        echo "INSTALL FAILED"
         echo "--------------------"
         exit 1
     fi
@@ -815,13 +890,16 @@ fi
 # Generate documentation with make
 if [ ${GEN_DOC} -ne 0 ]; then
     DOC_COMMAND="make doc"
+    if [ ${USE_NINJA} -ne 0 ]; then
+        DOC_COMMAND="ninja doc"
+    fi
     if [ ${VERBOSE} -ne 0 ]; then
         echo "${DOC_COMMAND}"
     fi
     eval ${DOC_COMMAND}
     if [ $? -ne 0 ]; then
         echo "--------------------"
-        echo "MAKE DOC FAILED"
+        echo "BUILDING DOC FAILED"
         echo "--------------------"
         exit 1
     fi
diff --git a/src/callbacks/callback_checkpoint.cpp b/src/callbacks/callback_checkpoint.cpp
index a603bc8df25..29418df60b0 100644
--- a/src/callbacks/callback_checkpoint.cpp
+++ b/src/callbacks/callback_checkpoint.cpp
@@ -81,20 +81,20 @@ bool lbann_callback_checkpoint::need_checkpoint(model *m) {
   if (!m_checkpoint_shared && m_checkpoint_epochs > 0 && (p.get_cb_type() == callback_type::epoch || p.get_cb_type() == callback_type::validation)){
       m_checkpoint_shared = (cur_epoch > 0) && (cur_epoch % m_checkpoint_epochs == 0);
     }
-    
+
   if(!m_checkpoint_dist && m_ckpt_dist_epochs > 0 && (p.get_cb_type() == callback_type::epoch || p.get_cb_type() == callback_type::validation)){
       m_checkpoint_dist = (cur_epoch > 0) && (cur_epoch % m_ckpt_dist_epochs == 0);
   }
-  
+
   // If we are at the end of a training mb step and the training mb step lands on defined interval, trigger checkpoint
   if (!m_checkpoint_shared && m_checkpoint_steps > 0) {
     m_checkpoint_shared = (m->get_cur_step() > 0) && (m->get_cur_step() % m_checkpoint_steps == 0);
   }
-  
+
   if(!m_checkpoint_dist && m_ckpt_dist_steps > 0){
       m_checkpoint_dist = (m->get_cur_step() > 0) && (m->get_cur_step() % m_ckpt_dist_steps == 0);
   }
-  
+
   // check the clock if time-based checkpoint is enabled
   if (!m_checkpoint_shared && m_checkpoint_secs != 0.0) {
     // have rank 0 determine whether we should checkpoint
@@ -108,12 +108,12 @@ bool lbann_callback_checkpoint::need_checkpoint(model *m) {
       m_checkpoint_shared = (current >= next);
     }
     comm->model_broadcast(0, m_checkpoint_shared);
-  } 
-  // If either checkpoint version is triggered, return true, otherwise false.   
+  }
+  // If either checkpoint version is triggered, return true, otherwise false.
   return (m_checkpoint_shared || m_checkpoint_dist);
 }
 
-// Print last checkpoint to file, used to determine which checkpoint to load from. 
+// Print last checkpoint to file, used to determine which checkpoint to load from.
 static bool write_latest(const char *dir, const char *name, int epoch, int train) {
   // define filename
   char filename[1024];
@@ -199,7 +199,7 @@ bool lbann_callback_checkpoint::checkpoint(model *m) {
     // Print latest checkpoint to file
     if (comm->am_model_master()) {
       write_latest(dir, "last.distributed.checkpoint", epoch, step);
-    } 
+    }
   }
   // Shared checkpoint, logic identical to Distributed.i
   if(m_checkpoint_shared){
@@ -209,7 +209,7 @@ bool lbann_callback_checkpoint::checkpoint(model *m) {
     if (comm->am_model_master()) {
       p.open_checkpoint(epochdir);
     }
-      // Need to give other ranks knowledge of checkpoint dir for writing of rank specific rng state
+    // Need to give other ranks knowledge of checkpoint dir for writing of rank specific rng state
     comm->model_broadcast(0, &(p.m_checkpoint_dir[0]), sizeof(p.m_checkpoint_dir));
     m->save_to_checkpoint_shared(p);
     // close our checkpoint
@@ -217,7 +217,7 @@ bool lbann_callback_checkpoint::checkpoint(model *m) {
     if (comm->am_model_master()) {
       write_latest(dir, "last.shared.checkpoint", epoch, step);
     }
-   }
+  }
 
   uint64_t bytes_count = p.get_bytes();
 
@@ -252,17 +252,17 @@ bool lbann_callback_checkpoint::restart(model *m) {
   int step_dist = -1;
   lbann_comm *comm = m->get_comm();
   int shared = 1;
-  // Grab latest checkpoint information, checks for latest in dist and shared, restarts from most recent between the two. 
+  // Grab latest checkpoint information, checks for latest in dist and shared, restarts from most recent between the two.
   if (comm->am_model_master()) {
     if(m_per_rank_dir.length()){
       snprintf(dir, sizeof(dir), "%s/%s", m_per_rank_dir.c_str(), m_checkpoint_dir.c_str());
       read_latest(dir, "last.distributed.checkpoint", &epoch_dist, &step_dist);
-    } 
+    }
     if(m_checkpoint_dir.length()){
       strcpy(dir, m_checkpoint_dir.c_str());
       read_latest(dir, "last.shared.checkpoint", &epoch, &step);
     }
-    
+
     if(epoch > epoch_dist){
       strcpy(dir, m_checkpoint_dir.c_str());
       shared = 1;
@@ -278,7 +278,7 @@ bool lbann_callback_checkpoint::restart(model *m) {
       shared = 0;
     }
   }
-  // Update other ranks on where we are loading from. 
+  // Update other ranks on where we are loading from.
   // TODO: we would want to prepend dir with the model name and model rank:
   // m->get_name() + '.' + std::to_string(comm->get_model_rank()) + '.'
 #if 1
@@ -314,7 +314,7 @@ bool lbann_callback_checkpoint::restart(model *m) {
     printf("Restart: epoch %d ...\n", epoch);
     fflush(stdout);
   }
-  
+
   char epochdir[1024];
   // Create dir to restart from based off last recorded checkpoint (or overriden values in last.shared[distributed].checkpoint
   if(!shared){
@@ -334,7 +334,7 @@ bool lbann_callback_checkpoint::restart(model *m) {
     if(comm->am_model_master())
       p.close_restart();
   }
-  
+
   // close our checkpoint
   uint64_t bytes_count = p.get_bytes();
   // let user know we've completed reading our restart
diff --git a/src/callbacks/callback_dump_minibatch_sample_indices.cpp b/src/callbacks/callback_dump_minibatch_sample_indices.cpp
index fdf51704531..257d8b76ebf 100644
--- a/src/callbacks/callback_dump_minibatch_sample_indices.cpp
+++ b/src/callbacks/callback_dump_minibatch_sample_indices.cpp
@@ -44,6 +44,14 @@ void lbann_callback_dump_minibatch_sample_indices::dump_to_file(model *m, Layer
         || indices->Width() == 0) {
       return;
     }
+
+    std::ostringstream s;
+    s << "mkdir -p " << m_basename;
+    const int dir= system(s.str().c_str());
+    if (dir< 0) {
+      LBANN_ERROR("callback_dump_minibatch_sample_indices is unable to create the target director");
+    }
+
     const std::string file
       = (m_basename
          + _to_string(m->get_execution_mode())
diff --git a/src/callbacks/callback_gpu_memory_usage.cpp b/src/callbacks/callback_gpu_memory_usage.cpp
index 3ee82851c5d..90ac602a15f 100644
--- a/src/callbacks/callback_gpu_memory_usage.cpp
+++ b/src/callbacks/callback_gpu_memory_usage.cpp
@@ -27,6 +27,30 @@
 
 #include "lbann/callbacks/callback_gpu_memory_usage.hpp"
 #include <iomanip>
+#include <sstream>
+
+namespace {
+template <typename T>
+T get_mean(const std::vector<T> &v) {
+  return std::accumulate(v.begin(), v.end(), 0.0) /
+      v.size();
+}
+template <typename T>
+T get_median(const std::vector<T> &v) {
+  std::vector<T> tmp = v;
+  int median_idx = tmp.size() / 2 - 1 + tmp.size() % 2;  
+  std::nth_element(tmp.begin(), tmp.begin() + median_idx, tmp.end());
+  return tmp[median_idx];
+}
+template <typename T>
+T get_max(const std::vector<T> &v) {
+  return *std::max_element(v.begin(), v.end());  
+}
+template <typename T>
+T get_min(const std::vector<T> &v) {
+  return *std::min_element(v.begin(), v.end());
+}
+}
 
 namespace lbann {
 
@@ -36,16 +60,34 @@ void lbann_callback_gpu_memory_usage::on_epoch_begin(model *m) {
   size_t total;
   FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total));
   size_t used = total - available;
-  std::cout << "GPU memory usage at epoch " << m->get_cur_epoch()
-            << " of model " << m->get_comm()->get_model_rank()
-            << " at rank " << m->get_comm()->get_rank_in_model()
-            << ": " << used << " bytes ("
-            << std::setprecision(3)
-            << (used / 1024.0 / 1024.0 / 1024.0) << " GiB) used out of "
-            << total << " bytes ("
-            << std::setprecision(3)      
-            << (total / 1024.0 / 1024.0 / 1024.0)
-            << " GiB)" << std::endl;
+  auto comm = m->get_comm();
+  if (comm->am_model_master()) {
+    auto num_procs = comm->get_procs_per_model();
+    std::vector<size_t> used_list(num_procs);
+    comm->model_gather(used, used_list.data());
+    double used_mean = get_mean(used_list) / 1024.0 / 1024.0 / 1024.0;
+    double used_median = get_median(used_list) / 1024.0 / 1024.0 / 1024.0;
+    double used_max = get_max(used_list) / 1024.0 / 1024.0 / 1024.0;
+    double used_min = get_min(used_list) / 1024.0 / 1024.0 / 1024.0;
+    std::stringstream ss;
+    ss << "Model " << m->get_comm()->get_model_rank()
+       << " GPU memory usage statistics : "
+       << std::setprecision(3)        
+       << used_mean  << " GiB mean, "
+       << std::setprecision(3)        
+       << used_median  << " GiB median, "
+       << std::setprecision(3)        
+       << used_max  << " GiB max, "
+       << std::setprecision(3)        
+       << used_min  << " GiB min "
+       << "("
+       << std::setprecision(3)      
+       << (total / 1024.0 / 1024.0 / 1024.0)
+       << " GiB total)" << std::endl;
+    std::cout << ss.str();
+  } else {
+    comm->model_gather(used, comm->get_model_master());
+  }
 #endif
 }
 
diff --git a/src/callbacks/callback_ltfb.cpp b/src/callbacks/callback_ltfb.cpp
index 3d182d66c5e..c9b2a104fb9 100644
--- a/src/callbacks/callback_ltfb.cpp
+++ b/src/callbacks/callback_ltfb.cpp
@@ -67,7 +67,6 @@ int assign_partners(lbann_comm* comm) {
     return comm->scatter<int>(0, comm->get_world_comm());
   }
 }
-
 /** Exchange weights with remote model.
  *  Weights from the local model are copied into local_weights and
  *  weights from the remote model are copied into model_weights.
@@ -91,8 +90,27 @@ void exchange_weights(lbann_comm* comm,
       }
     }
     if (size > 0 && send) {
-      comm->sendrecv(local_matrix.LockedBuffer(), size, partner,
-                     remote_matrix->Buffer(), size, partner);
+      switch (remote_matrix->GetLocalDevice()) {
+      case El::Device::CPU:
+        comm->sendrecv(local_matrix.LockedBuffer(), size, partner,
+                       remote_matrix->Buffer(), size, partner,
+                       El::SyncInfo<El::Device::CPU>{});
+        break;
+#ifdef HYDROGEN_HAVE_CUDA
+      case El::Device::GPU:
+        using ValueT
+          = std::remove_pointer<decltype(remote_matrix->Buffer())>::type;
+        comm->sendrecv(
+          local_matrix.LockedBuffer(), size, partner,
+          remote_matrix->Buffer(), size, partner,
+          El::SyncInfo<El::Device::GPU>{
+            static_cast<El::Matrix<ValueT,El::Device::GPU> const&>(
+              remote_matrix->LockedMatrix())});
+        break;
+#endif // HYDROGEN_HAVE_CUDA
+      default:
+        El::LogicError("exchange_weights: Bad device type.");
+      }
       model_weights[i]->set_values(*remote_matrix);
     }
     delete remote_matrix;
@@ -117,12 +135,12 @@ EvalType evaluate(model *m, std::unordered_set<std::string>& eval_metrics) {
 
 } // namespace
 
-lbann_callback_ltfb::lbann_callback_ltfb(int round_size, 
+lbann_callback_ltfb::lbann_callback_ltfb(int round_size,
                                          std::unordered_set<std::string> eval_metrics,
                                          bool increasing_metric_mode,
-                                         std::unordered_set<std::string> weights_tosend, 
+                                         std::unordered_set<std::string> weights_tosend,
                                          lbann_summary *summarizer)
-  : lbann_callback(1, summarizer), m_round_size(round_size), 
+  : lbann_callback(1, summarizer), m_round_size(round_size),
                    m_eval_metrics(std::move(eval_metrics)),
                    m_increasing_metric_mode(increasing_metric_mode),
                    m_weights_tosend(std::move(weights_tosend)){}
@@ -144,7 +162,7 @@ lbann_callback_ltfb& lbann_callback_ltfb::operator=(const lbann_callback_ltfb& o
   m_comm = other.m_comm;
   m_round_size = other.m_round_size;
   m_eval_metrics = other.m_eval_metrics;
-  m_increasing_metric_mode = other.m_increasing_metric_mode; 
+  m_increasing_metric_mode = other.m_increasing_metric_mode;
   m_weights_tosend = other.m_weights_tosend;
 
   // Deep copy
@@ -160,9 +178,9 @@ lbann_callback_ltfb::~lbann_callback_ltfb() {
 }
 
 void lbann_callback_ltfb::setup(model *m) {
-  
+
   if(m_eval_metrics.size() < 1)
-    LBANN_ERROR("LTFB: specify at least one evaluation metric for tournament voting."); 
+    LBANN_ERROR("LTFB: specify at least one evaluation metric for tournament voting.");
 
   m_comm = m->get_comm();
 
@@ -171,7 +189,7 @@ void lbann_callback_ltfb::setup(model *m) {
   for (auto& w : m_local_weights) { delete w; }
   m_local_weights = m->get_weights();
   for (auto& w : m_local_weights) { w = w->copy(); }
-  
+
   // Make sure model does not have inter-model communication callback
   for (auto&& cb : m->get_callbacks()) {
     if (dynamic_cast<lbann_callback_imcomm*>(cb) != nullptr) {
diff --git a/src/callbacks/callback_print.cpp b/src/callbacks/callback_print.cpp
index 0c7c4a2ed98..9e2014cb7a6 100644
--- a/src/callbacks/callback_print.cpp
+++ b/src/callbacks/callback_print.cpp
@@ -162,7 +162,7 @@ void lbann_callback_print::report_results(model *m) {
       comm->intermodel_gather(obj_fn, obj_fn_list);
       comm->intermodel_gather(obj_fn_samples, num_samples_list);
       for (int i = 0; i < num_models; ++i) {
-        std::cout << "Model " << i << " " << mode_string << " "
+        std::cout << m->get_name() << " (instance " <<  i <<  ") "  << mode_string << " "
                   << "objective function : " << obj_fn_list[i]
                   << std::endl;
       }
@@ -174,7 +174,7 @@ void lbann_callback_print::report_results(model *m) {
                                      / std::accumulate(num_samples_list.begin(),
                                                        num_samples_list.end(),
                                                        0));
-        std::cout << "World average " << mode_string << " "
+        std::cout << m->get_name() << " global average " << mode_string << " "
                   << "objective function : " << avg_obj_fn
                   << std::endl;
       }
@@ -193,7 +193,7 @@ void lbann_callback_print::report_results(model *m) {
         comm->intermodel_gather(score, score_list);
         comm->intermodel_gather(score_samples, num_samples_list);
         for (int i = 0; i < num_models; ++i) {
-          std::cout << "Model " << i << " " << mode_string << " "
+          std::cout << m->get_name() << " (instance " << i <<  ") " << mode_string << " "
                     << met->name() << " : "
                     << score_list[i] << met->get_unit()
                     << std::endl;
@@ -206,7 +206,7 @@ void lbann_callback_print::report_results(model *m) {
                                       / std::accumulate(num_samples_list.begin(),
                                                         num_samples_list.end(),
                                                         0));
-          std::cout << "World " << mode_string << " "
+          std::cout << m->get_name() << " (global) "  << mode_string << " "
                     << met->name() << " : "
                     << avg_score << met->get_unit()
                     << std::endl;
diff --git a/src/callbacks/callback_save_images.cpp b/src/callbacks/callback_save_images.cpp
index dc541901270..6081182c85d 100644
--- a/src/callbacks/callback_save_images.cpp
+++ b/src/callbacks/callback_save_images.cpp
@@ -22,65 +22,141 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
-//
-// lbann_callback_save_images .hpp .cpp - Callbacks to save images
 ////////////////////////////////////////////////////////////////////////////////
 
-#include <vector>
 #include "lbann/callbacks/callback_save_images.hpp"
-#include "lbann/data_readers/image_utils.hpp"
+#ifdef LBANN_HAS_OPENCV
+#include <opencv2/imgcodecs.hpp>
+#endif // LBANN_HAS_OPENCV
 
 namespace lbann {
 
+namespace {
 
-void lbann_callback_save_images::on_epoch_end(model *m) {
-  auto tag = "epoch" + std::to_string(m->get_cur_epoch());
-  save_image(*m,tag);
-}
+void save_image(std::string prefix,
+                std::string format,
+                const std::vector<Layer*>& layers,
+                const std::vector<std::string>& layer_names) {
+#ifdef LBANN_HAS_OPENCV
+  for (const auto* l : layers) {
 
+    // Only save outputs of layers in list
+    const auto& name = l->get_name();
+    if (std::find(layer_names.begin(), layer_names.end(), name)
+        == layer_names.end()) {
+      continue;
+    }
 
-void lbann_callback_save_images::on_phase_end(model *m) {
-  const auto phase = m->get_current_phase();
-  auto tag = "phase" + std::to_string(phase);
-  save_image(*m,tag);
-}
+    // Check that tensor dimensions are valid for images
+    const auto& dims = l->get_output_dims();
+    El::Int num_channels(0), height(0), width(0);
+    if (dims.size() == 2) {
+      num_channels = 1;
+      height = dims[0];
+      width = dims[1];
+    } else if (dims.size() == 3) {
+      num_channels = dims[0];
+      height = dims[1];
+      width = dims[2];
+    }
+    if (!(num_channels == 1 || num_channels == 3)
+        || height < 1 || width < 1) {
+      std::stringstream err;
+      err << "images are assumed to either be "
+          << "2D tensors in HW format or 3D tensors in CHW format, "
+          << "but the output of layer \"" << l->get_name() << "\" "
+          << "has dimensions ";
+        for (size_t i = 0; i < dims.size(); ++i) {
+          err << (i > 0 ? "" : " x ") << dims[i];
+        }
+      LBANN_ERROR(err.str());
+    }
 
-void lbann_callback_save_images::on_test_end(model *m) {
-  save_image(*m,"test");
-}
+    // Get tensor data
+    const auto& raw_data = l->get_activations();
+    std::unique_ptr<AbsDistMat> raw_data_v(raw_data.Construct(raw_data.Grid(), raw_data.Root()));
+    El::LockedView(*raw_data_v, raw_data, El::ALL, El::IR(0));
+    CircMat<El::Device::CPU> circ_data(raw_data_v->Grid(), raw_data_v->Root());
+    circ_data = *raw_data_v;
 
-void lbann_callback_save_images::save_image(model& m,
-                                            std::string tag) {
+    // Export tensor as image
+    if (circ_data.CrossRank() == circ_data.Root()) {
+      const auto& data = circ_data.LockedMatrix();
 
-  // Save image
-  if(m_layer_names.empty()) {
-    if(m.get_comm()->am_world_master())
-      std::cout << "Layer list empty, images not saved " << std::endl;
-    return;
-  }
- //@todo: check that number of neurons (linearized) equal mat heigth?
- if(m.get_comm()->am_world_master())
-      std::cout << "Saving images to " << m_image_dir << std::endl;
-
-  const auto layers = m.get_layers();
-  for(auto& l: layers) {
-    auto layer_name = l->get_name();
-    if(std::find(std::begin(m_layer_names), std::end(m_layer_names),
-                  layer_name) != std::end(m_layer_names)) {
-
-      AbsDistMat* input_col = l->get_activations().Construct(
-                                          l->get_activations().Grid(),
-                                          l->get_activations().Root());
-      El::View(*input_col, l->get_activations(), El::ALL, El::IR(0));
-      CircMat<El::Device::CPU> input_circ = *input_col;
-      delete input_col;
-
-      if(m.get_comm()->am_world_master())
-        m_reader->save_image(input_circ.Matrix(),
-                             m_image_dir+tag+"-"+layer_name+"."+m_extension);
+      // Data will be scaled to be in [0,256]
+      DataType lower = data(0, 0);
+      DataType upper = data(0, 0);
+      for (El::Int i = 1; i < data.Height(); ++i) {
+        lower = std::min(lower, data(i, 0));
+        upper = std::max(upper, data(i, 0));
+      }
+      const auto& scale = ((upper > lower) ?
+                           256 / (upper - lower) :
+                           DataType(1));
+
+      // Copy data into OpenCV matrix
+      int type = -1;
+      if (num_channels == 1) { type = CV_8UC1; }
+      if (num_channels == 3) { type = CV_8UC3; }
+      cv::Mat img(height, width, type);
+      for (El::Int row = 0; row < height; ++row) {
+        for (El::Int col = 0; col < width; ++col) {
+          const auto& offset = row * width + col;
+          if (num_channels == 1) {
+            img.at<uchar>(row, col)
+              = cv::saturate_cast<uchar>(scale * (data(offset, 0) - lower));
+          } else if (num_channels == 3) {
+            cv::Vec3b pixel;
+            pixel[0] = cv::saturate_cast<uchar>(scale * (data(offset, 0) - lower));
+            pixel[1] = cv::saturate_cast<uchar>(scale * (data(height*width + offset, 0) - lower));
+            pixel[2] = cv::saturate_cast<uchar>(scale * (data(2*height*width + offset, 0) - lower));
+            img.at<cv::Vec3b>(row, col) = pixel;
+          }
+        }
+      }
+
+      // Write image to file
+      cv::imwrite(prefix + "-" + name + "." + format, img);
+        
     }
+      
   }
+#endif // LBANN_HAS_OPENCV
+}
+  
+} // namespace
+
+lbann_callback_save_images::lbann_callback_save_images(std::vector<std::string> layer_names,
+                                                       std::string image_format,
+                                                       std::string image_prefix)
+  : lbann_callback(),
+    m_layer_names(std::move(layer_names)),
+    m_image_format(image_format.empty() ? "jpg" : image_format),
+    m_image_prefix(std::move(image_prefix)) {
+#ifndef LBANN_HAS_OPENCV
+  LBANN_ERROR("OpenCV not detected");
+#endif // LBANN_HAS_OPENCV
+}
+
+void lbann_callback_save_images::on_epoch_end(model *m) {
+  save_image(m_image_prefix + "epoch" + std::to_string(m->get_cur_epoch()),
+             m_image_format,
+             m->get_layers(),
+             m_layer_names);
 }
 
+void lbann_callback_save_images::on_phase_end(model *m) {
+  save_image(m_image_prefix + "phase" + std::to_string(m->get_current_phase()),
+             m_image_format,
+             m->get_layers(),
+             m_layer_names);
+}
+
+void lbann_callback_save_images::on_test_end(model *m) {
+  save_image(m_image_prefix + "test",
+             m_image_format,
+             m->get_layers(),
+             m_layer_names);
+}
 
-}  // namespace lbann
+} // namespace lbann
diff --git a/src/callbacks/callback_timer.cpp b/src/callbacks/callback_timer.cpp
index 10dba28f8f8..c8b7bbafe16 100644
--- a/src/callbacks/callback_timer.cpp
+++ b/src/callbacks/callback_timer.cpp
@@ -22,71 +22,73 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
-//
-// lbann_callback_timer .hpp .cpp - Callback hooks to time training
 ///////////////////////////////////////////////////////////////////////////////
 
-#include <algorithm>
-#include "lbann/utils/timer.hpp"
 #include "lbann/callbacks/callback_timer.hpp"
+#include "lbann/utils/timer.hpp"
+#include <algorithm>
 
 namespace lbann {
 
-void lbann_callback_timer::batch_timing_begin(model *m) {
-  m_batch_start_time = get_time();
+void lbann_callback_timer::batch_timing_begin(const model& m) {
+  const auto& mode = m.get_execution_mode();
+  m_batch_start_times[mode] = get_time();
 }
 
-void lbann_callback_timer::batch_timing_end(model *m) {
-  const EvalType mb_time = get_time() - m_batch_start_time;
-  m_batch_times.push_back(mb_time);
+void lbann_callback_timer::batch_timing_end(const model& m) {
+  const auto& mode = m.get_execution_mode();
+  const auto& batch_time = get_time() - m_batch_start_times[mode];
+  m_batch_times[mode].push_back(batch_time);
   if (m_summarizer != nullptr) {
-    m_summarizer->reduce_scalar("minibatch_time", mb_time, m->get_cur_step()-1);
-    m_summarizer->reduce_scalar_all("minibatch_time", mb_time, m->get_cur_step()-1);
+    m_summarizer->reduce_scalar("minibatch_time", batch_time, m.get_cur_step()-1);
+    m_summarizer->reduce_scalar_all("minibatch_time", batch_time, m.get_cur_step()-1);
   }
 }
 
-void lbann_callback_timer::timing_begin(model *m) {
-  m_batch_times.clear();
-  m_start_time = get_time();
+void lbann_callback_timer::timing_begin(const model& m) {
+  const auto& mode = m.get_execution_mode();
+  m_start_times[mode] = get_time();
+  m_batch_times[mode].clear();
 }
 
-void lbann_callback_timer::timing_end(model *m) {
-  lbann_comm *comm = m->get_comm();
+void lbann_callback_timer::timing_end(model& m) {
+  constexpr EvalType zero = 0;
 
   // Get run time
-  const EvalType run_time = get_time() - m_start_time;
+  const auto& mode = m.get_execution_mode();
+  const auto& run_time = get_time() - m_start_times[mode];
 
   // Compute minibatch statistics
-  const int num_batches = m_batch_times.size();
+  const auto& batch_times = m_batch_times[mode];
+  const auto& num_batches = batch_times.size();
   EvalType batch_time_mean = std::nan("");
   EvalType batch_time_min = std::nan("");
   EvalType batch_time_max = std::nan("");
   EvalType batch_time_stdev = std::nan("");
   if (num_batches > 0) {
-    batch_time_mean = std::accumulate(m_batch_times.begin(),
-                                      m_batch_times.end(),
-                                      0.0);
-    batch_time_mean /= num_batches;
-    batch_time_min = *std::min_element(m_batch_times.begin(),
-                                       m_batch_times.end());
-    batch_time_max = *std::max_element(m_batch_times.begin(),
-                                       m_batch_times.end());
+    batch_time_mean = std::accumulate(batch_times.begin(),
+                                      batch_times.end(),
+                                      zero) / num_batches;
+    batch_time_min = *std::min_element(batch_times.begin(),
+                                       batch_times.end());
+    batch_time_max = *std::max_element(batch_times.begin(),
+                                       batch_times.end());
   }
   if (num_batches > 1) {
-    const EvalType sqsum = std::inner_product(m_batch_times.begin(),
-                                            m_batch_times.end(),
-                                            m_batch_times.begin(),
-                                            0.0);
-    EvalType var = sqsum / num_batches - batch_time_mean * batch_time_mean;
-    var = num_batches * var / (num_batches - 1);
-    batch_time_stdev = std::sqrt(std::max(var, 0.0));
+    batch_time_stdev = zero;
+    for (const auto& t : batch_times) {
+      const auto& diff = t - batch_time_mean;
+      batch_time_stdev += diff * diff;
+    }
+    batch_time_stdev /= num_batches - 1;
+    batch_time_stdev = std::sqrt(std::max(batch_time_stdev, zero));
   }
 
   // Get string for execution mode
   std::string mode_string;
-  switch(m->get_execution_mode()) {
+  switch(mode) {
   case execution_mode::training:
-    mode_string = "training epoch " + std::to_string(m->get_cur_epoch()-1);
+    mode_string = "training epoch " + std::to_string(m.get_cur_epoch()-1);
     break;
   case execution_mode::validation:
     mode_string = "validation";
@@ -95,15 +97,13 @@ void lbann_callback_timer::timing_end(model *m) {
     mode_string = "test";
     break;
   default:
-    std::stringstream err;
-    err << __FILE__ << " " << __LINE__ << " :: "
-        << "invalid execution mode for reporting results";
-    throw lbann_exception(err.str());
+    LBANN_ERROR("invalid execution mode");
   }
 
   // Report timing results
-  const int num_models = comm->get_num_models();
-  if (comm->am_model_master()) {
+  auto& comm = *m.get_comm();
+  const El::Int num_models = comm.get_num_models();
+  if (comm.am_model_master()) {
 
     // Gather timing results in world master
     std::vector<EvalType> run_time_list(num_models);
@@ -111,29 +111,30 @@ void lbann_callback_timer::timing_end(model *m) {
     std::vector<EvalType> min_list(num_models);
     std::vector<EvalType> max_list(num_models);
     std::vector<EvalType> stdev_list(num_models);
-    if (comm->am_world_master()) {
-      comm->intermodel_gather(run_time, run_time_list);
-      comm->intermodel_gather(batch_time_mean, mean_list);
-      comm->intermodel_gather(batch_time_min, min_list);
-      comm->intermodel_gather(batch_time_max, max_list);
-      comm->intermodel_gather(batch_time_stdev, stdev_list);
+    if (comm.am_world_master()) {
+      comm.intermodel_gather(run_time, run_time_list);
+      comm.intermodel_gather(batch_time_mean, mean_list);
+      comm.intermodel_gather(batch_time_min, min_list);
+      comm.intermodel_gather(batch_time_max, max_list);
+      comm.intermodel_gather(batch_time_stdev, stdev_list);
     } else {
-      comm->intermodel_gather(run_time, comm->get_intermodel_master());
-      comm->intermodel_gather(batch_time_mean, comm->get_intermodel_master());
-      comm->intermodel_gather(batch_time_min, comm->get_intermodel_master());
-      comm->intermodel_gather(batch_time_max, comm->get_intermodel_master());
-      comm->intermodel_gather(batch_time_stdev, comm->get_intermodel_master());
+      const auto& world_master = comm.get_intermodel_master();
+      comm.intermodel_gather(run_time, world_master);
+      comm.intermodel_gather(batch_time_mean, world_master);
+      comm.intermodel_gather(batch_time_min, world_master);
+      comm.intermodel_gather(batch_time_max, world_master);
+      comm.intermodel_gather(batch_time_stdev, world_master);
     }
 
     // Print results
-    if (comm->am_world_master()) {
-      for (int i = 0; i < num_models; ++i) {
-        std::cout << "Model " << i << " " << mode_string << " "
+    if (comm.am_world_master()) {
+      for (El::Int i = 0; i < num_models; ++i) {
+        std::cout << m.get_name() << " (instance "<< i << ") " << mode_string << " "
                   << "run time : " << run_time_list[i] << "s"
                   << std::endl;
       }
-      for (int i = 0; i < num_models; ++i) {
-        std::cout << "Model " << i << " " << mode_string << " "
+      for (El::Int i = 0; i < num_models; ++i) {
+        std::cout << m.get_name() << " (instance " << i << ") " << mode_string << " "
                   << "mini-batch time statistics : ";
         if (std::isnan(mean_list[i])) {
           std::cout << "N/A";
diff --git a/src/comm.cpp b/src/comm.cpp
index aed713c6942..3387cc3d547 100644
--- a/src/comm.cpp
+++ b/src/comm.cpp
@@ -54,8 +54,8 @@ namespace lbann {
 #define checkMPI(status) status
 #endif // #ifdef LBANN_DEBUG
 
-lbann_comm::lbann_comm(int ppm, const El::mpi::Comm world) :
-  world_comm(world), grid(nullptr), procs_per_model(ppm), num_model_barriers(0),
+lbann_comm::lbann_comm(int ppm, El::mpi::Comm world) :
+  world_comm(std::move(world)), grid(nullptr), procs_per_model(ppm), num_model_barriers(0),
   num_intermodel_barriers(0), num_global_barriers(0), bytes_sent(0),
   bytes_received(0) {
 #ifdef LBANN_HAS_ALUMINUM
@@ -87,7 +87,6 @@ lbann_comm::~lbann_comm() {
     }
   }
 #ifdef LBANN_HAS_ALUMINUM
-  m_al_comms.clear();
   ::Al::Finalize();
 #endif
 }
@@ -141,7 +140,7 @@ void lbann_comm::intermodel_sum_matrix(AbsDistMat& mat) {
 }
 
 void lbann_comm::allreduce(AbsMat& m,
-                           const El::mpi::Comm c,
+                           El::mpi::Comm c,
                            El::mpi::Op op) {
   if (El::mpi::Size(c) == 1 || m.Height() < 1 || m.Width() < 1) {
     return;
@@ -167,24 +166,25 @@ void lbann_comm::allreduce(AbsMat& m,
     // If available, use the MPI-CUDA backend for small matrices.
 #ifdef AL_HAS_MPI_CUDA
     // Based on runs on Pascal and Ray.
-    if ((El::mpi::Size(c) > 4 && local_size <= 8192) ||
+    /*if ((El::mpi::Size(c) > 4 && local_size <= 8192) ||
         (El::mpi::Size(c) >= 16 && local_size <= 32768)) {
       t = std::type_index(typeid(::Al::MPICUDABackend));
-    }
+      }*/
 #endif  // AL_HAS_MPI_CUDA
+#elif defined(AL_HAS_MPI_CUDA)
+    t = std::type_index(typeid(::Al::MPICUDABackend));
 #else
-    throw lbann_exception("Allreduce on GPU matrix requires NCCL support in"
-                          " Aluminum");
+    throw lbann_exception("Allreduce on GPU matrix requires NCCL or MPI-CUDA"
+                          " support in Aluminum");
 #endif  // AL_HAS_NCCL
   }
 #endif  // LBANN_HAS_GPU
-  auto&& comm = get_al_comm(c, t);
   if (t == std::type_index(typeid(::Al::MPIBackend))) {
     ::Al::Allreduce<::Al::MPIBackend>(
       m.Buffer(),
       local_size,
       mpi_op_to_al_op(op),
-      *comm);
+      c.template GetComm<::Al::MPIBackend>());
   }
 #ifdef AL_HAS_NCCL
   if (t == std::type_index(typeid(::Al::NCCLBackend))) {
@@ -192,7 +192,7 @@ void lbann_comm::allreduce(AbsMat& m,
       m.Buffer(),
       local_size,
       mpi_op_to_al_op(op),
-      *static_cast<::Al::NCCLBackend::comm_type*>(comm));
+      c.template GetComm<::Al::NCCLBackend>());
   }
 #endif // AL_HAS_NCCL
 #ifdef AL_HAS_MPI_CUDA
@@ -202,7 +202,7 @@ void lbann_comm::allreduce(AbsMat& m,
       m.Buffer(),
       local_size,
       mpi_op_to_al_op(op),
-      *static_cast<::Al::MPICUDABackend::comm_type*>(comm),
+      c.template GetComm<::Al::MPICUDABackend>(),
       ::Al::MPICUDAAllreduceAlgorithm::host_transfer);
   }
 #endif  // AL_HAS_MPI_CUDA
@@ -213,13 +213,13 @@ void lbann_comm::allreduce(AbsMat& m,
 }
 
 void lbann_comm::allreduce(AbsDistMat& m,
-                           const El::mpi::Comm c,
+                           El::mpi::Comm c,
                            El::mpi::Op op) {
-  allreduce(m.Matrix(), c, op);
+  allreduce(m.Matrix(), std::move(c), op);
 }
 
 void lbann_comm::nb_allreduce(AbsMat& m,
-                              const El::mpi::Comm c,
+                              El::mpi::Comm c,
                               Al::request& req,
                               El::mpi::Op op) {
   if (El::mpi::Size(c) == 1 || m.Height() < 1 || m.Width() < 1) {
@@ -246,24 +246,25 @@ void lbann_comm::nb_allreduce(AbsMat& m,
     // If available, use the MPI-CUDA backend for small matrices.
 #ifdef AL_HAS_MPI_CUDA
     // Based on runs on Pascal and Ray.
-    if ((El::mpi::Size(c) > 4 && local_size <= 8192) ||
+    /*if ((El::mpi::Size(c) > 4 && local_size <= 8192) ||
         (El::mpi::Size(c) >= 16 && local_size <= 32768)) {
       t = std::type_index(typeid(::Al::MPICUDABackend));
-    }
+      }*/
 #endif  // AL_HAS_MPI_CUDA
+#elif defined(AL_HAS_MPI_CUDA)
+    t = std::type_index(typeid(::Al::MPICUDABackend));
 #else
-    throw lbann_exception("Allreduce on GPU matrix requires NCCL support in"
-                          " Aluminum");
+    throw lbann_exception("Allreduce on GPU matrix requires NCCL or MPI-CUDA"
+                          " support in Aluminum");
 #endif  // AL_HAS_NCCL
   }
 #endif  // LBANN_HAS_GPU
-  auto&& comm = get_al_comm(c, t);
   if (t == std::type_index(typeid(::Al::MPIBackend))) {
     ::Al::NonblockingAllreduce<::Al::MPIBackend>(
       m.Buffer(),
       local_size,
       mpi_op_to_al_op(op),
-      *comm,
+      c.template GetComm<::Al::MPIBackend>(),
       req.mpi_req);
   }
   /// @todo MPI-CUDA backend
@@ -273,7 +274,7 @@ void lbann_comm::nb_allreduce(AbsMat& m,
       m.Buffer(),
       local_size,
       mpi_op_to_al_op(op),
-      *static_cast<::Al::NCCLBackend::comm_type*>(comm),
+      c.template GetComm<::Al::NCCLBackend>(),
       req.nccl_req);
   }
 #endif // AL_HAS_NCCL
@@ -284,22 +285,22 @@ void lbann_comm::nb_allreduce(AbsMat& m,
       m.Buffer(),
       local_size,
       mpi_op_to_al_op(op),
-      *static_cast<::Al::MPICUDABackend::comm_type*>(comm),
+      c.template GetComm<::Al::MPICUDABackend>(),
       req.mpicuda_req,
       ::Al::MPICUDAAllreduceAlgorithm::host_transfer);
   }
 #endif  // AL_HAS_MPI_CUDA
   bytes_received += sizeof(DataType) * local_size * (El::mpi::Size(c) - 1);
 #else
-  allreduce(m, c, op);
+  allreduce(m, std::move(c), op);
 #endif // LBANN_HAS_ALUMINUM
 }
 
 void lbann_comm::nb_allreduce(AbsDistMat& m,
-                              const El::mpi::Comm c,
+                              El::mpi::Comm c,
                               Al::request& req,
                               El::mpi::Op op) {
-  nb_allreduce(m.Matrix(), c, req, op);
+  nb_allreduce(m.Matrix(), std::move(c), req, op);
 }
 
 void lbann_comm::wait(Al::request& req) {
@@ -351,9 +352,9 @@ void lbann_comm::intermodel_broadcast_matrix(AbsDistMat& mat, int root) {
 }
 
 template<>
-void lbann_comm::broadcast<std::string>(const int root, std::string& str, const El::mpi::Comm c) {
+void lbann_comm::broadcast<std::string>(const int root, std::string& str, El::mpi::Comm c) {
   std::vector<char> data(str.begin(), str.end());
-  broadcast(root, data, c);
+  broadcast(root, data, std::move(c));
   str.assign(data.begin(), data.end());
 }
 
@@ -377,11 +378,11 @@ void lbann_comm::barrier(const El::mpi::Comm c) {
 }
 
 void lbann_comm::send(const AbsMat& mat, int model, int rank) {
-  send(mat.LockedBuffer(), mat.Height() * mat.Width(), model, rank);
+  El::Send(mat, get_world_comm(), get_world_rank(model, rank));
 }
 
 void lbann_comm::send(const DistMat& mat, int model, int rank) {
-  send(mat.LockedBuffer(), mat.LocalHeight() * mat.LocalWidth(), model, rank);
+  send(mat.LockedMatrix(), model, rank);
 }
 
 void lbann_comm::nb_send(const AbsMat& mat, int model, int rank,
@@ -396,19 +397,19 @@ void lbann_comm::nb_send(const DistMat& mat, int model, int rank,
 }
 
 void lbann_comm::recv(AbsMat& mat, int model, int rank) {
-  recv(mat.Buffer(), mat.Height() * mat.Width(), model, rank);
+  El::Recv(mat, get_world_comm(), get_world_rank(model, rank));
 }
 
 void lbann_comm::recv(DistMat& mat, int model, int rank) {
-  recv(mat.Buffer(), mat.LocalHeight() * mat.LocalWidth(), model, rank);
+  recv(mat.Matrix(), model, rank);
 }
 
 void lbann_comm::recv(AbsMat& mat) {
-  recv(mat.Buffer(), mat.Height() * mat.Width());
+  El::Recv(mat, get_world_comm(), El::mpi::ANY_SOURCE);
 }
 
 void lbann_comm::recv(DistMat& mat) {
-  recv(mat.Buffer(), mat.LocalHeight() * mat.LocalWidth());
+  recv(mat.Matrix());
 }
 
 void lbann_comm::nb_recv(AbsMat& mat, int model, int rank,
@@ -496,6 +497,10 @@ void lbann_comm::recursive_doubling_allreduce_pow2(
     throw lbann_exception("lbann_comm: recursive doubling allreduce requires"
                           " a power-of-2 number of participating processes");
   }
+
+  // FIXME
+  El::SyncInfo<El::Device::CPU> fixmeSyncInfo;
+
   uint8_t *max_recv_buf = get_collective_buffer(max_recv_count);
   uint8_t *recv_buf = max_recv_buf;
   unsigned int mask = 1;
@@ -522,7 +527,8 @@ void lbann_comm::recursive_doubling_allreduce_pow2(
     ar_bytes_sent += send_size;
     double sendrecv_start = get_time();
     El::mpi::SendRecv(send_buf, send_size, partner,
-                      recv_buf, recv_size, partner, comm);
+                      recv_buf, recv_size, partner, comm,
+                      fixmeSyncInfo);
     double sendrecv_tot = get_time() - sendrecv_start;
     ar_send_time += sendrecv_tot;
     ar_recv_time += sendrecv_tot;
@@ -544,6 +550,9 @@ void lbann_comm::pe_ring_allreduce(
   std::function<int(uint8_t *, AbsMat&)> recv_transform,
   std::function<int(uint8_t *, AbsMat&, bool)> recv_apply_transform,
   const lbann_comm::allreduce_options opts) {
+
+  El::SyncInfo<D> syncInfo{mat};
+
   double ar_start = get_time();
   const int rank = El::mpi::Rank(comm);
   const int nprocs = El::mpi::Size(comm);
@@ -692,7 +701,8 @@ void lbann_comm::pe_ring_allreduce(
     }
     double sendrecv_start = get_time();
     El::mpi::SendRecv(send_buf, send_size, dst,
-                      recv_buf, max_recv_count, src, comm);
+                      recv_buf, max_recv_count, src, comm,
+                      syncInfo);
     double sendrecv_tot = get_time() - sendrecv_start;
     ar_send_time += sendrecv_tot;
     ar_recv_time += sendrecv_tot;
@@ -733,7 +743,8 @@ void lbann_comm::pe_ring_allreduce(
     ar_ag_bytes_sent += send_size;
     double sendrecv_start = get_time();
     El::mpi::SendRecv(recv_buf, send_size, dst,
-                      recv_buf2, max_recv_count, src, comm);
+                      recv_buf2, max_recv_count, src, comm,
+                      syncInfo);
     double sendrecv_tot = get_time() - sendrecv_start;
     ar_send_time += sendrecv_tot;
     ar_recv_time += sendrecv_tot;
@@ -765,6 +776,9 @@ void lbann_comm::ring_allreduce(
   std::function<int(uint8_t *, AbsMat&)> recv_transform,
   std::function<int(uint8_t *, AbsMat&, bool)> recv_apply_transform,
   const lbann_comm::allreduce_options opts) {
+
+  El::SyncInfo<D> syncInfo{mat};
+
   double ar_start = get_time();
   const int rank = El::mpi::Rank(comm);
   const int nprocs = El::mpi::Size(comm);
@@ -831,7 +845,7 @@ void lbann_comm::ring_allreduce(
     ar_rs_bytes_sent += send_size;
     double sendrecv_start = get_time();
     El::mpi::SendRecv(send_buf, send_size, dst,
-                      recv_buf, recv_size, src, comm);
+                      recv_buf, recv_size, src, comm, syncInfo);
     double sendrecv_tot = get_time() - sendrecv_start;
     ar_send_time += sendrecv_tot;
     ar_recv_time += sendrecv_tot;
@@ -870,7 +884,7 @@ void lbann_comm::ring_allreduce(
     }
     double sendrecv_start = get_time();
     El::mpi::SendRecv(send_buf, send_size, dst,
-                      recv_buf, max_recv_count, src, comm);
+                      recv_buf, max_recv_count, src, comm, syncInfo);
     double sendrecv_tot = get_time() - sendrecv_start;
     ar_send_time += sendrecv_tot;
     ar_recv_time += sendrecv_tot;
@@ -907,7 +921,7 @@ void lbann_comm::ring_allreduce(
     ar_ag_bytes_sent += send_size;
     double sendrecv_start = get_time();
     El::mpi::SendRecv(recv_buf, send_size, dst,
-                      recv_buf2, max_recv_count, src, comm);
+                      recv_buf2, max_recv_count, src, comm, syncInfo);
     double sendrecv_tot = get_time() - sendrecv_start;
     ar_send_time += sendrecv_tot;
     ar_recv_time += sendrecv_tot;
@@ -950,6 +964,9 @@ void lbann_comm::rabenseifner_allreduce(
     throw lbann_exception("lbann_comm: Rabenseifner allreduce requires"
                           " a power-of-2 number of participating processes");
   }
+
+  El::SyncInfo<D> syncInfo{mat};
+
   // Compute the slices on each processor.
   const El::Int cols_per_proc = mat.Width() / nprocs;
   const El::Int cols_remainder = mat.Width() % nprocs;
@@ -1011,7 +1028,7 @@ void lbann_comm::rabenseifner_allreduce(
     ar_rs_bytes_sent += send_size;
     double sendrecv_start = get_time();
     El::mpi::SendRecv(send_buf, send_size, partner,
-                      recv_buf, recv_size, partner, comm);
+                      recv_buf, recv_size, partner, comm, syncInfo);
     double sendrecv_tot = get_time() - sendrecv_start;
     ar_send_time += sendrecv_tot;
     ar_recv_time += sendrecv_tot;
@@ -1085,7 +1102,7 @@ void lbann_comm::rabenseifner_allreduce(
     ar_ag_bytes_sent += send_size;
     double sendrecv_start = get_time();
     El::mpi::SendRecv(send_buf, send_size, partner,
-                      recv_buf, recv_size, partner, comm);
+                      recv_buf, recv_size, partner, comm, syncInfo);
     double sendrecv_tot = get_time() - sendrecv_start;
     ar_send_time += sendrecv_tot;
     ar_recv_time += sendrecv_tot;
@@ -1194,38 +1211,6 @@ uint8_t *lbann_comm::get_collective_buffer(size_t size, size_t idx) {
 }
 
 #ifdef LBANN_HAS_ALUMINUM
-::Al::MPICommunicator* lbann_comm::get_al_comm(El::mpi::Comm c,
-                                               std::type_index t) {
-
-  // Construct Aluminum communicator if needed
-  const al_comms_key_type key(c.comm, t);
-  if (m_al_comms.count(key) == 0) {
-    if (t == std::type_index(typeid(::Al::MPIBackend))) {
-      m_al_comms[key] = al_comms_val_type(new ::Al::MPIBackend::comm_type(c.comm));
-    }
-    #ifdef AL_HAS_NCCL
-    if (t == std::type_index(typeid(::Al::NCCLBackend))) {
-      auto&& val = new ::Al::NCCLBackend::comm_type(c.comm, El::GPUManager::Stream());
-      m_al_comms[key] = al_comms_val_type(val);
-    }
-    #endif // AL_HAS_NCCL
-    #ifdef AL_HAS_MPI_CUDA
-    if (t == std::type_index(typeid(::Al::MPICUDABackend))) {
-      auto&& val = new ::Al::MPICUDABackend::comm_type(c.comm, El::GPUManager::Stream());
-      m_al_comms[key] = al_comms_val_type(val);
-    }
-    #endif  // AL_HAS_MPI_CUDA
-  }
-
-  // Return Aluminum communicator
-  auto&& comm = m_al_comms[key].get();
-  if (comm == nullptr) {
-    throw lbann_exception("Could not get Aluminum communicator");
-  }
-  return comm;
-
-}
-
 ::Al::ReductionOperator lbann_comm::mpi_op_to_al_op(El::mpi::Op op) {
   if (op == El::mpi::SUM) {
     return ::Al::ReductionOperator::sum;
diff --git a/src/data_readers/CMakeLists.txt b/src/data_readers/CMakeLists.txt
index 565daa87bdc..5d9d67f0395 100644
--- a/src/data_readers/CMakeLists.txt
+++ b/src/data_readers/CMakeLists.txt
@@ -26,6 +26,7 @@ set_full_path(THIS_DIR_SOURCES
   data_reader_merge_samples.cpp
   data_reader_mesh.cpp
   data_reader_mnist.cpp
+  data_reader_moving_mnist.cpp
   data_reader_nci.cpp
   data_reader_numpy.cpp
   data_reader_pilot2_molecular.cpp
diff --git a/src/data_readers/cv_utils.cpp b/src/data_readers/cv_utils.cpp
index a5dcc93f403..d7fcf3f783c 100644
--- a/src/data_readers/cv_utils.cpp
+++ b/src/data_readers/cv_utils.cpp
@@ -90,7 +90,7 @@ std::ostream& operator<<(std::ostream& os, const cv_transform& tr) {
 }
 
 
-cv::Mat cv_utils::lbann_imread(const std::string& img_file_path, int flags, std::vector<char>& buf) {
+cv::Mat cv_utils::lbann_imread(const std::string& img_file_path, int flags, std::vector<char>& buf, cv::Mat* cv_buf) {
   // Load an image bytestream into memory
   bool ok = lbann::load_file(img_file_path, buf);
   if (!ok) {
@@ -102,8 +102,8 @@ cv::Mat cv_utils::lbann_imread(const std::string& img_file_path, int flags, std:
   const cv::Mat inbuf(1, buf.size(), InputBuf_T::T(1), buf.data());
 
   // decode the image data in the memory buffer
-  cv::Mat image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH);
-
+  // Note that if cv_buf is not NULL, then the return value is *cv_buf
+  cv::Mat image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, cv_buf);
   return image;
 }
 
diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp
index 6fda322ce0a..78ca187a4df 100644
--- a/src/data_readers/data_reader.cpp
+++ b/src/data_readers/data_reader.cpp
@@ -28,10 +28,15 @@
 
 #include "lbann/data_readers/data_reader.hpp"
 #include "lbann/data_store/generic_data_store.hpp"
+#include "lbann/utils/omp_pragma.hpp"
+#include "lbann/models/model.hpp"
 #include <omp.h>
 
 namespace lbann {
 
+#undef DEBUG
+//#define DEBUG
+
 void generic_data_reader::shuffle_indices() {
   // Shuffle the data
   if (m_shuffle) {
@@ -67,6 +72,18 @@ void generic_data_reader::setup() {
 
 
 int lbann::generic_data_reader::fetch_data(CPUMat& X) {
+  #ifdef DEBUG
+  if (m_current_pos == 0) {
+    if (is_master()) {
+      std::cout << "role: " << get_role() << " model: " << m_model->get_model_id()
+                << " shuffled indices: ";
+      for (size_t j=0; j<15; j++) {
+        std::cout << m_shuffled_indices[j] << " ";
+      }
+      std::cout << "\n";
+    }
+  }
+  #endif
 
   int nthreads = omp_get_max_threads();
   if(!position_valid()) {
@@ -87,11 +104,16 @@ int lbann::generic_data_reader::fetch_data(CPUMat& X) {
   }
 
   int loaded_batch_size = get_loaded_mini_batch_size();
-  const int end_pos = std::min(static_cast<size_t>(m_current_pos+loaded_batch_size),
-                               m_shuffled_indices.size());
-  const int mb_size = std::min(
-    El::Int{((end_pos - m_current_pos) + m_sample_stride - 1) / m_sample_stride},
-    X.Width());
+
+  const int end_pos = std::min(static_cast<size_t>(m_current_pos+loaded_batch_size), m_shuffled_indices.size());
+  const int mb_size = std::min(El::Int{((end_pos - m_current_pos) + m_sample_stride - 1) / m_sample_stride},
+      X.Width());
+
+  static bool fix_jag = true;
+  if (m_jag_partitioned && fix_jag) {
+    fix_jag = false;
+    set_jag_variables(mb_size);
+  }
 
   if (!m_save_minibatch_indices) {
     El::Zeros(X, X.Height(), X.Width());
@@ -108,15 +130,15 @@ int lbann::generic_data_reader::fetch_data(CPUMat& X) {
 
   else {
     std::string error_message;
-#pragma omp parallel for
-    for (int s = 0; s < mb_size; s++) {
+    LBANN_DATA_FETCH_OMP_FOR (int s = 0; s < mb_size; s++) {
       int n = m_current_pos + (s * m_sample_stride);
       int index = m_shuffled_indices[n];
-      bool valid = fetch_datum(X, index, s, omp_get_thread_num());
+      bool valid = fetch_datum(X, index, s, LBANN_OMP_THREAD_NUM);
       if (!valid) {
-#pragma omp critical
+        LBANN_DATA_FETCH_OMP_CRITICAL
         error_message = "invalid datum (index " + std::to_string(index) + ")";
       }
+      m_indices_fetched_per_mb.Set(s, 0, index);
     }
     if (!error_message.empty()) { LBANN_ERROR(error_message); }
 
@@ -131,6 +153,32 @@ int lbann::generic_data_reader::fetch_data(CPUMat& X) {
   return mb_size;
 }
 
+void lbann::generic_data_reader::set_jag_variables(int mb_size) {
+  // all min_batches have the same number of indices;
+  // this probably causes a few indices to be discarded,
+  // but with 1B indices, who cares?
+  int mb_max = m_comm->model_allreduce<int>(mb_size, El::mpi::MAX);
+  m_num_iterations_per_epoch = m_shuffled_indices.size() / mb_max;
+
+  m_last_mini_batch_size = m_mini_batch_size;
+  m_global_mini_batch_size = m_mini_batch_size;
+  m_global_last_mini_batch_size = m_mini_batch_size;
+
+  m_reset_mini_batch_index = 0;
+  m_loaded_mini_batch_idx = 0;
+  m_current_mini_batch_idx = 0;
+
+  m_stride_to_next_mini_batch = mb_size;
+  m_stride_to_last_mini_batch = mb_size;
+
+  m_base_offset = 0;
+  m_model_offset = 0;
+  m_sample_stride = 1;
+  m_iteration_stride = 1;
+
+  m_world_master_mini_batch_adjustment = 0;
+}
+
 int lbann::generic_data_reader::fetch_labels(CPUMat& Y) {
   if(!position_valid()) {
     throw lbann_exception(
@@ -154,13 +202,12 @@ int lbann::generic_data_reader::fetch_labels(CPUMat& Y) {
 
 //  else {
     std::string error_message;
-#pragma omp parallel for
-    for (int s = 0; s < mb_size; s++) {
+    LBANN_DATA_FETCH_OMP_FOR (int s = 0; s < mb_size; s++) {
       int n = m_current_pos + (s * m_sample_stride);
       int index = m_shuffled_indices[n];
-      bool valid = fetch_label(Y, index, s, omp_get_thread_num());
+      bool valid = fetch_label(Y, index, s, LBANN_OMP_THREAD_NUM);
       if (!valid) {
-#pragma omp critical
+        LBANN_DATA_FETCH_OMP_CRITICAL
         error_message = "invalid label (index " + std::to_string(index) + ")";
       }
     }
@@ -185,13 +232,12 @@ int lbann::generic_data_reader::fetch_responses(CPUMat& Y) {
 
   El::Zeros(Y, Y.Height(), Y.Width());
   std::string error_message;
-#pragma omp parallel for
-  for (int s = 0; s < mb_size; s++) {
+  LBANN_DATA_FETCH_OMP_FOR (int s = 0; s < mb_size; s++) {
     int n = m_current_pos + (s * m_sample_stride);
     int index = m_shuffled_indices[n];
-    bool valid = fetch_response(Y, index, s, omp_get_thread_num());
+    bool valid = fetch_response(Y, index, s, LBANN_OMP_THREAD_NUM);
     if (!valid) {
-#pragma omp critical
+      LBANN_DATA_FETCH_OMP_CRITICAL
       error_message = "invalid response (index " + std::to_string(index) + ")";
     }
   }
@@ -199,6 +245,7 @@ int lbann::generic_data_reader::fetch_responses(CPUMat& Y) {
   return mb_size;
 }
 
+#if 0
 bool generic_data_reader::is_data_reader_done(bool is_active_reader) {
   bool reader_not_done = true;
   if(is_active_reader) {
@@ -208,6 +255,7 @@ bool generic_data_reader::is_data_reader_done(bool is_active_reader) {
   }
   return reader_not_done;
 }
+#endif
 
 bool generic_data_reader::update(bool is_active_reader) {
   bool reader_not_done = true; // BVE The sense of this should be fixed
@@ -216,11 +264,6 @@ bool generic_data_reader::update(bool is_active_reader) {
   if(is_active_reader) {
     m_current_pos = get_next_position();
 
-    /// Maintain the current height of the matrix
-    if (!m_save_minibatch_indices) {
-      El::Zeros(m_indices_fetched_per_mb, m_indices_fetched_per_mb.Height(), 1);
-    }
-
     m_loaded_mini_batch_idx += m_iteration_stride;
   }
   if (m_loaded_mini_batch_idx >= m_num_iterations_per_epoch) {
@@ -230,7 +273,8 @@ bool generic_data_reader::update(bool is_active_reader) {
     reader_not_done = false;
   }
   if (m_current_mini_batch_idx == m_num_iterations_per_epoch) {
-    if ((get_rank() < m_num_parallel_readers) && (m_current_pos < (int)m_shuffled_indices.size())) {
+    // for working with 1B jag samples, we may not process all the data
+    if ((get_rank() < m_num_parallel_readers) && (m_current_pos < (int)m_shuffled_indices.size()) && !m_jag_partitioned) {
       throw lbann_exception(
         std::string{} + __FILE__ + " " + std::to_string(__LINE__)
         + " :: generic data reader update error: the epoch is complete,"
@@ -258,6 +302,9 @@ bool generic_data_reader::update(bool is_active_reader) {
       }
     }
   }
+
+  post_update();
+
   return reader_not_done;
 }
 
@@ -484,6 +531,11 @@ for j in range(40) :
 }
 
 void generic_data_reader::select_subset_of_data() {
+  // ensure that all readers have the same number of indices
+  if (m_jag_partitioned) {
+    size_t n = m_comm->model_allreduce<size_t>(m_shuffled_indices.size(), El::mpi::MIN);
+    m_shuffled_indices.resize(n);
+  }
 
   // optionally partition data set amongst the models
   if (m_is_partitioned) {
diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp
index 17936a41bbc..a5f0c98ab28 100644
--- a/src/data_readers/data_reader_image.cpp
+++ b/src/data_readers/data_reader_image.cpp
@@ -136,6 +136,17 @@ void image_data_reader::load() {
   select_subset_of_data();
 }
 
+void image_data_reader::setup() {
+  generic_data_reader::setup();
+
+  using InputBuf_T = lbann::cv_image_type<uint8_t>;
+  auto cvMat = cv::Mat(1, get_linearized_data_size(), InputBuf_T::T(1));
+  m_thread_cv_buffer.resize(omp_get_max_threads());
+  for(int tid = 0; tid < omp_get_max_threads(); ++tid) {
+    m_thread_cv_buffer[tid] = cvMat.clone();
+  }
+}
+
 std::vector<image_data_reader::sample_t> image_data_reader::get_image_list_of_current_mb() const {
   std::vector<sample_t> ret;
   ret.reserve(m_mini_batch_size);
diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp
index e06ebacb9b1..6a6d24d067d 100644
--- a/src/data_readers/data_reader_imagenet.cpp
+++ b/src/data_readers/data_reader_imagenet.cpp
@@ -137,7 +137,7 @@ bool imagenet_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx, int tid) {
     m_data_store->get_data_buf(data_id, image_buf, 0);
     ret = lbann::image_utils::load_image(*image_buf, width, height, img_type, *(m_pps[tid]), X_v);
   } else {
-    ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid]);
+    ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]);
   }
 
   if(!ret) {
diff --git a/src/data_readers/data_reader_imagenet_patches.cpp b/src/data_readers/data_reader_imagenet_patches.cpp
index a1d026647ca..374fd9379af 100644
--- a/src/data_readers/data_reader_imagenet_patches.cpp
+++ b/src/data_readers/data_reader_imagenet_patches.cpp
@@ -154,7 +154,7 @@ bool imagenet_reader_patches::fetch_datum(CPUMat& X, int data_id, int mb_idx, in
     m_data_store->get_data_buf(data_id, image_buf, 0);
     ret = lbann::image_utils::load_image(*image_buf, width, height, img_type, *(m_pps[tid]), X_v);
   } else {
-    ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid]);
+    ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]);
   }
     //ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v);
 
diff --git a/src/data_readers/data_reader_jag.cpp b/src/data_readers/data_reader_jag.cpp
index 4279c40cac2..21ef8faac3f 100644
--- a/src/data_readers/data_reader_jag.cpp
+++ b/src/data_readers/data_reader_jag.cpp
@@ -57,12 +57,14 @@ data_reader_jag::~data_reader_jag() {
 
 
 void data_reader_jag::set_independent_variable_type(
-  const std::vector<data_reader_jag::variable_t> independent) {
+  const std::vector< std::vector<data_reader_jag::variable_t> >& independent) {
   if (!independent.empty() && !m_independent.empty() && (m_independent[0] == Undefined)) {
     m_independent.clear();
   }
-  for (const auto t: independent) {
-    add_independent_variable_type(t);
+  for (const auto& group: independent) {
+    for (const auto type: group) {
+      add_independent_variable_type(type);
+    }
   }
 }
 
@@ -77,12 +79,14 @@ void data_reader_jag::add_independent_variable_type(
 }
 
 void data_reader_jag::set_dependent_variable_type(
-  const std::vector<data_reader_jag::variable_t> dependent) {
+  const std::vector < std::vector<data_reader_jag::variable_t> >& dependent) {
   if (!dependent.empty() && !m_dependent.empty() && (m_dependent[0] == Undefined)) {
     m_dependent.clear();
   }
-  for (const auto t: dependent) {
-    add_dependent_variable_type(t);
+  for (const auto& group: dependent) {
+    for (const auto type: group) {
+      add_dependent_variable_type(type);
+    }
   }
 }
 
diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp
index 6be4a9b534c..bf2c79ec168 100644
--- a/src/data_readers/data_reader_jag_conduit.cpp
+++ b/src/data_readers/data_reader_jag_conduit.cpp
@@ -27,13 +27,15 @@
 
 #ifndef _JAG_OFFLINE_TOOL_MODE_
 #include "lbann/data_readers/data_reader_jag_conduit.hpp"
-#include "lbann/utils/file_utils.hpp" // for add_delimiter() in load()
+#include "lbann/io/data_buffers/partitioned_io_buffer.hpp"
+#include "lbann/io/data_buffers/distributed_io_buffer.hpp"
 //#include "lbann/data_store/data_store_jag_conduit.hpp"
 #else
 #include "data_reader_jag_conduit.hpp"
 #endif // _JAG_OFFLINE_TOOL_MODE_
 
 #ifdef LBANN_HAS_CONDUIT
+#include "lbann/utils/file_utils.hpp" // for add_delimiter() in load()
 #include "lbann/data_readers/opencv_extensions.hpp"
 #include <limits>     // numeric_limits
 #include <algorithm>  // max_element
@@ -46,21 +48,24 @@
 #include <omp.h>
 #include "lbann/utils/timer.hpp"
 #include "lbann/utils/glob.hpp"
+#include "lbann/utils/peek_map.hpp"
+#include "conduit/conduit_relay.hpp"
+#include "conduit/conduit_relay_hdf5.hpp"
 
 
 // This macro may be moved to a global scope
 #define _THROW_LBANN_EXCEPTION_(_CLASS_NAME_,_MSG_) { \
-  std::stringstream err; \
-  err << __FILE__ << ' '  << __LINE__ << " :: " \
+  std::stringstream _err; \
+  _err << __FILE__ << ' '  << __LINE__ << " :: " \
       << (_CLASS_NAME_) << "::" << (_MSG_); \
-  throw lbann_exception(err.str()); \
+  throw lbann_exception(_err.str()); \
 }
 
 #define _THROW_LBANN_EXCEPTION2_(_CLASS_NAME_,_MSG1_,_MSG2_) { \
-  std::stringstream err; \
-  err << __FILE__ << ' '  << __LINE__ << " :: " \
+  std::stringstream _err; \
+  _err << __FILE__ << ' '  << __LINE__ << " :: " \
       << (_CLASS_NAME_) << "::" << (_MSG1_) << (_MSG2_); \
-  throw lbann_exception(err.str()); \
+  throw lbann_exception(_err.str()); \
 }
 
 // This comes after all the headers, and is only visible within the current implementation file.
@@ -69,6 +74,33 @@
 
 namespace lbann {
 
+hdf5_file_handles::~hdf5_file_handles() {
+  for (auto& h: m_open_hdf5_files) {
+    conduit::relay::io::hdf5_close_file(h.second);
+  }
+  m_open_hdf5_files.clear();
+}
+
+bool hdf5_file_handles::add(const std::string fname, hid_t hnd) {
+  auto ret1 = m_open_hdf5_files.insert(std::pair<std::string, hid_t>(fname, hnd));
+  auto ret2 = m_open_hdf5_handles.insert(std::pair<hid_t, std::string>(hnd, fname));
+  return ret1.second && ret2.second;
+}
+
+hid_t hdf5_file_handles::get(const std::string& fname) const {
+  std::unordered_map<std::string, hid_t>::const_iterator it = m_open_hdf5_files.find(fname);
+  if (it == m_open_hdf5_files.end()) {
+    return static_cast<hid_t>(-1);
+  }
+  return it->second;
+}
+
+std::string hdf5_file_handles::get(const hid_t h) const {
+  return peek_map(m_open_hdf5_handles, h);
+}
+
+std::unordered_map<std::string, int> data_reader_jag_conduit::m_num_local_readers;
+
 const std::set<std::string> data_reader_jag_conduit::non_numeric_vars = {
   "fusion_reaction",
   "fusion_model_reaction",
@@ -93,6 +125,179 @@ const std::set<std::string> data_reader_jag_conduit::non_numeric_vars = {
   "solver_mode"
 };
 
+#ifndef _JAG_OFFLINE_TOOL_MODE_
+// These methods are overriden to allow each process to load and consume a unique set of data files
+bool data_reader_jag_conduit::position_valid() const {
+  const bool ok = (static_cast<size_t>(m_shuffled_indices[m_current_pos]) < m_valid_samples.size())
+    && (m_current_pos < (int)m_shuffled_indices.size());
+  if (!ok) {
+    const size_t my_rank = static_cast<size_t>(m_comm->get_rank_in_model());
+    std::stringstream err;
+    err << "rank " << my_rank << " position invalid: m_shuffled_indices["
+        << m_current_pos << "] (" << m_shuffled_indices[m_current_pos]
+        << ") >= m_valid_samples.size() (" << m_valid_samples.size() << ")" << std::endl;
+    std::cerr << err.str();
+  }
+  return ok;
+}
+
+void data_reader_jag_conduit::set_base_offset(const int s) {
+  m_base_offset = 0;
+}
+
+void data_reader_jag_conduit::set_reset_mini_batch_index(const int s) {
+  m_reset_mini_batch_index = 0;
+}
+
+int data_reader_jag_conduit::get_num_data() const {
+  return m_global_num_samples_to_use;
+}
+
+void data_reader_jag_conduit::shuffle_indices() {
+  // Shuffle the data
+  if (m_shuffle) {
+    std::shuffle(m_valid_samples.begin(), m_valid_samples.end(),
+                 get_data_seq_generator());
+  }
+  m_valid_samples.resize(m_local_num_samples_to_use);
+}
+
+void data_reader_jag_conduit::select_subset_of_data() {
+
+  m_local_num_samples_to_use = get_num_valid_local_samples();
+  shuffle_indices();
+
+  const size_t count = get_absolute_sample_count();
+  const double use_percent = get_use_percent();
+  if (count == 0u and use_percent == 0.0) {
+      throw lbann_exception(
+        std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
+        " :: data_reader_jag_conduit::select_subset_of_data() get_use_percent() "
+        + "and get_absolute_sample_count() are both zero; exactly one "
+        + "must be zero");
+  }
+  if (!(count == 0u or use_percent == 0.0)) {
+      throw lbann_exception(
+        std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
+        " :: data_reader_jag_conduit::select_subset_of_data() get_use_percent() "
+        "and get_absolute_sample_count() are both non-zero; exactly one "
+        "must be zero");
+  }
+
+  if (count != 0u) {
+    if(count > get_num_valid_local_samples()) {
+      throw lbann_exception(
+        std::string{} + __FILE__ + " " + std::to_string(__LINE__) +
+        " :: data_reader_jag_conduit::select_subset_of_data() - absolute_sample_count=" +
+        std::to_string(count) + " is > get_num_valid_local_samples()=" +
+        std::to_string(get_num_valid_local_samples()));
+    }
+    m_valid_samples.resize(get_absolute_sample_count());
+  }
+
+  if (use_percent) {
+    m_valid_samples.resize(get_use_percent()*get_num_valid_local_samples());
+  }
+
+  long unused = get_validation_percent()*get_num_valid_local_samples();
+  long use_me = get_num_valid_local_samples() - unused;
+  if (unused > 0) {
+      m_unused_samples = sample_map_t(m_valid_samples.begin() + use_me, m_valid_samples.end());
+      m_valid_samples.resize(use_me);
+  }
+
+  if(!m_shuffle) {
+    std::sort(m_valid_samples.begin(), m_valid_samples.end());
+    std::sort(m_unused_samples.begin(), m_unused_samples.end());
+  }
+  m_local_num_samples_to_use = get_num_valid_local_samples();
+}
+
+void data_reader_jag_conduit::use_unused_index_set() {
+  if ((m_leading_reader != this) && (m_leading_reader != nullptr)) {
+    return;
+  }
+  m_valid_samples.swap(m_unused_samples);
+  m_unused_samples.clear();
+  m_unused_samples.shrink_to_fit();
+  adjust_num_samples_to_use();
+  m_local_num_samples_to_use = get_num_valid_local_samples();
+}
+
+void data_reader_jag_conduit::set_io_buffer_type(const std::string io_buffer) {
+  m_io_buffer_type = io_buffer;
+}
+
+void data_reader_jag_conduit::set_local_id(const std::string role) {
+  m_local_reader_id = m_num_local_readers[role]++;
+}
+
+int data_reader_jag_conduit::get_local_id(const std::string role) const {
+  return m_local_reader_id;
+}
+
+void data_reader_jag_conduit::set_open_hdf5_files(std::shared_ptr<hdf5_file_handles>& f) {
+  m_open_hdf5_files = f;
+}
+
+std::shared_ptr<hdf5_file_handles>& data_reader_jag_conduit::get_open_hdf5_files() {
+  return m_open_hdf5_files;
+}
+
+void data_reader_jag_conduit::set_leading_reader(data_reader_jag_conduit* r) {
+  m_leading_reader = r;
+}
+
+data_reader_jag_conduit* data_reader_jag_conduit::get_leading_reader() {
+  return m_leading_reader;
+}
+
+int data_reader_jag_conduit::compute_max_num_parallel_readers() {
+  if (m_io_buffer_type == "distributed") {
+    // Use a sufficiently large data set size for the time being, and
+    // check if it is ok when the actual size of data is available later
+    long data_set_size = 2 * get_mini_batch_size() * m_comm->get_num_models() * get_num_parallel_readers();
+    set_num_parallel_readers(distributed_io_buffer::compute_max_num_parallel_readers(
+                             data_set_size, get_mini_batch_size(),
+                             get_num_parallel_readers(), get_comm()));
+    set_sample_stride(1);
+    set_iteration_stride(get_num_parallel_readers());
+  } else if (m_io_buffer_type == "partitioned") {
+    set_num_parallel_readers(partitioned_io_buffer::compute_max_num_parallel_readers(
+                             0, get_mini_batch_size(),
+                             get_num_parallel_readers(), get_comm()));
+    set_sample_stride(get_num_parallel_readers());
+    set_iteration_stride(1);
+  } else {
+    _THROW_LBANN_EXCEPTION_(get_type(), " unknown io_buffer type: " + m_io_buffer_type);
+  }
+  return get_num_parallel_readers();
+}
+
+bool data_reader_jag_conduit::check_num_parallel_readers(long data_set_size) {
+  if (m_io_buffer_type == "distributed") {
+    const bool too_many_readers = !distributed_io_buffer::check_num_parallel_readers(data_set_size, get_mini_batch_size(), get_num_parallel_readers(), m_comm);
+    if (too_many_readers) {
+      if(m_comm->am_world_master()) {
+        std::string err =
+          "The training data set size " + std::to_string(data_set_size)
+          + " is too small for the number of parallel readers "
+          + std::to_string(get_num_parallel_readers());
+        _THROW_LBANN_EXCEPTION_(get_type(), err);
+        return false;
+      }
+    }
+  }
+  return true;
+}
+#else // _JAG_OFFLINE_TOOL_MODE_
+void data_reader_jag_conduit::set_num_samples(size_t ns) {
+  m_local_num_samples_to_use = ns;
+  m_global_num_samples_to_use = ns;
+  m_num_samples = ns;
+}
+#endif // _JAG_OFFLINE_TOOL_MODE_
+
 data_reader_jag_conduit::data_reader_jag_conduit(const std::shared_ptr<cv_process>& pp, bool shuffle)
   : generic_data_reader(shuffle) {
   set_defaults();
@@ -106,16 +311,19 @@ data_reader_jag_conduit::data_reader_jag_conduit(const std::shared_ptr<cv_proces
 
 void data_reader_jag_conduit::copy_members(const data_reader_jag_conduit& rhs) {
   m_independent = rhs.m_independent;
+  m_independent_groups = rhs.m_independent_groups;
   m_dependent = rhs.m_dependent;
+  m_dependent_groups = rhs.m_dependent_groups;
   m_image_width = rhs.m_image_width;
   m_image_height = rhs.m_image_height;
   m_image_num_channels = rhs.m_image_num_channels;
-  set_linearized_image_size();
   m_num_img_srcs = rhs.m_num_img_srcs;
+  m_split_channels = rhs.m_split_channels;
+  set_linearized_image_size();
   m_is_data_loaded = rhs.m_is_data_loaded;
+  m_emi_image_keys = rhs.m_emi_image_keys;
   m_scalar_keys = rhs.m_scalar_keys;
   m_input_keys = rhs.m_input_keys;
-  m_success_map = rhs.m_success_map;
 
   if (rhs.m_pps.size() == 0u || !rhs.m_pps[0]) {
     _THROW_LBANN_EXCEPTION_(get_type(), " construction error: no image processor");
@@ -123,13 +331,32 @@ void data_reader_jag_conduit::copy_members(const data_reader_jag_conduit& rhs) {
 
   replicate_processor(*rhs.m_pps[0]);
 
-  m_data = rhs.m_data;
   m_uniform_input_type = rhs.m_uniform_input_type;
 
   m_scalar_filter = rhs.m_scalar_filter;
   m_scalar_prefix_filter = rhs.m_scalar_prefix_filter;
   m_input_filter = rhs.m_input_filter;
   m_input_prefix_filter = rhs.m_input_prefix_filter;
+  m_valid_samples = rhs.m_valid_samples;
+  m_unused_samples = rhs.m_unused_samples;
+  m_local_num_samples_to_use = rhs.m_local_num_samples_to_use;
+  m_global_num_samples_to_use = rhs.m_global_num_samples_to_use;
+  m_io_buffer_type = rhs.m_io_buffer_type;
+  m_local_reader_id = rhs.m_local_reader_id;
+  m_open_hdf5_files = rhs.m_open_hdf5_files;
+  //TODO: need  to make sure this is what we want
+  m_leading_reader = rhs.m_leading_reader;
+
+  El::Copy(rhs.m_data_cache, m_data_cache);
+  El::Copy(rhs.m_response_cache, m_response_cache);
+  El::Copy(rhs.m_label_cache, m_label_cache);
+  m_cached_data_mb_size = rhs.m_cached_data_mb_size;
+  m_cached_response_mb_size = rhs.m_cached_response_mb_size;
+  m_cached_label_mb_size = rhs.m_cached_label_mb_size;
+
+  m_image_normalization_params = rhs.m_image_normalization_params;
+  m_scalar_normalization_params = rhs.m_scalar_normalization_params;
+  m_input_normalization_params = rhs.m_input_normalization_params;
 }
 
 data_reader_jag_conduit::data_reader_jag_conduit(const data_reader_jag_conduit& rhs)
@@ -154,15 +381,19 @@ data_reader_jag_conduit::~data_reader_jag_conduit() {
 }
 
 void data_reader_jag_conduit::set_defaults() {
-  m_independent.assign(1u, Undefined);
-  m_dependent.assign(1u, Undefined);
+  m_independent.clear();
+  m_independent_groups.clear();
+  m_dependent.clear();
+  m_dependent_groups.clear();
   m_image_width = 0;
   m_image_height = 0;
   m_image_num_channels = 1;
   set_linearized_image_size();
   m_num_img_srcs = 1u;
+  m_split_channels = false;
   m_is_data_loaded = false;
   m_num_labels = 0;
+  m_emi_image_keys.clear();
   m_scalar_keys.clear();
   m_input_keys.clear();
   m_uniform_input_type = false;
@@ -170,6 +401,20 @@ void data_reader_jag_conduit::set_defaults() {
   m_scalar_prefix_filter.clear();
   m_input_filter.clear();
   m_input_prefix_filter.clear();
+  m_valid_samples.clear();
+  m_local_num_samples_to_use = 0ul;
+  m_global_num_samples_to_use = 0ul;
+  m_io_buffer_type = "";
+  m_local_reader_id = 0;
+  m_open_hdf5_files = nullptr;
+  m_leading_reader = this;
+  m_cached_data_mb_size = 0;
+  m_cached_response_mb_size = 0;
+  m_cached_label_mb_size = 0;
+
+  m_image_normalization_params.clear();
+  m_scalar_normalization_params.clear();
+  m_input_normalization_params.clear();
 }
 
 /// Replicate image processor for each OpenMP thread
@@ -204,44 +449,74 @@ bool data_reader_jag_conduit::replicate_processor(const cv_process& pp) {
   return true;
 }
 
-const conduit::Node& data_reader_jag_conduit::get_conduit_node(const std::string key) const {
-  return m_data[key];
+const conduit::Node& data_reader_jag_conduit::get_conduit_node(const conduit::Node& n_base, const std::string key) {
+  return n_base[key];
 }
 
+bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::string& key, conduit::Node& node) const {
+  const std::string& sample_name = m_valid_samples[i].first;
+  hid_t h = m_valid_samples[i].second;
+  if (h <= static_cast<hid_t>(0)) {
+    _THROW_LBANN_EXCEPTION_(get_type(), "Invalid file handle for " + sample_name);
+    return false;
+  }
 
-void data_reader_jag_conduit::set_independent_variable_type(
-  const std::vector<data_reader_jag_conduit::variable_t> independent) {
-  if (!independent.empty() && !m_independent.empty() && (m_independent[0] == Undefined)) {
-    m_independent.clear();
+  const std::string path = sample_name + key;
+#if 0
+  // In case that a file handle is closed, reopen and remap it.
+  if (!conduit::relay::io::hdf5_has_path(h, path)) {
+    const std::string conduit_file_path = m_open_hdf5_files->get(h);
+    hid_t hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( conduit_file_path );
+    m_open_hdf5_files->add(conduit_file_path, hdf5_file_hnd);
   }
-  for (const auto t: independent) {
-    add_independent_variable_type(t);
+#endif
+  conduit::relay::io::hdf5_read(h, path, node);
+
+  return true;
+}
+
+bool data_reader_jag_conduit::has_conduit_path(const size_t i, const std::string& key) const {
+  const std::string& sample_name = m_valid_samples[i].first;
+  hid_t h = m_valid_samples[i].second;
+  return conduit::relay::io::hdf5_has_path(h, std::string("/") + sample_name + key);
+}
+
+
+void data_reader_jag_conduit::set_independent_variable_type(
+  const std::vector< std::vector<data_reader_jag_conduit::variable_t> >& independent) {
+  m_independent_groups = independent;
+  m_independent.clear();
+
+  for (const auto& group: independent) {
+    for (const auto type: group) {
+      add_independent_variable_type(type);
+    }
   }
 }
 
 void data_reader_jag_conduit::add_independent_variable_type(
   const data_reader_jag_conduit::variable_t independent) {
-  if (!(independent == JAG_Image || independent == JAG_Scalar ||
-        independent == JAG_Input || independent == Undefined)) {
+  if (!(independent == JAG_Image || independent == JAG_Scalar || independent == JAG_Input)) {
     _THROW_LBANN_EXCEPTION_(_CN_, "unrecognized independent variable type ");
   }
   m_independent.push_back(independent);
 }
 
 void data_reader_jag_conduit::set_dependent_variable_type(
-  const std::vector<data_reader_jag_conduit::variable_t> dependent) {
-  if (!dependent.empty() && !m_dependent.empty() && (m_dependent[0] == Undefined)) {
-    m_dependent.clear();
-  }
-  for (const auto t: dependent) {
-    add_dependent_variable_type(t);
+  const std::vector< std::vector<data_reader_jag_conduit::variable_t> >& dependent) {
+  m_dependent_groups = dependent;
+  m_dependent.clear();
+
+  for (const auto& group: dependent) {
+    for (const auto type: group) {
+      add_dependent_variable_type(type);
+    }
   }
 }
 
 void data_reader_jag_conduit::add_dependent_variable_type(
   const data_reader_jag_conduit::variable_t dependent) {
-  if (!(dependent == JAG_Image || dependent == JAG_Scalar ||
-        dependent == JAG_Input || dependent == Undefined)) {
+  if (!(dependent == JAG_Image || dependent == JAG_Scalar || dependent == JAG_Input)) {
     _THROW_LBANN_EXCEPTION_(_CN_, "unrecognized dependent variable type ");
   }
   m_dependent.push_back(dependent);
@@ -258,16 +533,29 @@ data_reader_jag_conduit::get_dependent_variable_type() const {
 }
 
 void data_reader_jag_conduit::set_image_dims(const int width, const int height, const int ch) {
-  if ((width > 0) && (height > 0)) { // set and valid
+  if ((width > 0) && (height > 0) && (ch > 0)) { // set and valid
     m_image_width = width;
     m_image_height = height;
     m_image_num_channels = ch;
-  } else if (!((width == 0) && (height == 0))) { // set but not valid
+  } else if (!((width == 0) && (height == 0) && (ch == 1))) { // set but not valid
     _THROW_LBANN_EXCEPTION_(_CN_, "set_image_dims() : invalid image dims");
   }
   set_linearized_image_size();
 }
 
+void data_reader_jag_conduit::set_image_choices(const std::vector<std::string> image_keys) {
+  m_emi_image_keys = image_keys;
+  // For example, in the data reader prototext file, have a line similar to the one below
+  // image_keys: ["(0.0, 0.0)/0.0","(90.0, 0.0)/0.0","(90.0, 78.0)/0.0"];
+
+  m_num_img_srcs = m_emi_image_keys.size();
+}
+
+const std::vector<std::string>& data_reader_jag_conduit::get_image_choices() const {
+  return m_emi_image_keys;
+}
+
+
 void data_reader_jag_conduit::add_scalar_filter(const std::string& key) {
   m_scalar_filter.insert(key);
 }
@@ -291,7 +579,7 @@ void data_reader_jag_conduit::add_input_prefix_filter(const prefix_t& p) {
  */
 bool data_reader_jag_conduit::filter(const std::set<std::string>& key_filter,
   const std::vector<data_reader_jag_conduit::prefix_t>& prefix_filter, const std::string& key) const {
-  if (key_filter.find(key) != key_filter.end()) {
+  if (key_filter.find(key) != key_filter.cend()) {
     return true;
   }
   for (const auto& pf: prefix_filter) {
@@ -305,26 +593,17 @@ bool data_reader_jag_conduit::filter(const std::set<std::string>& key_filter,
   return false;
 }
 
-/**
- * To use no key, set 'Undefined' to the corresponding variable type,
- * or call this with an empty vector argument after loading data.
- */
 void data_reader_jag_conduit::set_scalar_choices(const std::vector<std::string>& keys) {
   m_scalar_keys = keys;
-  // If this call is made after loading data, check the keys
-  if (m_is_data_loaded) {
-    check_scalar_keys();
-  } else if (keys.empty()) {
-    _THROW_LBANN_EXCEPTION2_(_CN_, "set_scalar_choices() : ", \
-                                   "empty keys not allowed before data loading");
-  }
+  check_scalar_keys();
 }
 
 void data_reader_jag_conduit::set_all_scalar_choices() {
-  if (m_success_map.size() == 0) {
+  if (m_valid_samples.empty()) {
     return;
   }
-  const conduit::Node & n_scalar = get_conduit_node(m_success_map[0] + "/outputs/scalars");
+  conduit::Node n_scalar;
+  load_conduit_node(0, "/outputs/scalars", n_scalar);
   m_scalar_keys.reserve(n_scalar.number_of_children());
   const std::vector<std::string>& child_names = n_scalar.child_names();
   for (const auto& key: child_names) {
@@ -346,20 +625,15 @@ const std::vector<std::string>& data_reader_jag_conduit::get_scalar_choices() co
  */
 void data_reader_jag_conduit::set_input_choices(const std::vector<std::string>& keys) {
   m_input_keys = keys;
-  // If this call is made after loading data, check the keys
-  if (m_is_data_loaded) {
-    check_input_keys();
-  } else if (keys.empty()) {
-    _THROW_LBANN_EXCEPTION2_(_CN_, "set_input_choices() : ", \
-                                   "empty keys not allowed before data loading");
-  }
+  check_input_keys();
 }
 
 void data_reader_jag_conduit::set_all_input_choices() {
-  if (m_success_map.size() == 0) {
+  if (m_valid_samples.empty()) {
     return;
   }
-  const conduit::Node & n_input = get_conduit_node(m_success_map[0] + "/inputs");
+  conduit::Node n_input;
+  load_conduit_node(0, "/inputs", n_input);
   m_input_keys.reserve(n_input.number_of_children());
   const std::vector<std::string>& child_names = n_input.child_names();
   for (const auto& key: child_names) {
@@ -375,88 +649,92 @@ const std::vector<std::string>& data_reader_jag_conduit::get_input_choices() con
 }
 
 
-void data_reader_jag_conduit::set_num_img_srcs() {
-  m_num_img_srcs = m_emi_selectors.size();
-
-#if 0
-
-  if (m_success_map.size() == 0) {
-    return;
-  }
-
-  conduit::NodeConstIterator itr = get_conduit_node(m_success_map[0] + "/outputs/images").children();
-
-  using view_set = std::set< std::pair<float, float> >;
-  view_set views;
-
-  while (itr.has_next()) {
-    const conduit::Node & n_image = itr.next();
-    std::stringstream sstr(n_image["view"].as_string());
-    double c1, c2;
-    std::string tmp;
-    sstr >> tmp >> c1 >> c2;
-
-    views.insert(std::make_pair(c1, c2));
-  }
-
-  m_num_img_srcs = views.size();
-  if (m_num_img_srcs == 0u) {
-    m_num_img_srcs = 1u;
-  }
-#endif
-}
-
 void data_reader_jag_conduit::set_linearized_image_size() {
-  m_image_linearized_size = m_image_width * m_image_height;
-  //m_image_linearized_size = m_image_width * m_image_height * m_image_num_channels;
-  // TODO: we do not know how multi-channel image data will be formatted yet.
+  m_image_linearized_size = m_image_width * m_image_height * m_image_num_channels;
+  m_1ch_image_linearized_size = m_image_width * m_image_height;
 }
 
-void data_reader_jag_conduit::check_image_size() {
-  if (m_success_map.size() == 0) {
+void data_reader_jag_conduit::check_image_data() {
+  if (m_valid_samples.empty()) {
     return;
   }
 
-  const conduit::Node & n_imageset = get_conduit_node(m_success_map[0] + "/outputs/images");
+  if (!has_conduit_path(0, "")) {
+    _THROW_LBANN_EXCEPTION_(_CN_, "check_image_data() : no sample by " + m_valid_samples[0].first);
+    return;
+  }
+  conduit::Node n_imageset;
+  load_conduit_node(0, "/outputs/images", n_imageset);
   if (static_cast<size_t>(n_imageset.number_of_children()) == 0u) {
-    //m_image_width = 0;
-    //m_image_height = 0;
-    //set_linearized_image_size();
-    _THROW_LBANN_EXCEPTION_(_CN_, "check_image_size() : no image in data");
+    _THROW_LBANN_EXCEPTION_(_CN_, "check_image_data() : no image in data");
     return;
   }
-  const conduit::Node & n_image = get_conduit_node(m_success_map[0] + "/outputs/images/(0.0, 0.0)/0.0/emi");
-  conduit::float32_array emi = n_image.value();
-
-  m_image_linearized_size = static_cast<size_t>(emi.number_of_elements());
+  if (m_emi_image_keys.size() == 0u) {
+    _THROW_LBANN_EXCEPTION_(_CN_, "check_image_data() : no image is selected");
+    return;
+  }
+  for (const auto& emi_tag: m_emi_image_keys) {
+    if (!has_conduit_path(0, "/outputs/images/" + emi_tag + "/emi")) {
+      _THROW_LBANN_EXCEPTION_(_CN_, "check_image_data() : no emi image by " + emi_tag);
+      return;
+    }
+  }
+  conduit::Node n_image;
+  load_conduit_node(0, "/outputs/images/" + m_emi_image_keys[0] + "/emi", n_image);
+  conduit_ch_t emi = n_image.value();
 
   if (m_image_linearized_size != static_cast<size_t>(emi.number_of_elements())) {
     if ((m_image_width == 0) && (m_image_height == 0)) {
       m_image_height = 1;
       m_image_width = static_cast<int>(emi.number_of_elements());
+      m_image_num_channels = 1;
       set_linearized_image_size();
     } else {
-      //_THROW_LBANN_EXCEPTION_(_CN_, "check_image_size() : image size mismatch");
-      std::stringstream err;
-      err << __FILE__ << " " << __LINE__ << " :: "
-          <<"check_image_size() : image size mismatch; m_image_width: "
-          << m_image_width << " m_image_height: " << m_image_height
-          << " m_image_linearized_size: " << m_image_linearized_size << std::endl;
+      std::string msg = "expected linearized emi image size: "
+                      + std::to_string(emi.number_of_elements()) + '\n';
+      _THROW_LBANN_EXCEPTION_(_CN_, msg + get_description());
+    }
+  }
+
+  if (m_image_normalization_params.empty()) {
+    m_image_normalization_params.assign(m_emi_image_keys.size()*m_image_num_channels, linear_transform_t(1.0, 0.0));
+  } else if (m_image_normalization_params.size() != m_emi_image_keys.size()*m_image_num_channels) {
+    _THROW_LBANN_EXCEPTION_(_CN_, "Incorrect number of image normalization parameter sets!" \
+                                + std::to_string(m_image_normalization_params.size()) + " != " \
+                                + std::to_string(m_emi_image_keys.size()) + '*' + std::to_string(m_image_num_channels));
+  }
+#if defined(LBANN_DEBUG)
+  std::cout << "image normalization parameters: " << std::endl;
+  for (size_t i = 0u, s = 0u; s < m_emi_image_keys.size(); ++s) {
+    for (int c = 0; c < m_image_num_channels; ++c) {
+      const auto& param = m_image_normalization_params[i*m_image_num_channels + c];
+      std::cout << " scale: \t" << param.first << " \tbias: \t" << param.second
+                << " \t" << m_emi_image_keys[s] << ":C" << c << std::endl;
     }
   }
+#endif
 }
 
 void data_reader_jag_conduit::check_scalar_keys() {
-  if (m_success_map.size() == 0) {
-    m_scalar_keys.clear();
+  if (m_scalar_keys.empty()) {
+    return;
+  }
+  if (!m_is_data_loaded) {
+    return;
+  }
+  if (m_valid_samples.empty()) {
+    //m_scalar_keys.clear();
     return;
   }
 
+  // If this call is made after loading data, check if the keys are in data
+
   size_t num_found = 0u;
   std::vector<bool> found(m_scalar_keys.size(), false);
   std::set<std::string> keys_conduit;
 
-  const conduit::Node & n_scalar = get_conduit_node(m_success_map[0] + "/outputs/scalars");
+  conduit::Node n_scalar;
+  load_conduit_node(0, "/outputs/scalars", n_scalar);
   const std::vector<std::string>& child_names = n_scalar.child_names();
   for (const auto& key: child_names) {
     keys_conduit.insert(key);
@@ -464,7 +742,7 @@ void data_reader_jag_conduit::check_scalar_keys() {
 
   for (size_t i=0u; i < m_scalar_keys.size(); ++i) {
     std::set<std::string>::const_iterator it = keys_conduit.find(m_scalar_keys[i]);
-    if (it != keys_conduit.end()) {
+    if (it != keys_conduit.cend()) {
       num_found ++;
       found[i] = true;
     }
@@ -479,20 +757,44 @@ void data_reader_jag_conduit::check_scalar_keys() {
     }
     _THROW_LBANN_EXCEPTION_(_CN_, "check_scalar_keys() : " + msg);
   }
+
+  if (m_scalar_normalization_params.empty()) {
+    m_scalar_normalization_params.assign(m_scalar_keys.size(), linear_transform_t(1.0, 0.0));
+  } else if (m_scalar_normalization_params.size() != m_scalar_keys.size()) {
+     _THROW_LBANN_EXCEPTION_(_CN_, "Incorrect number of scalar normalization parameter sets! " \
+                                 + std::to_string(m_scalar_normalization_params.size()) + " != " \
+                                 + std::to_string(m_scalar_keys.size()));
+  }
+#if defined(LBANN_DEBUG)
+  std::cout << "scalar normalization parameters: " << std::endl;
+  for (size_t i = 0u; i < m_scalar_normalization_params.size(); ++i) {
+    const auto& param = m_scalar_normalization_params[i];
+    std::cout << " scale: \t" << param.first << " \tbias: \t" << param.second << "\t " << m_scalar_keys[i] << std::endl;
+  }
+#endif
 }
 
 
 void data_reader_jag_conduit::check_input_keys() {
-  if (m_success_map.size() == 0) {
-    m_input_keys.clear();
+  if (m_input_keys.empty()) {
+    return;
+  }
+  if (!m_is_data_loaded) {
+    return;
+  }
+  if (m_valid_samples.empty()) {
+    //m_input_keys.clear();
     return;
   }
 
+  // If this call is made after loading data, check if the keys
+
   size_t num_found = 0u;
   std::vector<bool> found(m_input_keys.size(), false);
   std::map<std::string, TypeID> keys_conduit;
 
-  const conduit::Node & n_input = get_conduit_node(m_success_map[0] + "/inputs");
+  conduit::Node n_input;
+  load_conduit_node(0, "/inputs", n_input);
   conduit::NodeConstIterator itr = n_input.children();
 
   while (itr.has_next()) {
@@ -504,7 +806,7 @@ void data_reader_jag_conduit::check_input_keys() {
 
   for (size_t i=0u; i < m_input_keys.size(); ++i) {
     std::map<std::string, TypeID>::const_iterator it = keys_conduit.find(m_input_keys[i]);
-    if (it != keys_conduit.end()) {
+    if (it != keys_conduit.cend()) {
       num_found ++;
       found[i] = true;
       is_input_t = is_input_t && is_same_type<input_t>(it->second);
@@ -522,10 +824,139 @@ void data_reader_jag_conduit::check_input_keys() {
   }
 
   m_uniform_input_type = (m_input_keys.size() == 0u)? false : is_input_t;
+
+  if (m_input_normalization_params.empty()) {
+    m_input_normalization_params.assign(m_input_keys.size(), linear_transform_t(1.0, 0.0));
+  } else if (m_input_normalization_params.size() != m_input_keys.size()) {
+     _THROW_LBANN_EXCEPTION_(_CN_, "Incorrect number of input normalization parameter sets! " \
+                                 + std::to_string(m_input_normalization_params.size()) + " != " \
+                                 + std::to_string(m_input_keys.size()));
+  }
+#if defined(LBANN_DEBUG)
+  std::cout << "input normalization parameters: " << std::endl;
+  for (size_t i = 0u; i < m_input_normalization_params.size(); ++i) {
+    const auto& param = m_input_normalization_params[i];
+    std::cout << " scale: \t" << param.first << " \tbias: \t" << param.second << " \t" << m_input_keys[i] << std::endl;
+  }
+#endif
 }
 
 
 #ifndef _JAG_OFFLINE_TOOL_MODE_
+void data_reader_jag_conduit::determine_num_samples_to_use() {
+  // The meaning of m_first_n as well as absolute_sample_count is slightly
+  // different in this data reader as it represents the first n local samples
+  // instead of the first n global samples.
+#if 1
+  if (m_first_n > 0) {
+    const size_t num_samples = std::min(static_cast<size_t>(m_first_n), get_num_valid_local_samples());
+    m_valid_samples.resize(num_samples); // this does not work with unordered_map but with vector
+  }
+#else
+  if (m_first_n > 0) {
+    _THROW_LBANN_EXCEPTION_(_CN_, "load() does not support first_n feature.");
+  }
+#endif
+
+#if 1
+  select_subset_of_data();
+#else
+  // We do not support "percent_of_data_to_use" or "absolute_sample_count" yet.
+  if ((get_use_percent() != 1.0) || (get_absolute_sample_count() != static_cast<size_t>(0u))) {
+    _THROW_LBANN_EXCEPTION_(get_type(), \
+      "'percent_of_data_to_use' and 'absolute_sample_count' are not supported with this data reader");
+  }
+  if (get_validation_percent() != 0.0) {
+    _THROW_LBANN_EXCEPTION_(get_type(), \
+      "'validation_percent' is not supported with this data reader");
+  }
+#endif
+  adjust_num_samples_to_use();
+}
+
+void data_reader_jag_conduit::adjust_num_samples_to_use() {
+  const size_t num_valid_samples = get_num_valid_local_samples();
+
+  const int my_rank = m_comm->get_rank_in_model();
+  const int num_readers = get_num_parallel_readers();
+
+  // Find the minimum of the number of valid samples locally available
+  unsigned long long n_loc = static_cast<unsigned long long>(num_valid_samples);
+  unsigned long long n_min = static_cast<unsigned long long>(num_valid_samples);
+
+  if (my_rank >= num_readers) {
+    n_loc = std::numeric_limits<unsigned long long>::max();
+    n_min = std::numeric_limits<unsigned long long>::max();
+  }
+
+  m_comm->model_allreduce(&n_loc, 1, &n_min, El::mpi::MIN);
+
+  // Find the first rank that has the minimum number of valid samples
+  int rank_tmp_1st = (n_loc == n_min)? my_rank : num_readers;
+  int rank_min_1st;
+  m_comm->model_allreduce(&rank_tmp_1st, 1, &rank_min_1st, El::mpi::MIN);
+
+  // Determine the number of samples to use
+  m_global_num_samples_to_use = static_cast<size_t>(n_min * num_readers + rank_min_1st);
+  if (m_global_num_samples_to_use == static_cast<size_t>(0u)) {
+    _THROW_LBANN_EXCEPTION_(get_type(), "No valid sample found.");
+  }
+
+  m_local_num_samples_to_use = (my_rank < rank_min_1st)? (n_min+1) : n_min;
+  if (my_rank >= num_readers) {
+    m_local_num_samples_to_use = 0u;
+  }
+
+
+  // Compute data yield
+  unsigned long long n_valid_local = num_valid_samples;
+  unsigned long long n_valid_global = 0u;
+  m_comm->model_allreduce(&n_valid_local, 1, &n_valid_global, El::mpi::SUM);
+
+  if (is_master()) {
+    const double yield = static_cast<double>(m_global_num_samples_to_use)/n_valid_global;
+    std::cout << "\nData yield: " << yield << std::endl;
+  }
+
+  check_num_parallel_readers(static_cast<long>(m_global_num_samples_to_use));
+  populate_shuffled_indices(m_global_num_samples_to_use);
+
+#if 0
+  std::cout << "rank " << my_rank << '/' << num_readers
+            << " has L" << m_local_num_samples_to_use << "/G" << m_global_num_samples_to_use
+            << " samples to use out of total L" << n_valid_local << "/G" << n_valid_global
+            << " valid samples." << std::endl;
+  std::cout << "num_parallel_readers_per_model: " << get_num_parallel_readers() << std::endl;
+#endif
+}
+
+void data_reader_jag_conduit::populate_shuffled_indices(const size_t num_samples) {
+  m_shuffled_indices.clear();
+  m_shuffled_indices.resize(num_samples);
+
+  int s = 0;
+  if (m_io_buffer_type == "partitioned") {
+    const size_t s_stride = static_cast<size_t>(get_sample_stride());
+    for(size_t n = 0u; n < m_shuffled_indices.size() ; n += s_stride) {
+      for(size_t r = 0u; (r < s_stride) && (n+r < m_shuffled_indices.size()); ++r) {
+        m_shuffled_indices[n+r] = s;
+      }
+      ++s;
+    }
+  } else if (m_io_buffer_type == "distributed") {
+    const int num_readers = get_iteration_stride();
+    const int mb_size = get_mini_batch_size();
+    for(size_t n = 0u; n < m_shuffled_indices.size(); ) {
+      for(int r = 0; r < num_readers; r++) {
+        for(int m = 0, si = s; (m < mb_size) && (n < m_shuffled_indices.size()); ++m) {
+          m_shuffled_indices[n++] = si++;
+        }
+      }
+      s += mb_size;
+    }
+  }
+}
+
 void data_reader_jag_conduit::load() {
   if(m_gan_labelling) {
     m_num_labels=2;
@@ -536,79 +967,132 @@ void data_reader_jag_conduit::load() {
               << m_gan_labelling <<" : " << m_gan_label_value << std::endl;
   }
 
-  // for selecting images, per Luc's advise
-  m_emi_selectors.insert("(0.0, 0.0)");
-  m_emi_selectors.insert("(90.0, 0.0)");
-  m_emi_selectors.insert("(90.0, 78.0)");
+  if ((m_leading_reader != this) && (m_leading_reader != nullptr)) {
+    m_valid_samples = m_leading_reader->get_valid_local_samples();
+    m_unused_samples = m_leading_reader->get_valid_local_samples_unused();
+    m_local_num_samples_to_use = m_leading_reader->get_num_valid_local_samples();
+    m_global_num_samples_to_use = m_leading_reader->get_num_data();
+    m_open_hdf5_files = m_leading_reader->get_open_hdf5_files();
+    return;
+  }
 
-  //const std::string data_dir = add_delimiter(get_file_dir());
-  //const std::string conduit_file_name = get_data_filename();
-  const std::string pattern = get_file_dir();
-  std::vector<std::string> names = glob(pattern);
-  if (names.size() < 1) {
+  const std::string data_dir = add_delimiter(get_file_dir());
+  const std::string conduit_file_name = get_data_filename();
+  const std::string pattern = data_dir + conduit_file_name;
+  std::vector<std::string> filenames = glob(pattern);
+  if (filenames.size() < 1) {
     _THROW_LBANN_EXCEPTION_(get_type(), " failed to get data filenames");
   }
 
-  if (m_first_n > 0) {
-    _THROW_LBANN_EXCEPTION_(_CN_, "load() does not support first_n feature.");
+  // Shuffle the file names
+  if (is_shuffled()) {
+    std::shuffle(filenames.begin(), filenames.end(), get_data_seq_generator());
   }
 
-  int max_files_to_load = INT_MAX;
-  if (m_max_files_to_load > 0) {
-    max_files_to_load = m_max_files_to_load;
+  const size_t my_rank = static_cast<size_t>(m_comm->get_rank_in_model());
+  const size_t num_readers = static_cast<size_t>(compute_max_num_parallel_readers());
+
+  // handle data partitioning among models (e.g., for LTFB)
+  if (m_is_partitioned) {
+    const size_t one_more = filenames.size() % m_num_partitions;
+    const size_t min_num_files_per_partition = filenames.size()/static_cast<size_t>(m_num_partitions);
+    if (min_num_files_per_partition == 0u) {
+      _THROW_LBANN_EXCEPTION_(get_type(), "Insufficient number of files for the number of models.");
+    }
+    const size_t p = static_cast<size_t>(m_my_partition);
+    const size_t idx_start = min_num_files_per_partition * p
+                           + ((p >= one_more)? one_more : p);
+
+    const size_t idx_end = idx_start + min_num_files_per_partition
+                           + ((p < one_more)? 1u : 0u);
+    std::vector<std::string> filenames_partitioned(filenames.begin()+idx_start, filenames.begin()+idx_end);
+    filenames = filenames_partitioned;
   }
+  const size_t num_files_to_load =
+    (m_max_files_to_load > 0u)? std::min(m_max_files_to_load, filenames.size()) : filenames.size();
+
+  filenames.resize(num_files_to_load);
 
   double tm1 = get_time();
-  int n = 0;
-  for (auto t : names) {
-    load_conduit(t);
-    ++n;
-    if (is_master()) std::cerr << "time to load: " << n << " files: " << get_time() - tm1 << "\n";
-    if (n >= max_files_to_load) {
-      break;
+
+  // Reserve m_valid_samples
+  const size_t max_num_files_to_load_per_rank = (num_files_to_load + num_readers - 1u) / num_readers;
+  bool valid_samples_reserved = false;
+  size_t idx = static_cast<size_t>(0ul);
+
+  for (size_t n = my_rank; (n < num_files_to_load) && (my_rank < num_readers); n += num_readers) {
+    load_conduit(filenames[n], idx);
+    if (!valid_samples_reserved) {
+      // reserve the sufficient capacity estimated assuming that files have the same number of samples
+      m_valid_samples.reserve(m_valid_samples.size() * (max_num_files_to_load_per_rank + 1u));
+      valid_samples_reserved = true;
+    }
+    if (is_master()) {
+      std::cerr << "time to load: " << n + num_readers << " files: " << get_time() - tm1 << std::endl;
     }
   }
-  if (is_master()) std::cerr << "time to load conduit files: " << get_time() - tm1
-        << "  num samples: " << m_data.number_of_children() << "\n";
-
-  // reset indices
-  m_shuffled_indices.resize(get_num_samples());
-  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  if (is_master()) {
+    std::cerr << "time to load conduit files: " << get_time() - tm1
+              << "  number of valid local samples at the master rank: " << m_valid_samples.size() << std::endl;
+  }
 
-  select_subset_of_data();
+  check_image_data();
+  determine_num_samples_to_use();
 
   if (is_master()) {
-    std::cout << "\n" << get_description() << "\n\n";
+    std::cout << std::endl << get_description() << std::endl << std::endl;
   }
 }
 #endif // _JAG_OFFLINE_TOOL_MODE_
 
-void data_reader_jag_conduit::load_conduit(const std::string conduit_file_path) {
-if (is_master()) std::cerr << "loading: " << conduit_file_path<< "\n";
-  conduit::relay::io::load_merged(conduit_file_path, "hdf5", m_data);
+
+void data_reader_jag_conduit::load_conduit(const std::string conduit_file_path, size_t& idx) {
+  if (!check_if_file_exists(conduit_file_path)) {
+    _THROW_LBANN_EXCEPTION_(get_type(), " failed to open " + conduit_file_path);
+  }
+#ifndef _JAG_OFFLINE_TOOL_MODE_
+  const size_t my_rank = static_cast<size_t>(m_comm->get_rank_in_model());
+  std::cerr << ("rank "  + std::to_string(my_rank) + " loading: " + conduit_file_path) << std::endl;
+#else
+  std::cerr << "loading: " << conduit_file_path << std::endl;
+#endif
+
+  hid_t hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( conduit_file_path );
+
+  if (!m_open_hdf5_files) {
+    m_open_hdf5_files = std::make_shared<hdf5_file_handles>();
+  }
+  m_open_hdf5_files->add(conduit_file_path, hdf5_file_hnd);
+  if (hdf5_file_hnd <= static_cast<hid_t>(0)) {
+    _THROW_LBANN_EXCEPTION_(get_type(), std::string("cannot add invalid file handle for ") + conduit_file_path);
+  }
 
   // set up mapping: need to do this since some of the data may be bad
-  const std::vector<std::string> &children_names = m_data.child_names();
-  int idx = 0;
-  int bad = 0;
-  for (auto t : children_names) {
-    const std::string key = "/" + t + "/performance/success";
-    const conduit::Node& n_ok = get_conduit_node(key);
+  std::vector<std::string> sample_names;
+  conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", sample_names);
+  size_t bad = 0u;
+  for (auto s : sample_names) {
+    conduit::Node n_ok;
+    if (!conduit::relay::io::hdf5_has_path(hdf5_file_hnd, s + "/performance/success")) {
+      _THROW_LBANN_EXCEPTION_(get_type(),  s + "/performance/success does not exist");
+    }
+    conduit::relay::io::hdf5_read(hdf5_file_hnd, s + "/performance/success", n_ok);
     int success = n_ok.to_int64();
     if (success == 1) {
-      m_success_map[idx++] = t;
+      m_valid_samples.push_back(sample_locator_t(s, hdf5_file_hnd));
     } else {
       ++bad;
     }
   }
+  idx = m_valid_samples.size();
   if (is_master()) {
-    std::cerr << "data_reader_jag_conduit::load_conduit: num good samples: " << m_success_map.size() << "  num bad: " << bad << "\n";
+    std::cerr << "data_reader_jag_conduit::load_conduit: num good samples: "
+              << m_valid_samples.size() << "  num bad: " << bad << std::endl;
   }
 
-  set_num_img_srcs();
-  check_image_size();
-
   if (!m_is_data_loaded) {
+    m_is_data_loaded = true;
+
     if (m_scalar_keys.size() == 0u) {
       set_all_scalar_choices(); // use all by default if none is specified
     }
@@ -619,13 +1103,19 @@ if (is_master()) std::cerr << "loading: " << conduit_file_path<< "\n";
     }
     check_input_keys();
   }
+}
+
 
-  m_is_data_loaded = true;
+size_t data_reader_jag_conduit::get_num_valid_local_samples() const {
+  return m_valid_samples.size();
 }
 
+const data_reader_jag_conduit::sample_map_t& data_reader_jag_conduit::get_valid_local_samples() const {
+  return m_valid_samples;
+}
 
-size_t data_reader_jag_conduit::get_num_samples() const {
-  return m_success_map.size();
+const data_reader_jag_conduit::sample_map_t& data_reader_jag_conduit::get_valid_local_samples_unused() const {
+  return m_unused_samples;
 }
 
 unsigned int data_reader_jag_conduit::get_num_img_srcs() const {
@@ -636,6 +1126,10 @@ size_t data_reader_jag_conduit::get_linearized_image_size() const {
   return m_image_linearized_size;
 }
 
+size_t data_reader_jag_conduit::get_linearized_1ch_image_size() const {
+  return m_1ch_image_linearized_size;
+}
+
 size_t data_reader_jag_conduit::get_linearized_scalar_size() const {
   return m_scalar_keys.size();
 }
@@ -664,9 +1158,6 @@ size_t data_reader_jag_conduit::get_linearized_size(const data_reader_jag_condui
 int data_reader_jag_conduit::get_linearized_data_size() const {
   size_t sz = 0u;
   for (const auto t: m_independent) {
-    if (t == Undefined) {
-      continue;
-    }
     sz += get_linearized_size(t);
   }
   return static_cast<int>(sz);
@@ -675,9 +1166,6 @@ int data_reader_jag_conduit::get_linearized_data_size() const {
 int data_reader_jag_conduit::get_linearized_response_size() const {
   size_t sz = 0u;
   for (const auto t: m_dependent) {
-    if (t == Undefined) {
-      continue;
-    }
     sz += get_linearized_size(t);
   }
   return static_cast<int>(sz);
@@ -687,11 +1175,11 @@ std::vector<size_t> data_reader_jag_conduit::get_linearized_data_sizes() const {
   std::vector<size_t> all_dim;
   all_dim.reserve(m_independent.size());
   for (const auto t: m_independent) {
-    if (t == Undefined) {
-      continue;
-    }
     all_dim.push_back(get_linearized_size(t));
   }
+  if (all_dim.empty()) {
+    return {0u};
+  }
   return all_dim;
 }
 
@@ -699,11 +1187,11 @@ std::vector<size_t> data_reader_jag_conduit::get_linearized_response_sizes() con
   std::vector<size_t> all_dim;
   all_dim.reserve(m_dependent.size());
   for (const auto t: m_dependent) {
-    if (t == Undefined) {
-      continue;
-    }
     all_dim.push_back(get_linearized_size(t));
   }
+  if (all_dim.empty()) {
+    return {0u};
+  }
   return all_dim;
 }
 
@@ -725,20 +1213,42 @@ const std::vector<int> data_reader_jag_conduit::get_dims(const data_reader_jag_c
 }
 
 const std::vector<int> data_reader_jag_conduit::get_data_dims() const {
+#if 1
   return {get_linearized_data_size()};
-#if 0
+#else
   std::vector<int> all_dim;
   for (const auto t: m_independent) {
-    if (t == Undefined) {
-      continue;
-    }
     const std::vector<int> ld = get_dims(t);
     all_dim.insert(all_dim.end(), ld.begin(), ld.end());
   }
+  if (all_dim.empty()) {
+    return {0u};
+  }
   return all_dim;
 #endif
 }
 
+std::vector<El::Int> data_reader_jag_conduit::get_slice_points(const std::vector< std::vector<data_reader_jag_conduit::variable_t> >& var) const {
+  std::vector<El::Int> points(var.size()+1u, static_cast<El::Int>(0));
+  for (size_t i = 0u; i < var.size(); ++i) {
+    const auto& group = var[i];
+    size_t size = 0u;
+    for (const auto type: group) {
+      size += get_linearized_size(type);
+    }
+    points[i+1] = points[i] + static_cast<El::Int>(size);
+  }
+  return points;
+}
+
+std::vector<El::Int> data_reader_jag_conduit::get_slice_points_independent() const {
+  return get_slice_points(m_independent_groups);
+}
+
+std::vector<El::Int> data_reader_jag_conduit::get_slice_points_dependent() const {
+  return get_slice_points(m_independent_groups);
+}
+
 int data_reader_jag_conduit::get_num_labels() const {
   return m_num_labels;
 }
@@ -747,6 +1257,31 @@ int data_reader_jag_conduit::get_linearized_label_size() const {
   return m_num_labels;
 }
 
+int data_reader_jag_conduit::get_linearized_size(const std::string& desc) const {
+  if (desc == "JAG_Image") {
+    return get_linearized_size(JAG_Image);
+  } else if (desc == "JAG_Scalar") {
+    return get_linearized_size(JAG_Scalar);
+  } else if (desc == "JAG_Input") {
+    return get_linearized_size(JAG_Input);
+  } else {
+    _THROW_LBANN_EXCEPTION_(_CN_, "get_linearized_size() : unknown key " + desc);
+  }
+  return generic_data_reader::get_linearized_size(desc);
+}
+
+void data_reader_jag_conduit::set_split_image_channels() {
+  m_split_channels = true;
+}
+
+void data_reader_jag_conduit::unset_split_image_channels() {
+  m_split_channels = false;
+}
+
+bool data_reader_jag_conduit::check_split_image_channels() const {
+  return m_split_channels;
+}
+
 
 std::string data_reader_jag_conduit::to_string(const variable_t t) {
   switch (t) {
@@ -767,18 +1302,26 @@ std::string data_reader_jag_conduit::to_string(const std::vector<data_reader_jag
   return str;
 }
 
+std::string data_reader_jag_conduit::to_string(const std::vector< std::vector<data_reader_jag_conduit::variable_t> >& vec) {
+  std::string str("[");
+  for (const auto& el: vec) {
+    str += ' ' + data_reader_jag_conduit::to_string(el);
+  }
+  str += " ]";
+  return str;
+}
+
 std::string data_reader_jag_conduit::get_description() const {
-  std::vector<size_t> s = get_linearized_data_sizes();
   std::string ret = std::string("data_reader_jag_conduit:\n")
-    + " - independent: " + data_reader_jag_conduit::to_string(m_independent) + "\n"
-    + " - dependent: " + data_reader_jag_conduit::to_string(m_dependent) + "\n"
-    + " - images: "   + std::to_string(m_num_img_srcs) + 'x'
+    + " - independent: " + data_reader_jag_conduit::to_string(m_independent_groups) + "\n"
+    + " - dependent: " + data_reader_jag_conduit::to_string(m_dependent_groups) + "\n"
+    + " - images: "   + std::to_string(m_num_img_srcs) + " of "
+                      + std::to_string(m_image_num_channels) + 'x'
                       + std::to_string(m_image_width) + 'x'
                       + std::to_string(m_image_height) + "\n"
     + " - scalars: "  + std::to_string(get_linearized_scalar_size()) + "\n"
     + " - inputs: "   + std::to_string(get_linearized_input_size()) + "\n"
     + " - linearized data size: "   + std::to_string(get_linearized_data_size()) + "\n"
-
     + " - uniform_input_type: " + (m_uniform_input_type? "true" : "false") + '\n';
   if (!m_scalar_filter.empty()) {
     ret += " - scalar filter:";
@@ -813,16 +1356,12 @@ std::string data_reader_jag_conduit::get_description() const {
 
 
 bool data_reader_jag_conduit::check_sample_id(const size_t sample_id) const {
-  if (m_success_map.find(sample_id) == m_success_map.end()) {
-    return false;
-  }
-  return true;
-  //return (static_cast<conduit_index_t>(sample_id) < m_data.number_of_children());
+  return (sample_id < m_valid_samples.size());
 }
 
 bool data_reader_jag_conduit::check_non_numeric(const std::string key) {
   std::set<std::string>::const_iterator kit = non_numeric_vars.find(key);
-  if (kit != non_numeric_vars.end()) {
+  if (kit != non_numeric_vars.cend()) {
     std::string err = "data_reader_jag_conduit::add_val() : non-numeric '" + key
                     + "' requires a conversion method.";
    #if 1
@@ -836,67 +1375,29 @@ bool data_reader_jag_conduit::check_non_numeric(const std::string key) {
 }
 
 
-std::vector<int> data_reader_jag_conduit::choose_image_near_bang_time(const size_t sample_id) const {
-  std::vector<int> img_indices;
-  return img_indices;
-#if 0
-  using view_map = std::map<std::pair<float, float>, std::pair<int, double> >;
-
-  conduit::NodeConstIterator itr = get_conduit_node(m_success_map[sample_id] + "/outputs/images").children();
-  view_map near_bang_time;
-  int idx = 0;
-
-  while (itr.has_next()) {
-    const conduit::Node & n_image = itr.next();
-    std::stringstream sstr(n_image["view"].as_string());
-    double c1, c2;
-    std::string tmp;
-    sstr >> tmp >> c1 >> c2;
-    const double t = n_image["time"].value();
-    const double t_abs = std::abs(t);
-
-    view_map::iterator it = near_bang_time.find(std::make_pair(c1, c2));
-
-    if (it == near_bang_time.end()) {
-      near_bang_time.insert(std::make_pair(std::make_pair(c1, c2), std::make_pair(idx, t_abs)));
-    } else if ((it->second).second > t) { // currently ignore tie
-      it->second = std::make_pair(idx, t_abs);
-    }
-
-    idx++;
+std::vector< std::vector<data_reader_jag_conduit::ch_t> >
+data_reader_jag_conduit::get_image_data(const size_t sample_id) const {
+  if (sample_id >= m_valid_samples.size()) {
+    _THROW_LBANN_EXCEPTION_(_CN_, "get_image_data() : invalid sample index");
   }
 
-  std::vector<int> img_indices;
-  img_indices.reserve(near_bang_time.size());
-  for(const auto& view: near_bang_time) {
-    img_indices.push_back(view.second.first);
-  }
-  return img_indices;
-#endif
-}
-
-std::vector< std::pair<size_t, const data_reader_jag_conduit::ch_t*> >
-data_reader_jag_conduit::get_image_ptrs(const size_t sample_id) const {
-  if (sample_id >= m_success_map.size()) {
-    _THROW_LBANN_EXCEPTION_(_CN_, "get_images() : invalid sample index");
-  }
+  std::vector< std::vector<ch_t> > image_ptrs;
+  image_ptrs.reserve(m_emi_image_keys.size());
 
-  std::vector< std::pair<size_t, const ch_t*> >image_ptrs;
-  std::unordered_map<int, std::string>::const_iterator it = m_success_map.find(sample_id);
-
-  for (auto t : m_emi_selectors) {
-    std::string img_key = it->second + "/outputs/images/" + t + "/0.0/emi";
-    const conduit::Node & n_image = get_conduit_node(img_key);
-    conduit::float32_array emi = n_image.value();
-    const size_t num_pixels = emi.number_of_elements();
+  for (const auto& emi_tag : m_emi_image_keys) {
+    conduit::Node n_image;
+    load_conduit_node(sample_id, "/outputs/images/" + emi_tag + "/emi", n_image);
+    conduit_ch_t emi = n_image.value();
+    const size_t num_vals = emi.number_of_elements();
     const ch_t* emi_data = n_image.value();
-    image_ptrs.push_back(std::make_pair(num_pixels, emi_data));
+    image_ptrs.emplace_back(emi_data, emi_data + num_vals);
   }
 
   return image_ptrs;
 }
 
-cv::Mat data_reader_jag_conduit::cast_to_cvMat(const std::pair<size_t, const ch_t*> img, const int height) {
+cv::Mat data_reader_jag_conduit::cast_to_cvMat(
+  const std::pair<size_t, const ch_t*> img, const int height, const int num_ch) {
   const int num_pixels = static_cast<int>(img.first);
   const ch_t* ptr = img.second;
 
@@ -906,87 +1407,189 @@ cv::Mat data_reader_jag_conduit::cast_to_cvMat(const std::pair<size_t, const ch_
                       reinterpret_cast<void*>(const_cast<ch_t*>(ptr)));
   // reshape the image. Furter need to clone (deep-copy) the image
   // to preserve the constness of the original data
-  return (image.reshape(0, height));
+  return (image.reshape(num_ch, height));
+}
+
+/// Assumes the same parameters for the same channel from different views
+void data_reader_jag_conduit::image_normalization(cv::Mat& img, size_t i, size_t ch) const {
+  const auto& tr = m_image_normalization_params.at(i*m_image_num_channels + ch);
+  img.convertTo(img, -1, tr.first, tr.second);
 }
 
 std::vector<cv::Mat> data_reader_jag_conduit::get_cv_images(const size_t sample_id) const {
-  std::vector< std::pair<size_t, const ch_t*> > img_ptrs(get_image_ptrs(sample_id));
+  const std::vector< std::vector<ch_t> > img_data(get_image_data(sample_id));
   std::vector<cv::Mat> images;
-  images.reserve(img_ptrs.size());
 
-  for (const auto& img: img_ptrs) {
-    images.emplace_back(cast_to_cvMat(img, m_image_height).clone());
+  if (m_split_channels) {
+    images.reserve(img_data.size()*m_image_num_channels);
+    for (size_t i = 0u; i < img_data.size(); ++i) {
+      const auto& img = img_data[i];
+      cv::Mat ch[m_image_num_channels];
+      cv::split(cast_to_cvMat(std::make_pair(img.size(), img.data()), m_image_height, m_image_num_channels), ch);
+      for(int c = 0; c < m_image_num_channels; ++c) {
+    #if 1 // with normalization
+        image_normalization(ch[c], i, static_cast<size_t>(c));
+    #endif
+        images.emplace_back(ch[c].clone());
+      }
+    }
+  } else {
+    images.reserve(img_data.size());
+    for (size_t i = 0u; i < img_data.size(); ++i) {
+      const auto& img = img_data[i];
+    #if 1 // with normalization
+      cv::Mat ch[m_image_num_channels];
+      cv::split(cast_to_cvMat(std::make_pair(img.size(), img.data()), m_image_height, m_image_num_channels), ch);
+      for(int c = 0; c < m_image_num_channels; ++c) {
+        image_normalization(ch[c], i, static_cast<size_t>(c));
+      }
+      cv::Mat img_normalized;
+      cv::merge(ch, m_image_num_channels, img_normalized);
+      images.emplace_back(img_normalized);
+    #else
+      images.emplace_back(cast_to_cvMat(std::make_pair(img.size(), img.data()), m_image_height, m_image_num_channels).clone());
+    #endif
+    }
   }
   return images;
 }
 
 std::vector<data_reader_jag_conduit::ch_t> data_reader_jag_conduit::get_images(const size_t sample_id) const {
-  std::vector< std::pair<size_t, const ch_t*> > img_ptrs(get_image_ptrs(sample_id));
+  std::vector< std::vector<ch_t> > img_data(get_image_data(sample_id));
   std::vector<ch_t> images;
-  images.reserve(get_linearized_image_size());
 
-  for (const auto& img: img_ptrs) {
-    const size_t num_pixels = img.first;
-    const ch_t* ptr = img.second;
-    images.insert(images.end(), ptr, ptr + num_pixels);
+  if (m_split_channels) {
+    images.resize(get_linearized_size(JAG_Image));
+    size_t i = 0u;
+    size_t j = 0u;
+    for (const auto& img: img_data) {
+      const ch_t * const ptr_end = img.data() + img.size();
+      for (int c=0; c < m_image_num_channels; ++c) {
+        const auto& tr = m_image_normalization_params.at(j*m_image_num_channels + c);
+        for (const ch_t* ptr = img.data() + c; ptr < ptr_end; ptr += m_image_num_channels) {
+        #if 1 // with normalization
+          images[i++] = cv::saturate_cast<ch_t>(*ptr * tr.first + tr.second);
+        #else
+          images[i++] = *ptr;
+        #endif
+        }
+      }
+      j ++;
+    }
+  } else {
+    images.reserve(get_linearized_size(JAG_Image));
+    for (const auto& img: img_data) {
+    #if 1 // with normalization
+      // TODO: normalization needed
+      _THROW_LBANN_EXCEPTION_(_CN_, "get_images() : normalization not implemented yet");
+      (void) img;
+    #else
+      images.insert(images.end(), img.cbegin(), ptr + img.cend());
+    #endif
+    }
   }
 
   return images;
 }
 
 std::vector<data_reader_jag_conduit::scalar_t> data_reader_jag_conduit::get_scalars(const size_t sample_id) const {
-  if (!check_sample_id(sample_id)) {
+  if (sample_id >= m_valid_samples.size()) {
     _THROW_LBANN_EXCEPTION_(_CN_, "get_scalars() : invalid sample index");
   }
 
+  #define _LBANN_DATA_READER_JAG_CONDUIT_IO_PER_SCALAR_KEY_ // fetching by individual file I/O per key
+
+  #if !defined(_LBANN_DATA_READER_JAG_CONDUIT_IO_PER_SCALAR_KEY_)
+  conduit::Node n_scalar;
+  load_conduit_node(sample_id, "/outputs/scalars", n_scalar);
+  #endif // !_LBANN_DATA_READER_JAG_CONDUIT_IO_PER_SCALAR_KEY_
+
   std::vector<scalar_t> scalars;
   scalars.reserve(m_scalar_keys.size());
 
+  auto tr = m_scalar_normalization_params.cbegin();
+
   for(const auto key: m_scalar_keys) {
-    std::unordered_map<int, std::string>::const_iterator t2 = m_success_map.find(sample_id);
-    std::string scalar_key = t2->second + "/outputs/scalars/" + key;
-    const conduit::Node & n_scalar = get_conduit_node(scalar_key);
-    // All the scalar output currently seems to be scalar_t
-    //add_val(key, n_scalar, scalars);
-    scalars.push_back(static_cast<scalar_t>(n_scalar.to_value()));
-  }
+  #if defined(_LBANN_DATA_READER_JAG_CONDUIT_IO_PER_SCALAR_KEY_)
+    conduit::Node n_scalar;
+    // TODO: optimize by loading the entire set of scalars of the samples
+    load_conduit_node(sample_id, "/outputs/scalars/" + key, n_scalar);
+    // All the scalar output currently seems to be scalar_t.
+    // If not, use add_val(key, n_scalar, scalars);
+
+    const scalar_t val_raw = static_cast<scalar_t>(n_scalar.to_value());
+  #else
+    conduit::Node n_scalar_var = get_conduit_node(n_scalar, key);
+    // All the scalar output currently seems to be scalar_t.
+    // If not, use add_val(key, n_scalar_var, scalars);
+
+    const scalar_t val_raw = static_cast<scalar_t>(n_scalar_var.to_value());
+  #endif // _LBANN_DATA_READER_JAG_CONDUIT_IO_PER_SCALAR_KEY_
+    const scalar_t val = static_cast<scalar_t>(val_raw * tr->first + tr->second);
+    scalars.push_back(val);
+    tr ++;
+  }
+  #undef _LBANN_DATA_READER_JAG_CONDUIT_IO_PER_SCALAR_KEY_
   return scalars;
 }
 
 std::vector<data_reader_jag_conduit::input_t> data_reader_jag_conduit::get_inputs(const size_t sample_id) const {
-  if (!check_sample_id(sample_id)) {
+  if (sample_id >= m_valid_samples.size()) {
     _THROW_LBANN_EXCEPTION_(_CN_, "get_inputs() : invalid sample index");
   }
 
+  //#define _LBANN_DATA_READER_JAG_CONDUIT_IO_PER_INPUT_KEY_ // fetching by individual file I/O per key
+
+  #if !defined(_LBANN_DATA_READER_JAG_CONDUIT_IO_PER_INPUT_KEY_)
+  // fetching the entire input parameters of a sample by a single file I/O
+  conduit::Node n_input;
+  load_conduit_node(sample_id, "/inputs", n_input);
+  #endif // !_LBANN_DATA_READER_JAG_CONDUIT_IO_PER_INPUT_KEY_
+
   std::vector<input_t> inputs;
   inputs.reserve(m_input_keys.size());
 
+  // The sequence of normalization parameters should follow the same order as
+  // that of the variable keys.
+  auto tr = m_input_normalization_params.cbegin();
+
   // automatically determine which method to use based on if all the variables are of input_t
   if (m_uniform_input_type) {
+    // avoid some overhead by taking advantage of the fact that all the variables are of the same type
     for(const auto key: m_input_keys) {
-      std::unordered_map<int, std::string>::const_iterator t2 = m_success_map.find(sample_id);
-      std::string input_key = t2->second + "/inputs/" + key;
-      const conduit::Node & n_input = get_conduit_node(input_key);
-      inputs.push_back(n_input.value()); // less overhead
+    #if defined(_LBANN_DATA_READER_JAG_CONDUIT_IO_PER_INPUT_KEY_)
+      // TODO: whether to fetch by individual I/O or not can be dynamically
+      // determined based on how many of the variables are to be fetched.
+      conduit::Node n_input;
+      load_conduit_node(sample_id, "/inputs/" + key, n_input);
+      const input_t val_raw = static_cast<input_t>(n_input.value());
+    #else
+      conduit::Node n_input_var = get_conduit_node(n_input, key);
+      const input_t val_raw = static_cast<input_t>(n_input_var.value());
+    #endif // _LBANN_DATA_READER_JAG_CONDUIT_IO_PER_INPUT_KEY_
+      const input_t val = static_cast<input_t>(val_raw * tr->first + tr->second);
+      inputs.push_back(val);
+      tr ++;
     }
   } else {
     for(const auto key: m_input_keys) {
-      std::unordered_map<int, std::string>::const_iterator t2 = m_success_map.find(sample_id);
-      std::string input_key = t2->second + "/inputs/" + key;
-      const conduit::Node & n_input = get_conduit_node(input_key);
+    #if defined(_LBANN_DATA_READER_JAG_CONDUIT_IO_PER_INPUT_KEY_)
+      conduit::Node n_input;
+      load_conduit_node(sample_id, "/inputs/" + key, n_input);
       add_val(key, n_input, inputs); // more overhead but general
+    #else
+      conduit::Node n_input_var = get_conduit_node(n_input, key);
+      add_val(key, n_input_var, inputs); // more overhead but general
+    #endif // _LBANN_DATA_READER_JAG_CONDUIT_IO_PER_INPUT_KEY_
+
+      input_t& val = inputs.back();
+      val = static_cast<input_t>(val * tr->first + tr->second);
+      tr ++;
     }
   }
-  return inputs;
-}
-
-int data_reader_jag_conduit::check_exp_success(const size_t sample_id) const {
-  if (!check_sample_id(sample_id)) {
-    _THROW_LBANN_EXCEPTION_(_CN_, "check_exp_success() : invalid sample index");
-  }
+  #undef _LBANN_DATA_READER_JAG_CONDUIT_IO_PER_INPUT_KEY_
 
-  std::unordered_map<int, std::string>::const_iterator it = m_success_map.find(sample_id);
-  return static_cast<int>(get_conduit_node(it->second + "performance/success").value());
+  return inputs;
 }
 
 
@@ -1007,16 +1610,19 @@ bool data_reader_jag_conduit::fetch(CPUMat& X, int data_id, int mb_idx, int tid,
   const data_reader_jag_conduit::variable_t vt, const std::string tag) {
   switch (vt) {
     case JAG_Image: {
-      const std::vector<size_t> sizes(get_num_img_srcs(), get_linearized_image_size());
+      const size_t num_images = get_num_img_srcs()
+                              * static_cast<size_t>(m_split_channels? m_image_num_channels : 1u);
+      const size_t image_size = m_split_channels? get_linearized_1ch_image_size() : get_linearized_image_size();
+      const std::vector<size_t> sizes(num_images, image_size);
       std::vector<CPUMat> X_v = create_datum_views(X, sizes, mb_idx);
       std::vector<cv::Mat> images = get_cv_images(data_id);
 
-      if (images.size() != get_num_img_srcs()) {
+      if (images.size() != num_images) {
         _THROW_LBANN_EXCEPTION2_(_CN_, "fetch() : the number of images is not as expected", \
-          std::to_string(images.size()) + "!=" + std::to_string(get_num_img_srcs()));
+          std::to_string(images.size()) + "!=" + std::to_string(num_images));
       }
 
-      for(size_t i=0u; i < get_num_img_srcs(); ++i) {
+      for(size_t i=0u; i < num_images; ++i) {
         int width, height, img_type;
         image_utils::process_image(images[i], width, height, img_type, *(m_pps[tid]), X_v[i]);
       }
@@ -1039,6 +1645,52 @@ bool data_reader_jag_conduit::fetch(CPUMat& X, int data_id, int mb_idx, int tid,
   return true;
 }
 
+int data_reader_jag_conduit::reuse_data(CPUMat& X) {
+  El::Copy(m_data_cache, X);
+  return m_cached_data_mb_size;
+}
+
+int data_reader_jag_conduit::reuse_responses(CPUMat& Y) {
+  El::Copy(m_response_cache, Y);
+  return m_cached_response_mb_size;
+}
+
+int data_reader_jag_conduit::reuse_labels(CPUMat& Y) {
+  El::Copy(m_label_cache, Y);
+  return m_cached_label_mb_size;
+}
+
+int data_reader_jag_conduit::fetch_data(CPUMat& X) {
+  if ((m_leading_reader != this) && (m_leading_reader != nullptr)) {
+    return m_leading_reader->reuse_data(X);
+  }
+  m_cached_data_mb_size = generic_data_reader::fetch_data(X);
+  El::Copy(X, m_data_cache);
+
+  return m_cached_data_mb_size;
+}
+
+int data_reader_jag_conduit::fetch_responses(CPUMat& Y) {
+  if ((m_leading_reader != this) && (m_leading_reader != nullptr)) {
+    return m_leading_reader->reuse_responses(Y);
+  }
+  m_cached_response_mb_size = generic_data_reader::fetch_responses(Y);
+  El::Copy(Y, m_response_cache);
+
+  return m_cached_response_mb_size;
+}
+
+int data_reader_jag_conduit::fetch_labels(CPUMat& Y) {
+  if ((m_leading_reader != this) && (m_leading_reader != nullptr)) {
+    return m_leading_reader->reuse_labels(Y);
+  }
+  m_cached_label_mb_size = generic_data_reader::fetch_labels(Y);
+  El::Copy(Y, m_label_cache);
+
+  return m_cached_label_mb_size;
+}
+
+
 bool data_reader_jag_conduit::fetch_datum(CPUMat& X, int data_id, int mb_idx, int tid) {
   std::vector<size_t> sizes = get_linearized_data_sizes();
   std::vector<CPUMat> X_v = create_datum_views(X, sizes, mb_idx);
@@ -1047,6 +1699,7 @@ bool data_reader_jag_conduit::fetch_datum(CPUMat& X, int data_id, int mb_idx, in
     // The third argument mb_idx below is 0 because it is for the view of X not X itself
     ok = fetch(X_v[i], data_id, 0, tid, m_independent[i], "datum");
   }
+
   return ok;
 }
 
@@ -1072,15 +1725,6 @@ bool data_reader_jag_conduit::fetch_label(CPUMat& Y, int data_id, int mb_idx, in
 
 #ifndef _JAG_OFFLINE_TOOL_MODE_
 void data_reader_jag_conduit::setup_data_store(model *m) {
-  if (m_data_store != nullptr) {
-    //delete m_data_store;
-  }
-/*
-  m_data_store = new data_store_jag_conduit(this, m);
-  if (m_data_store != nullptr) {
-    m_data_store->setup();
-  }
-*/
 }
 #endif // _JAG_OFFLINE_TOOL_MODE_
 
@@ -1090,19 +1734,39 @@ void data_reader_jag_conduit::save_image(Mat& pixels, const std::string filename
 #endif // _JAG_OFFLINE_TOOL_MODE_
 }
 
-void data_reader_jag_conduit::print_schema() const {
-  m_data.schema().print();
-}
-
 void data_reader_jag_conduit::print_schema(const size_t sample_id) const {
-  if (!check_sample_id(sample_id)) {
+  if (sample_id >= m_valid_samples.size()) {
     _THROW_LBANN_EXCEPTION_(_CN_, "get_inputs() : invalid sample index");
   }
-  std::unordered_map<int, std::string>::const_iterator it = m_success_map.find(sample_id);
-  const conduit::Node & n = get_conduit_node(it->second);
+  conduit::Node n;
+  load_conduit_node(sample_id, "", n);
   n.schema().print();
 }
 
+void data_reader_jag_conduit::clear_image_normalization_params() {
+  m_image_normalization_params.clear();
+}
+
+void data_reader_jag_conduit::clear_scalar_normalization_params() {
+  m_scalar_normalization_params.clear();
+}
+
+void data_reader_jag_conduit::clear_input_normalization_params() {
+  m_input_normalization_params.clear();
+}
+
+void data_reader_jag_conduit::add_image_normalization_param(const data_reader_jag_conduit::linear_transform_t& t) {
+  m_image_normalization_params.push_back(t);
+}
+
+void data_reader_jag_conduit::add_scalar_normalization_param(const data_reader_jag_conduit::linear_transform_t& t) {
+  m_scalar_normalization_params.push_back(t);
+}
+
+void data_reader_jag_conduit::add_input_normalization_param(const data_reader_jag_conduit::linear_transform_t& t) {
+  m_input_normalization_params.push_back(t);
+}
+
 } // end of namespace lbann
 
 #undef _CN_
diff --git a/src/data_readers/data_reader_jag_conduit_hdf5.cpp b/src/data_readers/data_reader_jag_conduit_hdf5.cpp
index d98c7c12f28..1e4954137b3 100644
--- a/src/data_readers/data_reader_jag_conduit_hdf5.cpp
+++ b/src/data_readers/data_reader_jag_conduit_hdf5.cpp
@@ -25,14 +25,11 @@
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-#ifndef _JAG_OFFLINE_TOOL_MODE_
 #include "lbann/data_readers/data_reader_jag_conduit_hdf5.hpp"
 #include "lbann/utils/file_utils.hpp" // for add_delimiter() in load()
 #include "lbann/utils/options.hpp" // for add_delimiter() in load()
 #include "lbann/data_store/jag_store.hpp"
-#else
-#include "data_reader_jag_conduit_hdf5.hpp"
-#endif // _JAG_OFFLINE_TOOL_MODE_
+#include "lbann/models/model.hpp"
 
 #ifdef LBANN_HAS_CONDUIT
 #include "lbann/data_readers/opencv_extensions.hpp"
@@ -40,6 +37,7 @@
 #include "lbann/data_readers/image_utils.hpp"
 #include "lbann/utils/timer.hpp"
 #include "lbann/utils/glob.hpp"
+#include <thread>
 
 
 // This macro may be moved to a global scope
@@ -66,7 +64,8 @@ namespace lbann {
 data_reader_jag_conduit_hdf5::data_reader_jag_conduit_hdf5(const std::shared_ptr<cv_process>& pp, bool shuffle)
   : generic_data_reader(shuffle),
     m_jag_store(nullptr),
-    m_owns_jag_store(false) {
+    m_owns_jag_store(false),
+    m_primary_reader(nullptr) {
 
   set_defaults();
 
@@ -78,14 +77,11 @@ data_reader_jag_conduit_hdf5::data_reader_jag_conduit_hdf5(const std::shared_ptr
 }
 
 void data_reader_jag_conduit_hdf5::copy_members(const data_reader_jag_conduit_hdf5& rhs) {
-  //todo: make m_jag_store a shared pointer
   m_jag_store = rhs.m_jag_store;
   m_owns_jag_store = rhs.m_owns_jag_store;
   m_image_width = rhs.m_image_width;
   m_image_height = rhs.m_image_height;
   m_image_num_channels = rhs.m_image_num_channels;
-  //set_linearized_image_size();
-  //m_num_img_srcs = rhs.m_num_img_srcs;
   m_is_data_loaded = rhs.m_is_data_loaded;
   m_scalar_keys = rhs.m_scalar_keys;
   m_input_keys = rhs.m_input_keys;
@@ -96,8 +92,6 @@ void data_reader_jag_conduit_hdf5::copy_members(const data_reader_jag_conduit_hd
   }
 
   replicate_processor(*rhs.m_pps[0]);
-
-  //m_data = rhs.m_data;
   m_uniform_input_type = rhs.m_uniform_input_type;
 }
 
@@ -130,17 +124,7 @@ void data_reader_jag_conduit_hdf5::set_defaults() {
   m_image_width = 0;
   m_image_height = 0;
   m_image_num_channels = 1;
-/*
-  m_independent.assign(1u, Undefined);
-  m_dependent.assign(1u, Undefined);
-  set_linearized_image_size();
-  m_num_img_srcs = 1u;
-  m_is_data_loaded = false;
   m_num_labels = 0;
-  m_scalar_keys.clear();
-  m_input_keys.clear();
-  m_uniform_input_type = false;
-*/
 }
 
 /// Replicate image processor for each OpenMP thread
@@ -151,7 +135,6 @@ bool data_reader_jag_conduit_hdf5::replicate_processor(const cv_process& pp) {
   // Construct thread private preprocessing objects out of a shared pointer
   #pragma omp parallel for schedule(static, 1)
   for (int i = 0; i < nthreads; ++i) {
-    //auto ppu = std::make_unique<cv_process>(pp); // c++14
     std::unique_ptr<cv_process> ppu(new cv_process(pp));
     m_pps[i] = std::move(ppu);
   }
@@ -181,6 +164,28 @@ void data_reader_jag_conduit_hdf5::set_image_dims(const int width, const int hei
   m_image_num_channels = ch;
 }
 
+bool data_reader_jag_conduit_hdf5::fetch_datum(CPUMat& X, int data_id, int mb_idx, int tid) {
+  m_jag_store->load_data(data_id, tid);
+
+  std::vector<size_t> sizes = get_linearized_data_sizes();
+  std::vector<CPUMat> X_v = create_datum_views(X, sizes, mb_idx);
+
+  size_t i = 0;
+  std::vector<cv::Mat> images = get_cv_images(data_id, tid);
+
+  for(size_t k=0u; k < get_num_img_srcs(); ++k) {
+    int width, height, img_type;
+    image_utils::process_image(images[k], width, height, img_type, *(m_pps[tid]), X_v[i++]);
+   }
+
+  const std::vector<data_reader_jag_conduit_hdf5::scalar_t> &scalars = m_jag_store->fetch_scalars(data_id, tid);
+  set_minibatch_item<data_reader_jag_conduit_hdf5::scalar_t>(X_v[i++], 0, scalars.data(), m_jag_store->get_linearized_scalar_size());
+
+  const std::vector<data_reader_jag_conduit_hdf5::input_t> &inputs = m_jag_store->fetch_inputs(data_id, tid);
+  set_minibatch_item<data_reader_jag_conduit_hdf5::input_t>(X_v[i++], 0, inputs.data(), m_jag_store->get_linearized_input_size());
+  return true;
+}
+
 void data_reader_jag_conduit_hdf5::load() {
   if(m_gan_labelling) {
     m_num_labels=2;
@@ -192,83 +197,24 @@ void data_reader_jag_conduit_hdf5::load() {
   }
 
   bool setup_jag_store = true;
-  options *opts = options::get();
-  if (is_master()) std::cerr << "data_reader_jag_conduit_hdf5::load() - getting ptrs to data_readers\n";
-  std::vector<void*> p = opts->get_ptrs();
-  for (auto t : p) {
-    data_reader_jag_conduit_hdf5 *other = static_cast<data_reader_jag_conduit_hdf5*>(t);
-    if (other == nullptr) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: dynamic_cast<data_reader_jag_conduit_hdf5*> failed");
-    }
-    if (other->get_role() == get_role()) {
-      if (is_master()) std::cerr << "data_reader_jag_conduit_hdf5::load() - found compatible reader; role: " <<  get_role() << "\n";
-      m_jag_store = other->get_jag_store();
-      m_owns_jag_store = false;
-      setup_jag_store = false;
-      break;
-    }
-  }
 
   if (setup_jag_store) {
     m_jag_store = new jag_store;
-    //m_jag_store = std::make_shared<jag_store>(new jag_store);
   
+    m_jag_store->set_comm(m_comm);
+    if (is_master()) std::cerr << "calling: m_jag_store->set_image_size\n";
     m_jag_store->set_image_size(m_image_height * m_image_width);
-  
-    // for selecting images, per Luc's advise
-    m_emi_selectors.insert("(0.0, 0.0)");
-    m_emi_selectors.insert("(90.0, 0.0)");
-    m_emi_selectors.insert("(90.0, 78.0)");
-  
-    //const std::string data_dir = add_delimiter(get_file_dir());
-    //const std::string conduit_file_name = get_data_filename();
-    const std::string pattern = get_file_dir();
-    std::vector<std::string> names = glob(pattern);
-    if (names.size() < 1) {
-      _THROW_LBANN_EXCEPTION_(get_type(), " failed to get data filenames");
-    }
-  
+
     if (m_first_n > 0) {
       _THROW_LBANN_EXCEPTION_(_CN_, "load() does not support first_n feature.");
     }
-  
-    if (m_max_files_to_load > 0) {
-      if (m_max_files_to_load < names.size()) {
-        names.resize(m_max_files_to_load);
-      }
-    }
-  
-    m_jag_store->set_comm(m_comm);
-    if (m_use_inputs) {
-      if (is_master()) {
-        std::cerr << "USING INPUTS\n";
-      }
-      m_jag_store->load_inputs();
-    }  
-    if (m_use_scalars) {
-      if (is_master()) {
-        std::cerr << "USING SCALARS\n";
-      }  
-      m_jag_store->load_scalars();
-    }
-  
-    if (m_use_images) {
-      if (is_master()) {
-        std::cerr << "USING IMAGES\n";
-      }  
-      std::vector<std::string> image_names;
-      for (auto t : m_emi_selectors) {
-        image_names.push_back(t);
-      }
-      m_jag_store->load_images(image_names);
-    }  
-
-    m_jag_store->setup(names);
+
+    if (is_master()) std::cerr << "data_reader_jag_conduit_hdf5: calling m_jag_store->setup()\n";
+    m_jag_store->setup(this);
   }
 
   m_is_data_loaded = true;
 
-
   // reset indices
   m_shuffled_indices.resize(get_num_samples());
   std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
@@ -280,7 +226,6 @@ void data_reader_jag_conduit_hdf5::load() {
   }
 }
 
-
 size_t data_reader_jag_conduit_hdf5::get_num_samples() const {
   return m_jag_store->get_num_samples();
 }
@@ -289,6 +234,14 @@ unsigned int data_reader_jag_conduit_hdf5::get_num_img_srcs() const {
   return m_jag_store->get_num_img_srcs();
 }
 
+unsigned int data_reader_jag_conduit_hdf5::get_num_channels() const {
+  return m_jag_store->get_num_channels_per_view();
+}
+
+size_t data_reader_jag_conduit_hdf5::get_linearized_channel_size() const {
+  return m_jag_store->get_linearized_channel_size();
+}
+
 size_t data_reader_jag_conduit_hdf5::get_linearized_image_size() const {
   return m_jag_store->get_linearized_image_size();
 }
@@ -309,16 +262,6 @@ int data_reader_jag_conduit_hdf5::get_linearized_data_size() const {
 int data_reader_jag_conduit_hdf5::get_linearized_response_size() const {
   throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: not implemented");
   return 0;
-#if 0
-  size_t sz = 0u;
-  for (const auto t: m_dependent) {
-    if (t == Undefined) {
-      continue;
-    }
-    sz += get_linearized_size(t);
-  }
-  return static_cast<int>(sz);
-#endif
   return 0;
 }
 
@@ -330,17 +273,6 @@ std::vector<size_t> data_reader_jag_conduit_hdf5::get_linearized_response_sizes(
   throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: not implemented");
   std::vector<size_t> r;
   return r;
-#if 0
-  std::vector<size_t> all_dim;
-  all_dim.reserve(m_dependent.size());
-  for (const auto t: m_dependent) {
-    if (t == Undefined) {
-      continue;
-    }
-    all_dim.push_back(get_linearized_size(t));
-  }
-  return all_dim;
-#endif
 }
 
 const std::vector<int> data_reader_jag_conduit_hdf5::get_data_dims() const {
@@ -355,26 +287,7 @@ int data_reader_jag_conduit_hdf5::get_num_labels() const {
 int data_reader_jag_conduit_hdf5::get_linearized_label_size() const {
   throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: not implemented");
   return m_num_labels;
-}
-
-
-std::string data_reader_jag_conduit_hdf5::to_string(const variable_t t) {
-  switch (t) {
-    case Undefined:  return "Undefined";
-    case JAG_Image:  return "JAG_Image";
-    case JAG_Scalar: return "JAG_Scalar";
-    case JAG_Input:  return "JAG_Input";
-  }
-  return "Undefined";
-}
-
-std::string data_reader_jag_conduit_hdf5::to_string(const std::vector<data_reader_jag_conduit_hdf5::variable_t>& vec) {
-  std::string str("[");
-  for (const auto& el: vec) {
-    str += ' ' + data_reader_jag_conduit_hdf5::to_string(el);
-  }
-  str += " ]";
-  return str;
+  return 0;
 }
 
 std::string data_reader_jag_conduit_hdf5::get_description() const {
@@ -399,29 +312,8 @@ std::string data_reader_jag_conduit_hdf5::get_description() const {
 
 
 bool data_reader_jag_conduit_hdf5::check_sample_id(const size_t sample_id) const {
-  return m_jag_store->check_sample_id(sample_id);
-}
-
-std::vector< std::pair<size_t, const data_reader_jag_conduit_hdf5::ch_t*> >
-data_reader_jag_conduit_hdf5::get_image_ptrs(const size_t sample_id) const {
-  if (sample_id >= m_success_map.size()) {
-    _THROW_LBANN_EXCEPTION_(_CN_, "get_images() : invalid sample index");
-  }
-
-  std::vector< std::pair<size_t, const ch_t*> >image_ptrs;
-#if 0
-  std::unordered_map<int, std::string>::const_iterator it = m_success_map.find(sample_id);
-
-  for (auto t : m_emi_selectors) {
-    std::string img_key = it->second + "/outputs/images/" + t + "/0.0/emi";
-    const conduit::Node & n_image = get_conduit_node(img_key);
-    conduit::float32_array emi = n_image.value();
-    const size_t num_pixels = emi.number_of_elements();
-    const ch_t* emi_data = n_image.value();
-    image_ptrs.push_back(std::make_pair(num_pixels, emi_data));
-  }  
-#endif
-  return image_ptrs;
+  m_jag_store->check_sample_id(sample_id);
+  return true;
 }
 
 cv::Mat data_reader_jag_conduit_hdf5::cast_to_cvMat(const std::pair<size_t, const ch_t*> img, const int height) {
@@ -437,10 +329,10 @@ cv::Mat data_reader_jag_conduit_hdf5::cast_to_cvMat(const std::pair<size_t, cons
   return (image.reshape(0, height));
 }
 
-std::vector<cv::Mat> data_reader_jag_conduit_hdf5::get_cv_images(const size_t sample_id) const {
-  const std::vector<std::vector<data_reader_jag_conduit_hdf5::ch_t>> &raw_images = m_jag_store->fetch_images(sample_id);
+std::vector<cv::Mat> data_reader_jag_conduit_hdf5::get_cv_images(const size_t sample_id, int tid) const {
+  const std::vector<std::vector<data_reader_jag_conduit_hdf5::ch_t>> &raw_images = m_jag_store->fetch_views(sample_id, tid);
   std::vector< std::pair<size_t, const ch_t*> > img_ptrs(raw_images.size());
-  size_t num_pixels = get_linearized_image_size();
+  size_t num_pixels = get_linearized_channel_size();
   for (size_t h=0; h<raw_images.size(); h++) {
     img_ptrs[h] = std::make_pair(num_pixels, raw_images[h].data());
   }
@@ -454,76 +346,6 @@ std::vector<cv::Mat> data_reader_jag_conduit_hdf5::get_cv_images(const size_t sa
   return images;
 }
 
-std::vector<data_reader_jag_conduit_hdf5::ch_t> data_reader_jag_conduit_hdf5::get_images(const size_t sample_id) const {
-  std::vector< std::pair<size_t, const ch_t*> > img_ptrs(get_image_ptrs(sample_id));
-  std::vector<ch_t> images;
-  images.reserve(get_linearized_image_size());
-
-  for (const auto& img: img_ptrs) {
-    const size_t num_pixels = img.first;
-    const ch_t* ptr = img.second;
-    images.insert(images.end(), ptr, ptr + num_pixels);
-  }
-
-  return images;
-}
-
-std::vector<data_reader_jag_conduit_hdf5::scalar_t> data_reader_jag_conduit_hdf5::get_scalars(const size_t sample_id) const {
-  throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: not implemented");
-  std::vector<data_reader_jag_conduit_hdf5::scalar_t> r;
-  return r;
-#if 0
-  if (!check_sample_id(sample_id)) {
-    _THROW_LBANN_EXCEPTION_(_CN_, "get_scalars() : invalid sample index");
-  }
-
-  std::vector<scalar_t> scalars;
-  scalars.reserve(m_scalar_keys.size());
-
-  for(const auto key: m_scalar_keys) {
-    std::unordered_map<int, std::string>::const_iterator t2 = m_success_map.find(sample_id);
-    std::string scalar_key = t2->second + "/outputs/scalars/" + key;
-    const conduit::Node & n_scalar = get_conduit_node(scalar_key);
-    // All the scalar output currently seems to be scalar_t
-    //add_val(key, n_scalar, scalars);
-    scalars.push_back(static_cast<scalar_t>(n_scalar.to_value()));
-  }
-  return scalars;
-#endif
-}
-
-std::vector<data_reader_jag_conduit_hdf5::input_t> data_reader_jag_conduit_hdf5::get_inputs(const size_t sample_id) const {
-  throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: not implemented");
-  std::vector<data_reader_jag_conduit_hdf5::input_t> r;
-  return r;
-#if 0
-  if (!check_sample_id(sample_id)) {
-    _THROW_LBANN_EXCEPTION_(_CN_, "get_inputs() : invalid sample index");
-  }
-
-  std::vector<input_t> inputs;
-  inputs.reserve(m_input_keys.size());
-
-  // automatically determine which method to use based on if all the variables are of input_t
-  if (m_uniform_input_type) {
-    for(const auto key: m_input_keys) {
-      std::unordered_map<int, std::string>::const_iterator t2 = m_success_map.find(sample_id);
-      std::string input_key = t2->second + "/inputs/" + key;
-      const conduit::Node & n_input = get_conduit_node(input_key);
-      inputs.push_back(n_input.value()); // less overhead
-    }
-  } else {
-    for(const auto key: m_input_keys) {
-      std::unordered_map<int, std::string>::const_iterator t2 = m_success_map.find(sample_id);
-      std::string input_key = t2->second + "/inputs/" + key;
-      //const conduit::Node & n_input = get_conduit_node(input_key);
-      //add_val(key, n_input, inputs); // more overhead but general
-    }
-  }
-  return inputs;
-#endif
-}
-
 std::vector<CPUMat>
 data_reader_jag_conduit_hdf5::create_datum_views(CPUMat& X, const std::vector<size_t>& sizes, const int mb_idx) const {
   std::vector<CPUMat> X_v(sizes.size());
@@ -537,80 +359,6 @@ data_reader_jag_conduit_hdf5::create_datum_views(CPUMat& X, const std::vector<si
   return X_v;
 }
 
-bool data_reader_jag_conduit_hdf5::fetch(CPUMat& X, int data_id, int mb_idx, int tid,
-  const data_reader_jag_conduit_hdf5::variable_t vt, const std::string tag) {
-  switch (vt) {
-    case JAG_Image: {
-      const std::vector<size_t> sizes(get_num_img_srcs(), get_linearized_image_size());
-      std::vector<CPUMat> X_v = create_datum_views(X, sizes, mb_idx);
-      std::vector<cv::Mat> images = get_cv_images(data_id);
-
-      if (images.size() != get_num_img_srcs()) {
-        _THROW_LBANN_EXCEPTION2_(_CN_, "fetch() : the number of images is not as expected", \
-          std::to_string(images.size()) + "!=" + std::to_string(get_num_img_srcs()));
-      }
-
-      for(size_t i=0u; i < get_num_img_srcs(); ++i) {
-        int width, height, img_type;
-        image_utils::process_image(images[i], width, height, img_type, *(m_pps[tid]), X_v[i]);
-      }
-      break;
-    }
-    case JAG_Scalar: {
-      const std::vector<scalar_t> scalars(get_scalars(data_id));
-      set_minibatch_item<scalar_t>(X, mb_idx, scalars.data(), get_linearized_scalar_size());
-      break;
-    }
-    case JAG_Input: {
-      const std::vector<input_t> inputs(get_inputs(data_id));
-      set_minibatch_item<input_t>(X, mb_idx, inputs.data(), get_linearized_input_size());
-      break;
-    }
-    default: { // includes Undefined case
-      _THROW_LBANN_EXCEPTION_(_CN_, "fetch_" + tag + "() : unknown or undefined variable type");
-    }
-  }
-  return true;
-}
-
-bool data_reader_jag_conduit_hdf5::fetch_datum(CPUMat& X, int data_id, int mb_idx, int tid) {
-  bool ok = true;
-
-  const std::vector<size_t> & sizes = get_linearized_data_sizes();
-  std::vector<CPUMat> X_v = create_datum_views(X, sizes, mb_idx);
-
-  size_t i = 0;
-  const std::vector<data_reader_jag_conduit_hdf5::input_t> &inputs = m_jag_store->fetch_inputs(data_id);
-  set_minibatch_item<data_reader_jag_conduit_hdf5::input_t>(X_v[i++], 0, inputs.data(), m_jag_store->get_linearized_input_size());
-
-  std::vector<cv::Mat> images = get_cv_images(data_id);
-
-  if (images.size() != get_num_img_srcs()) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: the number of images is not as expected " + std::to_string(images.size()) + "!=" + std::to_string(get_num_img_srcs()));
-  }  
-
-  for(size_t k=0u; k < get_num_img_srcs(); ++k) {
-    int width, height, img_type;
-    image_utils::process_image(images[k], width, height, img_type, *(m_pps[tid]), X_v[i]);
-   }
-
-  return ok;
-}
-
-bool data_reader_jag_conduit_hdf5::fetch_response(CPUMat& X, int data_id, int mb_idx, int tid) {
-  throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: not implemented");
-  return true;
-#if 0
-  std::vector<size_t> sizes = get_linearized_response_sizes();
-  std::vector<CPUMat> X_v = create_datum_views(X, sizes, mb_idx);
-  bool ok = true;
-  for(size_t i = 0u; ok && (i < X_v.size()); ++i) {
-    ok = fetch(X_v[i], data_id, 0, tid, m_dependent[i], "response");
-  }
-  return ok;
-#endif
-}
-
 bool data_reader_jag_conduit_hdf5::fetch_label(CPUMat& Y, int data_id, int mb_idx, int tid) {
   if(m_gan_label_value) Y.Set(m_gan_label_value,mb_idx,1); //fake sample is set to 1; adversarial model
   else { //fake sample (second half of minibatch is set to 0;discriminator model
@@ -633,6 +381,9 @@ void data_reader_jag_conduit_hdf5::setup_data_store(model *m) {
 */
 }
 
+void data_reader_jag_conduit_hdf5::post_update() {
+  return;
+}
 
 } // end of namespace lbann
 
diff --git a/src/data_readers/data_reader_moving_mnist.cpp b/src/data_readers/data_reader_moving_mnist.cpp
new file mode 100644
index 00000000000..500235b0470
--- /dev/null
+++ b/src/data_readers/data_reader_moving_mnist.cpp
@@ -0,0 +1,266 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/data_readers/data_reader_moving_mnist.hpp"
+#include "lbann/utils/file_utils.hpp"
+#include <fstream>
+#include <functional>
+
+namespace lbann {
+
+moving_mnist_reader::moving_mnist_reader(El::Int num_frames,
+                                         El::Int image_height,
+                                         El::Int image_width,
+                                         El::Int num_objects)
+  : generic_data_reader(true),
+    m_num_frames(num_frames),
+    m_image_height(image_height),
+    m_image_width(image_width),
+    m_num_objects(num_objects) {}
+
+// Data dimension access functions
+const std::vector<int> moving_mnist_reader::get_data_dims() const {
+  std::vector<int> dims(4);
+  dims[0] = m_num_frames;
+  dims[1] = 3;
+  dims[2] = m_image_height;
+  dims[3] = m_image_width;
+  return dims;
+}
+int moving_mnist_reader::get_num_labels() const {
+  return 1 + 9 * m_num_objects;
+}
+int moving_mnist_reader::get_linearized_data_size() const {
+  const auto& dims = get_data_dims();
+  return std::accumulate(dims.begin(), dims.end(), 1,
+                         std::multiplies<int>());
+}
+int moving_mnist_reader::get_linearized_label_size() const {
+  return get_num_labels();
+}
+
+bool moving_mnist_reader::fetch_datum(CPUMat& X, int data_id, int col, int tid) {
+
+  // Useful constants
+  constexpr DataType zero = 0;
+  constexpr DataType one = 1;
+
+  // Choose raw images
+  /// @todo Implementation with uniform distribution
+  std::vector<El::Int> raw_image_indices(m_num_objects);
+  for (El::Int obj = 0; obj < m_num_objects; ++obj) {
+    const El::Int hash = (std::hash<int>()(data_id)
+                          ^ std::hash<int>()(col)
+                          ^ std::hash<El::Int>()(obj));
+    raw_image_indices[obj] = hash % m_num_raw_images;
+  }
+
+  // Determine object boundaries
+  std::vector<std::array<El::Int, 4>> bounds(m_num_objects);
+  for (El::Int obj = 0; obj < m_num_objects; ++obj) {
+    auto& xmin = bounds[obj][0] = m_raw_image_width;
+    auto& xmax = bounds[obj][1] = 0;
+    auto& ymin = bounds[obj][2] = m_raw_image_height;
+    auto& ymax = bounds[obj][3] = 0;
+    const auto& raw_image_offset = (raw_image_indices[obj]
+                                    * m_raw_image_height
+                                    * m_raw_image_width);
+    const auto* raw_image = &m_raw_image_data[raw_image_offset];
+    for (El::Int j = 0; j < m_raw_image_height; ++j) {
+      for (El::Int i = 0; i < m_raw_image_width; ++i) {
+        if (raw_image[i + j * m_raw_image_width] != 0) {
+          xmin = std::min(xmin, i);
+          xmax = std::max(xmax, i+1);
+          ymin = std::min(ymin, j);
+          ymax = std::max(ymax, j+1);
+        }
+      }
+    }
+    xmin = std::min(xmin, xmax);
+    ymin = std::min(ymin, ymax);
+  }
+
+  // Initial positions and velocities
+  /// @todo Ensure objects don't overlap
+  std::vector<std::vector<std::array<DataType, 2>>> pos(m_num_objects);
+  std::vector<std::array<DataType, 2>> v(m_num_objects);
+  std::uniform_real_distribution<DataType> dist(zero, one);
+  const DataType vmax = std::hypot(m_image_width, m_image_height) / 5;
+  for (El::Int obj = 0; obj < m_num_objects; ++obj) {
+    const auto& object_width = bounds[obj][1] - bounds[obj][0];
+    const auto& object_height = bounds[obj][3] - bounds[obj][2];
+    pos[obj].resize(m_num_frames);
+    pos[obj][0][0] = (m_image_width - object_width + 1) * dist(get_generator());
+    pos[obj][0][1] = (m_image_height - object_height + 1) * dist(get_generator());
+    const DataType vnorm = vmax * dist(get_generator());
+    const DataType theta = 2 * M_PI * dist(get_generator());
+    v[obj][0] = vnorm * std::sin(theta);
+    v[obj][1] = vnorm * std::cos(theta);
+  }
+
+  // Determine object positions
+  /// @todo Ensure objects don't overlap
+  for (El::Int frame = 1; frame < m_num_frames; ++frame) {
+    for (El::Int obj = 0; obj < m_num_objects; ++obj) {
+
+      // Linear motion
+      auto& x = pos[obj][frame][0];
+      auto& y = pos[obj][frame][1];
+      auto& vx = v[obj][0];
+      auto& vy = v[obj][1];
+      x = pos[obj][frame-1][0] + vx;
+      y = pos[obj][frame-1][1] + vy;
+
+      // Reflections at boundaries
+      const auto& object_width = bounds[obj][1] - bounds[obj][0];
+      const auto& object_height = bounds[obj][3] - bounds[obj][2];
+      const DataType xmax = m_image_width - object_width + 1;
+      const DataType ymax = m_image_height - object_height + 1;
+      if (x <= zero || x >= xmax) {
+        x = std::min(std::max(x, zero), xmax);
+        vx = -vx;
+      }
+      if (y <= zero || y >= ymax) {
+        y = std::min(std::max(y, zero), ymax);
+        vy = -vy;
+      }
+
+    }
+  }
+
+  // Populate frames
+  std::memset(X.Buffer(0, col), 0, X.Height() * sizeof(DataType));
+  for (El::Int obj = 0; obj < m_num_objects; ++obj) {
+
+    // Get raw image
+    const auto& object_width = bounds[obj][1] - bounds[obj][0];
+    const auto& object_height = bounds[obj][3] - bounds[obj][2];
+    const auto& object_width_offset = bounds[obj][0];
+    const auto& object_height_offset = bounds[obj][2];
+    const auto& raw_image_offset = ((raw_image_indices[obj]
+                                     * m_raw_image_height
+                                     * m_raw_image_width)
+                                    + object_width_offset
+                                    + (object_height_offset
+                                       * m_raw_image_width));
+    const auto* raw_image = &m_raw_image_data[raw_image_offset];
+
+    // Copy raw image into each frame
+    const auto& xmax = m_image_width - object_width + 1;
+    const auto& ymax = m_image_height - object_height + 1;
+    for (El::Int frame = 0; frame < m_num_frames; ++frame) {
+
+      // Get image position in current frame
+      El::Int xoff = pos[obj][frame][0];
+      El::Int yoff = pos[obj][frame][1];
+      xoff = std::min(std::max(xoff, El::Int(0)), xmax-1);
+      yoff = std::min(std::max(yoff, El::Int(0)), ymax-1);
+
+      // Copy raw image into position
+      for (El::Int channel = 0; channel < 3; ++channel) {
+        for (El::Int j = 0; j < object_height; ++j) {
+          for (El::Int i = 0; i < object_width; ++i) {
+            const auto& row = (frame * 3 * m_image_height * m_image_width
+                               + channel * m_image_height * m_image_width
+                               + (yoff+j) * m_image_width
+                               + (xoff+i));
+            auto& pixel = X(row, col);
+            pixel += raw_image[i + j * m_raw_image_width] / 255.0;
+            pixel = std::min(pixel, one);
+          }
+        }
+      }
+
+    }
+
+  }
+
+  return true;
+}
+
+bool moving_mnist_reader::fetch_label(CPUMat& Y, int data_id, int col, int tid) {
+
+  // Choose raw images
+  /// @todo Implementation with uniform distribution
+  std::vector<El::Int> raw_image_indices;
+  for (El::Int i = 0; i < m_num_objects; ++i) {
+    const El::Int hash = (std::hash<int>()(data_id)
+                          ^ std::hash<int>()(col)
+                          ^ std::hash<El::Int>()(i));
+    raw_image_indices.push_back(hash % m_num_raw_images);
+  }
+
+  // Label is sum of raw image labels
+  El::Int sum = 0;
+  for (const auto& i : raw_image_indices) {
+    sum += m_raw_label_data[i];
+  }
+  Y(sum, col) = DataType(1);
+
+  return true;
+}
+
+void moving_mnist_reader::load() {
+
+  // Read image data
+  const auto& image_file = get_file_dir() + "/" + get_data_filename();
+  std::ifstream fs_image(image_file.c_str(),
+                         std::fstream::in | std::fstream::binary);
+  unsigned int num_images = 0;
+  unsigned int image_height = 0;
+  unsigned int image_width = 0;
+  fs_image.ignore(4);
+  fs_image.read(reinterpret_cast<char*>(&num_images), 4);
+  fs_image.read(reinterpret_cast<char*>(&image_height), 4);
+  fs_image.read(reinterpret_cast<char*>(&image_width), 4);
+  __swapEndianInt(num_images);
+  __swapEndianInt(image_height);
+  __swapEndianInt(image_width);
+  m_num_raw_images = num_images;
+  m_raw_image_height = image_height;
+  m_raw_image_width = image_width;
+  m_raw_image_data.resize(num_images * image_height * image_width);
+  fs_image.read(reinterpret_cast<char*>(m_raw_image_data.data()),
+                num_images * image_height * image_width);
+  fs_image.close();
+
+  // Read labels
+  const auto& label_file = get_file_dir() + "/" + get_label_filename();
+  std::ifstream fs_label(label_file.c_str(),
+                         std::fstream::in | std::fstream::binary);
+  fs_label.ignore(8);
+  m_raw_label_data.resize(num_images);
+  fs_label.read(reinterpret_cast<char*>(m_raw_label_data.data()), num_images);
+  fs_label.close();
+
+  // Reset indices
+  m_shuffled_indices.resize(num_images);
+  std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0);
+  select_subset_of_data();
+
+}
+
+}  // namespace lbann
diff --git a/src/data_readers/data_reader_multi_images.cpp b/src/data_readers/data_reader_multi_images.cpp
index 4fa06f88f96..916ba665c38 100644
--- a/src/data_readers/data_reader_multi_images.cpp
+++ b/src/data_readers/data_reader_multi_images.cpp
@@ -112,7 +112,7 @@ bool data_reader_multi_images::fetch_datum(CPUMat& X, int data_id, int mb_idx, i
       m_data_store->get_data_buf(data_id, image_buf, i);
       ret = lbann::image_utils::load_image(*image_buf, width, height, img_type, *(m_pps[tid]), X_v[i]);
     } else {
-      ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v[i], m_thread_buffer[tid]);
+      ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v[i], m_thread_buffer[tid], &m_thread_cv_buffer[tid]);
     }
 
     if(!ret) {
diff --git a/src/data_readers/data_reader_synthetic.cpp b/src/data_readers/data_reader_synthetic.cpp
index e1af7f87901..bd826845801 100644
--- a/src/data_readers/data_reader_synthetic.cpp
+++ b/src/data_readers/data_reader_synthetic.cpp
@@ -33,6 +33,20 @@
 
 namespace lbann {
 
+namespace {
+
+void fill_matrix(CPUMat& mat) {
+  std::normal_distribution<DataType> dist(DataType(0), DataType(1));
+  auto& gen = get_fast_generator();
+  const El::Int height = mat.Height();  // Width is 1.
+  DataType * __restrict__ buf = mat.Buffer();
+  for (El::Int i = 0; i < height; ++i) {
+    buf[i] = dist(gen);
+  }
+}
+
+}  // anonymous namespace
+
 data_reader_synthetic::data_reader_synthetic(int num_samples, int num_features,
                                              bool shuffle)
   : data_reader_synthetic(num_samples, {num_features}, 0, shuffle) {}
@@ -43,23 +57,36 @@ data_reader_synthetic::data_reader_synthetic(int num_samples,
   : generic_data_reader(shuffle), m_num_samples(num_samples),
     m_num_labels(num_labels), m_dimensions(dims) {}
 
-bool data_reader_synthetic::fetch_datum(Mat& X, int data_id, int mb_idx, int) {
+data_reader_synthetic::data_reader_synthetic(int num_samples,
+                                             std::vector<int> dims,
+                                             std::vector<int> response_dims,
+                                             bool shuffle)
+  : generic_data_reader(shuffle), m_num_samples(num_samples),
+    m_num_labels(0), m_dimensions(dims), m_response_dimensions(response_dims) {}
+
+bool data_reader_synthetic::fetch_datum(CPUMat& X, int data_id, int mb_idx, int) {
   auto X_v = El::View(X, El::ALL, El::IR(mb_idx, mb_idx + 1));
-  std::normal_distribution<DataType> dist(DataType(0), DataType(1));
-  auto& gen = get_fast_generator();
-  const El::Int height = X_v.Height();  // Width is 1.
-  DataType * __restrict__ buf = X_v.Buffer();
-  for (El::Int i = 0; i < height; ++i) {
-    buf[i] = dist(gen);
-  }
+  fill_matrix(X_v);
   return true;
 }
 
-bool data_reader_synthetic::fetch_label(Mat& Y, int data_id, int mb_idx, int) {
+bool data_reader_synthetic::fetch_label(CPUMat& Y, int data_id, int mb_idx, int) {
+  if (m_num_labels == 0) {
+    LBANN_ERROR("Synthetic data reader does not have labels");
+  }
   Y.Set(fast_rand_int(get_fast_generator(), m_num_labels), mb_idx, 1);
   return true;
 }
 
+bool data_reader_synthetic::fetch_response(CPUMat& Y, int data_id, int mb_idx, int) {
+  if (m_response_dimensions.empty()) {
+    LBANN_ERROR("Synthetic data reader does not have responses");
+  }
+  auto Y_v = El::View(Y, El::ALL, El::IR(mb_idx, mb_idx + 1));
+  fill_matrix(Y_v);
+  return true;
+}
+
 void data_reader_synthetic::load() {
   m_shuffled_indices.clear();
   m_shuffled_indices.resize(m_num_samples);
diff --git a/src/data_readers/data_reader_triplet.cpp b/src/data_readers/data_reader_triplet.cpp
index 6382796e56e..4396ba0f4a2 100644
--- a/src/data_readers/data_reader_triplet.cpp
+++ b/src/data_readers/data_reader_triplet.cpp
@@ -95,7 +95,7 @@ bool data_reader_triplet::fetch_datum(Mat& X, int data_id, int mb_idx, int tid)
       // This could probably have used image_utils::import_image()
       ret = lbann::image_utils::load_image(*image_buf, width, height, img_type, *(m_pps[tid]), X_v[i]);
     } else {
-      ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v[i], m_thread_buffer[tid]);
+      ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v[i], m_thread_buffer[tid], &m_thread_cv_buffer[tid]);
     }
 
     if(!ret) {
diff --git a/src/data_readers/image_utils.cpp b/src/data_readers/image_utils.cpp
index 63c3808c717..2b01a11b2db 100644
--- a/src/data_readers/image_utils.cpp
+++ b/src/data_readers/image_utils.cpp
@@ -199,9 +199,9 @@ bool image_utils::process_image(cv::Mat& image, int& Width, int& Height, int& Ty
  *  @param buf      A thread safe buffer for local, temporary, image decoding
  */
 bool image_utils::load_image(const std::string& filename,
-                                    int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data, std::vector<char>& buf) {
+                             int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data, std::vector<char>& buf, cv::Mat* cv_buf) {
 #ifdef LBANN_HAS_OPENCV
-  cv::Mat image = cv_utils::lbann_imread(filename, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, buf);
+  cv::Mat image = cv_utils::lbann_imread(filename, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, buf, cv_buf);
 
   return process_image(image, Width, Height, Type, pp, data);
 #else
@@ -220,9 +220,9 @@ bool image_utils::load_image(const std::string& filename,
  *  @param buf      A thread safe buffer for local, temporary, image decoding
  */
 bool image_utils::load_image(const std::string& filename,
-                                    int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data, std::vector<char>& buf) {
+                                    int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data, std::vector<char>& buf, cv::Mat* cv_buf) {
 #ifdef LBANN_HAS_OPENCV
-  cv::Mat image = cv_utils::lbann_imread(filename, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, buf);
+  cv::Mat image = cv_utils::lbann_imread(filename, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, buf, cv_buf);
 
   return process_image(image, Width, Height, Type, pp, data);
 #else
@@ -241,9 +241,9 @@ bool image_utils::load_image(const std::string& filename,
  *  @param data     The pre-processed image data to be stored in El::Matrix<DataType> format
  */
 bool image_utils::load_image(std::vector<unsigned char>& image_buf,
-                                    int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data) {
+                                    int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data, cv::Mat* cv_buf) {
 
-  return import_image(image_buf, Width, Height, Type, pp, data);
+  return import_image(image_buf, Width, Height, Type, pp, data, cv_buf);
 }
 
 /**
@@ -281,9 +281,14 @@ bool image_utils::save_image(const std::string& filename,
  *  @param data    The pre-processed image data. A set of sub-matrix Views can be used to store the data.
  */
 bool image_utils::import_image(cv::InputArray inbuf,
-                                      int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data) {
+                                      int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data, cv::Mat* cv_buf) {
 #ifdef LBANN_HAS_OPENCV
-  cv::Mat image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH);
+  cv::Mat image;
+  if(cv_buf != nullptr) {
+    image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, cv_buf);
+  }else {
+    image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH);
+  }
 
   return process_image(image, Width, Height, Type, pp, data);
 #else
@@ -303,9 +308,14 @@ bool image_utils::import_image(cv::InputArray inbuf,
  *  @param data    The pre-processed image data. A set of sub-matrix Views can be used to store the data.
  */
 bool image_utils::import_image(cv::InputArray inbuf,
-                                      int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data) {
+                                      int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector<::Mat>& data, cv::Mat* cv_buf) {
 #ifdef LBANN_HAS_OPENCV
-  cv::Mat image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH);
+  cv::Mat image;
+  if(cv_buf != nullptr) {
+    image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, cv_buf);
+  }else {
+    image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH);
+  }
 
   return process_image(image, Width, Height, Type, pp, data);
 #else
@@ -353,8 +363,8 @@ bool image_utils::export_image(const std::string& fileExt, std::vector<uchar>& o
 
 
 bool image_utils::load_image(std::vector<unsigned char>& image_buf,
-                                    int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data) {
-  return import_image(image_buf, Width, Height, Type, pp, data);
+                                    int& Width, int& Height, int& Type, cv_process& pp, ::Mat& data, cv::Mat* cv_buf) {
+  return import_image(image_buf, Width, Height, Type, pp, data, cv_buf);
 }
 
 } // namespace lbann
diff --git a/src/data_store/data_store_pilot2_molecular.cpp b/src/data_store/data_store_pilot2_molecular.cpp
index e68cf28efdc..5ccf304787e 100644
--- a/src/data_store/data_store_pilot2_molecular.cpp
+++ b/src/data_store/data_store_pilot2_molecular.cpp
@@ -266,7 +266,7 @@ void data_store_pilot2_molecular::exchange_data() {
   for (auto data_id : required_molecules) {
     m_my_molecules[data_id].resize(num_features);
     m_comm->nb_tagged_recv<double>(
-          m_my_molecules[data_id].data(), num_features, m_owner_rank, 
+          m_my_molecules[data_id].data(), num_features, m_owner_rank,
           data_id, recv_req[jj++], m_comm->get_world_comm());
   }
 
@@ -280,7 +280,7 @@ void data_store_pilot2_molecular::exchange_data() {
       send_req[p].resize(required_molecules.size());
       for (auto data_id : required_molecules) {
         m_comm->nb_tagged_send<double>(
-           m_data[data_id].data(), num_features, p, 
+           m_data[data_id].data(), num_features, p,
            data_id, send_req[p][jj++], m_comm->get_world_comm());
       }
     }
diff --git a/src/data_store/jag_store.cpp b/src/data_store/jag_store.cpp
index b1ff7db582a..ee622efa0cc 100644
--- a/src/data_store/jag_store.cpp
+++ b/src/data_store/jag_store.cpp
@@ -3,292 +3,989 @@
 #ifdef LBANN_HAS_CONDUIT
 
 #include "lbann/utils/exception.hpp"
-#include "lbann/utils/timer.hpp"
 #include "lbann/utils/options.hpp"
 #include "conduit/conduit_relay.hpp"
 #include "conduit/conduit_relay_hdf5.hpp"
+#include "lbann/data_readers/data_reader_jag_conduit_hdf5.hpp"
+#include "lbann/utils/glob.hpp"
+#include <cmath>
+#include <limits>
 #include "hdf5.h"
 #include <unordered_set>
 
 namespace lbann {
 
 jag_store::jag_store() 
-  : m_is_setup(false),
-    m_load_inputs(false),
-    m_load_scalars(false),
-    m_image_size(0),
-    m_run_tests(false)
+  : m_image_size(0),
+    m_comm(nullptr),
+    m_master(false),
+    m_max_samples(INT_MAX)
   { 
   }
 
-void jag_store::load_inputs() {
-  m_load_inputs = true;
+void load_keys(std::vector<std::string> &v, const std::string &keys) {
+   std::stringstream s;
+   s << keys;
+   std::string key;
+   while (s >> key) {
+     v.push_back(key);
+   }
 }
 
-void jag_store::load_scalars() {
-  m_load_scalars = true;
+void jag_store::load_scalars_to_use(const std::string &keys) {
+  m_scalars_to_use.clear();
+  load_keys(m_scalars_to_use, keys);
 }
 
-
-void jag_store::load_images(const std::vector<std::string> &keys) {
-  for (auto t : keys) {
-    m_images_to_load.push_back(t);
-  }
+void jag_store::load_inputs_to_use(const std::string &keys) {
+  m_inputs_to_use.clear();
+  load_keys(m_inputs_to_use, keys);
 }
 
-void jag_store::setup(
-  const std::vector<std::string> conduit_filenames,
-  bool num_stores,
-  int my_rank) {
-
-  // quick hack to get every processor to read a unique
-  // subset of the data
-  if (options::get()->has_string("every_n")) {
-    my_rank = m_comm->get_rank_in_world();
-    num_stores = m_comm->get_procs_in_world();
-  }
-
-  if (options::get()->has_string("test_jag_store")) {
-    m_run_tests = true;
-    if (! (my_rank == 0 && num_stores == 1)) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: --test_jag_store is incompatible with --every_n");
+void jag_store::load_image_views_to_use(const std::string &keys) {
+  m_image_views_to_use.clear();
+  size_t last = 0;
+  while (true) {
+    size_t j1 = keys.find('(', last);
+    size_t j2 = keys.find(')', last);
+    if (j1 == std::string::npos || j2 == std::string::npos) {
+      break;
     }
+    std::string key = keys.substr(j1, j2-j1+1);
+    m_image_views_to_use.push_back(key);
+    last = j2+1;
   }
+}
 
-  bool master = m_comm->am_world_master();
-  if (master) std::cerr << "starting jag_store::setup for " << conduit_filenames.size() << " conduit files\n";
+void jag_store::load_image_channels_to_use(const std::string &keys) {
+   std::stringstream s;
+   s << keys;
+   int channel;
+   while (s >> channel) {
+     m_image_channels_to_use.push_back(channel);
+   }
+}
 
-  if (m_image_size == 0) {
-    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: image_size = 0; probably set_image_size() has not been called");
+void jag_store::build_conduit_index(const std::vector<std::string> &filenames) {
+  options *opts = options::get();
+  const std::string base_dir = opts->get_string("base_dir");
+  const std::string output_fn = opts->get_string("build_conduit_index");
+  std::stringstream ss;
+  ss << output_fn << "." << m_rank_in_world;
+  std::ofstream out(ss.str().c_str()); 
+  if (!out.good()) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to open " + output_fn + " for writing");
+  }
+  if (m_master) std::cerr << "writing index file: " << output_fn << "\n";
+  if (m_rank_in_world == 0) {
+    out << base_dir << "\n";
   }
 
-  std::string test_file;
-  std::string test_sample_id;
-
-  // get the sample_ids of successful samples; mostly need to do this
-  // to figure out the proper memory allocation for m_data
-  double tm1 = get_time();
-  conduit::Node n_ok;
-  size_t failed = 0;
-  for (size_t j = my_rank; j<conduit_filenames.size(); j+= num_stores) {
-    hid_t hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( conduit_filenames[j] );
+  int global_num_samples = 0;
+  for (size_t j=m_rank_in_world; j<filenames.size(); j+=m_num_procs_in_world) {
+    std::stringstream s2;
+    filenames[j];
+    std::string fn(base_dir);
+    fn += '/';
+    fn += filenames[j];
+    out << filenames[j] << " ";
+    hid_t hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( fn );
     std::vector<std::string> cnames;
     conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames);
-    for (auto t : cnames) {
-      const std::string key = "/" + t + "/performance/success";
-      conduit::relay::io::hdf5_read(hdf5_file_hnd, key, n_ok);
+    size_t is_good = 0;
+    size_t is_bad = 0;
+    std::stringstream s5;
+    conduit::Node n_ok;
+    for (size_t h=0; h<cnames.size(); h++) {
+      const std::string key_1 = "/" + cnames[h] + "/performance/success";
+      conduit::relay::io::hdf5_read(hdf5_file_hnd, key_1, n_ok);
       int success = n_ok.to_int64();
       if (success == 1) {
-        m_valid_samples.insert(t);
-        if (test_file == "") {
-          test_file = conduit_filenames[j];
-          test_sample_id = "/" + t;
-        }
+        ++is_good;
       } else {
-        ++failed;
+        s5 << h << " ";
+        ++is_bad;
       }
     }
+    global_num_samples += is_good;
+    out << is_good << " " << is_bad << " " << s5.str() << "\n";
     conduit::relay::io::hdf5_close_file(hdf5_file_hnd);
   }
-  m_num_samples = m_valid_samples.size();
-  if (master) {
-    std::cout << "jag_store::setup; successful samples: " << m_num_samples << " failed samples: " << failed << " time to test for success: " << get_time() - tm1 << std::endl;
+  out.close();
+  m_comm->global_barrier();
+
+  int num_samples;
+  MPI_Reduce(&global_num_samples, &num_samples, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+  //m_comm->reduce<int>(&global_num_samples, 1, 0, m_comm->get_world_comm(), El::mpi::SUM);
+
+  std::stringstream s3;
+  s3 << "echo " << global_num_samples << " " << filenames.size() << " >  num_samples_tmp";
+  system(s3.str().c_str());
+  s3.clear();
+  s3.str("");
+  s3 << "cat num_samples_tmp ";
+  for (int k=0; k<m_num_procs_in_world; k++) {
+    s3 << output_fn << "." << k << " ";
+  }
+  s3 << "> " << output_fn;
+  system(s3.str().c_str());
+  s3.clear();
+  s3.str("");
+  s3 << "chmod 660 " << output_fn;
+  system(s3.str().c_str());
+  s3.clear();
+  s3.str("");
+  s3 << "rm -f num_samples_tmp ";
+  for (int k=0; k<m_num_procs_in_world; k++) {
+    s3 << output_fn << "." << k << " ";
+  }
+  system(s3.str().c_str());
+}
+
+void jag_store::setup_testing() {
+  setup_conduit();
+  setup_binary();
+}
+
+void jag_store::setup(
+  data_reader_jag_conduit_hdf5 *reader,
+  bool num_stores,
+  int my_rank) {
+  double tm1 = get_time();
+
+  m_master = m_comm->am_world_master();
+  options *opts = options::get();
+  m_reader = reader;
+
+  m_max_samples = INT_MAX;
+  if (opts->has_int("max_samples")) {
+    m_max_samples = (size_t)opts->get_int("max_samples");
   }
-  double tm2 = get_time();
 
-  //optionally get input and scalar names
-  if (m_load_inputs || m_load_scalars) {
-    if (test_file == "" || test_sample_id == "") {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to find any good samples");
+  bool has_conduit_filenames = false;
+  if (opts->has_string("conduit_filelist")) {
+    std::string f = opts->get_string("conduit_filelist");
+    std::ifstream in(f.c_str());
+    if (!in) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to open " + f + " for reading");
     }
-    if (m_load_inputs) {
-      get_default_keys(test_file, test_sample_id, "inputs", master);
+    std::string line;
+    while (getline(in, line)) {
+      m_conduit_filenames.push_back(line);
     }
-    if (m_load_scalars) {
-      get_default_keys(test_file, test_sample_id, "outputs/scalars", master);
+    in.close();
+    if (m_max_samples < m_conduit_filenames.size()) {
+      m_conduit_filenames.resize(m_max_samples);
     }
+    has_conduit_filenames = true;
   }
 
-  //allocate memory
-  m_data_inputs.resize(m_num_samples);
-  m_data_scalars.resize(m_num_samples);
-  m_data_images.resize(m_num_samples);
-  for (size_t j=0; j<m_num_samples; j++) {
-    m_data_inputs[j].reserve(m_inputs_to_load.size());
-    m_data_scalars[j].reserve(m_scalars_to_load.size());
-    m_data_images[j].resize(m_images_to_load.size());
+  if (m_image_size == 0) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: image_size = 0; probably set_image_size() has not been called");
   }
-  for (size_t j=0; j<m_data_images.size(); j++) {
-    m_data_images[j].resize(m_images_to_load.size());
-    for (size_t i=0; i<m_images_to_load.size(); i++) {
-      m_data_images[j][i].resize(m_image_size);
+
+  // optionally build an index file, then exit. Each line of the file will
+  // contain a conduit filename, followed by the valid sample_ids in 
+  // the conduit file
+  if (opts->has_string("build_conduit_index")) {
+    if (! has_conduit_filenames) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: you must pass --conduit_filenames=<string> on the cmd line when building a conduit index");
+    }
+    build_conduit_index(m_conduit_filenames);
+    exit(0);
+  }
+
+  load_variable_names();
+  build_data_sizes();
+  report_linearized_sizes();
+  allocate_memory();
+  load_normalization_values();
+
+  if (!opts->has_int("mode")) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: you must pass --mode=<int> on cmd line, where <int> is 1 (to use conduit files) or 2 or 3 (for testing) (to use binary files)");
+  }  
+  m_mode = opts->get_int("mode");
+  if (! (m_mode == 1 || m_mode == 2 || m_mode == 3)) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: you must pass --mode=<int> on cmd line, where <int> is 1 (to use conduit files) or 2 (to use binary files); or 4 (for testing) you passed: " + std::to_string(m_mode));
+  }
+  if (m_master) std::cerr << "Running in mode: " << m_mode << "\n";
+
+  // optionally convert conduit files to our binary format, then exit
+  if (opts->has_string("convert_conduit")) {
+    if (! has_conduit_filenames) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: you must pass --conduit_filenames=<string> on the cmd line when converting conduit filenames to binary");
+    }
+    setup_conduit();
+    convert_conduit_to_binary(m_conduit_filenames);
+    exit(0);
+  } 
+
+  if (m_mode == 1) {
+    setup_conduit();
+  } else if (m_mode == 2) {
+    setup_binary();
+  } else {
+    setup_testing();
+  }
+
+  if (m_master) {
+    std::cerr << "jag_store::setup time: " << get_time() - tm1 << "; num samples: " << m_num_samples << std::endl;
+  }
+
+  if (m_mode == 3) {
+    test_converted_files(); 
+    m_comm->global_barrier();
+    exit(0);
+  }
+
+  // optionally compute min/max values, then exit.
+  // This is only needed for one-time computation of normalization values
+  if (opts->has_string("compute_min_max")) {
+    compute_min_max();
+    exit(0);
+  }
+
+  // optionally check bandwidth (sort of), then exit
+  if (opts->has_int("bandwidth")) {
+    if (m_mode == 0) {
+      compute_bandwidth();
+    } else {
+      compute_bandwidth_binary();
+    } 
+    exit(0);
+  }
+}
+
+size_t jag_store::get_linearized_data_size() const {
+  size_t n = m_image_views_to_use.size() * m_image_channels_to_use.size() * get_linearized_channel_size()
+           + m_scalars_to_use.size()
+           + m_inputs_to_use.size();
+  return n;
+}
+
+void jag_store::build_data_sizes() {
+  for (size_t i=0; i<get_total_num_channels(); i++) {
+    m_data_sizes.push_back(get_linearized_channel_size());
+  }
+  if (get_linearized_scalar_size() > 0.0) {
+    m_data_sizes.push_back(get_linearized_scalar_size());
+  }
+  if (get_linearized_input_size() > 0.0) {
+    m_data_sizes.push_back(get_linearized_input_size());
+  }  
+}
+
+void jag_store::report_linearized_sizes() {
+  if (! m_master) {
+    return;
+  }
+  std::cerr 
+    << "===================================================================\n"
+    << "LINEARIZED SIZES REPORT:\n"
+    << "get_linearized_data_size:  " << get_linearized_data_size() << "\n"
+    << "get_linearized_image_size:   " << get_linearized_image_size() << "\n"
+    << "get_linearized_channel_size: " << get_linearized_channel_size() << "\n"
+    << "get_num_channels: " << get_num_channels_per_view() << "\n"
+    << "get_linearized_scalar_size:  " << get_linearized_scalar_size() << "\n"
+    << "get_linearized_input_size:   " << get_linearized_input_size() << "\n"
+    << "get_num_img_srcs:            " << get_num_img_srcs() << "\n"
+    << "sizes vector: ";
+  size_t total = 0;
+  for (auto t : m_data_sizes) {
+    std::cerr << t << " ";
+    total += t;
+  }
+  std::cerr << "\n";
+  std::cerr << "total, from m_data_sizes; should be same as above: " 
+    << total << "\n"
+    << "===================================================================\n";
+}
+
+void jag_store::load_data_binary(int data_id, int tid) {
+  const int file_idx = m_sample_map[data_id].first;
+//  std::string fn = m_binary_filenames[file_idx];
+  const int sample_idx = m_sample_map[data_id].second;
+
+ // std::ifstream in(fn.c_str(), std::ios::out | std::ios::binary);
+  /*
+  if (!in.good()) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to open: " + fn + " for reading; data_id: " + std::to_string(data_id) + " tid: " + std::to_string(tid));
+  }
+  */
+
+//  in.seekg(sample_idx*m_sample_len);
+  m_streams[tid][file_idx]->seekg(sample_idx*m_sample_len);
+  m_streams[tid][file_idx]->read((char*)m_scratch[tid].data(), m_sample_len);
+  //in.read((char*)m_scratch[tid].data(), m_sample_len);
+//  in.close();
+
+//  size_t offset = sample_idx * m_sample_len;
+
+//  in.seekg(offset);
+ // in.read((char*)m_scratch[tid].data(), m_sample_len);
+
+  for (size_t j=0; j<m_inputs_to_use.size(); j++) {
+    check_entry(m_inputs_to_use[j]);
+    memcpy((void*)(m_data_inputs[tid].data()+j), (void*)(m_scratch[tid].data()+m_key_map[m_inputs_to_use[j]]), 8);
+  }
+  for (size_t j=0; j<m_data_inputs[tid].size(); j++) {
+    m_data_inputs[tid][j] = m_data_inputs[tid][j]*m_normalize_inputs[j].first - m_normalize_inputs[j].second;
+  }
+
+  for (size_t j=0; j<m_scalars_to_use.size(); j++) {
+    check_entry(m_scalars_to_use[j]);
+    memcpy((void*)(m_data_scalars[tid].data()+j), (void*)(m_scratch[tid].data()+m_key_map[m_scalars_to_use[j]]), 8);
+  }
+  for (size_t j=0; j<m_data_scalars[tid].size(); j++) {
+    m_data_scalars[tid][j] = m_data_scalars[tid][j]*m_normalize_scalars[j].first - m_normalize_scalars[j].second;
+  }
+
+  size_t y = 0;
+  for (size_t view=0; view<m_image_views_to_use.size(); view++) {
+    check_entry(m_image_views_to_use[view]);
+    for (size_t k=0; k<m_image_channels_to_use.size(); k++) {
+      int channel = m_image_channels_to_use[k];
+
+      memcpy((void*)m_data_images[tid][y].data(), 
+             (void*)(m_scratch[tid].data()+m_key_map[m_image_views_to_use[view]] + channel*get_linearized_channel_size()*sizeof(data_reader_jag_conduit_hdf5::ch_t)), get_linearized_channel_size());
+      for (size_t x=0; x<m_data_images[tid][y].size(); x++) {
+        m_data_images[tid][y][x] = m_data_images[tid][y][x]*m_normalize_views[channel].first - m_normalize_views[channel].second;
+      }
+      ++y;
     }
   }
+}
+
+void jag_store::load_data_conduit(int data_id, int tid) {
+  //map data_id to the correct file and sample_id
+  int idx = m_data_id_to_conduit_filename_idx[data_id];
+  const std::string &filename = m_conduit_filenames[idx];
+  const std::string sample_id = m_data_id_to_sample_id[data_id];
+  hid_t hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read(filename);
 
-  //load the data
-  size_t idx = 0;
   conduit::Node node;
-  for (size_t j = my_rank; j<conduit_filenames.size(); j+= num_stores) {
-    hid_t hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( conduit_filenames[j] );
+
+  size_t j = 0;
+  for (auto input_name : m_inputs_to_use) {
+    const std::string key = sample_id + "/inputs/" + input_name;
+    conduit::relay::io::hdf5_read(hdf5_file_hnd, key, node);
+    //this is fragile; will break if input_t changes
+    double d = node.to_float64();
+    d = d*m_normalize_inputs[j].first - m_normalize_inputs[j].second;
+    m_data_inputs[tid][j++] = d;
+  }
+
+  j = 0;
+  for (auto scalar_name : m_scalars_to_use) {
+    const std::string key = sample_id + "/outputs/scalars/" + scalar_name;
+    conduit::relay::io::hdf5_read(hdf5_file_hnd, key, node);
+    //this is fragile; will break if scalar_t changes
+    double d = node.to_float64();
+    d = d*m_normalize_scalars[j].first - m_normalize_scalars[j].second;
+    m_data_scalars[tid][j++] = d;
+  }
+
+  j = 0;
+  for (auto image_name : m_image_views_to_use) {
+    const std::string key = sample_id + "/outputs/images/" + image_name + "/0.0/emi";
+    conduit::relay::io::hdf5_read(hdf5_file_hnd, key, node);
+    conduit::float32_array emi = node.value();
+    const size_t image_size = emi.number_of_elements();
+    //this is fragile; will break if ch_t changes
+    for (size_t h=0; h<m_image_channels_to_use.size(); h++) {
+      int channel = m_image_channels_to_use[h];
+      int k = 0;
+      for (size_t i=channel; i<image_size; i+=4) {
+        float d = emi[i];
+        d = d*m_normalize_views[channel].first - m_normalize_views[channel].second;
+        m_data_images[tid][j][k++] = d;
+      }
+    }
+    ++j;
+  }
+  conduit::relay::io::hdf5_close_file(hdf5_file_hnd);
+}
+
+void jag_store::open_binary_file_for_output(const std::string &dir) {
+  if (m_binary_output_file.is_open()) {
+    m_binary_output_file.close();
+    m_binary_output_file_names.close();
+    ++m_global_file_idx;
+  }
+
+  std::stringstream s;
+  s << dir << "/" << BINARY_FILE_BASENAME << "_" << m_global_file_idx << ".bin";
+  m_binary_output_file.open(s.str().c_str(), std::ios::out | std::ios::binary);
+  if (!m_binary_output_file) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to open " + s.str() + " for writing");
+  }
+  m_binary_output_filename = s.str();
+  std::cerr << "opened for writing: " << s.str() << "\n";
+
+  s.clear();
+  s.str("");
+  s << dir << "/" << BINARY_FILE_BASENAME << "_" << m_global_file_idx << "_names.txt";
+  m_binary_output_file_names.open(s.str());
+  if (!m_binary_output_file_names) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to open " + s.str() + " for writing");
+  }
+  std::cerr << "opened for writing: " << s.str() << "\n";
+}
+
+void jag_store::write_binary(const std::vector<std::string> &filenames, const std::string &dir) {
+  if (m_master) std::cerr << "starting jag_store::write_binary\n";
+  options *opts = options::get();
+  const std::string output_dir = opts->get_string("convert_conduit");
+
+  m_global_file_idx = 0;
+  m_num_converted_samples = 0;
+  m_binary_output_filename = "";
+  open_binary_file_for_output(output_dir);
+
+  size_t num_samples_written = 0;
+  std::string fn;
+  for (size_t k=0; k<filenames.size(); ++k) {
+    std::stringstream s2;
+    s2 << filenames[k];
+    s2 >> fn;
+    hid_t hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( fn );
     std::vector<std::string> cnames;
     conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames);
-    for (auto t : cnames) {
-      m_id_to_name[idx] = t;
-      if (m_valid_samples.find(t) != m_valid_samples.end()) {
-        for (auto input_name : m_inputs_to_load) {
-          const std::string key = "/" + t + "/inputs/" + input_name;
+    if (m_master) std::cerr << "  num samples this file: " << cnames.size() << "\n";
+
+    conduit::Node n_ok;
+    conduit::Node node;
+    for (auto sample_name : cnames) {
+      const std::string key_1 = "/" + sample_name + "/performance/success";
+      conduit::relay::io::hdf5_read(hdf5_file_hnd, key_1, n_ok);
+      int success = n_ok.to_int64();
+      if (success == 1) {
+        m_binary_output_file_names << sample_name << "\n";
+        for (auto input_name : m_inputs_to_use) {
+          const std::string key = "/" + sample_name + "/inputs/" + input_name;
           conduit::relay::io::hdf5_read(hdf5_file_hnd, key, node);
-          //this is fragile; will break if scalar_t changes
-          m_data_inputs[idx].push_back( node.to_float64() );
+          //this is fragile; will break if input_t changes
+          double tmp = node.to_float64();
+          m_binary_output_file.write((char*)&tmp, sizeof(data_reader_jag_conduit_hdf5::input_t));
         }
-        for (auto scalar_name : m_scalars_to_load) {
-          const std::string key = "/" + t + "/outputs/scalars/" + scalar_name;
+  
+        for (auto scalar_name : m_scalars_to_use) {
+          const std::string key = "/" + sample_name + "/outputs/scalars/" + scalar_name;
           conduit::relay::io::hdf5_read(hdf5_file_hnd, key, node);
-          //this is fragile; will break if input_t changes
-          m_data_inputs[idx].push_back( node.to_float64() );
+          //this is fragile; will break if scalar_t changes
+          double tmp = node.to_float64();
+          m_binary_output_file.write((char*)&tmp, sizeof(data_reader_jag_conduit_hdf5::scalar_t));
         }
-        size_t k = 0;
-        for (auto image_name : m_images_to_load) {
-          const std::string key = "/" + t + "/outputs/images/" + image_name + "/0.0/emi";
+  
+        for (auto image_name : m_image_views_to_use) {
+          const std::string key = "/" + sample_name + "/outputs/images/" + image_name + "/0.0/emi";
           conduit::relay::io::hdf5_read(hdf5_file_hnd, key, node);
-
-
           conduit::float32_array emi = node.value();
           const size_t image_size = emi.number_of_elements();
-
-          //conduit::DataType dtype = node.dtype();
-          //size_t image_size = dtype.number_of_elements();
-          if (image_size != m_image_size) {
-            throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: m_image_size = " + std::to_string(m_image_size) + " but conduit image size is " + std::to_string(image_size));
-          }
           //this is fragile; will break if ch_t changes
-          //float *p = node.value();
-          for (size_t h=0; h<image_size; h++) {
-            m_data_images[idx][k][h] = emi[h];
-            //m_data_images[idx][k][h] = p[h];
+          for (int channel=0; channel<4; channel++) {
+            for (size_t j=channel; j<image_size; j+=4) {
+              m_binary_output_file.write((char*)&emi[j], sizeof(data_reader_jag_conduit_hdf5::ch_t));
+
+            }
           }
-          ++k;
         }
-        ++idx;
+        ++m_num_converted_samples;
+        if (m_num_converted_samples >= m_max_samples) {
+          conduit::relay::io::hdf5_close_file(hdf5_file_hnd);
+          goto EARLY_EXIT;
+          break;
+        }
+        ++num_samples_written;
+        if (num_samples_written == MAX_SAMPLES_PER_BINARY_FILE) {
+          num_samples_written = 0;
+          open_binary_file_for_output(output_dir);
+        }
       }
     }
     conduit::relay::io::hdf5_close_file(hdf5_file_hnd);
   }
+EARLY_EXIT : 
+  m_binary_output_file.close();
+  m_binary_output_file_names.close();
+  if (m_master) std::cerr << "LEAVING jag_store::write_binary\n";
+}
 
-  build_data_sizes();
-  m_is_setup = true;
-  if (master) {
-    std::cerr << "jag_store::setup; time to load the data: " << get_time() - tm2 << std::endl;
+void jag_store::read_key_map(const std::string &filename) {
+  if (m_master) std::cerr << "starting jag_store::read_key_map; opening file: " << filename << "\n";
+  std::ifstream in(filename.c_str());
+  if (!in.good()) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to open: " + filename);
+  }
+
+  std::string line;
+  getline(in, line);
+  getline(in, line);
+  getline(in, line);
+
+  std::string key;
+  int n;
+  for (int k=0; k<3; k++) {
+    getline(in, line);
+    std::stringstream s;
+    s << line;
+    s >> key >> n;
+    for (int j=0; j<n; j++) {
+      getline(in, line);
+      size_t j2 = line.rfind(" ");
+      if (j2 == std::string::npos) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to rfind space for this line: " + line);
+      }
+      int k2 = atoi(&line[j2+1]);
+      m_key_map[line.substr(0, j2)] = k2;
+    }
+  }
+  getline(in, line);
+  if (line.find("TOTAL") == std::string::npos) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: missing TOTAL field");
+  }
+  size_t j3 = line.rfind(" ");
+  if (j3 == std::string::npos) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to rfind space for this line: " + line);
   }
+  m_sample_len = atoi(&line[j3+1]);
+  in.close();
 
-  if (m_run_tests) {
-    run_tests(conduit_filenames);
+  if (m_master) {
+    for (auto t : m_key_map) {
+      std::cerr << "key: " << t.first << " offset: " << t.second << "\n";
+    }
   }
 }
 
-void jag_store::get_default_keys(std::string &filename, std::string &sample_id, std::string key1, bool master) {
-  hid_t hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read(filename);
-  conduit::Node n2;
+void jag_store::write_binary_metadata(std::string dir) {
+  std::stringstream s;
+  s << dir << "/" << METADATA_FN;
+  std::ofstream out(s.str().c_str());
+  if (m_master) std::cerr << "writing metadata for file: " << s.str() << "\n";
+  out << "input_t " << sizeof(data_reader_jag_conduit_hdf5::input_t) << "\n";
+  out << "scalar_t " << sizeof(data_reader_jag_conduit_hdf5::scalar_t) << "\n";
+  out << "ch_t " << sizeof(data_reader_jag_conduit_hdf5::ch_t) << "\n";
+
+  out << "INPUTS " << m_inputs_to_use.size() << "\n";
+  size_t offset = 0;
+  for (auto t : m_inputs_to_use) {
+    out << t << " " << offset << "\n";
+    offset += sizeof(data_reader_jag_conduit_hdf5::input_t);
+  }
 
-  const std::string key = "/" + sample_id + "/" + key1;
+  out << "SCALARS " << m_scalars_to_use.size() << "\n";
+  for (auto t : m_scalars_to_use) {
+    out << t << " " << offset << "\n";
+    offset += sizeof(data_reader_jag_conduit_hdf5::scalar_t);
+  }
 
-  std::vector<std::string> children;
-  conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, key, children);
-  for (auto t : children) {
-    if (key1 == "inputs") {
-      m_inputs_to_load.push_back(t);
-    } else {
-      m_scalars_to_load.push_back(t);
-    }
+  out << "VIEWS " << m_image_views_to_use.size() << "\n";
+  for (auto t : m_image_views_to_use) {
+    out << t << " " << offset << "\n";
+    offset += sizeof(data_reader_jag_conduit_hdf5::ch_t)*128*128; //magic number!
   }
-  conduit::relay::io::hdf5_close_file(hdf5_file_hnd);
+  out << "TOTAL " << offset << "\n";
+  out.close();
 }
 
-size_t jag_store::get_linearized_data_size() const {
-  return m_inputs_to_load.size()
-           + m_scalars_to_load.size()
-           + m_images_to_load.size() * get_linearized_image_size();
+void jag_store::convert_conduit_to_binary(const std::vector<std::string> &conduit_filenames) {
+  m_num_converted_samples = 0;
+
+  if (m_comm->get_procs_in_world() != 1) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: you must run convert_conduit with a single processor");
+  }
+
+  options *opts = options::get();
+  std::string output_dir = opts->get_string("convert_conduit");
+  if (m_master) {
+    char b[128];
+    sprintf(b, "mkdir --mode=770 -p %s", output_dir.c_str());
+    system(b);
+    write_binary_metadata(output_dir);
+  }
+  write_binary(conduit_filenames, output_dir);
 }
 
-void jag_store::build_data_sizes() {
-  for (auto t : m_inputs_to_load) {
-    m_data_sizes.push_back(get_linearized_input_size());
+void jag_store::load_variable_names() {
+  load_inputs_to_use(m_reader->m_input_keys);
+  load_scalars_to_use(m_reader->m_scalar_keys);
+  load_image_views_to_use(m_reader->m_image_views);
+  load_image_channels_to_use(m_reader->m_image_channels);
+
+  if (m_master) {
+    std::cerr << "using these inputs:\n";
+    for (auto t : m_inputs_to_use) {
+      std::cerr << "    " << t << "\n";
+    }
+    std::cerr << "\nusing these scalars:\n";
+    for (auto t : m_scalars_to_use) {
+      std::cerr << "    " << t << "\n";
+    }
+    std::cerr << "\nusing these views:\n";
+    for (auto t : m_image_views_to_use) {
+      std::cerr << "    " << t << "\n";
+    }
+    std::cerr << "\nusing these image channels: ";
+    for (auto t : m_image_channels_to_use) {
+      std::cerr << t << " ";
+    }
+    std::cerr << "\n";
   }
-  for (auto t : m_scalars_to_load) {
-    m_data_sizes.push_back(get_linearized_scalar_size());
+}
+
+void jag_store::allocate_memory() {
+  size_t nthreads = omp_get_max_threads();
+  if (m_master) std::cerr << "starting jag_store::allocate_memory; nthreads: " << nthreads << "\n";
+  m_data_inputs.resize(nthreads);
+  m_data_scalars.resize(nthreads);
+  for (size_t j=0; j<nthreads; j++) {
+    m_data_inputs[j].resize(m_inputs_to_use.size());
+    m_data_scalars[j].resize(m_scalars_to_use.size());
   }
-  for (auto t : m_images_to_load) {
-    m_data_sizes.push_back(get_linearized_image_size());
+
+  m_data_images.resize(nthreads);  
+  for (size_t j=0; j<m_data_images.size(); j++) {
+    m_data_images[j].resize(get_total_num_channels());
+    for (size_t i=0; i<m_data_images[j].size(); i++) {
+      m_data_images[j][i].resize(get_linearized_channel_size());
+    }  
   }
 }
 
-void jag_store::run_tests(const std::vector<std::string> &conduit_filenames) {
-  conduit::Node node;
-  bool master = m_comm->am_world_master();
-  if (master) {
-    std::cout << "\n=======================================================================\n"
-              << "starting jag_store::run_tests\n";
+void jag_store::test_converted_files() {
+  int np = m_comm->get_procs_in_world();
+  if (np != 1) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: mode 3 (test converted binary files) must be run with a single process");
   }
-  for (auto t : conduit_filenames) {
-    if (master) {
-      std::cout << "  loading conduit::Node from file: " << t << "\n";
-    }
-    conduit::relay::io::load_merged(t, "hdf5", node);
+  std::cerr << "\nstarting jag_store::test_converted_files()\n";
+
+  std::vector<std::vector<data_reader_jag_conduit_hdf5::input_t>> inputs;
+  std::vector<std::vector<data_reader_jag_conduit_hdf5::scalar_t>> scalars;
+  std::vector<std::vector<std::vector<data_reader_jag_conduit_hdf5::ch_t>>> images;
+
+  int tid = 0;
+  options *opts = options::get();
+  if (!opts->has_int("num_to_test")) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: when running in test mode you must pass --num_to_test=<int> on the cmd line");
   }
+  size_t num_to_test = opts->get_int("num_to_test");
+  std::cerr << "\nnum to test: " << num_to_test << "\n";
+  for (size_t data_id=0; data_id<num_to_test; data_id++) {
+
+    // sanity checks
+    if (data_id >= m_data_id_to_sample_id.size()) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: data_id: " + std::to_string(data_id) + " >= m_data_id_to_sample_id.size(): " + std::to_string(m_data_id_to_sample_id.size()));
+    }
+
+    const std::string sample_id = m_data_id_to_sample_id[data_id];
 
-  std::cout << "jag_store::run_tests; testing inputs\n";
-  for (size_t j=0; j<get_num_samples(); j++) {
-    if (m_id_to_name.find(j) == m_id_to_name.end()) {
-      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to find " + std::to_string(j) + " in m_id_to_name map");
+    if (m_sample_id_to_global_idx.find(sample_id) == m_sample_id_to_global_idx.end()) {
+    std::cerr << "discarding " << sample_id << " since it's not found in m_sample_id_to_global_idx\n";
+      //throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to find " + sample_id + " in m_sample_id_to_global_idx; data_id: " + std::to_string(data_id));
     }
 
-    std::string sample_id = m_id_to_name[j];
+else {
+    int global_id = m_sample_id_to_global_idx[sample_id];
 
-    const std::vector<data_reader_jag_conduit_hdf5::input_t> & inputs = fetch_inputs(j);
-    size_t i = 0;
-    for (auto input_name : m_inputs_to_load) {
-      const std::string key = "/" + sample_id + "/inputs/" + input_name;
-      if (master) {
-        //std::cout << "  next key: " << key << "\n";
+    std::cerr << "testing sample: " << sample_id << " data_id: " << data_id << " global_id: " << global_id << "\n";
+
+    load_data_conduit(data_id, tid);
+    inputs = m_data_inputs;
+    scalars = m_data_scalars;
+    images = m_data_images;
+
+    load_data_binary(global_id, tid);
+
+    if (inputs != m_data_inputs) {
+      std::cerr << "inputs for data_id " << data_id << " failed.\n"
+                << "values from conduit: ";
+      for (auto t : inputs[tid]) std::cerr << t << " ";
+      std::cerr << "\nvalues from binary:  ";
+      for (auto t : m_data_inputs[tid]) std::cerr << t << " ";
+      std::cerr << "\n";
+      exit(9);
+    }
+    if (scalars != m_data_scalars) {
+      std::cerr << "scalars != m_data_scalars\n";
+      exit(9);
+    }
+
+    std::cerr << "1. num channels: " << images[0].size() << "\n";
+    std::cerr << "2. num channels: " << m_data_images[0].size() << "\n";
+    for (size_t j=0; j<images[0].size(); j++) {
+      if (images[0][j] != m_data_images[0][j]) {
+        std::cerr << "FAILED: images[0][" << j << "] != m_data_images[0][" << j << "]\n";
+        for (size_t x=0; x<images[0][j].size(); x++) {
+          if (images[0][j][x] != m_data_images[0][j][x]) {
+            bool testme = images[0][j][x] - m_data_images[0][j][x] <  std::numeric_limits<float>::epsilon();
+            std::cerr << x << " " << images[0][j][x] << " " << m_data_images[0][j][x] << "  epsilon? " << testme << "\n";
+          }
+        }
+        //exit(9);
+      } else {
+        std::cerr << "PASSED: images[0][" << j << "] == m_data_images[0][" << j << "]\n";
       }
-      const conduit::Node& nd = node[key];
-      if (inputs[i] != nd.to_float64()) {
-        std::cout << "FAILED; id: " << j << " sample name: " << sample_id << " key: " << key << " value from jag_store: " << inputs[j] << " value from conduit: " << nd.to_float64() << "\n";
+    }
+  }
+  }
+  std::cerr << "\ntested " << m_max_samples << "; all passed\n";
+}
+
+void jag_store::setup_conduit() {
+  if (m_master) std::cerr << "starting jag_store::setup_conduit\n";
+
+  std::string filename;
+  std::string sample_id;
+  int j = -1;
+  std::vector<std::string> tmp;
+  for (auto t : m_conduit_filenames) {
+    if (m_data_id_to_sample_id.size() == m_max_samples) {
+      break;
+    }
+    ++j;
+    std::stringstream s(t);
+    s >> filename;
+    tmp.push_back(filename);
+    while (s >> sample_id) {
+      m_data_id_to_conduit_filename_idx.push_back(j);
+      m_data_id_to_sample_id.push_back(sample_id);
+      if (m_data_id_to_sample_id.size() == m_max_samples) {
+        break;
       }
-      ++i;
-    }
-  }
-  if (master) {
-    std::cout << "all inputs match!\n";
-  }
-
-  std::cout << "jag_store::run_tests; testing images\n";
-  for (size_t j=0; j<get_num_samples(); j++) {
-    const std::vector<std::vector<data_reader_jag_conduit_hdf5::ch_t>> & images = fetch_images(j);
-    std::string sample_id = m_id_to_name[j];
-    //if (master) std::cout << "  next sample: " << sample_id << "\n";
-    for (size_t k=0; k<m_images_to_load.size(); k++) {
-      //if (master) std::cerr << "  next image: " << m_images_to_load[k] << "\n";
-      std::string img_key = sample_id + "/outputs/images/" + m_images_to_load[k] + "/0.0/emi";
-      //if (master) std::cerr << "  next key: " << img_key << "\n";
-      const conduit::Node & n_image = node[img_key];
-      conduit::float32_array emi = n_image.value();
-      const size_t num_pixels = emi.number_of_elements();
-      if (num_pixels != images[k].size()) {
-        throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: jag_store::run_tests() - image size mismatch");
+    }
+  }
+  m_conduit_filenames = tmp;
+  m_num_samples = m_data_id_to_sample_id.size();
+  if (m_master) std::cerr << "finished reading " << m_num_samples << " sample names\n";
+}
+
+void jag_store::setup_binary() {
+  if (m_master) std::cerr << "starting jag_store::setup_binary\n";
+  options *opts = options::get();
+  if (!opts->has_string("binary_filelist")) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: you must pass --binary_filelist=<string> on the cmd line");
+  }
+
+  const std::string fn = opts->get_string("binary_filelist");
+  std::ifstream in(fn.c_str());
+  if (!in.good()) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to open " + fn + " for reading");
+  }
+  if (m_master) std::cerr << "opened " << fn << " for reading\n";
+
+  std::string filename;
+  size_t num_files = 0;
+  while (in >> filename) {
+    ++num_files;
+  }
+  in.close();
+
+  in.open(fn.c_str());
+  size_t nthreads = omp_get_max_threads();
+  m_streams.resize(nthreads);
+  for (size_t j=0; j<nthreads; j++) {
+    m_streams[j].resize(num_files);
+  }
+
+  size_t global_idx = 0;
+  int file_idx = -1;
+  while (in >> filename) {
+    if (m_master) std::cerr << "next binary filename: " << filename << "\n";
+    ++file_idx;
+
+    for (size_t tid=0; tid<nthreads; tid++) {
+      m_streams[tid][file_idx] = new std::ifstream(filename.c_str(), std::ios::out | std::ios::binary);
+    }
+
+    if (global_idx == m_max_samples) {
+      break;
+    }
+
+    m_binary_filenames.push_back(filename);
+
+    size_t j = filename.rfind(".bin"); 
+    if (j == std::string::npos) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: t.rfind('.bin') failed for filename: " + filename);
+    }
+
+    std::stringstream s;
+    s << filename.substr(0, j) << "_names.txt";
+    std::ifstream in2(s.str().c_str());
+    if (!in2.good()) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to open " + s.str() + " for reading");
+    }
+    if (m_master) std::cerr << "opened " << s.str() << " for reading\n";
+
+    size_t local_idx = 0;
+    std::string sample_id;
+    while (in2 >> sample_id) {
+      //maps global index (shuffled index subscript) to <file_index, 
+      //num sample within the file
+      m_sample_map[global_idx] = std::make_pair(file_idx, local_idx++);
+      m_sample_id_to_global_idx[sample_id] = global_idx;
+
+      //maps global index (shuffled index subscript) to sample id
+      m_sample_id_map[global_idx] = sample_id;
+
+      ++global_idx;
+      if (global_idx == m_max_samples) {
+        break;
       }
-      const data_reader_jag_conduit_hdf5::ch_t* emi_data = n_image.value();
-      for (size_t i=0; i<num_pixels; i++) {
-        if (images[k][i] != emi_data[i]) {
-          throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: jag_store::run_tests() - images don't match for sample " + std::to_string(j) + " image " + std::to_string(k) + " byte number " + std::to_string(i));
-        }
+    }
+    in2.close();
+  }
+  m_num_samples = m_sample_map.size();
+  if (m_master) std::cerr << "num samples: " << m_num_samples << "\n";
+
+  size_t jj = filename.rfind('/');
+  if (jj == std::string::npos) {
+    throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: " + filename + ".rfind('/') failed");
+  }
+  std::string key_fn = filename.substr(0, jj);
+  key_fn += "/metadata.txt";
+  read_key_map(key_fn);
+
+  m_scratch.resize(nthreads);
+  for (size_t j=0; j<nthreads; j++) {
+    for (size_t i=0; i<m_scratch.size(); i++) {
+      m_scratch[i].resize(m_sample_len);
+    }
+  }
+}
+
+void jag_store::compute_bandwidth_binary() {
+  if (m_master) std::cerr << "starting bandwidth test (binary); num_samples: " << m_num_samples << " m_max_samples: " << m_max_samples << "\n";
+  double tm1 = get_time();
+  int me = get_rank_in_world();
+  int np = m_comm->get_procs_in_world();
+
+  #pragma omp parallel
+  {
+    const auto threadId = omp_get_thread_num();
+
+    #pragma omp parallel for
+      for (size_t j = me; j<m_max_samples; j += np) {
+      if (j % 1000 == 0 && m_master) std::cerr << "processed " << j/1000 << "K samples\n";
+      load_data_binary(j, threadId);
+    }  
+  }  
+  std::cerr << "P_" << me << " finished; time: " << get_time() - tm1 << "\n";
+  m_comm->global_barrier();
+  if (m_master) std::cerr << "time to load all data: " << get_time() - tm1 << "\n";
+}
+
+void jag_store::compute_bandwidth() {
+  if (m_master) std::cerr << "starting bandwidth test\n";
+  double tm1 = get_time();
+  int me = get_rank_in_world();
+  int np = m_comm->get_procs_in_world();
+  size_t n = 0;
+  for (size_t j = me; j<m_data_id_to_conduit_filename_idx.size(); j+= np) {
+    if (j % 1000 == 0 && m_master) std::cerr << "processed " << j/1000 << "K samples\n";
+    load_data(j, 0);
+    n += np;
+  }  
+  std::cerr << "P_" << me << " finished; time: " << get_time() - tm1 << "\n";
+  m_comm->global_barrier();
+  if (m_master) std::cerr << "time to load all data: " << get_time() - tm1 << "\n";
+}
+
+void jag_store::compute_min_max() {
+  std::vector<double> inputs_max(m_inputs_to_use.size(), DBL_MIN);
+  std::vector<double> inputs_min(m_inputs_to_use.size(), DBL_MAX);
+  std::vector<double> inputs_avg(m_inputs_to_use.size(), 0.);
+  std::vector<double> scalars_max(m_scalars_to_use.size(), DBL_MIN);;
+  std::vector<double> scalars_min(m_scalars_to_use.size(), DBL_MAX);;
+  std::vector<double> scalars_avg(m_scalars_to_use.size(), 0.);;
+
+  for (size_t j = 0; j<m_data_id_to_conduit_filename_idx.size(); j++) {
+    if (j == m_max_samples) {
+      break;
+    }
+    if (j % 1000 == 0) std::cerr << "processed " << j/1000 << "K samples\n";
+    load_data(j, 0);
+    const std::vector<data_reader_jag_conduit_hdf5::input_t> &t1 = fetch_inputs(j, 0);
+    for (size_t h=0; h<t1.size(); h++) {
+      if (j == 0) {
+        inputs_min[h] = t1[h];
+        inputs_max[h] = t1[h];
+        inputs_avg[h] += t1[h];
+      } else {
+        inputs_avg[h] += t1[h];
+        if (t1[h] > inputs_max[h]) inputs_max[h] = t1[h];
+        if (t1[h] < inputs_min[h]) inputs_min[h] = t1[h];
       }
     }
+
+    const std::vector<data_reader_jag_conduit_hdf5::scalar_t> &t2 = fetch_scalars(j, 0);
+    for (size_t h=0; h<t2.size(); h++) {
+      scalars_avg[h] += t2[h];
+      if (t2[h] > scalars_max[h]) scalars_max[h] = t2[h];
+      if (t2[h] < scalars_min[h]) scalars_min[h] = t2[h];
+    }
   }
-  if (master) {
-    std::cout << "all images match!\n";
-    std::cout << "=======================================================================\n";
+  std::cerr << "\n\ninputs min: ";
+  for (auto t : inputs_min) std::cerr << t << " ";
+  std::cerr << "\ninputs max: ";
+  for (auto t : inputs_max) std::cerr << t << " ";
+  std::cerr << "\ninputs avg: ";
+  for (auto t : inputs_avg) std::cerr << t/m_data_id_to_conduit_filename_idx.size() << " ";
+  std::cerr << "\n\n";
+  std::cerr << "\n\nscalars min: ";
+  for (auto t : scalars_min) std::cerr << t << " ";
+  std::cerr << "\nscalars max: ";
+  for (auto t : scalars_max) std::cerr << t << " ";
+  std::cerr << "\nscalars avg: ";
+  for (auto t : scalars_avg) std::cerr << t/m_data_id_to_conduit_filename_idx.size() << " ";
+  std::cerr << "\n\n";
+}
+
+void jag_store::load_normalization_values_impl(
+    std::vector<std::pair<double, double>> &values,
+    const std::vector<std::string> &variables) {
+  values.resize(variables.size());
+  for (size_t j=0; j<values.size(); j++) {
+    values[j] = std::make_pair(1.0, 0.0);
   }
+
+  options *opts = options::get();
+  if (!opts->has_string("normalization_fn")) {
+    if (m_master) {
+      std::cerr << "\nWARNING! missing --normalization_fn option on command line; inputs, scalars, and possibly images will not be normalized. This is probably a bad thing.\n";
+    }
+  } else {
+    const std::string fn = opts->get_string("normalization_fn");
+    std::unordered_map<std::string, std::pair<double, double>> m;
+    std::ifstream in(fn.c_str());
+    if (!in.good()) {
+      throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed to open " + fn + " for reading");
+    }
+    std::string variable;
+    double scale;
+    double bias;
+    while (in >> variable >> scale >> bias) {
+      m[variable] = std::make_pair(scale, bias);
+    }
+    in.close();
+    for (size_t j=0; j<variables.size(); j++) {
+      if (m.find(variables[j]) == m.end()) {
+        throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: failed find scale and bias value for: " + variables[j]);
+      }
+      values[j] = m[variables[j]];
+    }
+  }
+}
+
+void jag_store::load_normalization_values() {
+  load_normalization_values_impl(m_normalize_inputs, m_inputs_to_use);
+  load_normalization_values_impl(m_normalize_scalars, m_scalars_to_use);
+  std::vector<std::string> channels_to_use;
+  for (int j=0; j<4; j++) {
+    std::string s = "C" + std::to_string(j);
+    channels_to_use.push_back(s);
+  }  
+  load_normalization_values_impl(m_normalize_views, channels_to_use);
 }
 
 
diff --git a/src/io/data_buffers/distributed_io_buffer.cpp b/src/io/data_buffers/distributed_io_buffer.cpp
index c9b3ddeb31c..5852e593556 100644
--- a/src/io/data_buffers/distributed_io_buffer.cpp
+++ b/src/io/data_buffers/distributed_io_buffer.cpp
@@ -172,26 +172,30 @@ bool lbann::distributed_io_buffer::is_data_set_processed(generic_data_reader *da
  *  parallel readers requested.
  */
 int lbann::distributed_io_buffer::compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const {
+  return distributed_io_buffer::compute_max_num_parallel_readers(data_set_size, mini_batch_size, requested_num_parallel_readers, m_comm);
+}
+
+int lbann::distributed_io_buffer::compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers, const lbann_comm* comm) {
   int num_parallel_readers = requested_num_parallel_readers;
 
   /// Are there enough ranks in the model to support the requested
   /// number of parallel readers
-  if(m_comm->get_model_grid().Size() < num_parallel_readers) {
-    if(m_comm->am_model_master()) {
-        std::cout << "Warning the grid size " << m_comm->get_model_grid().Size()
+  if(comm->get_model_grid().Size() < num_parallel_readers) {
+    if(comm->am_model_master()) {
+        std::cout << "Warning the grid size " << comm->get_model_grid().Size()
                   << "is smaller than the number of requested parallel readers "
                   << num_parallel_readers << "." << std::endl;
     }
-    num_parallel_readers = m_comm->get_model_grid().Size();
+    num_parallel_readers = comm->get_model_grid().Size();
   }
 
   /// Check to make sure that there is enough data for all of the parallel readers
   if(data_set_size != 0) {
     int max_num_parallel_readers = num_parallel_readers;
-    while(ceil((float)data_set_size / (float)(mini_batch_size * m_comm->get_num_models())) < max_num_parallel_readers) {
+    while (!check_num_parallel_readers(data_set_size, mini_batch_size, max_num_parallel_readers, comm)) {
       max_num_parallel_readers--;
     }
-    if(m_comm->am_world_master() && max_num_parallel_readers != num_parallel_readers) {
+    if(comm->am_world_master() && max_num_parallel_readers != num_parallel_readers) {
       std::cout << "Warning the training data set size " << data_set_size
                 << " is too small for the number of requested parallel readers "
                 << num_parallel_readers << ", using " << max_num_parallel_readers << "."
@@ -203,6 +207,10 @@ int lbann::distributed_io_buffer::compute_max_num_parallel_readers(long data_set
   }
 }
 
+bool lbann::distributed_io_buffer::check_num_parallel_readers(long data_set_size, int mini_batch_size, int num_parallel_readers, const lbann_comm* comm) {
+  return !(ceil((float)data_set_size / (float)(mini_batch_size * comm->get_num_models())) < num_parallel_readers);
+}
+
 void lbann::distributed_io_buffer::calculate_num_iterations_per_epoch(int num_models, int model_rank, int max_mini_batch_size, generic_data_reader *data_reader) {
   if(data_reader == nullptr) { return; }
   // If the data reader does not have any data bail out (e.g. unused validation reader)
diff --git a/src/io/data_buffers/partitioned_io_buffer.cpp b/src/io/data_buffers/partitioned_io_buffer.cpp
index 1ef1839c84c..fcf9e0e39d7 100644
--- a/src/io/data_buffers/partitioned_io_buffer.cpp
+++ b/src/io/data_buffers/partitioned_io_buffer.cpp
@@ -84,22 +84,26 @@ bool lbann::partitioned_io_buffer::is_data_set_processed(generic_data_reader *da
 }
 
 int lbann::partitioned_io_buffer::compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const {
+  return partitioned_io_buffer::compute_max_num_parallel_readers(data_set_size, mini_batch_size, requested_num_parallel_readers, m_comm);
+}
+
+int lbann::partitioned_io_buffer::compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers, const lbann_comm* comm) {
   int num_parallel_readers = requested_num_parallel_readers;
 
-  if(m_comm->get_procs_per_model() != num_parallel_readers) {
-    if (m_comm->am_model_master()) {
+  if(comm->get_procs_per_model() != num_parallel_readers) {
+    if (comm->am_model_master()) {
       std::cout << "Warning the requested number of parallel readers "
                 << num_parallel_readers
                 << " does not match the grid size "
-                << m_comm->get_procs_per_model()
+                << comm->get_procs_per_model()
                 << " OVERRIDING requested number of parallel readers."
                 << std::endl;
     }
-    num_parallel_readers = m_comm->get_procs_per_model();
+    num_parallel_readers = comm->get_procs_per_model();
   }
 
   if(mini_batch_size < num_parallel_readers) {
-    if (m_comm->am_model_master()) {
+    if (comm->am_model_master()) {
       std::cout << "Warning the requested number of parallel readers "
                 << num_parallel_readers
                 << " is larger than the requested mini-batch size "
diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt
index 95612fbdc50..2bd2ea2db36 100644
--- a/src/layers/CMakeLists.txt
+++ b/src/layers/CMakeLists.txt
@@ -5,8 +5,11 @@ set_full_path(THIS_DIR_SOURCES
 
 # Add the subdirectories
 add_subdirectory(activations)
+add_subdirectory(image)
 add_subdirectory(learning)
 add_subdirectory(loss)
+add_subdirectory(math)
+add_subdirectory(misc)
 add_subdirectory(regularizers)
 add_subdirectory(transform)
 
diff --git a/src/layers/activations/CMakeLists.txt b/src/layers/activations/CMakeLists.txt
index 77de09365a5..be0922ebb97 100644
--- a/src/layers/activations/CMakeLists.txt
+++ b/src/layers/activations/CMakeLists.txt
@@ -1,18 +1,18 @@
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
   relu.cpp
-  tanh.cpp
   sigmoid.cpp
   softmax.cpp
+  log_softmax.cpp
   )
 
 if (LBANN_HAS_CUDA)
   # Add the CUDA source files for this directory
   set_full_path(THIS_DIR_CU_SOURCES
-    abs.cu
     relu.cu
     sigmoid.cu
     softmax.cu
+    log_softmax.cu
     )
 endif ()
 
diff --git a/src/layers/activations/abs.cu b/src/layers/activations/abs.cu
deleted file mode 100644
index b8cc462dccf..00000000000
--- a/src/layers/activations/abs.cu
+++ /dev/null
@@ -1,122 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "math.h"
-#include "lbann/layers/activations/abs.hpp"
-
-namespace lbann {
-namespace {
-
-__global__ void fp_kernel(int height, int width,
-                          const DataType* __restrict__ input,
-                          int input_leading_dim,
-                          DataType* __restrict__ output,
-                          int output_leading_dim) {
-  const auto& gid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto& size = height * width;
-  const auto& num_threads = blockDim.x * gridDim.x;
-  for (int pos = gid; pos < size; pos += num_threads) {
-    const auto& row = pos % height;
-    const auto& col = pos / height;
-    const auto& x = input[row + col * input_leading_dim];
-    auto& y = output[row + col * output_leading_dim];
-    if (x >= DataType(0)) {
-      y = x;
-    } else {
-      y = -x;
-    }
-  }
-}
-
-__global__ void bp_kernel(int height, int width,
-                          const DataType* __restrict__ input,
-                          int input_leading_dim,
-                          const DataType* __restrict__ gradient_wrt_output,
-                          int gradient_wrt_output_leading_dim,
-                          DataType* __restrict__ gradient_wrt_input,
-                          int gradient_wrt_input_leading_dim) {
-  const auto& gid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto& size = height * width;
-  const auto& num_threads = blockDim.x * gridDim.x;
-  for (int pos = gid; pos < size; pos += num_threads) {
-    const auto& row = pos % height;
-    const auto& col = pos / height;
-    const auto& x = input[row + col * input_leading_dim];
-    const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_leading_dim];
-    auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_leading_dim];
-    if (x > DataType(0)) {
-      dx = dy;
-    } else if (x < DataType(0)) {
-      dx = -dy;
-    } else {
-      dx = DataType(0);
-    }
-  }
-}
-
-} // namespace
-
-namespace abs_cuda {
-
-void fp(int height,
-        int width,
-        const DataType* input,
-        int input_leading_dim,
-        DataType* output,
-        int output_leading_dim) {
-  const int block_dim = 256;
-  const int grid_dim = (height * width + block_dim - 1) / block_dim;
-  if (grid_dim > 0) {
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    fp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
-      height, width,
-      input, input_leading_dim,
-      output, output_leading_dim);
-  }
-}
-
-void bp(int height,
-        int width,
-        const DataType* input,
-        int input_leading_dim,
-        const DataType* gradient_wrt_output,
-        int gradient_wrt_output_leading_dim,
-        DataType* gradient_wrt_input,
-        int gradient_wrt_input_leading_dim) {
-  const int block_dim = 256;
-  const int grid_dim = (height * width + block_dim - 1) / block_dim;
-  if (grid_dim > 0) {
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    bp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
-      height, width,
-      input, input_leading_dim,
-      gradient_wrt_output, gradient_wrt_output_leading_dim,
-      gradient_wrt_input, gradient_wrt_input_leading_dim);
-  }
-}
-
-} // namespace abs_cuda
-} // namespace lbann
diff --git a/src/layers/activations/log_softmax.cpp b/src/layers/activations/log_softmax.cpp
new file mode 100644
index 00000000000..94ebbb38096
--- /dev/null
+++ b/src/layers/activations/log_softmax.cpp
@@ -0,0 +1,157 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/activations/log_softmax.hpp"
+
+namespace lbann {
+
+namespace {
+
+void fp(lbann_comm& comm,
+        const AbsDistMat& input,
+        AbsDistMat& output,
+        AbsDistMat& workspace) {
+
+  // Local matrices
+  const auto& local_input = input.LockedMatrix();
+  auto& local_output = output.Matrix();
+  auto& local_workspace = workspace.Matrix();
+  const auto& local_height = local_input.Height();
+  const auto& local_width = local_input.Width();
+
+  // Find column-wise maximum entries
+  El::Fill(workspace, std::numeric_limits<DataType>::lowest());
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    auto& max_entry = local_workspace(0, col);
+    for (El::Int row = 0; row < local_height; ++row) {
+      max_entry = std::max(max_entry, local_input(row, col));
+    }
+  }
+  comm.allreduce(workspace, workspace.RedundantComm(), El::mpi::MAX);
+
+  // Shift inputs and compute sum(exp(x)) for each column
+  // Note: Shifting by the max prevents LogSumExp from blowing up.
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    const auto shift = local_workspace(0, col);
+    DataType sum = 0;
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& x = local_input(row, col);
+      auto& y = local_output(row, col);
+      y = x - shift;
+      sum += std::exp(y);
+    }
+    local_workspace(0, col) = sum;
+  }
+  comm.allreduce(workspace, workspace.RedundantComm());
+
+  // Compute output by subtracting LogSumExp
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    const DataType log_sum_exp = std::log(local_workspace(0, col));
+    for (El::Int row = 0; row < local_height; ++row) {
+      auto& y = local_output(row, col);
+      y -= log_sum_exp;
+    }
+  }
+
+}
+
+void bp(lbann_comm& comm,
+        const AbsDistMat& output,
+        const AbsDistMat& gradient_wrt_output,
+        AbsDistMat& gradient_wrt_input,
+        AbsDistMat& workspace) {
+
+  // Local matrices
+  const auto& local_output = output.LockedMatrix();
+  const auto& local_gradient_wrt_output = gradient_wrt_output.LockedMatrix();
+  auto& local_gradient_wrt_input = gradient_wrt_input.Matrix();
+  auto& local_workspace = workspace.Matrix();
+  const auto& local_height = local_output.Height();
+  const auto& local_width = local_output.Width();
+
+  // Compute sum of entries in gradient w.r.t. output
+  El::Zero(workspace);
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    auto& sum = local_workspace(0, col);
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& dy = local_gradient_wrt_output(row, col);
+      sum += dy;
+    }
+  }
+  comm.allreduce(workspace, workspace.RedundantComm());
+
+  // Compute gradient w.r.t. input
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    const auto& sum = local_workspace(0, col);
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& y = local_output(row, col);
+      const auto& dy = local_gradient_wrt_output(row, col);
+      auto& dx = local_gradient_wrt_input(row, col);
+      dx = dy - std::exp(y) * sum;
+    }
+  }
+
+}
+
+} // namespace
+
+template <>
+void log_softmax_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
+  fp(*get_comm(),
+     get_prev_activations(),
+     get_activations(),
+     *m_workspace);
+}
+template <>
+void log_softmax_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute() {
+  bp(*get_comm(),
+     get_activations(),
+     get_prev_error_signals(),
+     get_error_signals(),
+     *m_workspace);
+}
+template <>
+void log_softmax_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::fp_compute() {
+  fp(*get_comm(),
+     get_prev_activations(),
+     get_activations(),
+     *m_workspace);
+}
+template <>
+void log_softmax_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_compute() {
+  bp(*get_comm(),
+     get_activations(),
+     get_prev_error_signals(),
+     get_error_signals(),
+     *m_workspace);
+}
+
+} // namespace lbann
diff --git a/src/layers/activations/log_softmax.cu b/src/layers/activations/log_softmax.cu
new file mode 100644
index 00000000000..83f08c99c88
--- /dev/null
+++ b/src/layers/activations/log_softmax.cu
@@ -0,0 +1,391 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/activations/log_softmax.hpp"
+
+namespace lbann {
+
+namespace {
+
+/** Find largest entry within each CUDA block.
+ *  Each block is assigned several entries from the same mini-batch
+ *  sample and it finds the largest entry. Results are output to an
+ *  nblocksx x width matrix.
+ */
+template <El::Int block_size>
+__global__ void reduce_max_kernel(El::Int height, El::Int width,
+                                  const DataType* __restrict__ values,
+                                  El::Int values_ldim,
+                                  DataType* __restrict__ max_values) {
+
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidx = blockIdx.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nblocksx = gridDim.x;
+  const El::Int nblocksy = gridDim.y;
+
+  // Reduce each matrix column independently
+  for (El::Int col = bidy; col < width; col += nblocksy) {
+
+    // Find largest value for each thread
+    DataType private_max_val = -cuda::infinity<DataType>();
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      private_max_val = cuda::max(private_max_val,
+                                  values[row + col * values_ldim]);
+    }
+
+    // Shared memory reduction to get largest value for each block
+    __shared__ DataType shared_max_vals[block_size];
+    shared_max_vals[tid] = private_max_val;
+    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        shared_max_vals[tid] = cuda::max(shared_max_vals[tid],
+                                         shared_max_vals[tid + stride]);
+      }
+    }
+    if (tid == 0) {
+      max_values[bidx + col*nblocksx] = shared_max_vals[0];
+    }
+
+  }
+
+}
+
+/** Exponentiate inputs and compute sum(exp(x)).
+ *  Inputs are shifted by the column max to prevent LogSumExp from
+ *  blowing up.
+ */
+template <El::Int block_size>
+__global__ void fp_exp_kernel(El::Int height, El::Int width,
+                              const DataType* __restrict__ input,
+                              El::Int input_ldim,
+                              DataType* __restrict__ output,
+                              El::Int output_ldim,
+                              const DataType* __restrict__ shifts,
+                              El::Int shifts_stride,
+                              DataType* __restrict__ sums,
+                              El::Int sums_stride) {
+
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nblocksy = gridDim.y;
+
+  // Reduce each matrix column independently
+  for (El::Int col = bidy; col < width; col += nblocksy) {
+    const auto& shift = shifts[col * shifts_stride];
+
+    // Exponentiate inputs and compute sum for each thread
+    DataType private_sum = 0;
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      const auto& x = input[row + col * input_ldim];
+      auto& y = output[row + col * output_ldim];
+      y = x - shift;
+      private_sum += cuda::exp(y);
+    }
+
+    // Shared memory reduction to get sum for each block
+    __shared__ DataType shared_sums[block_size];
+    shared_sums[tid] = private_sum;
+    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        shared_sums[tid] += shared_sums[tid + stride];
+      }
+    }
+
+    // Atomic add to global sum
+    if (tid == 0) {
+      cuda::atomic_add(&sums[col * sums_stride], shared_sums[0]);
+    }
+
+  }
+
+}
+
+/** Subtract LogSumExp from outputs.
+ *  sums should contain sum(exp(x)) for each column.
+ */
+__global__ void fp_lse_kernel(El::Int height, El::Int width,
+                              DataType* __restrict__ output,
+                              El::Int output_ldim,
+                              const DataType* __restrict__ sums,
+                              El::Int sums_stride) {
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nblocksy = gridDim.y;
+  for (El::Int col = bidy; col < width; col += nblocksy) {
+    const auto& log_sum_exp = cuda::log(sums[col * sums_stride]);
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      auto& y = output[row + col * output_ldim];
+      y -= log_sum_exp;
+    }
+  }
+}
+
+/** Compute sum of entries in gradient w.r.t. output. */
+template <El::Int block_size>
+__global__ void bp_sum_kernel(El::Int height, El::Int width,
+                              const DataType* __restrict__ gradient_wrt_output,
+                              El::Int gradient_wrt_output_ldim,
+                              DataType* __restrict__ sums,
+                              El::Int sums_stride) {
+
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nblocksy = gridDim.y;
+
+  // Compute sum for each matrix column independently
+  for (El::Int col = bidy; col < width; col += nblocksy) {
+
+    // Compute sum for each thread
+    DataType private_sum = 0;
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
+      private_sum += dy;
+    }
+
+    // Shared memory reduction to get sum for each block
+    __shared__ DataType shared_sums[block_size];
+    shared_sums[tid] = private_sum;
+    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        shared_sums[tid] += shared_sums[tid + stride];
+      }
+    }
+
+    // Atomic add to global sum
+    if (tid == 0) {
+      cuda::atomic_add(&sums[col * sums_stride], shared_sums[0]);
+    }
+
+  }
+
+}
+
+/** Compute gradient w.r.t. input. */
+template <El::Int block_size>
+__global__ void bp_kernel(El::Int height, El::Int width,
+                          const DataType* __restrict__ output,
+                          El::Int output_ldim,
+                          const DataType* __restrict__ gradient_wrt_output,
+                          El::Int gradient_wrt_output_ldim,
+                          const DataType* __restrict__ sums,
+                          El::Int sums_stride,
+                          DataType* __restrict__ gradient_wrt_input,
+                          El::Int gradient_wrt_input_ldim) {
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nblocksy = gridDim.y;
+  for (El::Int col = bidy; col < width; col += nblocksy) {
+    const auto& sum = sums[col * sums_stride];
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      const auto& y = output[row + col * output_ldim];
+      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
+      auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
+      dx = dy - cuda::exp(y) * sum;
+    }
+  }
+}
+
+} // namespace
+
+template <>
+void log_softmax_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
+  constexpr DataType zero = 0;
+  constexpr DataType one = 1;
+  const auto& local_input = get_local_prev_activations();
+  auto& local_output = get_local_activations();
+  if (!local_input.IsEmpty()) {
+    CHECK_CUDNN(cudnnSoftmaxForward(cudnn::get_handle(),
+                                    CUDNN_SOFTMAX_LOG,
+                                    CUDNN_SOFTMAX_MODE_INSTANCE,
+                                    &one,
+                                    m_tensors_cudnn_desc.get_prev_activations(),
+                                    local_input.LockedBuffer(),
+                                    &zero,
+                                    m_tensors_cudnn_desc.get_activations(),
+                                    local_output.Buffer()));
+  }
+}
+
+template <>
+void log_softmax_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
+  constexpr DataType zero = 0;
+  constexpr DataType one = 1;
+  const auto& local_output = get_local_activations();
+  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = get_local_error_signals();
+  if (!local_output.IsEmpty()) {
+    CHECK_CUDNN(cudnnSoftmaxBackward(cudnn::get_handle(),
+                                     CUDNN_SOFTMAX_LOG,
+                                     CUDNN_SOFTMAX_MODE_INSTANCE,
+                                     &one,
+                                     m_tensors_cudnn_desc.get_activations(),
+                                     local_output.LockedBuffer(),
+                                     m_tensors_cudnn_desc.get_prev_error_signals(),
+                                     local_gradient_wrt_output.LockedBuffer(),
+                                     &zero,
+                                     m_tensors_cudnn_desc.get_error_signals(),
+                                     local_gradient_wrt_input.Buffer()));
+  }
+}
+
+template <>
+void log_softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::fp_compute() {
+
+  // Local matrices
+  const auto& local_input = get_local_prev_activations();
+  auto& local_output = get_local_activations();
+  auto& local_workspace = m_workspace->Matrix();
+  const auto& local_height = local_input.Height();
+  const auto& local_width = local_input.Width();
+
+  // GPU objects
+  auto&& stream = El::GPUManager::Stream();
+  auto&& event = El::GPUManager::Event();
+  El::SyncInfo<El::Device::GPU> sync_info{stream, event};
+
+  // Initialize CUDA threads/blocks
+  // Note: kernels use a 2D thread distribution with a 256 x 1 block
+  // and nblocksx x local_width grid.
+  constexpr El::Int block_size = 256;
+  dim3 block_dims, grid_dims;
+  block_dims.x = block_size;
+  grid_dims.y = local_width;
+
+  // Find column-wise maximum entries
+  grid_dims.x = (local_height + block_size - 1) / block_size;
+  if (grid_dims.x < 1) { grid_dims.x = 1; }
+  cuda::thrust::vector<DataType> max_vals(grid_dims.x * local_width);
+  reduce_max_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+    local_height, local_width,
+    local_input.LockedBuffer(), local_input.LDim(),
+    max_vals.data().get());
+  while (grid_dims.x > 1) {
+    const El::Int prev_height = grid_dims.x;
+    grid_dims.x = (prev_height + block_size - 1) / block_size;
+    cuda::thrust::vector<DataType> prev_vals(std::move(max_vals));
+    max_vals.resize(grid_dims.x * local_width);
+    reduce_max_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+      prev_height, local_width,
+      prev_vals.data().get(), prev_height,
+      max_vals.data().get());
+  }
+  El::mpi::AllReduce(max_vals.data().get(), max_vals.size(),
+                     El::mpi::MAX, m_workspace->RedundantComm(),
+                     sync_info);
+
+  // Shift inputs and compute sum(exp(x)) for each column
+  El::Zero(*m_workspace);
+  if (!local_output.IsEmpty()) {
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    fp_exp_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+      local_height, local_width,
+      local_input.LockedBuffer(), local_input.LDim(),
+      local_output.Buffer(), local_output.LDim(),
+      max_vals.data().get(), 1,
+      local_workspace.Buffer(), 1);
+  }
+  El::AllReduce(*m_workspace, m_workspace->RedundantComm());
+
+  // Compute output by subtracting LogSumExp
+  if (!local_output.IsEmpty()) {
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    fp_lse_kernel<<<grid_dims, block_dims, 0, stream>>>(
+      local_height, local_width,
+      local_output.Buffer(), local_output.LDim(),
+      local_workspace.LockedBuffer(), 1);
+  }
+
+}
+
+template <>
+void log_softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
+
+  // Local matrices
+  const auto& local_output = get_local_activations();
+  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = get_local_error_signals();
+  auto& local_workspace = m_workspace->Matrix();
+  const auto& local_height = local_output.Height();
+  const auto& local_width = local_output.Width();
+
+  // GPU objects
+  auto&& stream = El::GPUManager::Stream();
+  auto&& event = El::GPUManager::Event();
+  El::SyncInfo<El::Device::GPU> sync_info{stream, event};
+
+  // Initialize CUDA threads/blocks
+  // Note: kernels use a 2D thread distribution with a 256 x 1 block
+  // and nblocksx x local_width grid.
+  constexpr El::Int block_size = 256;
+  dim3 block_dims, grid_dims;
+  block_dims.x = block_size;
+  grid_dims.y = local_width;
+
+  // Compute sum of entries in gradient w.r.t. output
+  El::Zero(local_workspace);
+  if (!local_output.IsEmpty()) {
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    bp_sum_kernel<block_size>
+      <<<grid_dims, block_dims, 0, stream>>>(
+        local_height, local_width,
+        local_gradient_wrt_output.LockedBuffer(),
+        local_gradient_wrt_output.LDim(),
+        local_workspace.Buffer(), 1);
+  }
+  El::AllReduce(*m_workspace, m_workspace->RedundantComm());
+
+  // Compute gradient w.r.t. input
+  if (!local_output.IsEmpty()) {
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    bp_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+      local_height, local_width,
+      local_output.LockedBuffer(),
+      local_output.LDim(),
+      local_gradient_wrt_output.LockedBuffer(),
+      local_gradient_wrt_output.LDim(),
+      local_workspace.Buffer(), 1,
+      local_gradient_wrt_input.Buffer(),
+      local_gradient_wrt_input.LDim());
+  }
+
+}
+
+} // namespace lbann
diff --git a/src/layers/activations/relu.cpp b/src/layers/activations/relu.cpp
index 9ffdb86ee3a..6345c7c28d7 100644
--- a/src/layers/activations/relu.cpp
+++ b/src/layers/activations/relu.cpp
@@ -25,24 +25,54 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/layers/activations/relu.hpp"
+#include "lbann/utils/entrywise_operator.hpp"
 
 namespace lbann {
 
+namespace {
+
+/** Entry-wise operator. */
+struct op {
+  inline DataType operator()(DataType x) const {
+    return x > DataType(0) ? x : DataType(0);
+  }
+};
+  
+/** Entry-wise operator for backprop.
+ *  If the forward propagation step computes \f$ y = f(x) \f$, the
+ *  backward propagation step computes
+ *  \f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$.
+ */
+struct op_backprop {
+  inline DataType operator()(DataType x, DataType dy) const {
+    return x > DataType(0) ? dy : DataType(0);
+  }
+};
+  
+} // namespace
+
+// Template instantiation
 template <>
 void relu_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::fp_compute() {
-  entrywise_activation_layer::fp_compute_cpu();
+  apply_entrywise_unary_operator<op>(get_prev_activations(),
+                                     get_activations());
 }
 template <>
 void relu_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_compute() {
-  entrywise_activation_layer::bp_compute_cpu();
+  apply_entrywise_binary_operator<op_backprop>(get_prev_activations(),
+                                               get_prev_error_signals(),
+                                               get_error_signals());
 }
 template <>
 void relu_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
-  entrywise_activation_layer::fp_compute_cpu();
+  apply_entrywise_unary_operator<op>(get_prev_activations(),
+                                     get_activations());
 }
 template <>
 void relu_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute() {
-  entrywise_activation_layer::bp_compute_cpu();
+  apply_entrywise_binary_operator<op_backprop>(get_prev_activations(),
+                                               get_prev_error_signals(),
+                                               get_error_signals());
 }
-
+  
 } // namespace lbann
diff --git a/src/layers/activations/relu.cu b/src/layers/activations/relu.cu
index 59342054aff..72e035fb0d1 100644
--- a/src/layers/activations/relu.cu
+++ b/src/layers/activations/relu.cu
@@ -28,101 +28,51 @@
 #include "lbann/utils/cuda.hpp"
 
 namespace lbann {
-namespace {
-
-__global__ void fp_kernel(int height, int width,
-                          const DataType* __restrict__ input,
-                          int input_leading_dim,
-                          DataType* __restrict__ output,
-                          int output_leading_dim) {
-  const auto& gid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto& size = height * width;
-  const auto& num_threads = blockDim.x * gridDim.x;
-  for (int pos = gid; pos < size; pos += num_threads) {
-    const auto& row = pos % height;
-    const auto& col = pos / height;
-    const auto& x = input[row + col * input_leading_dim];
-    auto& y = output[row + col * output_leading_dim];
-    y = cuda::max(x, DataType(0));
-  }
-}
 
-__global__ void bp_kernel(int height, int width,
-                          const DataType* __restrict__ input,
-                          int input_leading_dim,
-                          const DataType* __restrict__ gradient_wrt_output,
-                          int gradient_wrt_output_leading_dim,
-                          DataType* __restrict__ gradient_wrt_input,
-                          int gradient_wrt_input_leading_dim) {
-  const auto& gid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto& size = height * width;
-  const auto& num_threads = blockDim.x * gridDim.x;
-  for (int pos = gid; pos < size; pos += num_threads) {
-    const auto& row = pos % height;
-    const auto& col = pos / height;
-    const auto& x = input[row + col * input_leading_dim];
-    const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_leading_dim];
-    auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_leading_dim];
-    if (x > DataType(0)) {
-      dx = dy;
-    } else {
-      dx = DataType(0);
-    }
-  }
-}
+namespace {
 
-void fp(const AbsMat& input, AbsMat& output) {
-  const auto& height = input.Height();
-  const auto& width = input.Width();
-  const auto& block_dim = 256;
-  const auto& grid_dim = (height * width + block_dim - 1) / block_dim;
-  if (grid_dim > 0) {
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    fp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
-      height, width,
-      input.LockedBuffer(), input.LDim(),
-      output.Buffer(), output.LDim());
+/** Entry-wise operator. */
+struct op {
+  inline __device__ DataType operator()(DataType x) const {
+    return x > DataType(0) ? x : DataType(0);
   }
-}
-
-void bp(const AbsMat& input,
-        const AbsMat& gradient_wrt_output,
-        AbsMat& gradient_wrt_input) {
-  const auto& height = input.Height();
-  const auto& width = input.Width();
-  const auto& block_dim = 256;
-  const auto& grid_dim = (height * width + block_dim - 1) / block_dim;
-  if (grid_dim > 0) {
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    bp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
-      height, width,
-      input.LockedBuffer(), input.LDim(),
-      gradient_wrt_output.LockedBuffer(), gradient_wrt_output.LDim(),
-      gradient_wrt_input.Buffer(), gradient_wrt_input.LDim());
+};
+  
+/** Entry-wise operator for backprop.
+ *  If the forward propagation step computes \f$ y = f(x) \f$, the
+ *  backward propagation step computes
+ *  \f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$.
+ */
+struct op_backprop {
+  inline __device__ DataType operator()(DataType x, DataType dy) const {
+    return x > DataType(0) ? dy : DataType(0);
   }
-}
-
+};
+  
 } // namespace
 
+// Template instantiation
 template <>
 void relu_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::fp_compute() {
-  fp(get_local_prev_activations(), get_local_activations());
+  cuda::apply_entrywise_unary_operator<op>(get_prev_activations(),
+                                           get_activations());
 }
 template <>
 void relu_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
-  bp(get_local_prev_activations(),
-     get_local_prev_error_signals(),
-     get_local_error_signals());
+  cuda::apply_entrywise_binary_operator<op_backprop>(get_prev_activations(),
+                                                     get_prev_error_signals(),
+                                                     get_error_signals());
 }
 template <>
 void relu_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
-  fp(get_local_prev_activations(), get_local_activations());
+  cuda::apply_entrywise_unary_operator<op>(get_prev_activations(),
+                                           get_activations());
 }
 template <>
 void relu_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
-  bp(get_local_prev_activations(),
-     get_local_prev_error_signals(),
-     get_local_error_signals());
+  cuda::apply_entrywise_binary_operator<op_backprop>(get_prev_activations(),
+                                                     get_prev_error_signals(),
+                                                     get_error_signals());
 }
   
 } // namespace lbann
diff --git a/src/layers/activations/sigmoid.cpp b/src/layers/activations/sigmoid.cpp
index f0d89e38608..b195ed09d4b 100644
--- a/src/layers/activations/sigmoid.cpp
+++ b/src/layers/activations/sigmoid.cpp
@@ -25,24 +25,68 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/layers/activations/sigmoid.hpp"
+#include "lbann/utils/entrywise_operator.hpp"
 
 namespace lbann {
 
+namespace {
+
+// Useful constants
+constexpr DataType zero = 0;
+constexpr DataType one = 1;
+constexpr DataType eps = std::numeric_limits<DataType>::epsilon();
+
+/** Entry-wise operator. */
+struct op {
+  inline DataType operator()(DataType x) const {
+    const DataType y = 1 / (one + std::exp(-x));
+#ifdef LBANN_ENABLE_SIGMOID_CUTOFF
+    if (y <= eps)            { return eps; }
+    else if (y >= one - eps) { return one - eps; }
+#endif // LBANN_ENABLE_SIGMOID_CUTOFF
+    return y;
+  }
+};
+  
+/** Entry-wise operator for backprop.
+ *  If the forward propagation step computes \f$ y = f(x) \f$, the
+ *  backward propagation step computes
+ *  \f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$.
+ */
+struct op_backprop {
+  inline DataType operator()(DataType x, DataType dy) const {
+    const auto& y = op()(x);
+#ifdef LBANN_ENABLE_SIGMOID_CUTOFF
+    if (y <= eps || y >= one - eps) { return zero; }
+#endif // LBANN_ENABLE_SIGMOID_CUTOFF
+    return dy * y * (one - y);
+  }
+};
+  
+} // namespace
+
+// Template instantiation
 template <>
-void sigmoid_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
-  entrywise_activation_layer::fp_compute();
+void sigmoid_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::fp_compute() {
+  apply_entrywise_unary_operator<op>(get_prev_activations(),
+                                     get_activations());
 }
 template <>
-void sigmoid_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute() {
-  entrywise_activation_layer::bp_compute();
+void sigmoid_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_compute() {
+  apply_entrywise_binary_operator<op_backprop>(get_prev_activations(),
+                                               get_prev_error_signals(),
+                                               get_error_signals());
 }
 template <>
-void sigmoid_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::fp_compute() {
-  entrywise_activation_layer::fp_compute();
+void sigmoid_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
+  apply_entrywise_unary_operator<op>(get_prev_activations(),
+                                     get_activations());
 }
 template <>
-void sigmoid_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_compute() {
-  entrywise_activation_layer::bp_compute();
+void sigmoid_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute() {
+  apply_entrywise_binary_operator<op_backprop>(get_prev_activations(),
+                                               get_prev_error_signals(),
+                                               get_error_signals());
 }
-
+  
 } // namespace lbann
diff --git a/src/layers/activations/sigmoid.cu b/src/layers/activations/sigmoid.cu
index 01aca9ebbe8..d7dece4b01c 100644
--- a/src/layers/activations/sigmoid.cu
+++ b/src/layers/activations/sigmoid.cu
@@ -26,134 +26,65 @@
 
 #include "math.h"
 #include "lbann/layers/activations/sigmoid.hpp"
+#include <limits>
 
 namespace lbann {
 namespace {
-
-// Sigmoid function
-#if __CUDA_ARCH__ >= 530
-__device__ inline __half sigmoid(__half x) {
-  static_cast<void>(static_cast<__half (*)(__half)>(sigmoid)); // Suppress "unused function" warning
-  return __hdiv(__float2half(1.f),
-                __hadd(__float2half(1.f), hexp(__hneg(x))));
-}
-#endif // __CUDA_ARCH__ >= 530
-__device__ inline float sigmoid(float x) {
-  static_cast<void>(static_cast<float (*)(float)>(sigmoid)); // Suppress "unused function" warning
-  return 1 / (1.0f + expf(-x));
-}
-__device__ inline double sigmoid(double x) {
-  static_cast<void>(static_cast<double (*)(double)>(sigmoid)); // Suppress "unused function" warning
-  return 1 / (1.0 + exp(-x));
-}
   
-__global__ void fp_kernel(El::Int height, El::Int width,
-                          const DataType* __restrict__ input,
-                          El::Int input_leading_dim,
-                          DataType* __restrict__ output,
-                          El::Int output_leading_dim,
-                          DataType eps) {
-  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int size = height * width;
-  const El::Int num_threads = blockDim.x * gridDim.x;
-  for (El::Int pos = gid; pos < size; pos += num_threads) {
-    const auto& row = pos % height;
-    const auto& col = pos / height;
-    const auto& x = input[row + col * input_leading_dim];
-    auto y = sigmoid(x);
+/** Entry-wise operator. */
+struct op {
+  inline __device__ DataType operator()(DataType x) const {
+    constexpr DataType one = 1;
+    const DataType y = 1 / (one + cuda::exp(-x));
 #ifdef LBANN_ENABLE_SIGMOID_CUTOFF
-    if (y <= eps) { y = eps; }
-    else if (y >= DataType(1) - eps) { y = DataType(1) - eps; }
+    constexpr DataType eps = cuda::epsilon<DataType>();
+    if (y <= eps) { return eps; }
+    else if (y >= one - eps) { return one - eps; }
 #endif // LBANN_ENABLE_SIGMOID_CUTOFF
-    output[row + col * output_leading_dim] = y;
+    return y;
   }
-}
-
-__global__ void bp_kernel(El::Int height, El::Int width,
-                          const DataType* __restrict__ input,
-                          El::Int input_leading_dim,
-                          const DataType* __restrict__ gradient_wrt_output,
-                          El::Int gradient_wrt_output_leading_dim,
-                          DataType* __restrict__ gradient_wrt_input,
-                          El::Int gradient_wrt_input_leading_dim,
-                          DataType eps) {
-  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
-  const El::Int size = height * width;
-  const El::Int num_threads = blockDim.x * gridDim.x;
-  for (El::Int pos = gid; pos < size; pos += num_threads) {
-    const auto& row = pos % height;
-    const auto& col = pos / height;
-    const auto& x = input[row + col * input_leading_dim];
-    const auto& y = sigmoid(x);
-    const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_leading_dim];
-    auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_leading_dim];
+};
+  
+/** Entry-wise operator for backprop.
+ *  If the forward propagation step computes \f$ y = f(x) \f$, the
+ *  backward propagation step computes
+ *  \f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$.
+ */
+struct op_backprop {
+  inline __device__  DataType operator()(DataType x, DataType dy) const {
+    const auto& y = op()(x);
 #ifdef LBANN_ENABLE_SIGMOID_CUTOFF
-    if (y <= eps || y >= DataType(1) - eps) {
-      dx = DataType(0);
-      continue;
-    }
+    constexpr DataType eps = cuda::epsilon<DataType>();
+    if (y <= eps || y >= DataType(1) - eps) { return DataType(0); }
 #endif // LBANN_ENABLE_SIGMOID_CUTOFF
-    dx = dy * y * (DataType(1) - y);
-  }
-}
-
-void fp(const AbsMat& input, AbsMat& output, DataType eps) {
-  const auto& height = input.Height();
-  const auto& width = input.Width();
-  const auto& block_dim = 256;
-  const auto& grid_dim = (height * width + block_dim - 1) / block_dim;
-  if (grid_dim > 0) {
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    fp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
-      height, width,
-      input.LockedBuffer(), input.LDim(),
-      output.Buffer(), output.LDim(),
-      eps);
-  }
-}
-
-void bp(const AbsMat& input,
-        const AbsMat& gradient_wrt_output,
-        AbsMat& gradient_wrt_input,
-        DataType eps) {
-  const auto& height = input.Height();
-  const auto& width = input.Width();
-  const auto& block_dim = 256;
-  const auto& grid_dim = (height * width + block_dim - 1) / block_dim;
-  if (grid_dim > 0) {
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    bp_kernel<<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
-      height, width,
-      input.LockedBuffer(), input.LDim(),
-      gradient_wrt_output.LockedBuffer(), gradient_wrt_output.LDim(),
-      gradient_wrt_input.Buffer(), gradient_wrt_input.LDim(),
-      eps);
+    return dy * y * (DataType(1) - y);
   }
-}
+};
   
 } // namespace
 
+// Template instantiation
 template <>
-void sigmoid_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
-  fp(get_local_prev_activations(), get_local_activations(), eps);
+void sigmoid_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::fp_compute() {
+  cuda::apply_entrywise_unary_operator<op>(get_prev_activations(),
+                                           get_activations());
 }
 template <>
-void sigmoid_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
-  bp(get_local_prev_activations(),
-     get_local_prev_error_signals(),
-     get_local_error_signals(),
-     eps);
+void sigmoid_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
+  cuda::apply_entrywise_binary_operator<op_backprop>(get_prev_activations(),
+                                                     get_prev_error_signals(),
+                                                     get_error_signals());
 }
 template <>
-void sigmoid_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::fp_compute() {
-  fp(get_local_prev_activations(), get_local_activations(), eps);
+void sigmoid_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
+  cuda::apply_entrywise_unary_operator<op>(get_prev_activations(),
+                                           get_activations());
 }
 template <>
-void sigmoid_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
-  bp(get_local_prev_activations(),
-     get_local_prev_error_signals(),
-     get_local_error_signals(),
-     eps);
+void sigmoid_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
+  cuda::apply_entrywise_binary_operator<op_backprop>(get_prev_activations(),
+                                                     get_prev_error_signals(),
+                                                     get_error_signals());
 }
 
 } // namespace lbann
diff --git a/src/layers/activations/softmax.cpp b/src/layers/activations/softmax.cpp
index 1cd3ec4b093..cbafc96daaf 100644
--- a/src/layers/activations/softmax.cpp
+++ b/src/layers/activations/softmax.cpp
@@ -25,231 +25,144 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/layers/activations/softmax.hpp"
-#ifdef LBANN_HAS_CUDNN
-#include "lbann/utils/cublas.hpp"
-#endif  // LBANN_HAS_CUDNN
 
 namespace lbann {
 
-template <>
-void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
-  ::setup_matrices(const El::Grid& grid) {
-  activation_layer::setup_matrices(grid);
-  if (m_workspace != nullptr) { delete m_workspace; }
-  m_workspace = new StarMRMat<El::Device::CPU>(grid);
-}
+namespace {
 
-template <>
-void softmax_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
-  ::setup_matrices(const El::Grid& grid) {
-  activation_layer::setup_matrices(grid);
-  if (m_workspace != nullptr) { delete m_workspace; }
-  m_workspace = new StarVCMat<El::Device::CPU>(grid);
-}
-
-#ifdef LBANN_HAS_GPU
-template <>
-void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
-  ::setup_matrices(const El::Grid& grid) {
-  activation_layer::setup_matrices(grid);
-  if (m_workspace != nullptr) { delete m_workspace; }
-  m_workspace = new StarMRMat<El::Device::GPU>(grid);
-}
-
-template <>
-void softmax_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
-  ::setup_matrices(const El::Grid& grid) {
-  activation_layer::setup_matrices(grid);
-  if (m_workspace != nullptr) { delete m_workspace; }
-  m_workspace = new StarVCMat<El::Device::GPU>(grid);
-}
-#endif // LBANN_HAS_GPU
-
-template <>
-void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::fp_compute() {
-  fp_compute_cpu();
-}
-
-template <>
-void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_compute() {
-  bp_compute_cpu();
-}
+// Minimum output value to avoid denormalized floats
+#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
+const DataType min_output = std::sqrt(std::numeric_limits<DataType>::min());
+#else
+const DataType min_output = 0;
+#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
 
-#ifdef LBANN_HAS_GPU
-template <>
-void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::fp_compute() {
-
-  // Local matrices.
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
-  auto& local_workspace = m_workspace->Matrix();
-
-  const El::Int local_height = local_input.Height();
-  const El::Int local_width = local_input.Width();
-
-  // Find the maximum entry in each local column.
-  if (local_height == 0) {
-    // When there's no local data, fill the workspace with a small value so the
-    // maximum across processors is still computed correctly.
-    El::Fill(local_workspace, std::numeric_limits<DataType>::lowest());
-  } else {
-    softmax_cuda::max_local_col_entry(
-      local_height, local_width, local_input.LockedBuffer(),
-      local_input.LDim(), local_workspace.Buffer(), El::GPUManager::Stream());
+void fp(lbann_comm& comm,
+        const AbsDistMat& input,
+        AbsDistMat& output,
+        AbsDistMat& workspace) {
+
+  // Local matrices
+  const auto& local_input = input.LockedMatrix();
+  auto& local_output = output.Matrix();
+  auto& local_workspace = workspace.Matrix();
+  const auto& local_height = local_input.Height();
+  const auto& local_width = local_input.Width();
+
+  // Find column-wise maximum entries
+  El::Fill(workspace, std::numeric_limits<DataType>::lowest());
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    auto& max_entry = local_workspace(0, col);
+    for (El::Int row = 0; row < local_height; ++row) {
+      max_entry = std::max(max_entry, local_input(row, col));
+    }
   }
-  // Find the global max entry in each column.
-  m_comm->allreduce(*m_workspace, m_workspace->RedundantComm(), El::mpi::MAX);
-
-  // Exponentiate activations and compute column sums.
-  // This subtracts by the column max for stability.
-  if (local_height == 0) {
-    // Zero out so that we contribute nothing to the sum.
-    El::Zero(local_workspace);
-  } else {
-    softmax_cuda::exp_and_col_sum(
-      local_height, local_width, local_input.LockedBuffer(),
-      local_input.LDim(), local_output.Buffer(), local_output.LDim(),
-      local_workspace.Buffer(), El::GPUManager::Stream());
+  comm.allreduce(workspace, workspace.RedundantComm(), El::mpi::MAX);
+
+  // Exponentiate outputs and compute column sums
+  // Note: Subtracting by the column max prevents output from blowing
+  // up. Large negative values underflow to 0.
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    const auto shift = local_workspace(0, col);
+    DataType sum = 0;
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& x = local_input(row, col);
+      auto& y = local_output(row, col);
+      y = std::exp(x - shift);
+      sum += y;
+    }
+    local_workspace(0, col) = sum;
   }
-  // Compute the global sums for each column.
-  m_comm->allreduce(*m_workspace, m_workspace->RedundantComm(), El::mpi::SUM);
+  comm.allreduce(workspace, workspace.RedundantComm());
 
-  // Divide activations by the column sums.
-  // This rounds small values to avoid denormalization.
-  softmax_cuda::div_by_col_sums_and_cutoff(
-    local_height, local_width, local_output.Buffer(),
-    local_output.LDim(), local_workspace.LockedBuffer(), m_min_output,
-    El::GPUManager::Stream());
+  // Divide outputs by column sums
+  // Note: Small values can be rounded to minimum output value to
+  // avoid denormalized floats.
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    const auto& scale = 1 / local_workspace(0, col);
+    for (El::Int row = 0; row < local_height; ++row) {
+      auto& y = local_output(row, col);
+      y = std::max(scale * y, min_output);
+    }
+  }
 
 }
 
-template <>
-void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
-
-  // Local matrices.
-  const auto& local_output = get_local_activations();
-  const auto& local_grad_wrt_output = get_local_prev_error_signals();
-  auto& local_grad_wrt_input = get_local_error_signals();
-  auto& local_workspace = m_workspace->Matrix();
-
-  const El::Int local_height = local_output.Height();
-  const El::Int local_width = local_output.Width();
+void bp(lbann_comm& comm,
+        const AbsDistMat& output,
+        const AbsDistMat& gradient_wrt_output,
+        AbsDistMat& gradient_wrt_input,
+        AbsDistMat& workspace) {
+
+  // Local matrices
+  const auto& local_output = output.LockedMatrix();
+  const auto& local_gradient_wrt_output = gradient_wrt_output.LockedMatrix();
+  auto& local_gradient_wrt_input = gradient_wrt_input.Matrix();
+  auto& local_workspace = workspace.Matrix();
+  const auto& local_height = local_output.Height();
+  const auto& local_width = local_output.Width();
+
+  // Compute dot products between output and gradient w.r.t. output
+  El::Zero(local_workspace);
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    auto& y_dot_dy = local_workspace(0, col);
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& y = local_output(row, col);
+      const auto& dy = local_gradient_wrt_output(row, col);
+      y_dot_dy += y * dy;
+    }
+  }
+  comm.allreduce(workspace, workspace.RedundantComm());
 
-  // Compute dot products between output and gradient w.r.t. output.
-  auto&& handle = El::GPUManager::cuBLASHandle();
-  CHECK_CUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
+  // Compute gradient w.r.t. input
+#pragma omp parallel for
   for (El::Int col = 0; col < local_width; ++col) {
-    cublas::dot(handle, local_height,
-                local_output.LockedBuffer(0, col), 1,
-                local_grad_wrt_output.LockedBuffer(0, col), 1,
-                local_workspace.Buffer(0, col));
+    const auto& y_dot_dy = local_workspace(0, col);
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& y = local_output(row, col);
+      const auto& dy = local_gradient_wrt_output(row, col);
+      auto& dx = local_gradient_wrt_input(row, col);
+      dx = (y > min_output) ? y * (dy - y_dot_dy) : DataType(0);
+    }
   }
-  CHECK_CUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
-  m_comm->allreduce(*m_workspace, m_workspace->RedundantComm(), El::mpi::SUM);
-
-  // Compute gradient w.r.t. input.
-  // Applies a cutoff if needed to avoid denormalized floats.
-  softmax_cuda::grad_wrt_input_and_cutoff(
-    local_height, local_width, local_output.LockedBuffer(),
-    local_output.LDim(), local_workspace.LockedBuffer(),
-    local_grad_wrt_output.LockedBuffer(), local_grad_wrt_output.LDim(),
-    local_grad_wrt_input.Buffer(), local_grad_wrt_input.LDim(),
-    m_min_output, El::GPUManager::Stream());
 
 }
-#endif // LBANN_HAS_GPU
+
+} // namespace
 
 template <>
 void softmax_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
-  fp_compute_cpu();
+  fp(*get_comm(),
+     get_prev_activations(),
+     get_activations(),
+     *m_workspace);
 }
-
 template <>
 void softmax_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute() {
-  bp_compute_cpu();
+  bp(*get_comm(),
+     get_activations(),
+     get_prev_error_signals(),
+     get_error_signals(),
+     *m_workspace);
 }
-
-#ifdef LBANN_HAS_GPU
 template <>
-void softmax_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
-#ifndef LBANN_HAS_CUDNN
-  LBANN_ERROR("cuDNN not detected");
-#else
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
-  if (local_input.Height() > 0 && local_input.Width() > 0) {
-
-    // Useful constants
-    const DataType zero = DataType(0);
-    const DataType one = DataType(1);
-
-    // Apply softmax
-    CHECK_CUDNN(cudnnSoftmaxForward(cudnn::get_handle(),
-                                    CUDNN_SOFTMAX_ACCURATE,
-                                    CUDNN_SOFTMAX_MODE_INSTANCE,
-                                    &one,
-                                    m_tensors_cudnn_desc.get_prev_activations(),
-                                    local_input.LockedBuffer(),
-                                    &zero,
-                                    m_tensors_cudnn_desc.get_activations(),
-                                    local_output.Buffer()));
-
-#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
-    // Round to minimum value to avoid denormalized floats
-    softmax_cuda::fp_cutoff(local_output.Height(),
-                            local_output.Width(),
-                            local_output.Buffer(),
-                            local_output.LDim(),
-                            m_min_output,
-                            El::GPUManager::Stream());
-#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
-
-  }
-#endif // LBANN_HAS_CUDNN
+void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::fp_compute() {
+  fp(*get_comm(),
+     get_prev_activations(),
+     get_activations(),
+     *m_workspace);
 }
-
 template <>
-void softmax_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
-#ifndef LBANN_HAS_CUDNN
-  LBANN_ERROR("cuDNN not detected");
-#else
-  const auto& local_output = get_local_activations();
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
-  if (local_output.Height() > 0 && local_output.Width() > 0) {
-
-    // Useful constants
-    const DataType zero = DataType(0);
-    const DataType one = DataType(1);
-    
-    // Perform backprop
-    CHECK_CUDNN(cudnnSoftmaxBackward(cudnn::get_handle(),
-                                     CUDNN_SOFTMAX_ACCURATE,
-                                     CUDNN_SOFTMAX_MODE_INSTANCE,
-                                     &one,
-                                     m_tensors_cudnn_desc.get_activations(),
-                                     local_output.LockedBuffer(),
-                                     m_tensors_cudnn_desc.get_prev_error_signals(),
-                                     local_gradient_wrt_output.LockedBuffer(),
-                                     &zero,
-                                     m_tensors_cudnn_desc.get_error_signals(),
-                                     local_gradient_wrt_input.Buffer()));
-
-#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
-      // Round to minimum value to avoid denormalized floats
-      softmax_cuda::bp_cutoff(local_output.Height(),
-                              local_output.Width(),
-                              local_output.LockedBuffer(),
-                              local_output.LDim(),
-                              local_gradient_wrt_input.Buffer(),
-                              local_gradient_wrt_input.LDim(),
-                              m_min_output,
-                              El::GPUManager::Stream());
-#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
-
-  }
-#endif // LBANN_HAS_CUDNN
+void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_compute() {
+  bp(*get_comm(),
+     get_activations(),
+     get_prev_error_signals(),
+     get_error_signals(),
+     *m_workspace);
 }
-#endif // LBANN_HAS_GPU
 
 } // namespace lbann
diff --git a/src/layers/activations/softmax.cu b/src/layers/activations/softmax.cu
index 3e33ef2bbee..8ef0f909ebe 100644
--- a/src/layers/activations/softmax.cu
+++ b/src/layers/activations/softmax.cu
@@ -22,245 +22,415 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
-//
-// softmax_cuda.cu - GPU helper routines for softmax layer
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/layers/activations/softmax.hpp"
 
+namespace lbann {
+
 namespace {
 
-__global__ void fp_cutoff_kernel(int height, int width,
-                                 lbann::DataType* output,
-                                 int output_leading_dim,
-                                 lbann::DataType cutoff) {
-  const auto gid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto size = height * width;
-  const auto num_threads = blockDim.x * gridDim.x;
-  for (int pos = gid; pos < size; pos += num_threads) {
-    const int row = pos % height;
-    const int col = pos / height;
-    auto& y = output[row + col * output_leading_dim];
-    if (y < cutoff) { y = cutoff; }
-  }
+/** Minimum output value to avoid denormalized floats. */
+inline __device__ DataType get_min_output() {
+#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
+  return cuda::sqrt(cuda::min<DataType>());
+#else
+  return DataType(0);
+#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
 }
 
-__global__ void bp_cutoff_kernel(int height, int width,
-                                 const lbann::DataType* __restrict__ output,
-                                 int output_leading_dim,
-                                 lbann::DataType* __restrict__ gradient_wrt_input,
-                                 int gradient_wrt_input_leading_dim,
-                                 lbann::DataType cutoff) {
-  const auto gid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto size = height * width;
-  const auto num_threads = blockDim.x * gridDim.x;
-  for (int pos = gid; pos < size; pos += num_threads) {
-    const int row = pos % height;
-    const int col = pos / height;
-    const auto& y = output[row + col * output_leading_dim];
-    if (y < cutoff) {
-      auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_leading_dim];
-      dx = lbann::DataType(0);
+#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
+/** Operator for thresholding output. */
+struct fp_threshold_op {
+  const DataType min_output = get_min_output();
+  inline __device__ DataType operator()(const DataType& y) const {
+    return cuda::max(y, min_output);
+  }
+};
+/** Operator for thresholding gradient w.r.t. input. */
+struct bp_threshold_op {
+  const DataType min_output = get_min_output();
+  inline __device__ DataType operator()(const DataType& y,
+                                        const DataType& dx) const {
+    return (y > min_output) ? dx : DataType(0);
+  }
+};
+#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
+
+/** Find largest entry within each CUDA block.
+ *  Each block is assigned several entries from the same mini-batch
+ *  sample and it finds the largest entry. Results are output to an
+ *  nblocksx x width matrix.
+ */
+template <El::Int block_size>
+__global__ void reduce_max_kernel(El::Int height, El::Int width,
+                                  const DataType* __restrict__ values,
+                                  El::Int values_ldim,
+                                  DataType* __restrict__ max_values) {
+
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidx = blockIdx.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nblocksx = gridDim.x;
+  const El::Int nblocksy = gridDim.y;
+
+  // Reduce each matrix column independently
+  for (El::Int col = bidy; col < width; col += nblocksy) {
+
+    // Find largest value for each thread
+    DataType private_max_val = -cuda::infinity<DataType>();
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      private_max_val = cuda::max(private_max_val,
+                                  values[row + col * values_ldim]);
+    }
+
+    // Shared memory reduction to get largest value for each block
+    __shared__ DataType shared_max_vals[block_size];
+    shared_max_vals[tid] = private_max_val;
+    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        shared_max_vals[tid] = cuda::max(shared_max_vals[tid],
+                                         shared_max_vals[tid + stride]);
+      }
+    }
+    if (tid == 0) {
+      max_values[bidx + col*nblocksx] = shared_max_vals[0];
     }
+
   }
+
 }
 
-__global__ void max_local_col_entry_kernel(
-  int height, int width,
-  const lbann::DataType * __restrict__ input,
-  int input_ldim,
-  lbann::DataType * __restrict__ workspace) {
-  const int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  const int num_threads = blockDim.x * gridDim.x;
-  for (int col = tid; col < width; col += num_threads) {
-    const int col_offset = col*input_ldim;
-    lbann::DataType max_entry = input[col_offset];
-    for (int row = 1; row < height; ++row) {
-      max_entry = fmax(max_entry, input[row + col_offset]);
+/** Exponentiate outputs and compute column sums.
+ *  Subtracting by the column max prevents output from blowing
+ *  up. Large negative values underflow to 0.
+ */
+template <El::Int block_size>
+__global__ void fp_exp_kernel(El::Int height, El::Int width,
+                              const DataType* __restrict__ input,
+                              El::Int input_ldim,
+                              DataType* __restrict__ output,
+                              El::Int output_ldim,
+                              const DataType* __restrict__ shifts,
+                              El::Int shifts_stride,
+                              DataType* __restrict__ sums,
+                              El::Int sums_stride) {
+
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nblocksy = gridDim.y;
+
+  // Reduce each matrix column independently
+  for (El::Int col = bidy; col < width; col += nblocksy) {
+    const auto& shift = shifts[col * shifts_stride];
+
+    // Exponentiate and compute sum for each thread
+    DataType private_sum = 0;
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      const auto& x = input[row + col * input_ldim];
+      auto& y = output[row + col * output_ldim];
+      y = cuda::exp(x - shift);
+      private_sum += y;
     }
-    workspace[col] = max_entry;
+
+    // Shared memory reduction to get sum for each block
+    __shared__ DataType shared_sums[block_size];
+    shared_sums[tid] = private_sum;
+    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        shared_sums[tid] += shared_sums[tid + stride];
+      }
+    }
+
+    // Atomic add to global sum
+    if (tid == 0) {
+      cuda::atomic_add(&sums[col * sums_stride], shared_sums[0]);
+    }
+
   }
+
 }
 
-__global__ void exp_and_col_sum_kernel(
-  int height, int width,
-  const lbann::DataType * __restrict__ input,
-  int input_ldim,
-  lbann::DataType * __restrict__ output,
-  int output_ldim,
-  lbann::DataType * __restrict__ workspace) {
-  const int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  const int num_threads = blockDim.x * gridDim.x;
-  for (int col = tid; col < width; col += num_threads) {
-    const int input_col_offset = col*input_ldim;
-    const int output_col_offset = col*output_ldim;
-    // Shift by the pre-computed maximum value for the column.
-    const lbann::DataType shift = workspace[col];
-    lbann::DataType sum = lbann::DataType(0);
-    for (int row = 0; row < height; ++row) {
-      const lbann::DataType y = exp(input[row + input_col_offset] - shift);
-      output[row + output_col_offset] = y;
-      sum += y;
+/** Divide outputs by column sums.
+ *  Small values can be rounded to minimum output value to avoid
+ *  denormalized floats.
+ */
+__global__ void fp_scale_kernel(El::Int height, El::Int width,
+                                DataType* __restrict__ output,
+                                El::Int output_ldim,
+                                const DataType* __restrict__ sums,
+                                El::Int sums_stride) {
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nblocksy = gridDim.y;
+  const auto& min_output = get_min_output();
+  for (El::Int col = bidy; col < width; col += nblocksy) {
+    const auto& scale = 1 / sums[col * sums_stride];
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      auto& y = output[row + col * output_ldim];
+      y = cuda::max(scale * y, min_output);
     }
-    workspace[col] = sum;
   }
 }
 
-__global__ void div_by_col_sums_and_cutoff_kernel(
-  int height, int width,
-  lbann::DataType * __restrict__ output,
-  int output_ldim,
-  const lbann::DataType * __restrict__ workspace,
-  const lbann::DataType cutoff) {
-  const int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  const int num_threads = blockDim.x * gridDim.x;
-  for (int col = tid; col < width; col += num_threads) {
-    const int col_offset = col*output_ldim;
-    const lbann::DataType scale = lbann::DataType(1) / workspace[col];
-    for (int row = 0; row < height; ++row) {
-#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
-      output[row + col_offset] = fmax(scale*output[row + col_offset], cutoff);
-#else
-      output[row + col_offset] *= scale;
-#endif
+/** Compute dot products between output and gradient w.r.t. output. */
+template <El::Int block_size>
+__global__ void bp_dot_product_kernel(El::Int height, El::Int width,
+                                      const DataType* __restrict__ output,
+                                      El::Int output_ldim,
+                                      const DataType* __restrict__ gradient_wrt_output,
+                                      El::Int gradient_wrt_output_ldim,
+                                      DataType* __restrict__ dot_products,
+                                      El::Int dot_products_stride) {
+
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nblocksy = gridDim.y;
+
+  // Compute dot product for each matrix column independently
+  for (El::Int col = bidy; col < width; col += nblocksy) {
+
+    // Compute dot product contribution for each thread
+    DataType private_dot_product = 0;
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      const auto& y = output[row + col * output_ldim];
+      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
+      private_dot_product += y * dy;
+    }
+
+    // Shared memory reduction to get contribution for each block
+    __shared__ DataType shared_dot_products[block_size];
+    shared_dot_products[tid] = private_dot_product;
+    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        shared_dot_products[tid] += shared_dot_products[tid + stride];
+      }
     }
+
+    // Atomic add to global dot product
+    if (tid == 0) {
+      cuda::atomic_add(&dot_products[col * dot_products_stride],
+                       shared_dot_products[0]);
+    }
+
   }
+
 }
 
-__global__ void grad_wrt_input_and_cutoff_kernel(
-  int height, int width,
-  const lbann::DataType * __restrict__ output,
-  int output_ldim,
-  const lbann::DataType * __restrict__ workspace,
-  const lbann::DataType * __restrict__ grad_wrt_output,
-  int grad_wrt_output_ldim,
-  lbann::DataType * __restrict__ grad_wrt_input,
-  int grad_wrt_input_ldim,
-  const lbann::DataType cutoff) {
-  const int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  const int num_threads = blockDim.x * gridDim.x;
-  for (int col = tid; col < width; col += num_threads) {
-    const lbann::DataType y_dot_dy = workspace[col];
-    const int output_col_offset = col * output_ldim;
-    const int grad_wrt_output_offset = col * grad_wrt_output_ldim;
-    const int grad_wrt_input_offset = col * grad_wrt_input_ldim;
-    for (int row = 0; row < height; ++row) {
-      const auto& y = output[row + output_col_offset];
-      auto& dx = grad_wrt_input[row + grad_wrt_input_offset];
-#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
-      if (y <= cutoff) {
-        dx = lbann::DataType(0);
-      }
-      else
-#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
-      {
-        const auto& dy = grad_wrt_output[row + grad_wrt_output_offset];
-        dx = y * (dy - y_dot_dy);
-      }
+/** Compute gradient w.r.t. input. */
+template <El::Int block_size>
+__global__ void bp_kernel(El::Int height, El::Int width,
+                          const DataType* __restrict__ output,
+                          El::Int output_ldim,
+                          const DataType* __restrict__ gradient_wrt_output,
+                          El::Int gradient_wrt_output_ldim,
+                          const DataType* __restrict__ dot_products,
+                          El::Int dot_products_stride,
+                          DataType* __restrict__ gradient_wrt_input,
+                          El::Int gradient_wrt_input_ldim) {
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nblocksy = gridDim.y;
+  const auto& min_output = get_min_output();
+  for (El::Int col = bidy; col < width; col += nblocksy) {
+    const auto& y_dot_dy = dot_products[col * dot_products_stride];
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      const auto& y = output[row + col * output_ldim];
+      const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim];
+      auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
+      dx = (y > min_output) ? y * (dy - y_dot_dy) : DataType(0);
     }
   }
 }
 
-}  // anonymous namespace
+} // namespace
 
-namespace lbann {
-namespace softmax_cuda {
-
-void fp_cutoff(int height, int width,
-               DataType* output,
-               int output_leading_dim,
-               DataType cutoff,
-               cudaStream_t stream) {
-  const int size = height * width;
-  if (size == 0) return;
-  const int block_dim = 256;
-  const int grid_dim = (size + block_dim - 1) / block_dim;
-  fp_cutoff_kernel<<<grid_dim, block_dim, 0, stream>>>(
-    height, width, output, output_leading_dim, cutoff);
+template <>
+void softmax_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
+  constexpr DataType zero = 0;
+  constexpr DataType one = 1;
+  const auto& local_input = get_local_prev_activations();
+  auto& local_output = get_local_activations();
+  if (!local_input.IsEmpty()) {
+    CHECK_CUDNN(cudnnSoftmaxForward(cudnn::get_handle(),
+                                    CUDNN_SOFTMAX_ACCURATE,
+                                    CUDNN_SOFTMAX_MODE_INSTANCE,
+                                    &one,
+                                    m_tensors_cudnn_desc.get_prev_activations(),
+                                    local_input.LockedBuffer(),
+                                    &zero,
+                                    m_tensors_cudnn_desc.get_activations(),
+                                    local_output.Buffer()));
+#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
+    cuda::apply_entrywise_unary_operator<fp_threshold_op>(local_output,
+                                                          local_output);
+#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
+  }
 }
 
-void bp_cutoff(int height, int width,
-               const DataType* output,
-               int output_leading_dim,
-               DataType* gradient_wrt_input,
-               int gradient_wrt_input_leading_dim,
-               DataType cutoff,
-               cudaStream_t stream) {
-  const int size = height * width;
-  if (size == 0) return;
-  const int block_dim = 256;
-  const int grid_dim = (size + block_dim - 1) / block_dim;
-  bp_cutoff_kernel<<<grid_dim, block_dim, 0, stream>>>(
-    height, width,
-    output, output_leading_dim,
-    gradient_wrt_input, gradient_wrt_input_leading_dim,
-    cutoff);
+template <>
+void softmax_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
+  constexpr DataType zero = 0;
+  constexpr DataType one = 1;
+  const auto& local_output = get_local_activations();
+  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = get_local_error_signals();
+  if (!local_output.IsEmpty()) {
+    CHECK_CUDNN(cudnnSoftmaxBackward(cudnn::get_handle(),
+                                     CUDNN_SOFTMAX_ACCURATE,
+                                     CUDNN_SOFTMAX_MODE_INSTANCE,
+                                     &one,
+                                     m_tensors_cudnn_desc.get_activations(),
+                                     local_output.LockedBuffer(),
+                                     m_tensors_cudnn_desc.get_prev_error_signals(),
+                                     local_gradient_wrt_output.LockedBuffer(),
+                                     &zero,
+                                     m_tensors_cudnn_desc.get_error_signals(),
+                                     local_gradient_wrt_input.Buffer()));
+#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF
+    cuda::apply_entrywise_binary_operator<bp_threshold_op>(local_output,
+                                                           local_gradient_wrt_input,
+                                                           local_gradient_wrt_input);
+#endif // LBANN_ENABLE_SOFTMAX_CUTOFF
+  }
 }
 
-void max_local_col_entry(int height, int width,
-                         const DataType * __restrict__ input,
-                         int input_ldim,
-                         DataType * __restrict__ workspace,
-                         cudaStream_t stream) {
-  if (width <= 0 || height <= 0) {
-    return;
+template <>
+void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::fp_compute() {
+
+  // Local matrices
+  const auto& local_input = get_local_prev_activations();
+  auto& local_output = get_local_activations();
+  auto& local_workspace = m_workspace->Matrix();
+  const auto& local_height = local_input.Height();
+  const auto& local_width = local_input.Width();
+
+  // GPU objects
+  auto&& stream = El::GPUManager::Stream();
+  auto&& event = El::GPUManager::Event();
+  El::SyncInfo<El::Device::GPU> sync_info{stream, event};
+
+  // Initialize CUDA threads/blocks
+  // Note: kernels use a 2D thread distribution with a 256 x 1 block
+  // and nblocksx x local_width grid.
+  constexpr El::Int block_size = 256;
+  dim3 block_dims, grid_dims;
+  block_dims.x = block_size;
+  grid_dims.y = local_width;
+
+  // Find column-wise maximum entries
+  grid_dims.x = (local_height + block_size - 1) / block_size;
+  if (grid_dims.x < 1) { grid_dims.x = 1; }
+  cuda::thrust::vector<DataType> max_vals(grid_dims.x * local_width);
+  reduce_max_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+    local_height, local_width,
+    local_input.LockedBuffer(), local_input.LDim(),
+    max_vals.data().get());
+  while (grid_dims.x > 1) {
+    const El::Int prev_height = grid_dims.x;
+    grid_dims.x = (prev_height + block_size - 1) / block_size;
+    cuda::thrust::vector<DataType> prev_vals(std::move(max_vals));
+    max_vals.resize(grid_dims.x * local_width);
+    reduce_max_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+      prev_height, local_width,
+      prev_vals.data().get(), prev_height,
+      max_vals.data().get());
   }
-  const int block_dim = 256;
-  const int grid_dim = (width + block_dim - 1) / block_dim;
-  max_local_col_entry_kernel<<<grid_dim, block_dim, 0, stream>>>(
-    height, width, input, input_ldim, workspace);
-}
+  El::mpi::AllReduce(max_vals.data().get(), max_vals.size(),
+                     El::mpi::MAX, m_workspace->RedundantComm(),
+                     sync_info);
 
-void exp_and_col_sum(int height, int width,
-                     const DataType * __restrict__ input,
-                     int input_ldim,
-                     DataType * __restrict__ output,
-                     int output_ldim,
-                     DataType * __restrict__ workspace,
-                     cudaStream_t stream) {
-  if (width <= 0 || height <= 0) {
-    return;
+  // Exponentiate outputs and compute column sums
+  El::Zero(*m_workspace);
+  if (!local_output.IsEmpty()) {
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    fp_exp_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+      local_height, local_width,
+      local_input.LockedBuffer(), local_input.LDim(),
+      local_output.Buffer(), local_output.LDim(),
+      max_vals.data().get(), 1,
+      local_workspace.Buffer(), 1);
   }
-  const int block_dim = 256;
-  const int grid_dim = (width + block_dim - 1) / block_dim;
-  exp_and_col_sum_kernel<<<grid_dim, block_dim, 0, stream>>>(
-    height, width, input, input_ldim, output, output_ldim, workspace);
-}
+  El::AllReduce(*m_workspace, m_workspace->RedundantComm());
 
-void div_by_col_sums_and_cutoff(int height, int width,
-                                DataType * __restrict__ output,
-                                int output_ldim,
-                                const DataType * __restrict__ workspace,
-                                const DataType cutoff,
-                                cudaStream_t stream) {
-  if (width <= 0 || height <= 0) {
-    return;
+  // Divide activations by column sums
+  if (!local_output.IsEmpty()) {
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    fp_scale_kernel<<<grid_dims, block_dims, 0, stream>>>(
+      local_height, local_width,
+      local_output.Buffer(), local_output.LDim(),
+      local_workspace.LockedBuffer(), 1);
   }
-  const int block_dim = 256;
-  const int grid_dim = (width + block_dim - 1) / block_dim;
-  div_by_col_sums_and_cutoff_kernel<<<grid_dim, block_dim, 0, stream>>>(
-    height, width, output, output_ldim, workspace, cutoff);
+
 }
 
-void grad_wrt_input_and_cutoff(int height, int width,
-                               const DataType * __restrict__ output,
-                               int output_ldim,
-                               const DataType * __restrict__ workspace,
-                               const DataType * __restrict__ grad_wrt_output,
-                               int grad_wrt_output_ldim,
-                               DataType * __restrict__ grad_wrt_input,
-                               int grad_wrt_input_ldim,
-                               const DataType cutoff,
-                               cudaStream_t stream) {
-  if (width <= 0 || height <= 0) {
-    return;
+template <>
+void softmax_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
+
+  // Local matrices
+  const auto& local_output = get_local_activations();
+  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = get_local_error_signals();
+  auto& local_workspace = m_workspace->Matrix();
+  const auto& local_height = local_output.Height();
+  const auto& local_width = local_output.Width();
+
+  // GPU objects
+  auto&& stream = El::GPUManager::Stream();
+  auto&& event = El::GPUManager::Event();
+  El::SyncInfo<El::Device::GPU> sync_info{stream, event};
+
+  // Initialize CUDA threads/blocks
+  // Note: kernels use a 2D thread distribution with a 256 x 1 block
+  // and nblocksx x local_width grid.
+  constexpr El::Int block_size = 256;
+  dim3 block_dims, grid_dims;
+  block_dims.x = block_size;
+  grid_dims.y = local_width;
+
+  // Compute dot products between output and gradient w.r.t. output
+  El::Zero(local_workspace);
+  if (!local_output.IsEmpty()) {
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    bp_dot_product_kernel<block_size>
+      <<<grid_dims, block_dims, 0, stream>>>(
+        local_height, local_width,
+        local_output.LockedBuffer(),
+        local_output.LDim(),
+        local_gradient_wrt_output.LockedBuffer(),
+        local_gradient_wrt_output.LDim(),
+        local_workspace.Buffer(), 1);
+  }
+  El::AllReduce(*m_workspace, m_workspace->RedundantComm());
+
+  // Compute gradient w.r.t. input
+  if (!local_output.IsEmpty()) {
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    bp_kernel<block_size><<<grid_dims, block_dims, 0, stream>>>(
+      local_height, local_width,
+      local_output.LockedBuffer(),
+      local_output.LDim(),
+      local_gradient_wrt_output.LockedBuffer(),
+      local_gradient_wrt_output.LDim(),
+      local_workspace.Buffer(), 1,
+      local_gradient_wrt_input.Buffer(),
+      local_gradient_wrt_input.LDim());
   }
-  const int block_dim = 256;
-  const int grid_dim = (width + block_dim - 1) / block_dim;
-  grad_wrt_input_and_cutoff_kernel<<<grid_dim, block_dim, 0, stream>>>(
-    height, width, output, output_ldim, workspace, grad_wrt_output,
-    grad_wrt_output_ldim, grad_wrt_input, grad_wrt_input_ldim, cutoff);
+
 }
 
-} // namespace softmax_cuda
 } // namespace lbann
diff --git a/src/layers/activations/tanh.cpp b/src/layers/activations/tanh.cpp
deleted file mode 100644
index 407b7759b64..00000000000
--- a/src/layers/activations/tanh.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
-// Produced at the Lawrence Livermore National Laboratory.
-// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
-// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
-//
-// LLNL-CODE-697807.
-// All rights reserved.
-//
-// This file is part of LBANN: Livermore Big Artificial Neural Network
-// Toolkit. For details, see http://software.llnl.gov/LBANN or
-// https://github.com/LLNL/LBANN.
-//
-// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
-// may not use this file except in compliance with the License.  You may
-// obtain a copy of the License at:
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-// implied. See the License for the specific language governing
-// permissions and limitations under the license.
-////////////////////////////////////////////////////////////////////////////////
-
-#include "lbann/layers/activations/tanh.hpp"
-
-namespace lbann {
-
-// Model-parallel CPU forward/backward prop
-template <>
-void tanh_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::fp_compute() {
-  entrywise_activation_layer::fp_compute_cpu();
-}
-template <>
-void tanh_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>::bp_compute() {
-  entrywise_activation_layer::bp_compute_cpu();
-}
-
-// Data-parallel CPU forward/backward prop
-template <>
-void tanh_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
-  entrywise_activation_layer::fp_compute_cpu();
-}
-template <>
-void tanh_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute() {
-  entrywise_activation_layer::bp_compute_cpu();
-}
-
-#ifdef LBANN_HAS_GPU
-
-// Model-parallel GPU forward/backward prop
-template <>
-void tanh_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::fp_compute() {
-#ifndef LBANN_HAS_CUDNN
-  LBANN_ERROR("cuDNN not detected");
-#else
-  const DataType zero = DataType(0);
-  const DataType one = DataType(1);
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
-  if (local_input.Height() > 0 && local_input.Width() > 0) {
-    CHECK_CUDNN(cudnnActivationForward(cudnn::get_handle(),
-                                       m_activation_cudnn_desc,
-                                       &one,
-                                       m_tensors_cudnn_desc.get_prev_activations(),
-                                       local_input.LockedBuffer(),
-                                       &zero,
-                                       m_tensors_cudnn_desc.get_activations(),
-                                       local_output.Buffer()));
-  }
-#endif // LBANN_HAS_CUDNN
-}
-template <>
-void tanh_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>::bp_compute() {
-#ifndef LBANN_HAS_CUDNN
-  LBANN_ERROR("cuDNN not detected");
-#else
-  const DataType zero = DataType(0);
-  const DataType one = DataType(1);
-  const auto& local_input = get_local_prev_activations();
-  const auto& local_output = get_local_activations();
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
-  if (local_input.Height() > 0 && local_input.Width() > 0) {
-    CHECK_CUDNN(cudnnActivationBackward(cudnn::get_handle(),
-                                        m_activation_cudnn_desc,
-                                        &one,
-                                        m_tensors_cudnn_desc.get_activations(),
-                                        local_output.LockedBuffer(),
-                                        m_tensors_cudnn_desc.get_prev_error_signals(),
-                                        local_gradient_wrt_output.LockedBuffer(),
-                                        m_tensors_cudnn_desc.get_prev_activations(),
-                                        local_input.LockedBuffer(),
-                                        &zero,
-                                        m_tensors_cudnn_desc.get_error_signals(),
-                                        local_gradient_wrt_input.Buffer()));
-  }
-#endif // LBANN_HAS_CUDNN
-}
-
-// Data-parallel GPU forward/backward prop
-template <>
-void tanh_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
-#ifndef LBANN_HAS_CUDNN
-  LBANN_ERROR("cuDNN not detected");
-#else
-  const DataType zero = DataType(0);
-  const DataType one = DataType(1);
-  const auto& local_input = get_local_prev_activations();
-  auto& local_output = get_local_activations();
-  if (local_input.Height() > 0 && local_input.Width() > 0) {
-    CHECK_CUDNN(cudnnActivationForward(cudnn::get_handle(),
-                                       m_activation_cudnn_desc,
-                                       &one,
-                                       m_tensors_cudnn_desc.get_prev_activations(),
-                                       local_input.LockedBuffer(),
-                                       &zero,
-                                       m_tensors_cudnn_desc.get_activations(),
-                                       local_output.Buffer()));
-  }
-#endif // LBANN_HAS_CUDNN
-}
-template <>
-void tanh_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
-#ifndef LBANN_HAS_CUDNN
-  LBANN_ERROR("cuDNN not detected");
-#else
-  const DataType zero = DataType(0);
-  const DataType one = DataType(1);
-  const auto& local_input = get_local_prev_activations();
-  const auto& local_output = get_local_activations();
-  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
-  auto& local_gradient_wrt_input = get_local_error_signals();
-  if (local_input.Height() > 0 && local_input.Width() > 0) {
-    CHECK_CUDNN(cudnnActivationBackward(cudnn::get_handle(),
-                                        m_activation_cudnn_desc,
-                                        &one,
-                                        m_tensors_cudnn_desc.get_activations(),
-                                        local_output.LockedBuffer(),
-                                        m_tensors_cudnn_desc.get_prev_error_signals(),
-                                        local_gradient_wrt_output.LockedBuffer(),
-                                        m_tensors_cudnn_desc.get_prev_activations(),
-                                        local_input.LockedBuffer(),
-                                        &zero,
-                                        m_tensors_cudnn_desc.get_error_signals(),
-                                        local_gradient_wrt_input.Buffer()));
-  }
-#endif // LBANN_HAS_CUDNN
-}
-
-#endif // LBANN_HAS_GPU
-
-} // namespace lbann
diff --git a/src/layers/image/CMakeLists.txt b/src/layers/image/CMakeLists.txt
new file mode 100644
index 00000000000..cbebaa686c4
--- /dev/null
+++ b/src/layers/image/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Add the source files for this directory
+set_full_path(THIS_DIR_SOURCES
+  bilinear_resize.cpp
+  )
+
+if (LBANN_HAS_CUDA)
+  # Add the CUDA source files for this directory
+  set_full_path(THIS_DIR_CU_SOURCES
+    bilinear_resize.cu
+    )
+endif ()
+
+# Propagate the files up the tree
+set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
+set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CU_SOURCES}" PARENT_SCOPE)
diff --git a/src/layers/image/bilinear_resize.cpp b/src/layers/image/bilinear_resize.cpp
new file mode 100644
index 00000000000..f7ed43ca2cc
--- /dev/null
+++ b/src/layers/image/bilinear_resize.cpp
@@ -0,0 +1,113 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/image/bilinear_resize.hpp"
+
+namespace lbann {
+
+template <>
+void bilinear_resize_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
+
+  // Useful constants
+  constexpr DataType half = 0.5;
+  constexpr DataType one = 1;
+  
+  // Matrices
+  const auto& local_input = get_local_prev_activations();
+  auto& local_output = get_local_activations();
+
+  // Dimensions
+  const auto& input_dims = get_input_dims();
+  const auto& num_dims = input_dims.size();
+  const auto& num_samples = local_input.Width();
+  const El::Int num_channels = std::accumulate(input_dims.begin(),
+                                               input_dims.end()-2,
+                                               1,
+                                               std::multiplies<int>());
+  const El::Int input_height = input_dims[num_dims-2];
+  const El::Int input_width = input_dims[num_dims-1];
+
+  // Perform bilinear interpolation for each output pixel
+  const auto& x_stride = static_cast<DataType>(input_width) / m_width;
+  const auto& y_stride = static_cast<DataType>(input_height) / m_height;
+#pragma omp parallel for collapse(4)
+  for (El::Int sample = 0; sample < num_samples; ++sample) {
+    for (El::Int channel = 0; channel < num_channels; ++channel) {
+      for (El::Int output_row = 0; output_row < m_height; ++output_row) {
+        for (El::Int output_col = 0; output_col < m_width; ++output_col) {
+
+          // Interpolation point
+          const auto& x = (output_col + half) * x_stride;
+          const auto& y = (output_row + half) * y_stride;
+
+          // Find input pixels near interpolation point
+          const auto input_col = static_cast<El::Int>(std::floor(x - half));
+          const auto& input_col0 = std::max(input_col, El::Int(0));
+          const auto& input_col1 = std::min(input_col+1, input_width-1);
+          const auto input_row = static_cast<El::Int>(std::floor(y - half));
+          const auto& input_row0 = std::max(input_row, El::Int(0));
+          const auto& input_row1 = std::min(input_row+1, input_height-1);
+
+          // Interpolation point relative to input pixel centers
+          const auto& unit_x = x - (input_col + half);
+          const auto& unit_y = y - (input_row + half);
+
+          // Input and output pixels
+          const auto& pixel00 = local_input(channel * input_height * input_width
+                                            + input_row0 * input_width
+                                            + input_col0,
+                                            sample);
+          const auto& pixel01 = local_input(channel * input_height * input_width
+                                            + input_row0 * input_width
+                                            + input_col1,
+                                            sample);
+          const auto& pixel10 = local_input(channel * input_height * input_width
+                                            + input_row1 * input_width
+                                            + input_col0,
+                                            sample);
+          const auto& pixel11 = local_input(channel * input_height * input_width
+                                            + input_row1 * input_width
+                                            + input_col1,
+                                            sample);
+          auto& result = local_output(channel * m_height * m_width
+                                      + output_row * m_width
+                                      + output_col,
+                                      sample);
+
+          // Bilinear interpolation
+          result = (pixel00 * (one - unit_x) * (one - unit_y)
+                    + pixel01 * unit_x * (one - unit_y)
+                    + pixel10 * (one - unit_x) * unit_y
+                    + pixel11 * unit_x * unit_y);
+          
+        }
+      }
+    }
+  }
+  
+}
+  
+} // namespace lbann
diff --git a/src/layers/image/bilinear_resize.cu b/src/layers/image/bilinear_resize.cu
new file mode 100644
index 00000000000..f970875aa50
--- /dev/null
+++ b/src/layers/image/bilinear_resize.cu
@@ -0,0 +1,159 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/image/bilinear_resize.hpp"
+#include "lbann/utils/cuda.hpp"
+
+namespace lbann {
+
+namespace {
+
+template <int block_size>
+__global__ void fp_kernel(El::Int num_samples,
+                          El::Int num_channels,
+                          El::Int input_height,
+                          El::Int input_width,
+                          const DataType* __restrict__ input,
+                          El::Int input_ldim,
+                          El::Int output_height,
+                          El::Int output_width,
+                          DataType* __restrict__ output,
+                          El::Int output_ldim) {
+
+  // Useful constants
+  constexpr DataType half = 0.5;
+  constexpr DataType one = 1;
+  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int num_threads = blockDim.x * gridDim.x;
+  
+  // Stride between interpolation points
+  const auto& x_stride = static_cast<DataType>(input_width) / output_width;
+  const auto& y_stride = static_cast<DataType>(input_height) / output_height;
+
+  const auto& size = (num_samples * num_channels
+                      * output_height * output_width);
+  for (El::Int pos = gid; pos < size; pos += num_threads) {
+
+    // Indices
+    const auto& sample = pos / (num_channels * output_height * output_width);
+    const auto& channel = (pos / (output_height * output_width)) % num_channels;
+    const auto& output_row = (pos / output_width) % output_height;
+    const auto& output_col = pos % output_width;
+
+    // Interpolation point
+    const auto& x = (output_col + half) * x_stride;
+    const auto& y = (output_row + half) * y_stride;
+
+    // Find input pixels near interpolation point
+    const auto input_col = static_cast<El::Int>(cuda::floor(x - half));
+    const auto& input_col0 = cuda::max(input_col, El::Int(0));
+    const auto& input_col1 = cuda::min(input_col+1, input_width-1);
+    const auto input_row = static_cast<El::Int>(cuda::floor(y - half));
+    const auto& input_row0 = cuda::max(input_row, El::Int(0));
+    const auto& input_row1 = cuda::min(input_row+1, input_height-1);
+    
+    // Interpolation point relative to input pixel centers
+    const auto& unit_x = x - (input_col + half);
+    const auto& unit_y = y - (input_row + half);
+
+    // Input and output pixels
+    const auto& pixel00 = input[sample * input_ldim
+                                + channel * input_height * input_width
+                                + input_row0 * input_width
+                                + input_col0];
+    const auto& pixel01 = input[sample * input_ldim
+                                + channel * input_height * input_width
+                                + input_row0 * input_width
+                                + input_col1];
+    const auto& pixel10 = input[sample * input_ldim
+                                + channel * input_height * input_width
+                                + input_row1 * input_width
+                                + input_col0];
+    const auto& pixel11 = input[sample * input_ldim
+                                + channel * input_height * input_width
+                                + input_row1 * input_width
+                                + input_col1];
+    auto& result = output[sample * output_ldim
+                          + channel * output_height * output_width
+                          + output_row * output_width
+                          + output_col];
+
+    // Bilinear interpolation
+    result = (pixel00 * (one - unit_x) * (one - unit_y)
+              + pixel01 * unit_x * (one - unit_y)
+              + pixel10 * (one - unit_x) * unit_y
+              + pixel11 * unit_x * unit_y);
+    
+  }
+  
+}  
+  
+}
+
+  
+template <>
+void bilinear_resize_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
+
+  // Matrices
+  const auto& local_input = get_local_prev_activations();
+  auto& local_output = get_local_activations();
+
+  // Dimensions
+  const auto& input_dims = get_input_dims();
+  const auto& num_dims = input_dims.size();
+  const auto& num_samples = local_input.Width();
+  const El::Int num_channels = std::accumulate(input_dims.begin(),
+                                               input_dims.end()-2,
+                                               1,
+                                               std::multiplies<int>());
+  const El::Int input_height = input_dims[num_dims-2];
+  const El::Int input_width = input_dims[num_dims-1];
+
+  // Get CUDA grid dimensions
+  // Note: Maximum CUDA grid dimension is 2^32-1
+  // (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications).
+  const El::Int size = local_output.Height() * local_output.Width();
+  constexpr El::Int block_dim = 256;
+  El::Int grid_dim = (size + block_dim - 1) / block_dim;
+  if (sizeof(El::Int) > sizeof(uint32_t)
+      && grid_dim > std::numeric_limits<uint32_t>::max()) {
+    grid_dim = std::numeric_limits<uint32_t>::max();
+  }
+
+  // Launch CUDA kernel
+  if (grid_dim > 0) {
+    fp_kernel<block_dim>
+      <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+        num_samples, num_channels,
+        input_height, input_width,
+        local_input.LockedBuffer(), local_input.LDim(),
+        m_height, m_width,
+        local_output.Buffer(), local_output.LDim());
+  }
+  
+}
+  
+} // namespace lbann
diff --git a/src/layers/layer.cpp b/src/layers/layer.cpp
index dc1df262124..76c92e93f52 100644
--- a/src/layers/layer.cpp
+++ b/src/layers/layer.cpp
@@ -34,6 +34,12 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+// Asynchronous memory transfers for input data
+// Note: This introduces a race condition. It is possible for the
+// input data to be modified by another layer before it is used by
+// this layer.
+// #define ASYNC_INPUT_MEMORY_TRANSFER
+
 namespace lbann {
 
 Layer::Layer(lbann_comm *comm)
@@ -47,7 +53,7 @@ Layer::Layer(lbann_comm *comm)
 
   // Reset timing counters
   reset_counters();
-
+  
 }
 
 Layer::Layer(const Layer& other) :
@@ -577,7 +583,7 @@ void Layer::setup_pointers() {
       LBANN_ERROR(err.str());
     }
   }
-  
+
   // Check that the number of parents/children are valid
   if(m_expected_num_parent_layers >= 0
      && get_num_parents() != m_expected_num_parent_layers) {
@@ -625,7 +631,7 @@ void Layer::setup_dims() {
     }
   }
 }
-  
+
 void Layer::setup_matrices(const El::Grid& grid) {
 
   // Destroy previously setup matrices
@@ -653,7 +659,6 @@ void Layer::setup_matrices(const El::Grid& grid) {
     m_gradient_wrt_inputs[i]
       = construct_matrix(grid, "gradient_wrt_input", i);
   }
-
 }
 
 std::unique_ptr<AbsDistMat> Layer::construct_matrix(const El::Grid& grid,
@@ -682,7 +687,7 @@ std::unique_ptr<AbsDistMat> Layer::construct_matrix(const El::Grid& grid,
   std::unique_ptr<AbsDistMat> mat;
   mat.reset(AbsDistMat::Instantiate(grid, 0,
                                     col_dist, row_dist, wrap, device));
-  
+
 #ifdef LBANN_HAS_GPU
   // Allocate GPU memory with the CUDA API
   if (device == El::Device::GPU) { mat->Matrix().SetMemoryMode(0); }
@@ -885,7 +890,21 @@ void Layer::fp_setup_inputs(El::Int mini_batch_size) {
     if (parent_output.DistData() == input.DistData()) {
       El::LockedView(input, parent_output);
     } else {
-      El::Copy(parent_output, input);
+      bool async_copy = false;
+#if defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
+      // Asynchronously copy CPU data to GPU data if they are otherwise aligned
+      if (parent_output.GetLocalDevice() == El::Device::CPU
+          && input.GetLocalDevice() == El::Device::GPU) {
+        auto parent_dist_data = parent_output.DistData();
+        parent_dist_data.device = El::Device::GPU;
+        async_copy = parent_dist_data == input.DistData();
+      }
+#endif // defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
+      if (async_copy) {
+        El::CopyAsync(parent_output, input);
+      } else {
+        El::Copy(parent_output, input);
+      }
     }
 
     // Check input matrix dimensions
@@ -937,7 +956,21 @@ void Layer::bp_setup_gradient_wrt_outputs(El::Int mini_batch_size) {
         == gradient_wrt_output.DistData()) {
       El::LockedView(gradient_wrt_output, child_gradient_wrt_input);
     } else {
-      El::Copy(child_gradient_wrt_input, gradient_wrt_output);
+      bool async_copy = false;
+#if defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
+      // Asynchronously copy CPU data to GPU data if they are otherwise aligned
+      if (child_gradient_wrt_input.GetLocalDevice() == El::Device::CPU
+          && gradient_wrt_output.GetLocalDevice() == El::Device::GPU) {
+        auto child_dist_data = child_gradient_wrt_input.DistData();
+        child_dist_data.device = El::Device::GPU;
+        async_copy = child_dist_data == gradient_wrt_output.DistData();
+      }
+#endif // defined(LBANN_HAS_GPU) && defined(ASYNC_INPUT_MEMORY_TRANSFER)
+      if (async_copy) {
+        El::CopyAsync(child_gradient_wrt_input, gradient_wrt_output);
+      } else {
+        El::Copy(child_gradient_wrt_input, gradient_wrt_output);
+      }
     }
 
     // Check gradient w.r.t. output matrix dimensions
diff --git a/src/layers/loss/CMakeLists.txt b/src/layers/loss/CMakeLists.txt
index c97419f78e0..d3194e6f2bf 100644
--- a/src/layers/loss/CMakeLists.txt
+++ b/src/layers/loss/CMakeLists.txt
@@ -1,6 +1,9 @@
 # Add the source files for this directory
 set_full_path(THIS_DIR_SOURCES
+  categorical_accuracy.cpp
   cross_entropy.cpp
+  entrywise.cpp
+  l2_norm2.cpp
   mean_squared_error.cpp
   top_k_categorical_accuracy.cpp
   )
@@ -8,7 +11,10 @@ set_full_path(THIS_DIR_SOURCES
 if (LBANN_HAS_CUDA)
   # Add the CUDA source files for this directory
   set_full_path(THIS_DIR_CU_SOURCES
+    categorical_accuracy.cu
     cross_entropy.cu
+    entrywise.cu
+    l2_norm2.cu
     mean_squared_error.cu
     top_k_categorical_accuracy.cu
     )
diff --git a/src/layers/loss/categorical_accuracy.cpp b/src/layers/loss/categorical_accuracy.cpp
new file mode 100644
index 00000000000..b5ff2964822
--- /dev/null
+++ b/src/layers/loss/categorical_accuracy.cpp
@@ -0,0 +1,216 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/loss/categorical_accuracy.hpp"
+#include <limits>
+
+
+namespace lbann {
+
+namespace {
+
+/** CPU implementation of categorical accuracy layer forward prop. */
+void fp_cpu(lbann_comm& comm,
+            const AbsDistMat& predictions,
+            const AbsDistMat& labels,
+            AbsDistMat& loss) {
+
+  // Local matrices
+  const auto& local_predictions = predictions.LockedMatrix();
+  const auto& local_labels = labels.LockedMatrix();
+  auto& local_loss = loss.Matrix();
+
+  // Dimensions
+  const auto& height = predictions.Height();
+  const auto& local_height = local_predictions.Height();
+  const auto& local_width = local_predictions.Width();
+  if (local_width < 1) { return; }
+
+  // Column communicator
+  auto&& col_comm = predictions.ColComm();
+  const auto& col_comm_rank = El::mpi::Rank(col_comm);
+  const auto& col_comm_size = El::mpi::Size(col_comm);
+  const auto& col_comm_root = loss.RowOwner(0);
+
+  // Find largest prediction entries in local data
+  std::vector<DataType> prediction_vals(local_width);
+  std::vector<El::Int> prediction_inds(local_width);
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    DataType max_val = -std::numeric_limits<DataType>::infinity();
+    El::Int max_ind = height;
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& val = local_predictions(row, col);
+      if (val > max_val) {
+        max_val = val;
+        max_ind = predictions.GlobalRow(row);
+      }
+    }
+    prediction_vals[col] = max_val;
+    prediction_inds[col] = max_ind;
+  }
+
+  // Gather large prediction entries
+  /// @todo Non-blocking gather
+  Al::request prediction_vals_req, prediction_inds_req;
+  std::vector<DataType> gathered_prediction_vals;
+  std::vector<El::Int> gathered_prediction_inds;
+  if (col_comm_size > 1) {
+    if (col_comm_rank != col_comm_root) {
+      comm.gather(prediction_vals.data(), prediction_vals.size(),
+                  col_comm_root, col_comm,
+                  El::SyncInfo<El::Device::CPU>{});
+      comm.gather(prediction_inds.data(), prediction_inds.size(),
+                  col_comm_root, col_comm,
+                  El::SyncInfo<El::Device::CPU>{});
+    } else {
+      gathered_prediction_vals.resize(prediction_vals.size() * col_comm_size);
+      gathered_prediction_inds.resize(prediction_inds.size() * col_comm_size);
+      comm.gather(prediction_vals.data(), prediction_vals.size(),
+                  gathered_prediction_vals.data(),
+                  col_comm, El::SyncInfo<El::Device::CPU>{});
+      comm.gather(prediction_inds.data(), prediction_inds.size(),
+                  gathered_prediction_inds.data(),
+                  col_comm, El::SyncInfo<El::Device::CPU>{});
+    }
+  }
+
+  // Find largest label entries in local data
+  std::vector<DataType> label_vals(local_width);
+  std::vector<El::Int> label_inds(local_width);
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    DataType max_val = -std::numeric_limits<DataType>::infinity();
+    El::Int max_ind = height;
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& val = local_labels(row, col);
+      if (val > max_val) {
+        max_val = val;
+        max_ind = labels.GlobalRow(row);
+      }
+    }
+    label_vals[col] = max_val;
+    label_inds[col] = max_ind;
+  }
+
+  // Gather large label entries
+  /// @todo Non-blocking gather
+  Al::request label_vals_req, label_inds_req;
+  std::vector<DataType> gathered_label_vals;
+  std::vector<El::Int> gathered_label_inds;
+  if (col_comm_size > 1) {
+    if (col_comm_rank != col_comm_root) {
+      comm.gather(label_vals.data(), label_vals.size(),
+                  col_comm_root, col_comm,
+                  El::SyncInfo<El::Device::CPU>{});
+      comm.gather(label_inds.data(), label_inds.size(),
+                  col_comm_root, col_comm,
+                  El::SyncInfo<El::Device::CPU>{});
+    } else {
+      gathered_label_vals.resize(label_vals.size() * col_comm_size);
+      gathered_label_inds.resize(label_inds.size() * col_comm_size);
+      comm.gather(label_vals.data(), label_vals.size(),
+                  gathered_label_vals.data(),
+                  col_comm, El::SyncInfo<El::Device::CPU>{});
+      comm.gather(label_inds.data(), label_inds.size(),
+                  gathered_label_inds.data(),
+                  col_comm, El::SyncInfo<El::Device::CPU>{});
+    }
+  }
+
+  // Find largest prediction entry in global data
+  comm.wait(prediction_vals_req);
+  comm.wait(prediction_inds_req);
+  if (col_comm_size > 1 && col_comm_rank == col_comm_root) {
+#pragma omp parallel for
+    for (El::Int col = 0; col < local_width; ++col) {
+      DataType max_val = -std::numeric_limits<DataType>::infinity();
+      El::Int max_ind = height;
+      for (El::Int rank = 0; rank < col_comm_size; ++rank) {
+        const auto& val = gathered_prediction_vals[col + rank * local_width];
+        const auto& ind = gathered_prediction_inds[col + rank * local_width];
+        if (val > max_val || (val == max_val && ind < max_ind)) {
+          max_val = val;
+          max_ind = ind;
+        }
+      }
+      label_vals[col] = max_val;
+      label_inds[col] = max_ind;
+    }
+  }
+
+  // Find largest label entry in global data
+  comm.wait(label_vals_req);
+  comm.wait(label_inds_req);
+  if (col_comm_size > 1 && col_comm_rank == col_comm_root) {
+#pragma omp parallel for
+    for (El::Int col = 0; col < local_width; ++col) {
+      DataType max_val = -std::numeric_limits<DataType>::infinity();
+      El::Int max_ind = height;
+      for (El::Int rank = 0; rank < col_comm_size; ++rank) {
+        const auto& val = gathered_label_vals[col + rank * local_width];
+        const auto& ind = gathered_label_inds[col + rank * local_width];
+        if (val > max_val || (val == max_val && ind < max_ind)) {
+          max_val = val;
+          max_ind = ind;
+        }
+      }
+      label_vals[col] = max_val;
+      label_inds[col] = max_ind;
+    }
+  }
+
+  // Compute categorical accuracy
+  if (col_comm_rank == col_comm_root) {
+#pragma omp parallel for
+    for (El::Int col = 0; col < local_width; ++col) {
+      local_loss(0, col) = (prediction_inds[col] == label_inds[col] ?
+                            DataType(1) : DataType(0));
+    }
+  }
+  
+}
+
+} // namespace
+
+template <>
+void categorical_accuracy_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
+     ::fp_compute() {
+  fp_cpu(*get_comm(),
+         get_prev_activations(0),
+         get_prev_activations(1),
+         get_activations());
+}
+template <>
+void categorical_accuracy_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
+     ::fp_compute() {
+  fp_cpu(*get_comm(),
+         get_prev_activations(0),
+         get_prev_activations(1),
+         get_activations());
+}
+
+} // namespace lbann
diff --git a/src/layers/loss/categorical_accuracy.cu b/src/layers/loss/categorical_accuracy.cu
new file mode 100644
index 00000000000..870ff989b1d
--- /dev/null
+++ b/src/layers/loss/categorical_accuracy.cu
@@ -0,0 +1,386 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/loss/categorical_accuracy.hpp"
+#include "lbann/utils/cuda.hpp"
+
+namespace lbann {
+
+namespace {
+
+/** Fill matrix with corresponding indices.
+ *  Indices are equivalent to the global row indices of the input
+ *  matrix.
+ */
+__global__ void fill_indices_kernel(El::Int local_height,
+                                    El::Int local_width,
+                                    El::Int col_shift,
+                                    El::Int col_stride,
+                                    El::Int* __restrict__ indices) {
+  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int nthreads = blockDim.x * gridDim.x;
+  const El::Int size = local_height * local_width;
+  for (El::Int pos = gid; pos < size; pos += nthreads) {
+    const auto& row = pos % local_height;
+    const auto& col = pos / local_height;
+    indices[row + col*local_height] = col_shift + row * col_stride;
+  }
+}
+  
+/** Find largest entry within each CUDA block.
+ *  Each block is assigned several entries from the same mini-batch
+ *  sample and it finds the largest entry. Results are output to
+ *  nblocksx x width matrices.
+ */
+template <El::Int block_size>
+__global__ void reduce_max_entries_kernel(El::Int height, El::Int width,
+                                          const DataType* __restrict__ values,
+                                          El::Int values_row_stride,
+                                          El::Int values_col_stride,
+                                          const El::Int* __restrict__ indices,
+                                          El::Int indices_row_stride,
+                                          El::Int indices_col_stride,
+                                          DataType* __restrict__ max_values,
+                                          El::Int* __restrict__ max_indices) {
+
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidx = blockIdx.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  const El::Int nblocksx = gridDim.x;
+  
+  // Reduce each matrix column independently
+  for (El::Int col = bidy; col < width; col += gridDim.y) {
+
+    // Find largest entry for each thread
+    DataType private_max_val = -cuda::infinity<DataType>();
+    El::Int private_max_ind = cuda::max<El::Int>();
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      const auto& val = values[row * values_row_stride
+                               + col * values_col_stride];
+      const auto& ind = indices[row * indices_row_stride
+                                + col * indices_col_stride];
+      if (val > private_max_val
+          || (val == private_max_val && ind < private_max_ind)) {
+        private_max_val = val;
+        private_max_ind = ind;
+      }
+    }
+
+    // Shared memory reduction to get largest entry for each block
+    __shared__ DataType shared_max_vals[block_size];
+    __shared__ El::Int shared_max_inds[block_size];
+    shared_max_vals[tid] = private_max_val;
+    shared_max_inds[tid] = private_max_ind;
+    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        const auto& val = shared_max_vals[tid + stride];
+        const auto& ind = shared_max_inds[tid + stride];
+        if (val > shared_max_vals[tid]
+          || (val == shared_max_vals[tid] && ind < shared_max_inds[tid])) {
+          shared_max_vals[tid] = val;
+          shared_max_inds[tid] = ind;
+        }
+      }
+    }
+    if (tid == 0) {
+      max_values[bidx + col*nblocksx] = shared_max_vals[0];
+      max_indices[bidx + col*nblocksx] = shared_max_inds[0];
+    }
+
+  }
+  
+}
+
+/** Compute sample-wise categorical accuracy.
+ *  Outputs one if the prediction and label indices match and
+ *  otherwise outputs zero.
+ */
+__global__ void compute_accuracy_kernel(El::Int local_width,
+                                        const El::Int* __restrict__ prediction_indices,
+                                        const El::Int* __restrict__ label_indices,
+                                        DataType* __restrict__ loss,
+                                        El::Int loss_ldim) {
+  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int nthreads = blockDim.x * gridDim.x;
+  constexpr El::Int max_ind = cuda::max<El::Int>();
+  for (El::Int col = gid; col < local_width; col += nthreads) {
+    const auto& prediction = prediction_indices[col];
+    const auto& label = label_indices[col];
+    loss[col*loss_ldim] = (prediction == label && prediction < max_ind ?
+                           DataType(1) : DataType(0));
+  }
+}
+  
+/** GPU implementation of categorical accuracy layer forward prop. */
+void fp_gpu(lbann_comm& comm,
+            const AbsDistMat& predictions,
+            const AbsDistMat& labels,
+            AbsDistMat& loss) {
+
+  // Local matrices
+  const auto& local_predictions = predictions.LockedMatrix();
+  const auto& local_labels = labels.LockedMatrix();
+  auto& local_loss = loss.Matrix();
+
+  // Dimensions
+  const auto& height = predictions.Height();
+  const auto& local_height = local_predictions.Height();
+  const auto& local_width = local_predictions.Width();
+  if (local_width < 1) { return; }
+  
+  // Column communicator
+  auto&& col_comm = predictions.ColComm();
+  const auto& col_comm_rank = El::mpi::Rank(col_comm);
+  const auto& col_comm_size = El::mpi::Size(col_comm);
+  const auto& col_comm_root = loss.RowOwner(0);
+
+  // GPU objects
+  auto&& stream = El::GPUManager::Stream();
+  auto&& event = El::GPUManager::Event();
+  El::SyncInfo<El::Device::GPU> sync_info{stream, event};
+
+  // Initialize CUDA threads/blocks for reduction kernel
+  // Note: reduce_max_entries_kernel uses a 2D thread distribution
+  // with a 256 x 1 block and nblocksx x local_width grid.
+  constexpr El::Int block_size = 256;
+  dim3 block_dims, grid_dims;
+  block_dims.x = block_size;
+  grid_dims.y = local_width;
+
+  // Get indices for all input entries
+  cuda::thrust::vector<El::Int> full_inds(local_height * local_width);
+  if (full_inds.size() > 0) {
+    const El::Int grid_size = (full_inds.size() + block_size - 1) / block_size;
+    fill_indices_kernel<<<grid_size, block_size, 0, stream>>>(
+      local_height, local_width,
+      predictions.ColShift(), predictions.ColStride(),
+      full_inds.data().get());
+  }
+
+  // Find largest prediction entries in local data
+  grid_dims.x = (local_height + block_size - 1) / block_size;
+  if (grid_dims.x < 1) { grid_dims.x = 1; }
+  cuda::thrust::vector<DataType> prediction_vals(grid_dims.x * local_width);
+  cuda::thrust::vector<El::Int> prediction_inds(grid_dims.x * local_width);
+  reduce_max_entries_kernel<block_size>
+    <<<grid_dims, block_dims, 0, stream>>>(
+      local_height, local_width,
+      local_predictions.LockedBuffer(), 1, local_predictions.LDim(),
+      full_inds.data().get(), 1, local_height,
+      prediction_vals.data().get(),
+      prediction_inds.data().get());
+  while (grid_dims.x > 1) {
+    const El::Int prev_height = grid_dims.x;
+    grid_dims.x = (prev_height + block_size - 1) / block_size;
+    cuda::thrust::vector<DataType> prev_vals(std::move(prediction_vals));
+    cuda::thrust::vector<El::Int> prev_inds(std::move(prediction_inds));
+    prediction_vals.resize(grid_dims.x * local_width);
+    prediction_inds.resize(grid_dims.x * local_width);
+    reduce_max_entries_kernel<block_size>
+      <<<grid_dims, block_dims, 0, stream>>>(
+        prev_height, local_width,
+        prev_vals.data().get(), 1, prev_height,
+        prev_inds.data().get(), 1, prev_height,
+        prediction_vals.data().get(),
+        prediction_inds.data().get());
+  }
+
+  // Gather large prediction entries
+  /// @todo Non-blocking gather
+  Al::request prediction_vals_req, prediction_inds_req;
+  cuda::thrust::vector<DataType> gathered_prediction_vals;
+  cuda::thrust::vector<El::Int> gathered_prediction_inds;
+  if (col_comm_size > 1) {
+    if (col_comm_rank != col_comm_root) {
+      comm.gather(prediction_vals.data().get(), prediction_vals.size(),
+                  col_comm_root, col_comm, sync_info);
+      comm.gather(prediction_inds.data().get(), prediction_inds.size(),
+                  col_comm_root, col_comm, sync_info);
+    } else {
+      gathered_prediction_vals.resize(prediction_vals.size() * col_comm_size);
+      gathered_prediction_inds.resize(prediction_inds.size() * col_comm_size);
+      comm.gather(prediction_vals.data().get(), prediction_vals.size(),
+                  gathered_prediction_vals.data().get(),
+                  col_comm, sync_info);
+      comm.gather(prediction_inds.data().get(), prediction_inds.size(),
+                  gathered_prediction_inds.data().get(),
+                  col_comm, sync_info);
+    }
+  }
+
+  // Find largest label entries in local data
+  grid_dims.x = (local_height + block_size - 1) / block_size;
+  if (grid_dims.x < 1) { grid_dims.x = 1; }
+  cuda::thrust::vector<DataType> label_vals(grid_dims.x * local_width);
+  cuda::thrust::vector<El::Int> label_inds(grid_dims.x * local_width);
+  reduce_max_entries_kernel<block_size>
+    <<<grid_dims, block_dims, 0, stream>>>(
+      local_height, local_width,
+      local_labels.LockedBuffer(), 1, local_labels.LDim(),
+      full_inds.data().get(), 1, local_height,
+      label_vals.data().get(),
+      label_inds.data().get());
+  while (grid_dims.x > 1) {
+    const El::Int prev_height = grid_dims.x;
+    grid_dims.x = (prev_height + block_size - 1) / block_size;
+    cuda::thrust::vector<DataType> prev_vals(std::move(label_vals));
+    cuda::thrust::vector<El::Int> prev_inds(std::move(label_inds));
+    label_vals.resize(grid_dims.x * local_width);
+    label_inds.resize(grid_dims.x * local_width);
+    reduce_max_entries_kernel<block_size>
+      <<<grid_dims, block_dims, 0, stream>>>(
+        prev_height, local_width,
+        prev_vals.data().get(), 1, prev_height,
+        prev_inds.data().get(), 1, prev_height,
+        label_vals.data().get(),
+        label_inds.data().get());
+  }
+
+  // Gather large label entries
+  /// @todo Non-blocking gather
+  Al::request label_vals_req, label_inds_req;
+  cuda::thrust::vector<DataType> gathered_label_vals;
+  cuda::thrust::vector<El::Int> gathered_label_inds;
+  if (col_comm_size > 1) {
+    if (col_comm_rank != col_comm_root) {
+      comm.gather(label_vals.data().get(), label_vals.size(),
+                  col_comm_root, col_comm, sync_info);
+      comm.gather(label_inds.data().get(), label_inds.size(),
+                  col_comm_root, col_comm, sync_info);
+    } else {
+      gathered_label_vals.resize(label_vals.size() * col_comm_size);
+      gathered_label_inds.resize(label_inds.size() * col_comm_size);
+      comm.gather(label_vals.data().get(), label_vals.size(),
+                  gathered_label_vals.data().get(),
+                  col_comm, sync_info);
+      comm.gather(label_inds.data().get(), label_inds.size(),
+                  gathered_label_inds.data().get(),
+                  col_comm, sync_info);
+    }
+  }
+
+  // Clean up temporary arrays
+  full_inds.clear();
+
+  // Find largest prediction entry in global data
+  comm.wait(prediction_vals_req);
+  comm.wait(prediction_inds_req);
+  if (col_comm_size > 1 && col_comm_rank == col_comm_root) {
+    grid_dims.x = (col_comm_size + block_size - 1) / block_size;
+    if (grid_dims.x < 1) { grid_dims.x = 1; }
+    prediction_vals.resize(grid_dims.x * local_width);
+    prediction_inds.resize(grid_dims.x * local_width);
+    reduce_max_entries_kernel<block_size>
+      <<<grid_dims, block_dims, 0, stream>>>(
+        col_comm_size, local_width,
+        gathered_prediction_vals.data().get(), col_comm_size, 1,
+        gathered_prediction_inds.data().get(), col_comm_size, 1,
+        prediction_vals.data().get(),
+        prediction_inds.data().get());
+    while (grid_dims.x > 1) {
+      const El::Int prev_height = grid_dims.x;
+      grid_dims.x = (prev_height + block_size - 1) / block_size;
+      cuda::thrust::vector<DataType> prev_vals(std::move(prediction_vals));
+      cuda::thrust::vector<El::Int> prev_inds(std::move(prediction_inds));
+      prediction_vals.resize(grid_dims.x * local_width);
+      prediction_inds.resize(grid_dims.x * local_width);
+      reduce_max_entries_kernel<block_size>
+        <<<grid_dims, block_dims, 0, stream>>>(
+          prev_height, local_width,
+          prev_vals.data().get(), 1, prev_height,
+          prev_inds.data().get(), 1, prev_height,
+          prediction_vals.data().get(),
+          prediction_inds.data().get());
+    }
+  }
+
+  // Find largest label entry in global data
+  comm.wait(label_vals_req);
+  comm.wait(label_inds_req);
+  if (col_comm_size > 1 && col_comm_rank == col_comm_root) {
+    grid_dims.x = (col_comm_size + block_size - 1) / block_size;
+    if (grid_dims.x < 1) { grid_dims.x = 1; }
+    label_vals.resize(grid_dims.x * local_width);
+    label_inds.resize(grid_dims.x * local_width);
+    reduce_max_entries_kernel<block_size>
+      <<<grid_dims, block_dims, 0, stream>>>(
+        col_comm_size, local_width,
+        gathered_label_vals.data().get(), col_comm_size, 1,
+        gathered_label_inds.data().get(), col_comm_size, 1,
+        label_vals.data().get(),
+        label_inds.data().get());
+    while (grid_dims.x > 1) {
+      const El::Int prev_height = grid_dims.x;
+      grid_dims.x = (prev_height + block_size - 1) / block_size;
+      cuda::thrust::vector<DataType> prev_vals(std::move(label_vals));
+      cuda::thrust::vector<El::Int> prev_inds(std::move(label_inds));
+      label_vals.resize(grid_dims.x * local_width);
+      label_inds.resize(grid_dims.x * local_width);
+      reduce_max_entries_kernel<block_size>
+        <<<grid_dims, block_dims, 0, stream>>>(
+          prev_height, local_width,
+          prev_vals.data().get(), 1, prev_height,
+          prev_inds.data().get(), 1, prev_height,
+          label_vals.data().get(),
+          label_inds.data().get());
+    }
+  }
+
+  // Compute categorical accuracy
+  if (col_comm_rank == col_comm_root) {
+    const El::Int grid_size = (local_width + block_size - 1) / block_size;
+    compute_accuracy_kernel<<<grid_size, block_size, 0, stream>>>(
+      local_width,
+      prediction_inds.data().get(), label_inds.data().get(),
+      local_loss.Buffer(), local_loss.LDim());
+  }
+  
+}
+
+} // namespace
+
+template <>
+void categorical_accuracy_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
+     ::fp_compute() {
+  fp_gpu(*get_comm(),
+         get_prev_activations(0),
+         get_prev_activations(1),
+         get_activations());
+}
+template <>
+void categorical_accuracy_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
+     ::fp_compute() {
+  fp_gpu(*get_comm(),
+         get_prev_activations(0),
+         get_prev_activations(1),
+         get_activations());
+}
+
+} // namespace lbann
diff --git a/src/layers/loss/cross_entropy.cu b/src/layers/loss/cross_entropy.cu
index 0d990ff893c..db90a46d314 100644
--- a/src/layers/loss/cross_entropy.cu
+++ b/src/layers/loss/cross_entropy.cu
@@ -32,48 +32,6 @@ namespace lbann {
 
 namespace {
 
-// Atomic add functions
-#if __CUDA_ARCH__ >= 530
-__device__ inline __half atomic_add(__half* address, __half val) {
-#if 0 // TODO: replace this once Nvidia implements atomicAdd for __half
-  return atomicAdd(address, val);
-#else
-  unsigned int* address_as_uint = (unsigned int*) address;
-  unsigned int old = *address_as_uint;
-  __half* old_as_half = (__half*) &old;
-  unsigned int assumed;
-  unsigned int updated;
-  __half* updated_as_half = (__half*) &updated;
-  do {
-    assumed = old;
-    updated = old;
-    *updated_as_half += value;
-    old = atomicCAS(address_as_uint, assumed, updated);
-  } while (assumed != old);
-  return *old_as_half;
-#endif // 0
-}
-#endif // __CUDA_ARCH__ >= 530
-__device__ inline float atomic_add(float* address, float val) {
-  return atomicAdd(address, val);
-}
-__device__ inline double atomic_add(double* address, double val) {
-#if __CUDA_ARCH__ >= 600
-  return atomicAdd(address, val);
-#else
-  unsigned long long int* address_as_ull =
-    (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val +
-                                         __longlong_as_double(assumed)));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-#endif // __CUDA_ARCH__ < 600
-}
-
 template <int block_size>
 __global__ void fp_kernel(int height, int width,
                           const DataType* __restrict__ prediction,
@@ -112,7 +70,7 @@ __global__ void fp_kernel(int height, int width,
       }
     }
     if (tid == 0) {
-      atomic_add(&contribution[col], shared_contribution[0]);
+      cuda::atomic_add(&contribution[col], shared_contribution[0]);
     }
     
   }
diff --git a/src/layers/loss/entrywise.cpp b/src/layers/loss/entrywise.cpp
new file mode 100644
index 00000000000..e6f93ac9620
--- /dev/null
+++ b/src/layers/loss/entrywise.cpp
@@ -0,0 +1,245 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/loss/entrywise.hpp"
+#include "lbann/utils/entrywise_operator.hpp"
+
+namespace lbann {
+
+namespace {
+
+// Helpful constants
+constexpr DataType zero = 0;
+constexpr DataType one = 1;
+
+/** Apply a binary backprop operator to CPU data.
+ *  The input and output data must be on CPU and must have the same
+ *  dimensions. Given a binary function \f$ y = f(x_1,x_2) \f$, the
+ *  corresponding BinaryBackPropOperator is a 5-ary function with the
+ *  arguments \f$ x_1 \f$, \f$ x_2 \f$, \f$ dL/dy \f$, \f$ dL/dx_1\f$,
+ *  \f$ dL/dx_2 \f$. The last two arguments should be overwritten when
+ *  the BinaryBackPropOperator is called.
+ */
+template <typename BinaryBackPropOperator>
+void apply_binary_backprop_operator(const AbsMat& x1,
+                                    const AbsMat& x2,
+                                    const AbsMat& dy,
+                                    AbsMat& dx1,
+                                    AbsMat& dx2) {
+  if (x1.Contiguous() && x2.Contiguous() && dy.Contiguous()
+      && dx1.Contiguous() && dx2.Contiguous()) {
+    const auto* x1_buffer = x1.LockedBuffer();
+    const auto* x2_buffer = x2.LockedBuffer();
+    const auto* dy_buffer = dy.LockedBuffer();
+    auto* dx1_buffer = dx1.Buffer();
+    auto* dx2_buffer = dx2.Buffer();
+    const size_t size = x1.Height() * x1.Width();
+#pragma omp parallel for
+    for (size_t i = 0; i < size; ++i) {
+      BinaryBackPropOperator op;
+      op(x1_buffer[i], x2_buffer[i], dy_buffer[i],
+         dx1_buffer[i], dx2_buffer[i]);
+    }
+  } else {
+    auto const width = x1.Width();
+    auto const height = x1.Height();
+#pragma omp parallel for collapse(2)
+    for (El::Int col = 0; col < width; ++col) {
+      for (El::Int row = 0; row < height; ++row) {
+        BinaryBackPropOperator op;
+        op(x1(row, col), x2(row, col), dy(row, col),
+           dx1(row, col), dx2(row, col));
+      }
+    }
+  }
+
+}
+
+// =========================================================
+// Operator objects for entry-wise binary layers
+// =========================================================
+// Note: Binary operator corresponds to forward prop step
+// (\f$ y = f(x_1,x_2) \f$) and 5-ary operator corresponds
+// to back prop step
+// (\f$ \frac{dL}{dx_i} = \frac{dL}{dy} \frac{df}{dx_i}(x_1,x_2) \f$).
+
+/** Binary cross entropy operator. */
+struct binary_cross_entropy_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    DataType y = zero;
+    if (x2 > zero) { y += -x2 * std::log(x1); }
+    if (x2 < one)  { y += -(one-x2) * std::log(one-x1); }
+    return y;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+    if (dy == zero) { return; }
+    if (x2 > zero) {
+      dx1 += -x2 / x1 * dy;
+      dx2 += -std::log(x1) * dy;
+    }
+    if (x2 < one)  {
+      dx1 += (one-x2) / (one-x1) * dy;
+      dx2 += std::log(one-x1) * dy;
+    }
+  }
+};
+
+/** Sigmoid binary cross entropy operator.
+ *  Equivalent to applying a sigmoid function to the first operand and
+ *  then computing the binary cross entropy. Numerically stable
+ *  implementation is taken from
+ *  https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits.
+ */
+struct sigmoid_binary_cross_entropy_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    const auto& z = std::max(zero, std::min(x2, one));
+    if (x1 > zero) {
+      return (one - z) * x1 + std::log1p(std::exp(-x1));
+    } else {
+      return - x1 * z + std::log1p(std::exp(x1));
+    }
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    const auto& z = std::max(zero, std::min(x2, one));
+    if (x1 > zero) {
+      dx1 = -z + 1 / (one + std::exp(-x1));
+    } else {
+      dx1 = one - z - 1 / (one + std::exp(x1));
+    }
+    dx1 *= dy;
+    dx2 = (x2 == z) ? -x1 * dy : zero;
+  }
+};
+
+/** Boolean accuracy operator. */
+struct boolean_accuracy_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    const auto& b1 = x1 >= DataType(0.5);
+    const auto& b2 = x2 >= DataType(0.5);
+    return b1 == b2 ? one : zero;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+/** Boolean false negative operator. */
+struct boolean_false_negative_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    const auto& b1 = x1 >= DataType(0.5);
+    const auto& b2 = x2 >= DataType(0.5);
+    return (!b1 && b2) ? one : zero;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+/** Boolean false positive operator. */
+struct boolean_false_positive_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    const auto& b1 = x1 >= DataType(0.5);
+    const auto& b2 = x2 >= DataType(0.5);
+    return (b1 && !b2) ? one : zero;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+} // namespace
+
+// Template instantiation
+#define INSTANTIATE(layer, op)                                          \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
+         ::fp_compute() {                                               \
+    apply_entrywise_binary_operator<op>(get_prev_activations(0),        \
+                                        get_prev_activations(1),        \
+                                        get_activations());             \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
+         ::bp_compute() {                                               \
+    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
+                                       get_local_prev_activations(1),   \
+                                       get_local_prev_error_signals(),  \
+                                       get_local_error_signals(0),      \
+                                       get_local_error_signals(1));     \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
+         ::fp_compute() {                                               \
+    apply_entrywise_binary_operator<op>(get_prev_activations(0),        \
+                                        get_prev_activations(1),        \
+                                        get_activations());             \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
+  ::bp_compute() {                                                      \
+    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
+                                       get_local_prev_activations(1),   \
+                                       get_local_prev_error_signals(),  \
+                                       get_local_error_signals(0),      \
+                                       get_local_error_signals(1));     \
+  }
+  INSTANTIATE(binary_cross_entropy_layer, binary_cross_entropy_op)
+  INSTANTIATE(sigmoid_binary_cross_entropy_layer, sigmoid_binary_cross_entropy_op)
+  INSTANTIATE(boolean_accuracy_layer, boolean_accuracy_op)
+  INSTANTIATE(boolean_false_negative_layer, boolean_false_negative_op)
+  INSTANTIATE(boolean_false_positive_layer, boolean_false_positive_op)
+
+} // namespace lbann
diff --git a/src/layers/loss/entrywise.cu b/src/layers/loss/entrywise.cu
new file mode 100644
index 00000000000..b27d2557df5
--- /dev/null
+++ b/src/layers/loss/entrywise.cu
@@ -0,0 +1,276 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/loss/entrywise.hpp"
+
+namespace lbann {
+
+namespace {
+
+/** CUDA kernel to apply an binary backprop operator. */
+template <typename BinaryBackPropOperator>
+__global__
+void binary_backprop_operator_kernel(El::Int height, El::Int width,
+                                     const DataType* __restrict__ x1,
+                                     El::Int x1_ldim,
+                                     const DataType* __restrict__ x2,
+                                     El::Int x2_ldim,
+                                     const DataType* __restrict__ dy,
+                                     El::Int dy_ldim,
+                                     DataType* __restrict__ dx1,
+                                     El::Int dx1_ldim,
+                                     DataType* __restrict__ dx2,
+                                     El::Int dx2_ldim) {
+  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int size = height * width;
+  const El::Int num_threads = blockDim.x * gridDim.x;
+  BinaryBackPropOperator op;
+  for (El::Int pos = gid; pos < size; pos += num_threads) {
+    const auto& row = pos % height;
+    const auto& col = pos / height;
+    op(x1[row + col * x1_ldim],
+       x2[row + col * x2_ldim],
+       dy[row + col * dy_ldim],
+       dx1[row + col * dx1_ldim],
+       dx2[row + col * dx2_ldim]);
+  }
+}
+  
+/** Apply a binary backprop operator to CPU data.
+ *  The input and output data must be on CPU and must have the same
+ *  dimensions. Given a binary function \f$ y = f(x_1,x_2) \f$, the
+ *  corresponding BinaryBackPropOperator is a 5-ary function with the
+ *  arguments \f$ x_1 \f$, \f$ x_2 \f$, \f$ dL/dy \f$, \f$ dL/dx_1\f$,
+ *  \f$ dL/dx_2 \f$. The last two arguments should be overwritten when
+ *  the BinaryBackPropOperator is called.
+ */
+template <typename BinaryBackPropOperator>
+void apply_binary_backprop_operator(const AbsMat& x1,
+                                    const AbsMat& x2,
+                                    const AbsMat& dy,
+                                    AbsMat& dx1,
+                                    AbsMat& dx2) {
+
+  // Get CUDA grid dimensions
+  // Note: Maximum CUDA grid dimension is 2^32-1
+  // (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications).
+  const El::Int height = x1.Height();
+  const El::Int width = x1.Width();
+  const El::Int block_dim = 256;
+  El::Int grid_dim = (height * width + block_dim - 1) / block_dim;
+  if (sizeof(El::Int) > sizeof(unsigned int)
+      && grid_dim > std::numeric_limits<uint32_t>::max()) {
+    grid_dim = std::numeric_limits<uint32_t>::max();
+  }
+
+  // Launch CUDA kernel
+  if (grid_dim > 0) {
+    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    binary_backprop_operator_kernel<BinaryBackPropOperator>
+      <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+        height, width,
+        x1.LockedBuffer(), x1.LDim(),
+        x2.LockedBuffer(), x2.LDim(),
+        dy.LockedBuffer(), dy.LDim(),
+        dx1.Buffer(), dx1.LDim(),
+        dx2.Buffer(), dx2.LDim());
+  }
+
+}
+  
+// =========================================================
+// Operator objects for entry-wise binary layers
+// =========================================================
+// Note: Binary operator corresponds to forward prop step
+// (\f$ y = f(x_1,x_2) \f$) and 5-ary operator corresponds
+// to back prop step
+// (\f$ \frac{dL}{dx_i} = \frac{dL}{dy} \frac{df}{dx_i}(x_1,x_2) \f$).
+
+/** Binary cross entropy operator. */
+struct binary_cross_entropy_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    constexpr DataType zero = 0;
+    constexpr DataType one = 1;
+    DataType y = zero;
+    if (x2 > zero) { y += -x2 * cuda::log(x1); }
+    if (x2 < one)  { y += -(one-x2) * cuda::log(one-x1); }
+    return y;
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    constexpr DataType zero = 0;
+    constexpr DataType one = 1;
+    dx1 = zero;
+    dx2 = zero;
+    if (dy == zero) { return; }
+    if (x2 > zero) {
+      dx1 += -x2 / x1 * dy;
+      dx2 += -cuda::log(x1) * dy;
+    }
+    if (x2 < one)  {
+      dx1 += (one-x2) / (one-x1) * dy;
+      dx2 += cuda::log(one-x1) * dy;
+    }
+  }
+};
+
+/** Sigmoid binary cross entropy operator.
+ *  Equivalent to applying a sigmoid function to the first operand and
+ *  then computing the binary cross entropy. Numerically stable
+ *  implementation is taken from
+ *  https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits.
+ */
+struct sigmoid_binary_cross_entropy_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    constexpr DataType zero = 0;
+    constexpr DataType one = 1;
+    const auto& z = cuda::max(zero, cuda::min(x2, one));
+    if (x1 > zero) {
+      return (one - z) * x1 + cuda::log1p(cuda::exp(-x1));
+    } else {
+      return - x1 * z + cuda::log1p(cuda::exp(x1));
+    }
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    constexpr DataType zero = 0;
+    constexpr DataType one = 1;
+    const auto& z = cuda::max(zero, cuda::min(x2, one));
+    if (x1 > zero) {
+      dx1 = -z + 1 / (one + cuda::exp(-x1));
+    } else {
+      dx1 = one - z - 1 / (one + cuda::exp(x1));
+    }
+    dx1 *= dy;
+    dx2 = (x2 == z) ? -x1 * dy : zero;
+  }
+};
+  
+/** Boolean accuracy operator. */
+struct boolean_accuracy_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {    
+    const auto& b1 = x1 >= DataType(0.5);
+    const auto& b2 = x2 >= DataType(0.5);
+    return b1 == b2 ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+
+/** Boolean false negative operator. */
+struct boolean_false_negative_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {    
+    const auto& b1 = x1 >= DataType(0.5);
+    const auto& b2 = x2 >= DataType(0.5);
+    return (!b1 && b2) ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+
+/** Boolean false positive operator. */
+struct boolean_false_positive_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {    
+    const auto& b1 = x1 >= DataType(0.5);
+    const auto& b2 = x2 >= DataType(0.5);
+    return (b1 && !b2) ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+  
+} // namespace
+
+// Template instantiation
+#define INSTANTIATE(layer, op)                                          \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
+         ::fp_compute() {                                               \
+    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(0),  \
+                                              get_prev_activations(1),  \
+                                              get_activations());       \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
+         ::bp_compute() {                                               \
+    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
+                                       get_local_prev_activations(1),   \
+                                       get_local_prev_error_signals(),  \
+                                       get_local_error_signals(0),      \
+                                       get_local_error_signals(1));     \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
+         ::fp_compute() {                                               \
+    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(0),  \
+                                              get_prev_activations(1),  \
+                                              get_activations());       \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
+  ::bp_compute() {                                                      \
+    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
+                                       get_local_prev_activations(1),   \
+                                       get_local_prev_error_signals(),  \
+                                       get_local_error_signals(0),      \
+                                       get_local_error_signals(1));     \
+  }
+  INSTANTIATE(binary_cross_entropy_layer, binary_cross_entropy_op)
+  INSTANTIATE(sigmoid_binary_cross_entropy_layer, sigmoid_binary_cross_entropy_op)
+  INSTANTIATE(boolean_accuracy_layer, boolean_accuracy_op)
+  INSTANTIATE(boolean_false_negative_layer, boolean_false_negative_op)
+  INSTANTIATE(boolean_false_positive_layer, boolean_false_positive_op)
+  
+} // namespace lbann
diff --git a/src/layers/loss/l2_norm2.cpp b/src/layers/loss/l2_norm2.cpp
new file mode 100644
index 00000000000..f7e7cdb63d5
--- /dev/null
+++ b/src/layers/loss/l2_norm2.cpp
@@ -0,0 +1,95 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/loss/l2_norm2.hpp"
+
+namespace lbann {
+
+namespace {
+
+void local_fp_cpu(const AbsMat& local_input,
+                  AbsMat& local_contribution) {
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_input.Width(); ++col) {
+    DataType sum = 0;
+    for (El::Int row = 0; row < local_input.Height(); ++row) {
+      const auto& x = local_input(row, col);
+      sum += x * x;
+    }
+    local_contribution(0, col) = sum;
+  }
+}
+
+void local_bp_cpu(const AbsMat& local_input,
+                  const AbsMat& local_gradient_wrt_output,
+                  AbsMat& local_gradient_wrt_input) {
+  auto const width = local_input.Width();
+  auto const height = local_input.Height();
+#pragma omp parallel for collapse(2)
+  for (El::Int col = 0; col < width; ++col) {
+    for (El::Int row = 0; row < height; ++row) {
+      const auto& x = local_input(row, col);
+      const auto& dy = local_gradient_wrt_output(0, col);
+      auto& dx = local_gradient_wrt_input(row, col);
+      dx = 2 * x * dy;
+    }
+  }
+}
+
+} // namespace
+
+template <>
+void l2_norm2_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
+     ::local_fp_compute(const AbsMat& local_input,
+                        AbsMat& local_contribution) {
+  local_fp_cpu(local_input, local_contribution);
+}
+template <>
+void l2_norm2_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
+     ::local_bp_compute(const AbsMat& local_input,
+                        const AbsMat& local_gradient_wrt_output,
+                        AbsMat& local_gradient_wrt_input) {
+  local_bp_cpu(local_input,
+               local_gradient_wrt_output,
+               local_gradient_wrt_input);
+}
+template <>
+void l2_norm2_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
+     ::local_fp_compute(const AbsMat& local_input,
+                        AbsMat& local_contribution) {
+  local_fp_cpu(local_input, local_contribution);
+}
+template <>
+void l2_norm2_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
+     ::local_bp_compute(const AbsMat& local_input,
+                        const AbsMat& local_gradient_wrt_output,
+                        AbsMat& local_gradient_wrt_input) {
+  local_bp_cpu(local_input,
+               local_gradient_wrt_output,
+               local_gradient_wrt_input);
+}
+
+} // namespace lbann
diff --git a/src/layers/loss/l2_norm2.cu b/src/layers/loss/l2_norm2.cu
new file mode 100644
index 00000000000..84fda9d3508
--- /dev/null
+++ b/src/layers/loss/l2_norm2.cu
@@ -0,0 +1,168 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/loss/l2_norm2.hpp"
+
+namespace lbann {
+
+namespace {
+
+template <El::Int block_size>
+__global__ void fp_kernel(El::Int local_height,
+                          El::Int local_width,
+                          const DataType* __restrict__ input,
+                          El::Int input_ldim,
+                          DataType* __restrict__ contribution) {
+  
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+
+  // Compute local contribution for each matrix column
+  for (El::Int col = bidy; col < local_width; col += gridDim.y) {
+
+    // Compute contributions for each thread
+    DataType private_contribution = 0;
+    for (El::Int row = gidx; row < local_height; row += nthreadsx) {
+      const auto& x = input[row + col * input_ldim];
+      private_contribution += x * x;
+    }
+
+    // Shared memory reduction to get contribution for each block
+    /// @todo unroll loops
+    __shared__ DataType shared_contribution[block_size];
+    shared_contribution[tid] = private_contribution;
+    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        shared_contribution[tid] += shared_contribution[tid + stride];
+      }
+    }
+    if (tid == 0) {
+      cuda::atomic_add(&contribution[col], shared_contribution[0]);
+    }
+    
+  }
+    
+}
+  
+void local_fp_gpu(const AbsMat& local_input, AbsMat& local_contribution) {
+  El::Zero(local_contribution);
+  if (!local_input.IsEmpty()) {
+    const auto& local_height = local_input.Height();
+    const auto& local_width = local_input.Width();
+    const El::Int block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    grid_dims.y = local_width;
+    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    fp_kernel<block_size>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_height, local_width,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_contribution.Buffer());
+  }
+}
+
+template <El::Int block_size>
+__global__ void bp_kernel(El::Int local_height, El::Int local_width,
+                          const DataType* __restrict__ input,
+                          El::Int input_ldim,
+                          const DataType* __restrict__ gradient_wrt_output,
+                          DataType* __restrict__ gradient_wrt_input,
+                          El::Int gradient_wrt_input_ldim) {
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+  for (El::Int col = bidy; col < local_width; col += gridDim.y) {
+    const auto& dy = gradient_wrt_output[col];
+    for (El::Int row = gidx; row < local_height; row += nthreadsx) {
+      const auto& x = input[row + col * input_ldim];
+      auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
+      dx = 2 * x * dy;
+    }
+  }
+}
+
+void local_bp_gpu(const AbsMat& local_input,
+                  const AbsMat& local_gradient_wrt_output,
+                  AbsMat& local_gradient_wrt_input) {
+  if (!local_input.IsEmpty()) {
+    const auto& local_height = local_input.Height();
+    const auto& local_width = local_input.Width();
+    const El::Int block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    grid_dims.y = local_width;
+    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    bp_kernel<block_size>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_height, local_width,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_gradient_wrt_output.LockedBuffer(),
+        local_gradient_wrt_input.Buffer(),
+        local_gradient_wrt_input.LDim());
+  }
+}
+
+} // namespace
+
+template <>
+void l2_norm2_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
+     ::local_fp_compute(const AbsMat& local_input,
+                        AbsMat& local_contribution) {
+  local_fp_gpu(local_input, local_contribution);
+}
+template <>
+void l2_norm2_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
+     ::local_bp_compute(const AbsMat& local_input,
+                        const AbsMat& local_gradient_wrt_output,
+                        AbsMat& local_gradient_wrt_input) {
+  local_bp_gpu(local_input,
+               local_gradient_wrt_output,
+               local_gradient_wrt_input);
+}
+template <>
+void l2_norm2_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
+     ::local_fp_compute(const AbsMat& local_input,
+                        AbsMat& local_contribution) {
+  local_fp_gpu(local_input, local_contribution);
+}
+template <>
+void l2_norm2_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
+     ::local_bp_compute(const AbsMat& local_input,
+                        const AbsMat& local_gradient_wrt_output,
+                        AbsMat& local_gradient_wrt_input) {
+  local_bp_gpu(local_input,
+               local_gradient_wrt_output,
+               local_gradient_wrt_input);
+}
+
+} // namespace lbann
diff --git a/src/layers/loss/top_k_categorical_accuracy.cpp b/src/layers/loss/top_k_categorical_accuracy.cpp
index f1ceac48ca9..70b780bf38b 100644
--- a/src/layers/loss/top_k_categorical_accuracy.cpp
+++ b/src/layers/loss/top_k_categorical_accuracy.cpp
@@ -81,7 +81,7 @@ void fp_cpu(lbann_comm& comm,
   } else if (local_width < 1) {
     return;
   }
-  
+
   // Column communicator
   auto&& col_comm = predictions.ColComm();
   const auto& col_comm_rank = El::mpi::Rank(col_comm);
@@ -129,13 +129,13 @@ void fp_cpu(lbann_comm& comm,
       comm.gather(reinterpret_cast<El::byte*>(top_entries.data()),
                   top_entries.size() * sizeof(entry),
                   col_comm_root,
-                  col_comm);
+                  col_comm, El::SyncInfo<El::Device::CPU>{});
     } else {
       std::vector<entry> global_top_entries(col_comm_size * local_width * k);
       comm.gather(reinterpret_cast<El::byte*>(top_entries.data()),
                   top_entries.size() * sizeof(entry),
                   reinterpret_cast<El::byte*>(global_top_entries.data()),
-                  col_comm);
+                  col_comm, El::SyncInfo<El::Device::CPU>{});
 #pragma omp parallel for
       for (El::Int col = 0; col < local_width; ++col) {
         std::vector<entry> col_entries(col_comm_size * k);
@@ -168,7 +168,7 @@ void fp_cpu(lbann_comm& comm,
       }
     }
   }
-  
+
 }
 
 } // namespace
diff --git a/src/layers/loss/top_k_categorical_accuracy.cu b/src/layers/loss/top_k_categorical_accuracy.cu
index c037e939743..db0ffeec781 100644
--- a/src/layers/loss/top_k_categorical_accuracy.cu
+++ b/src/layers/loss/top_k_categorical_accuracy.cu
@@ -28,11 +28,7 @@
 #include "lbann/utils/cuda.hpp"
 #include "lbann/utils/exception.hpp"
 
-#include <thrust/system/cuda/execution_policy.h>
-#include <thrust/fill.h>
 #include <thrust/sort.h>
-#include <thrust/reduce.h>
-#include <thrust/device_ptr.h>
 #include <thrust/iterator/discard_iterator.h>
 
 namespace lbann {
@@ -41,42 +37,22 @@ namespace {
 
 /** Sparse vector entry. */
 struct entry {
-
   /** Vector entry value. */
   DataType value;
   /** Vector entry index. */
   El::Int index;
-
-  /** Minimum possible value. */
-  static constexpr DataType min_value = -std::numeric_limits<DataType>::infinity();
-  /** Maximum possible index. */
-  static constexpr El::Int max_index = std::numeric_limits<El::Int>::max();
-
 };
 
 /** Comparison operation to sort sparse vector entries.
  *  Entries are sorted by value in decreasing order, with ties broken
  *  in favor of entries with smaller indices.
  */
-struct entry_compare : thrust::binary_function<entry,entry,bool> {
+struct entry_compare : ::thrust::binary_function<entry,entry,bool> {
   __host__ __device__ bool operator()(const entry& a, const entry& b) const {
     return a.value > b.value || (a.value == b.value && a.index < b.index);
   }
 };
 
-/** Reduction operation to get largest sparse vector entry.
- *  Ties are broken in favor of entries with smaller indices.
- */
-struct entry_reduce : thrust::binary_function<entry,entry,entry> {
-  __host__ __device__ entry operator()(const entry& a, const entry& b) const {
-    if (a.value > b.value || (a.value == b.value && a.index < b.index)) {
-      return a;
-    } else {
-      return b;
-    }
-  }
-};
-
 /** Convert columns of a dense matrix into sparse vectors.
  *  The matrix and vectors are both distributed, so entry indices in
  *  the sparse vectors correspond to global row indices in the dense
@@ -105,7 +81,7 @@ __global__ void dense_matrix_to_sparse_vectors(El::Int local_vector_size,
       current_entry.value = local_matrix[local_row + local_col * local_matrix_ldim];
       current_entry.index = global_row;
     } else {
-      current_entry.value = entry::min_value;
+      current_entry.value = -cuda::infinity<DataType>();
       current_entry.index = global_matrix_height;
     }
   }
@@ -130,7 +106,7 @@ __global__ void fill_with_tensor_index(El::Int tensor_size,
   const El::Int num_threads = blockDim.x * gridDim.x;
   for (El::Int i = gid; i < tensor_size; i += num_threads) {
     tensor[i] = (i / dim_stride) % dim;
-  }  
+  }
 }
 
 /** Get indices corresponding to one-hot matrix.
@@ -156,7 +132,7 @@ __global__ void one_hot_matrix_to_indices(El::Int local_height,
                                 + local_row * global_matrix_col_stride);
       indices[local_col] = global_row;
     }
-  }  
+  }
 }
 
 /** Compute categorical accuracy for each matrix column.
@@ -182,7 +158,7 @@ __global__ void compute_categorical_accuracy(El::Int k,
         && label_index <= max_entry) {
       loss[col * loss_stride] = DataType(1);
     }
-  }  
+  }
 }
 
 /** GPU implementation of top-k categorical accuracy layer forward prop. */
@@ -191,20 +167,14 @@ void fp_gpu(lbann_comm& comm,
             const AbsDistMat& predictions,
             const AbsDistMat& labels,
             AbsDistMat& loss) {
-  if (predictions.Wrap() != El::ELEMENT
-      || labels.Wrap() != El::ELEMENT
-      || loss.Wrap() != El::ELEMENT) {
-    LBANN_ERROR("top-k categorical accuracy layer GPU implementation "
-                "assumes elemental distributed matrices");
-  }
 
   // Local matrices
   const auto& local_predictions = predictions.LockedMatrix();
   const auto& local_labels = labels.LockedMatrix();
   auto& local_loss = loss.Matrix();
-  const El::Int height = predictions.Height();
-  const El::Int local_height = local_predictions.Height();
-  const El::Int local_width = local_predictions.Width();
+  const auto& height = predictions.Height();
+  const auto& local_height = local_predictions.Height();
+  const auto& local_width = local_predictions.Width();
 
   // Trivial cases
   if (k < 1) {
@@ -225,20 +195,16 @@ void fp_gpu(lbann_comm& comm,
 
   // GPU objects
   auto&& stream = El::GPUManager::Stream();
+  auto&& event = El::GPUManager::Event();
+  El::SyncInfo<El::Device::GPU> syncInfo{stream, event};
   cuda::thrust::allocator<> alloc(stream);
-  using entry_array = thrust::device_vector<entry, cuda::thrust::allocator<entry>>;
-  using index_array = thrust::device_vector<El::Int, cuda::thrust::allocator<El::Int>>;
 
   // Get label indices
-  index_array label_indices(local_width);
+  cuda::thrust::vector<El::Int> label_indices(local_width, height);
   {
     const auto& local_size = local_height * local_width;
     const auto& block_dim = 256;
     const auto& grid_dim = (local_size + block_dim - 1) / block_dim;
-    thrust::fill(thrust::cuda::par(alloc).on(stream),
-                 label_indices.begin(),
-                 label_indices.end(),
-                 height);
     one_hot_matrix_to_indices<<<grid_dim, block_dim, 0, stream>>>(
       local_height, local_width,
       labels.ColShift(), labels.ColStride(),
@@ -249,18 +215,18 @@ void fp_gpu(lbann_comm& comm,
     El::mpi::AllReduce(label_indices.data().get(),
                        label_indices.size(),
                        El::mpi::MIN,
-                       col_comm);
+                       col_comm, syncInfo);
   }
 
   // Find top-k entries in each column of local prediction matrix
-  entry_array top_entries(local_width * k);
+  cuda::thrust::vector<entry> top_entries(local_width * k);
   {
     const auto& num_local_entries_per_col = std::max(local_height, k);
     const auto& num_local_entries = local_width * num_local_entries_per_col;
     const auto& block_dim = 256;
     const auto& grid_dim = (num_local_entries + block_dim - 1) / block_dim;
-    entry_array local_entries(num_local_entries);
-    index_array local_entries_cols(num_local_entries);
+    cuda::thrust::vector<entry> local_entries(num_local_entries);
+    cuda::thrust::vector<El::Int> local_entries_cols(num_local_entries);
     dense_matrix_to_sparse_vectors<<<grid_dim, block_dim, 0, stream>>>(
       num_local_entries_per_col, local_height, local_width, height,
       predictions.ColShift(), predictions.ColStride(),
@@ -269,34 +235,23 @@ void fp_gpu(lbann_comm& comm,
     fill_with_tensor_index<<<grid_dim, block_dim, 0, stream>>>(
       num_local_entries, local_width, num_local_entries_per_col,
       local_entries_cols.data().get());
-    if (k == 1) {
-      thrust::reduce_by_key(thrust::cuda::par(alloc).on(stream),
-                            local_entries_cols.begin(),
-                            local_entries_cols.end(),
-                            local_entries.begin(),
-                            thrust::make_discard_iterator(),
-                            top_entries.begin(),
-                            thrust::equal_to<El::Int>(),
-                            entry_reduce());
-    } else {
-      thrust::sort_by_key(thrust::cuda::par(alloc).on(stream),
+    ::thrust::sort_by_key(alloc.system(),
                           local_entries.begin(),
                           local_entries.end(),
                           local_entries_cols.begin(),
                           entry_compare());
-      thrust::stable_sort_by_key(thrust::cuda::par(alloc).on(stream),
+    ::thrust::stable_sort_by_key(alloc.system(),
                                  local_entries_cols.begin(),
                                  local_entries_cols.end(),
                                  local_entries.begin());
-      CHECK_CUDA(cudaMemcpy2DAsync(top_entries.data().get(),
-                                   k * sizeof(entry),
-                                   local_entries.data().get(),
-                                   num_local_entries_per_col * sizeof(entry),
-                                   k * sizeof(entry),
-                                   local_width,
-                                   cudaMemcpyDeviceToDevice,
-                                   stream));
-    }
+    CHECK_CUDA(cudaMemcpy2DAsync(top_entries.data().get(),
+                                 k * sizeof(entry),
+                                 local_entries.data().get(),
+                                 num_local_entries_per_col * sizeof(entry),
+                                 k * sizeof(entry),
+                                 local_width,
+                                 cudaMemcpyDeviceToDevice,
+                                 stream));
   }
 
   // Find top-k entries in each column of global prediction matrix
@@ -309,25 +264,25 @@ void fp_gpu(lbann_comm& comm,
       comm.gather(reinterpret_cast<El::byte*>(top_entries.data().get()),
                   top_entries.size() * sizeof(entry),
                   col_comm_root,
-                  col_comm);
+                  col_comm, syncInfo);
     } else {
-      entry_array global_top_entries(num_entries);
-      index_array global_top_entries_cols(num_entries);
+      cuda::thrust::vector<entry> global_top_entries(num_entries);
+      cuda::thrust::vector<El::Int> global_top_entries_cols(num_entries);
       comm.gather(reinterpret_cast<El::byte*>(top_entries.data().get()),
                   top_entries.size() * sizeof(entry),
                   reinterpret_cast<El::byte*>(global_top_entries.data().get()),
-                  col_comm);
+                  col_comm, syncInfo);
       fill_with_tensor_index<<<grid_dim, block_dim, 0, stream>>>(
         num_entries, local_width, k, global_top_entries_cols.data().get());
-      thrust::sort_by_key(thrust::cuda::par(alloc).on(stream),
-                          global_top_entries.begin(),
-                          global_top_entries.end(),
-                          global_top_entries_cols.begin(),
-                          entry_compare());
-      thrust::stable_sort_by_key(thrust::cuda::par(alloc).on(stream),
-                                 global_top_entries_cols.begin(),
-                                 global_top_entries_cols.end(),
-                                 global_top_entries.begin());
+      ::thrust::sort_by_key(alloc.system(),
+                            global_top_entries.begin(),
+                            global_top_entries.end(),
+                            global_top_entries_cols.begin(),
+                            entry_compare());
+      ::thrust::stable_sort_by_key(alloc.system(),
+                                   global_top_entries_cols.begin(),
+                                   global_top_entries_cols.end(),
+                                   global_top_entries.begin());
       CHECK_CUDA(cudaMemcpy2DAsync(top_entries.data().get(),
                                    k * sizeof(entry),
                                    global_top_entries.data().get(),
@@ -336,7 +291,7 @@ void fp_gpu(lbann_comm& comm,
                                    local_width,
                                    cudaMemcpyDeviceToDevice,
                                    stream));
-    }   
+    }
   }
 
   // Compute categorical accuracy
diff --git a/src/layers/math/CMakeLists.txt b/src/layers/math/CMakeLists.txt
new file mode 100644
index 00000000000..badf6cb2f5e
--- /dev/null
+++ b/src/layers/math/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Add the source files for this directory
+set_full_path(THIS_DIR_SOURCES
+  unary.cpp
+  binary.cpp
+  )
+
+if (LBANN_HAS_CUDA)
+  # Add the CUDA source files for this directory
+  set_full_path(THIS_DIR_CU_SOURCES
+    unary.cu
+    binary.cu
+    )
+endif ()
+
+# Propagate the files up the tree
+set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
+set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CU_SOURCES}" PARENT_SCOPE)
diff --git a/src/layers/math/binary.cpp b/src/layers/math/binary.cpp
new file mode 100644
index 00000000000..508c42b56a4
--- /dev/null
+++ b/src/layers/math/binary.cpp
@@ -0,0 +1,466 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/math/binary.hpp"
+#include "lbann/utils/entrywise_operator.hpp"
+
+namespace lbann {
+
+namespace {
+
+// Helpful constants
+constexpr DataType zero = 0;
+constexpr DataType one = 1;
+
+/** Apply a binary backprop operator to CPU data.
+ *  The input and output data must be on CPU and must have the same
+ *  dimensions. Given a binary function \f$ y = f(x_1,x_2) \f$, the
+ *  corresponding BinaryBackPropOperator is a 5-ary function with the
+ *  arguments \f$ x_1 \f$, \f$ x_2 \f$, \f$ dL/dy \f$, \f$ dL/dx_1\f$,
+ *  \f$ dL/dx_2 \f$. The last two arguments should be overwritten when
+ *  the BinaryBackPropOperator is called.
+ */
+template <typename BinaryBackPropOperator>
+void apply_binary_backprop_operator(const AbsMat& x1,
+                                    const AbsMat& x2,
+                                    const AbsMat& dy,
+                                    AbsMat& dx1,
+                                    AbsMat& dx2) {
+  if (x1.Contiguous() && x2.Contiguous() && dy.Contiguous()
+      && dx1.Contiguous() && dx2.Contiguous()) {
+    const auto* x1_buffer = x1.LockedBuffer();
+    const auto* x2_buffer = x2.LockedBuffer();
+    const auto* dy_buffer = dy.LockedBuffer();
+    auto* dx1_buffer = dx1.Buffer();
+    auto* dx2_buffer = dx2.Buffer();
+    const size_t size = x1.Height() * x1.Width();
+#pragma omp parallel for
+    for (size_t i = 0; i < size; ++i) {
+      BinaryBackPropOperator op;
+      op(x1_buffer[i], x2_buffer[i], dy_buffer[i],
+         dx1_buffer[i], dx2_buffer[i]);
+    }
+  } else {
+    auto const width = x1.Width();
+    auto const height = x1.Height();
+#pragma omp parallel for collapse(2)
+    for (El::Int col = 0; col < width; ++col) {
+      for (El::Int row = 0; row < height; ++row) {
+        BinaryBackPropOperator op;
+        op(x1(row, col), x2(row, col), dy(row, col),
+           dx1(row, col), dx2(row, col));
+      }
+    }
+  }
+
+}
+
+// =========================================================
+// Operator objects for entry-wise binary layers
+// =========================================================
+// Note: Binary operator corresponds to forward prop step
+// (\f$ y = f(x_1,x_2) \f$) and 5-ary operator corresponds
+// to back prop step
+// (\f$ \frac{dL}{dx_i} = \frac{dL}{dy} \frac{df}{dx_i}(x_1,x_2) \f$).
+
+/** Add operator. */
+struct add_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return x1 + x2;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = dy;
+    dx2 = dy;
+  }
+};
+
+/** Subtract operator. */
+struct subtract_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return x1 - x2;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = dy;
+    dx2 = -dy;
+  }
+};
+
+/** Multiply operator. */
+struct multiply_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return x1 * x2;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = dy * x2;
+    dx2 = dy * x1;
+  }
+};
+
+/** Divide operator. */
+struct divide_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return x1 / x2;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = dy / x2;
+    dx2 = -dy * x1 / (x2*x2);
+  }
+};
+
+/** Modulo operator. */
+struct mod_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return std::fmod(x1, x2);
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = dy;
+    dx2 = -dy * std::floor(x1 / x2);
+  }
+};
+
+/** Power operator. */
+struct pow_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return std::pow(x1, x2);
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+
+    dx1 = dy * x2 * std::pow(x1, x2 - one);
+    dx2 = dy * std::log(x1) * std::pow(x1, x2);
+  }
+};
+
+/** Safe divide operator.
+ *  If a standard division produces an infinity or NaN, zero is output
+ *  instead.
+ */
+struct safe_divide_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    const auto& y = x1 / x2;
+    if (std::isfinite(y)) { return y; }
+    else                  { return zero; }
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    const auto& y = x1 / x2;
+    if (std::isfinite(y)) {
+      dx1 = dy / x2;
+      dx2 = -dy * x1 / (x2*x2);
+    } else {
+      dx1 = zero;
+      dx2 = zero;
+    }
+  }
+};
+
+/** Maximum operator. */
+struct max_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return std::max(x1, x2);
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    if (x1 > x2) {
+      dx1 = dy;
+      dx2 = zero;
+    } else if (x2 > x1) {
+      dx1 = zero;
+      dx2 = dy;
+    } else {
+      dx1 = dy / 2;
+      dx2 = dy / 2;
+    }
+  }
+};
+
+/** Minimum operator. */
+struct min_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return std::min(x1, x2);
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    if (x1 < x2) {
+      dx1 = dy;
+      dx2 = zero;
+    } else if (x2 < x1) {
+      dx1 = zero;
+      dx2 = dy;
+    } else {
+      dx1 = dy / 2;
+      dx2 = dy / 2;
+    }
+  }
+};
+
+/** Equal operator. */
+struct equal_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return x1 == x2 ? one : zero;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+/** Not equal operator. */
+struct not_equal_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return x1 == x2 ? zero : one;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+/** Less than operator. */
+struct less_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return x1 < x2 ? one : zero;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+/** Less than or equal operator. */
+struct less_equal_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return x1 <= x2 ? one : zero;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+/** Greater than operator. */
+struct greater_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return x1 > x2 ? one : zero;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+/** Greater than or equal operator. */
+struct greater_equal_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    return x1 >= x2 ? one : zero;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+/** Logical and operator. */
+struct and_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    const auto& b1 = x1 != zero && !std::isnan(x1);
+    const auto& b2 = x2 != zero && !std::isnan(x2);
+    return (b1 && b2) ? one : zero;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+/** Logical or operator. */
+struct or_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    const auto& b1 = x1 != zero && !std::isnan(x1);
+    const auto& b2 = x2 != zero && !std::isnan(x2);
+    return (b1 || b2) ? one : zero;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+/** Logical xor operator. */
+struct xor_op {
+  inline DataType operator()(const DataType& x1,
+                             const DataType& x2) const {
+    const auto& b1 = x1 != zero && !std::isnan(x1);
+    const auto& b2 = x2 != zero && !std::isnan(x2);
+    return (b1 || b2) && !(b1 && b2) ? one : zero;
+  }
+  inline void operator()(const DataType& x1,
+                         const DataType& x2,
+                         const DataType& dy,
+                         DataType& dx1,
+                         DataType& dx2) const {
+    dx1 = zero;
+    dx2 = zero;
+  }
+};
+
+} // namespace
+
+// Template instantiation
+#define INSTANTIATE(layer, op)                                          \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
+         ::fp_compute() {                                               \
+    apply_entrywise_binary_operator<op>(get_prev_activations(0),        \
+                                        get_prev_activations(1),        \
+                                        get_activations());             \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
+         ::bp_compute() {                                               \
+    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
+                                       get_local_prev_activations(1),   \
+                                       get_local_prev_error_signals(),  \
+                                       get_local_error_signals(0),      \
+                                       get_local_error_signals(1));     \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
+         ::fp_compute() {                                               \
+    apply_entrywise_binary_operator<op>(get_prev_activations(0),        \
+                                        get_prev_activations(1),        \
+                                        get_activations());             \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
+  ::bp_compute() {                                                      \
+    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
+                                       get_local_prev_activations(1),   \
+                                       get_local_prev_error_signals(),  \
+                                       get_local_error_signals(0),      \
+                                       get_local_error_signals(1));     \
+  }
+  INSTANTIATE(add_layer, add_op)
+  INSTANTIATE(subtract_layer, subtract_op)
+  INSTANTIATE(multiply_layer, multiply_op)
+  INSTANTIATE(divide_layer, divide_op)
+  INSTANTIATE(mod_layer, mod_op)
+  INSTANTIATE(pow_layer, pow_op)
+  INSTANTIATE(safe_divide_layer, safe_divide_op)
+  INSTANTIATE(max_layer, max_op)
+  INSTANTIATE(min_layer, min_op)
+  INSTANTIATE(equal_layer, equal_op)
+  INSTANTIATE(not_equal_layer, not_equal_op)
+  INSTANTIATE(less_layer, less_op)
+  INSTANTIATE(less_equal_layer, less_equal_op)
+  INSTANTIATE(greater_layer, greater_op)
+  INSTANTIATE(greater_equal_layer, greater_equal_op)
+  INSTANTIATE(and_layer, and_op)
+  INSTANTIATE(or_layer, or_op)
+  INSTANTIATE(xor_layer, xor_op)
+
+} // namespace lbann
diff --git a/src/layers/math/binary.cu b/src/layers/math/binary.cu
new file mode 100644
index 00000000000..c36ec01ad89
--- /dev/null
+++ b/src/layers/math/binary.cu
@@ -0,0 +1,490 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/math/binary.hpp"
+
+namespace lbann {
+
+namespace {
+
+/** CUDA kernel to apply an binary backprop operator. */
+template <typename BinaryBackPropOperator>
+__global__
+void binary_backprop_operator_kernel(El::Int height, El::Int width,
+                                     const DataType* __restrict__ x1,
+                                     El::Int x1_ldim,
+                                     const DataType* __restrict__ x2,
+                                     El::Int x2_ldim,
+                                     const DataType* __restrict__ dy,
+                                     El::Int dy_ldim,
+                                     DataType* __restrict__ dx1,
+                                     El::Int dx1_ldim,
+                                     DataType* __restrict__ dx2,
+                                     El::Int dx2_ldim) {
+  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int size = height * width;
+  const El::Int num_threads = blockDim.x * gridDim.x;
+  BinaryBackPropOperator op;
+  for (El::Int pos = gid; pos < size; pos += num_threads) {
+    const auto& row = pos % height;
+    const auto& col = pos / height;
+    op(x1[row + col * x1_ldim],
+       x2[row + col * x2_ldim],
+       dy[row + col * dy_ldim],
+       dx1[row + col * dx1_ldim],
+       dx2[row + col * dx2_ldim]);
+  }
+}
+
+  
+/** Apply a binary backprop operator to CPU data.
+ *  The input and output data must be on CPU and must have the same
+ *  dimensions. Given a binary function \f$ y = f(x_1,x_2) \f$, the
+ *  corresponding BinaryBackPropOperator is a 5-ary function with the
+ *  arguments \f$ x_1 \f$, \f$ x_2 \f$, \f$ dL/dy \f$, \f$ dL/dx_1\f$,
+ *  \f$ dL/dx_2 \f$. The last two arguments should be overwritten when
+ *  the BinaryBackPropOperator is called.
+ */
+template <typename BinaryBackPropOperator>
+void apply_binary_backprop_operator(const AbsMat& x1,
+                                    const AbsMat& x2,
+                                    const AbsMat& dy,
+                                    AbsMat& dx1,
+                                    AbsMat& dx2) {
+
+  // Get CUDA grid dimensions
+  // Note: Maximum CUDA grid dimension is 2^32-1
+  // (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications).
+  const El::Int height = x1.Height();
+  const El::Int width = x1.Width();
+  const El::Int block_dim = 256;
+  El::Int grid_dim = (height * width + block_dim - 1) / block_dim;
+  if (sizeof(El::Int) > sizeof(unsigned int)
+      && grid_dim > std::numeric_limits<uint32_t>::max()) {
+    grid_dim = std::numeric_limits<uint32_t>::max();
+  }
+
+  // Launch CUDA kernel
+  if (grid_dim > 0) {
+    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    binary_backprop_operator_kernel<BinaryBackPropOperator>
+      <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
+        height, width,
+        x1.LockedBuffer(), x1.LDim(),
+        x2.LockedBuffer(), x2.LDim(),
+        dy.LockedBuffer(), dy.LDim(),
+        dx1.Buffer(), dx1.LDim(),
+        dx2.Buffer(), dx2.LDim());
+  }
+
+}
+  
+// =========================================================
+// Operator objects for entry-wise binary layers
+// =========================================================
+// Note: Binary operator corresponds to forward prop step
+// (\f$ y = f(x_1,x_2) \f$) and 5-ary operator corresponds
+// to back prop step
+// (\f$ \frac{dL}{dx_i} = \frac{dL}{dy} \frac{df}{dx_i}(x_1,x_2) \f$).
+
+/** Add operator. */
+struct add_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return x1 + x2;
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = dy;
+    dx2 = dy;
+  }
+};
+
+/** Subtract operator. */
+struct subtract_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return x1 - x2;
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = dy;
+    dx2 = -dy;
+  }
+};
+  
+/** Multiply operator. */
+struct multiply_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return x1 * x2;
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = dy * x2;
+    dx2 = dy * x1;
+  }
+};
+
+/** Divide operator. */
+struct divide_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return x1 / x2;
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = dy / x2;
+    dx2 = -dy * x1 / (x2*x2);
+  }
+};
+  
+/** Modulo operator. */
+struct mod_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return cuda::mod(x1, x2);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = dy;
+    dx2 = -dy * cuda::floor(x1 / x2);
+  }
+};
+
+/** Power operator. */
+struct pow_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return cuda::pow(x1, x2);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+
+    dx1 = dy * x2 * cuda::pow(x1, x2 - DataType(1));
+    dx2 = dy * cuda::log(x1) * cuda::pow(x1, x2);
+  }
+};
+
+/** Safe divide operator.
+ *  If a standard division produces an infinity or NaN, zero is output
+ *  instead.
+ */
+struct safe_divide_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    const auto& y = x1 / x2;
+    if (isfinite(y)) { return y; }
+    else             { return DataType(0); }
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    const auto& y = x1 / x2;
+    if (isfinite(y)) {
+      dx1 = dy / x2;
+      dx2 = -dy * x1 / (x2*x2);
+    } else {
+      dx1 = DataType(0);
+      dx2 = DataType(0);
+    }
+  }
+};
+  
+/** Maximum operator. */
+struct max_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return cuda::max(x1, x2);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    if (x1 > x2) {
+      dx1 = dy;
+      dx2 = DataType(0);
+    } else if (x2 > x1) {
+      dx1 = DataType(0);
+      dx2 = dy;
+    } else {
+      dx1 = dy / 2;
+      dx2 = dy / 2;
+    }
+  }
+};
+
+/** Minimum operator. */
+struct min_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return cuda::min(x1, x2);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    if (x1 < x2) {
+      dx1 = dy;
+      dx2 = DataType(0);
+    } else if (x2 < x1) {
+      dx1 = DataType(0);
+      dx2 = dy;
+    } else {
+      dx1 = dy / 2;
+      dx2 = dy / 2;
+    }
+  }
+};
+
+/** Equal operator. */
+struct equal_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return x1 == x2 ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+
+/** Not equal operator. */
+struct not_equal_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return x1 == x2 ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+
+/** Less than operator. */
+struct less_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return x1 < x2 ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+
+/** Less than or equal operator. */
+struct less_equal_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return x1 <= x2 ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+
+/** Greater than operator. */
+struct greater_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return x1 > x2 ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+
+/** Greater than or equal operator. */
+struct greater_equal_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    return x1 >= x2 ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+
+/** Logical and operator. */
+struct and_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    const auto& b1 = x1 != DataType(0) && !isnan(x1);
+    const auto& b2 = x2 != DataType(0) && !isnan(x2);
+    return (b1 && b2) ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+
+/** Logical or operator. */
+struct or_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    const auto& b1 = x1 != DataType(0) && !isnan(x1);
+    const auto& b2 = x2 != DataType(0) && !isnan(x2);
+    return (b1 || b2) ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+
+/** Logical xor operator. */
+struct xor_op {
+  inline __device__ DataType operator()(const DataType& x1,
+                                        const DataType& x2) const {
+    const auto& b1 = x1 != DataType(0) && !isnan(x1);
+    const auto& b2 = x2 != DataType(0) && !isnan(x2);
+    return (b1 || b2) && !(b1 && b2) ? DataType(1) : DataType(0);
+  }
+  inline __device__ void operator()(const DataType& x1,
+                                    const DataType& x2,
+                                    const DataType& dy,
+                                    DataType& dx1,
+                                    DataType& dx2) const {
+    dx1 = DataType(0);
+    dx2 = DataType(0);
+  }
+};
+  
+} // namespace
+
+// Template instantiation
+#define INSTANTIATE(layer, op)                                          \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
+         ::fp_compute() {                                               \
+    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(0),  \
+                                              get_prev_activations(1),  \
+                                              get_activations());       \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
+         ::bp_compute() {                                               \
+    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
+                                       get_local_prev_activations(1),   \
+                                       get_local_prev_error_signals(),  \
+                                       get_local_error_signals(0),      \
+                                       get_local_error_signals(1));     \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
+         ::fp_compute() {                                               \
+    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(0),  \
+                                              get_prev_activations(1),  \
+                                              get_activations());       \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
+  ::bp_compute() {                                                      \
+    apply_binary_backprop_operator<op>(get_local_prev_activations(0),   \
+                                       get_local_prev_activations(1),   \
+                                       get_local_prev_error_signals(),  \
+                                       get_local_error_signals(0),      \
+                                       get_local_error_signals(1));     \
+  }
+  INSTANTIATE(add_layer, add_op)
+  INSTANTIATE(subtract_layer, subtract_op)
+  INSTANTIATE(multiply_layer, multiply_op)
+  INSTANTIATE(divide_layer, divide_op)
+  INSTANTIATE(mod_layer, mod_op)
+  INSTANTIATE(pow_layer, pow_op)
+  INSTANTIATE(safe_divide_layer, safe_divide_op)
+  INSTANTIATE(max_layer, max_op)
+  INSTANTIATE(min_layer, min_op)
+  INSTANTIATE(equal_layer, equal_op)
+  INSTANTIATE(not_equal_layer, not_equal_op)
+  INSTANTIATE(less_layer, less_op)
+  INSTANTIATE(less_equal_layer, less_equal_op)
+  INSTANTIATE(greater_layer, greater_op)
+  INSTANTIATE(greater_equal_layer, greater_equal_op)
+  INSTANTIATE(and_layer, and_op)
+  INSTANTIATE(or_layer, or_op)
+  INSTANTIATE(xor_layer, xor_op)
+  
+} // namespace lbann
diff --git a/src/layers/math/unary.cpp b/src/layers/math/unary.cpp
new file mode 100644
index 00000000000..45b5ad88dbb
--- /dev/null
+++ b/src/layers/math/unary.cpp
@@ -0,0 +1,402 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/math/unary.hpp"
+#include "lbann/utils/entrywise_operator.hpp"
+
+namespace lbann {
+
+namespace {
+
+// Helpful constants
+constexpr DataType zero = 0;
+constexpr DataType one = 1;
+  
+// =========================================================
+// Operator objects for entry-wise unary layers
+// =========================================================
+// Note: Unary operator corresponds to forward prop step
+// (\f$ y = f(x) \f$) and binary operator corresponds to 
+// back prop step
+// (\f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$).
+
+/** Logical not operator. */
+struct not_op {
+  inline DataType operator()(const DataType& x) const {
+    const auto& b = x != zero && !std::isnan(x);
+    return !b ? one : zero;
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return zero;
+  }
+};
+  
+/** Absolute value operator. */
+struct abs_op {
+  inline DataType operator()(const DataType& x) const {
+    return x >= zero ? x : -x;
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    if      (x > zero) { return dy;   }
+    else if (x < zero) { return -dy;  }
+    else               { return zero; }
+  }
+};
+
+/** Negative operator. */
+struct negative_op {
+  inline DataType operator()(const DataType& x) const {
+    return -x;
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return -dy;
+  }
+};
+
+/** Sign operator. */
+struct sign_op {
+  inline DataType operator()(const DataType& x) const {
+    if      (x > zero) { return one;  }
+    else if (x < zero) { return -one; }
+    else               { return zero; }
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return zero;
+  }
+};
+
+/** Round operator. */
+struct round_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::round(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return zero;
+  }
+};
+
+/** Ceiling operator. */
+struct ceil_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::ceil(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return zero;
+  }
+};
+
+/** Floor operator. */
+struct floor_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::floor(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return zero;
+  }
+};
+
+/** Reciprocal operator.
+ *  If a standard reciprocal produces an infinity or NaN, zero is
+ *  output instead.
+ */
+struct reciprocal_op {
+  inline DataType operator()(const DataType& x) const {
+    return 1 / x;
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    if (dy == zero) { return zero; }
+    else            { return - dy / (x*x); }
+  }
+};
+
+/** Square operator. */
+struct square_op {
+  inline DataType operator()(const DataType& x) const {
+    return x*x;
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return 2*x * dy;
+  }
+};
+
+
+/** Square root operator. */
+struct sqrt_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::sqrt(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / (2 * std::sqrt(x));
+  }
+};
+
+/** Reciprocal square root operator. */
+struct rsqrt_op {
+  inline DataType operator()(const DataType& x) const {
+    return 1 / std::sqrt(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    const auto& s = std::sqrt(x);
+    return - dy / (2 * x * s);
+  }
+};
+
+/** Safe reciprocal operator. */
+struct safe_reciprocal_op {
+  inline DataType operator()(const DataType& x) const {
+    const auto& y = 1 / x;
+    if (std::isfinite(y)) { return y; }
+    else                  { return zero; }
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    const auto& y = 1 / x;
+    if (std::isfinite(y)) { return - dy * y*y; }
+    else                  { return zero; }
+  }
+};
+  
+/** Exponential operator. */
+struct exp_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::exp(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy * std::exp(x);
+  }
+};
+
+/** Exponential minus one operator. */
+struct expm1_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::expm1(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy * std::exp(x);
+  }
+};
+
+/** Natural logarithm operator. */
+struct log_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::log(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / x;
+  }
+};
+
+/** Natural logarithm one plus operator. */
+struct log1p_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::log1p(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / (x + one);
+  }
+};
+
+/** Cosine operator. */
+struct cos_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::cos(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return -dy * std::sin(x);
+  }
+};
+
+/** Sine operator. */
+struct sin_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::sin(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy * std::cos(x);
+  }
+};
+
+/** Tangent operator. */
+struct tan_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::tan(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    const auto& c = std::cos(x);
+    return dy / (c*c);
+  }
+};
+
+/** Arccosine operator. */
+struct acos_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::acos(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return -dy / std::sqrt(one - x*x);
+  }
+};
+
+/** Arcsine operator. */
+struct asin_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::asin(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / std::sqrt(one - x*x);
+  }
+};
+
+/** Arctangent operator. */
+struct atan_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::atan(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / (one + x*x);
+  }
+};
+
+/** Hyperbolic cosine operator. */
+struct cosh_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::cosh(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy * std::sinh(x);
+  }
+};
+
+/** Hyperbolic sine operator. */
+struct sinh_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::sinh(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy * std::cosh(x);
+  }
+};
+
+/** Hyperbolic tangent operator. */
+struct tanh_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::tanh(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    const auto& c = std::cosh(x);
+    return dy / (c*c);
+  }
+};
+
+/** Hyperbolic arccosine operator. */
+struct acosh_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::acosh(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return -dy / (std::sqrt(x - one) * std::sqrt(x + one));
+  }
+};
+
+/** Hyperbolic arcsine operator. */
+struct asinh_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::asinh(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / std::sqrt(one + x*x);
+  }
+};
+
+/** Hyperbolic arctangent operator. */
+struct atanh_op {
+  inline DataType operator()(const DataType& x) const {
+    return std::atanh(x);
+  }
+  inline DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / (one - x*x);
+  }
+};
+  
+} // namespace
+
+// Template instantiation
+#define INSTANTIATE(layer, op)                                          \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
+         ::fp_compute() {                                               \
+    apply_entrywise_unary_operator<op>(get_prev_activations(),          \
+                                       get_activations());              \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::CPU>              \
+         ::bp_compute() {                                               \
+    apply_entrywise_binary_operator<op>(get_prev_activations(),         \
+                                        get_prev_error_signals(),       \
+                                        get_error_signals());           \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
+         ::fp_compute() {                                               \
+    apply_entrywise_unary_operator<op>(get_prev_activations(),          \
+                                       get_activations());              \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::CPU>               \
+         ::bp_compute() {                                               \
+    apply_entrywise_binary_operator<op>(get_prev_activations(),         \
+                                        get_prev_error_signals(),       \
+                                        get_error_signals());           \
+  }
+  INSTANTIATE(not_layer, not_op)
+  INSTANTIATE(abs_layer, abs_op)
+  INSTANTIATE(negative_layer, negative_op)
+  INSTANTIATE(sign_layer, sign_op)
+  INSTANTIATE(round_layer, round_op)
+  INSTANTIATE(ceil_layer, ceil_op)
+  INSTANTIATE(floor_layer, floor_op)
+  INSTANTIATE(reciprocal_layer, reciprocal_op)
+  INSTANTIATE(square_layer, square_op)
+  INSTANTIATE(sqrt_layer, sqrt_op)
+  INSTANTIATE(rsqrt_layer, rsqrt_op)
+  INSTANTIATE(safe_reciprocal_layer, safe_reciprocal_op)
+  INSTANTIATE(exp_layer, exp_op)
+  INSTANTIATE(expm1_layer, expm1_op)
+  INSTANTIATE(log_layer, log_op)
+  INSTANTIATE(log1p_layer, log1p_op)
+  INSTANTIATE(cos_layer, cos_op)
+  INSTANTIATE(sin_layer, sin_op)
+  INSTANTIATE(tan_layer, tan_op)
+  INSTANTIATE(acos_layer, acos_op)
+  INSTANTIATE(asin_layer, asin_op)
+  INSTANTIATE(atan_layer, atan_op)
+  INSTANTIATE(cosh_layer, cosh_op)
+  INSTANTIATE(sinh_layer, sinh_op)
+  INSTANTIATE(tanh_layer, tanh_op)
+  INSTANTIATE(acosh_layer, acosh_op)
+  INSTANTIATE(asinh_layer, asinh_op)
+  INSTANTIATE(atanh_layer, atanh_op)
+  
+} // namespace lbann
diff --git a/src/layers/math/unary.cu b/src/layers/math/unary.cu
new file mode 100644
index 00000000000..9d02f534ebf
--- /dev/null
+++ b/src/layers/math/unary.cu
@@ -0,0 +1,401 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/math/unary.hpp"
+
+namespace lbann {
+
+namespace {
+  
+// =========================================================
+// Operator objects for entry-wise unary layers
+// =========================================================
+// Note: Unary operator corresponds to forward prop step
+// (\f$ y = f(x) \f$) and binary operator corresponds to 
+// back prop step
+// (\f$ \frac{dL}{dx} = \frac{dL}{dy} f'(x) \f$).
+
+/** Logical not operator. */
+struct not_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    const auto& b = x != DataType(0) && !isnan(x);
+    return !b ? DataType(1) : DataType(0);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return DataType(0);
+  }
+};
+  
+/** Absolute value operator. */
+struct abs_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::abs(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    constexpr DataType zero = 0;
+    if      (x > zero) { return dy;   }
+    else if (x < zero) { return -dy;  }
+    else               { return zero; }
+  }
+};
+
+/** Negative operator. */
+struct negative_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return -x;
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return -dy;
+  }
+};
+
+/** Sign operator. */
+struct sign_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    constexpr DataType zero = 0;
+    constexpr DataType one = 1;
+    if      (x > zero) { return one;  }
+    else if (x < zero) { return -one; }
+    else               { return zero; }
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return DataType(0);
+  }
+};
+
+/** Round operator. */
+struct round_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::round(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return DataType(0);
+  }
+};
+
+/** Ceiling operator. */
+struct ceil_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::ceil(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return DataType(0);
+  }
+};
+
+/** Floor operator. */
+struct floor_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::floor(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return DataType(0);
+  }
+};
+
+/** Reciprocal operator. */
+struct reciprocal_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return 1 / x;
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    if (dy == DataType(0)) { return DataType(0); }
+    else                   { return - dy / (x*x); }
+    
+  }
+};
+
+/** Square operator. */
+struct square_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return x*x;
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return 2*x * dy;
+  }
+};
+
+
+/** Square root operator. */
+struct sqrt_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::sqrt(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / (2 * cuda::sqrt(x));
+  }
+};
+
+/** Reciprocal square root operator. */
+struct rsqrt_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::rsqrt(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    const auto& s = cuda::sqrt(x);
+    return - dy / (2 * x * s);
+  }
+};
+
+/** Safe reciprocal operator.
+ *  If a standard reciprocal produces an infinity or NaN, zero is
+ *  output instead.
+ */
+struct safe_reciprocal_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    const auto& y = 1 / x;
+    if (isfinite(y)) { return y; }
+    else             { return DataType(0); }
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    const auto& y = 1 / x;
+    if (isfinite(y)) { return - dy * y*y; }
+    else             { return DataType(0); }
+  }
+};
+  
+/** Exponential operator. */
+struct exp_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::exp(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy * cuda::exp(x);
+  }
+};
+
+/** Exponential minus one operator. */
+struct expm1_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::expm1(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy * cuda::exp(x);
+  }
+};
+
+/** Natural logarithm operator. */
+struct log_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::log(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / x;
+  }
+};
+
+/** Natural logarithm one plus operator. */
+struct log1p_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::log1p(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / (x + DataType(1));
+  }
+};
+
+/** Cosine operator. */
+struct cos_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::cos(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return -dy * cuda::sin(x);
+  }
+};
+
+/** Sine operator. */
+struct sin_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::sin(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy * cuda::cos(x);
+  }
+};
+
+/** Tangent operator. */
+struct tan_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::tan(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    const auto& c = cuda::cos(x);
+    return dy / (c*c);
+  }
+};
+
+/** Arccosine operator. */
+struct acos_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::acos(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return -dy / cuda::sqrt(DataType(1) - x*x);
+  }
+};
+
+/** Arcsine operator. */
+struct asin_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::asin(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / cuda::sqrt(DataType(1) - x*x);
+  }
+};
+
+/** Arctangent operator. */
+struct atan_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::atan(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / (DataType(1) + x*x);
+  }
+};
+
+/** Hyperbolic cosine operator. */
+struct cosh_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::cosh(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy * cuda::sinh(x);
+  }
+};
+
+/** Hyperbolic sine operator. */
+struct sinh_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::sinh(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy * cuda::cosh(x);
+  }
+};
+
+/** Hyperbolic tangent operator. */
+struct tanh_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::tanh(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    const auto& c = cuda::cosh(x);
+    return dy / (c*c);
+  }
+};
+
+/** Hyperbolic arccosine operator. */
+struct acosh_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::acosh(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return -dy / (cuda::sqrt(x - DataType(1)) * cuda::sqrt(x + DataType(1)));
+  }
+};
+
+/** Hyperbolic arcsine operator. */
+struct asinh_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::asinh(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / cuda::sqrt(DataType(1) + x*x);
+  }
+};
+
+/** Hyperbolic arctangent operator. */
+struct atanh_op {
+  inline __device__ DataType operator()(const DataType& x) const {
+    return cuda::atanh(x);
+  }
+  inline __device__ DataType operator()(const DataType& x, const DataType& dy) const {
+    return dy / (DataType(1) - x*x);
+  }
+};
+  
+} // namespace
+
+// Template instantiation
+#define INSTANTIATE(layer, op)                                          \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
+  ::fp_compute() {                                                      \
+    cuda::apply_entrywise_unary_operator<op>(get_prev_activations(),    \
+                                             get_activations());        \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::MODEL_PARALLEL, El::Device::GPU>              \
+  ::bp_compute() {                                                      \
+    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(),   \
+                                              get_prev_error_signals(), \
+                                              get_error_signals());     \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
+  ::fp_compute() {                                                      \
+    cuda::apply_entrywise_unary_operator<op>(get_prev_activations(),    \
+                                             get_activations());        \
+  }                                                                     \
+  template <>                                                           \
+  void layer<data_layout::DATA_PARALLEL, El::Device::GPU>               \
+  ::bp_compute() {                                                      \
+    cuda::apply_entrywise_binary_operator<op>(get_prev_activations(),   \
+                                              get_prev_error_signals(), \
+                                              get_error_signals());     \
+  }
+  INSTANTIATE(not_layer, not_op)
+  INSTANTIATE(abs_layer, abs_op)
+  INSTANTIATE(negative_layer, negative_op)
+  INSTANTIATE(sign_layer, sign_op)
+  INSTANTIATE(round_layer, round_op)
+  INSTANTIATE(ceil_layer, ceil_op)
+  INSTANTIATE(floor_layer, floor_op)
+  INSTANTIATE(reciprocal_layer, reciprocal_op)
+  INSTANTIATE(square_layer, square_op)
+  INSTANTIATE(sqrt_layer, sqrt_op)
+  INSTANTIATE(safe_reciprocal_layer, safe_reciprocal_op)
+  INSTANTIATE(rsqrt_layer, rsqrt_op)
+  INSTANTIATE(exp_layer, exp_op)
+  INSTANTIATE(expm1_layer, expm1_op)
+  INSTANTIATE(log_layer, log_op)
+  INSTANTIATE(log1p_layer, log1p_op)
+  INSTANTIATE(cos_layer, cos_op)
+  INSTANTIATE(sin_layer, sin_op)
+  INSTANTIATE(tan_layer, tan_op)
+  INSTANTIATE(acos_layer, acos_op)
+  INSTANTIATE(asin_layer, asin_op)
+  INSTANTIATE(atan_layer, atan_op)
+  INSTANTIATE(cosh_layer, cosh_op)
+  INSTANTIATE(sinh_layer, sinh_op)
+  INSTANTIATE(tanh_layer, tanh_op)
+  INSTANTIATE(acosh_layer, acosh_op)
+  INSTANTIATE(asinh_layer, asinh_op)
+  INSTANTIATE(atanh_layer, atanh_op)
+  
+} // namespace lbann
diff --git a/src/layers/misc/CMakeLists.txt b/src/layers/misc/CMakeLists.txt
new file mode 100644
index 00000000000..e1c9713f9a9
--- /dev/null
+++ b/src/layers/misc/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Add the source files for this directory
+set_full_path(THIS_DIR_SOURCES
+  covariance.cpp
+  variance.cpp
+  )
+
+if (LBANN_HAS_CUDA)
+  # Add the CUDA source files for this directory
+  set_full_path(THIS_DIR_CU_SOURCES
+    covariance.cu
+    variance.cu
+    )
+endif ()
+
+# Propagate the files up the tree
+set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
+set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CU_SOURCES}" PARENT_SCOPE)
diff --git a/src/layers/misc/covariance.cpp b/src/layers/misc/covariance.cpp
new file mode 100644
index 00000000000..426be208c5d
--- /dev/null
+++ b/src/layers/misc/covariance.cpp
@@ -0,0 +1,190 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/misc/covariance.hpp"
+
+namespace lbann {
+
+namespace {
+
+/** CPU forward prop implementation.
+ *  We use a two-pass algorithm since it is more numerically stable
+ *  than the naive single-pass algorithm.
+ */
+void fp_cpu(const AbsDistMat& input0,
+            const AbsDistMat& input1,
+            AbsDistMat& output,
+            AbsDistMat& means,
+            AbsDistMat& workspace,
+            bool biased) {
+
+  // Local matrices
+  const auto& local_input0 = static_cast<const CPUMat&>(input0.LockedMatrix());
+  const auto& local_input1 = static_cast<const CPUMat&>(input1.LockedMatrix());
+  auto& local_means = static_cast<CPUMat&>(means.Matrix());
+  auto& local_workspace = static_cast<CPUMat&>(workspace.Matrix());
+  
+  // Dimensions
+  const auto& height = input0.Height();
+  const auto& width = input0.Width();
+  const auto& local_height = local_input0.Height();
+  const auto& local_width = local_input0.Width();
+
+  // Compute column-wise mean
+  means.Empty(false);
+  means.AlignWith(input0);
+  means.Resize(2, width);
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    DataType sum0 = 0, sum1 = 0;
+    for (El::Int row = 0; row < local_height; ++row) {
+      sum0 += local_input0(row, col);
+      sum1 += local_input1(row, col);
+    }
+    local_means(0, col) = sum0 / height;
+    local_means(1, col) = sum1 / height;
+  }
+  El::AllReduce(means, means.RedundantComm());
+
+  // Compute column-wise covariance
+  workspace.Empty(false);
+  workspace.AlignWith(input0);
+  workspace.Resize(1, width);
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    const auto& mean0 = local_means(0, col);
+    const auto& mean1 = local_means(1, col);
+    DataType sum = 0;
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& x0 = local_input0(row, col);
+      const auto& x1 = local_input1(row, col);
+      sum += (x0 - mean0) * (x1 - mean1);
+    }
+    local_workspace(0, col) = sum / (biased ? height : height - 1);
+  }
+  El::AllReduce(workspace, workspace.RedundantComm());
+  El::Copy(workspace, output);
+  
+}
+
+/** CPU backprop implementation.
+ *  Means have already been computed in forward prop.
+ */
+void bp_cpu(const AbsDistMat& input0,
+            const AbsDistMat& input1,
+            const AbsDistMat& gradient_wrt_output,
+            AbsDistMat& gradient_wrt_input0,
+            AbsDistMat& gradient_wrt_input1,
+            const AbsDistMat& means,
+            AbsDistMat& workspace,
+            bool biased) {
+
+  // Local matrices
+  const auto& local_input0 = static_cast<const CPUMat&>(input0.LockedMatrix());
+  const auto& local_input1 = static_cast<const CPUMat&>(input1.LockedMatrix());
+  auto& local_gradient_wrt_input0 = static_cast<CPUMat&>(gradient_wrt_input0.Matrix());
+  auto& local_gradient_wrt_input1 = static_cast<CPUMat&>(gradient_wrt_input1.Matrix());
+  const auto& local_means = static_cast<const CPUMat&>(means.LockedMatrix());
+  auto& local_workspace = static_cast<CPUMat&>(workspace.Matrix());
+  
+  // Dimensions
+  const auto& height = input0.Height();
+  const auto& local_height = local_input0.Height();
+  const auto& local_width = local_input0.Width();
+
+  // Initialize workspace with gradients w.r.t. output
+  El::Copy(gradient_wrt_output, workspace);
+
+  // Compute gradients w.r.t. input
+  const DataType scale = DataType(1) / (biased? height : height - 1);
+#pragma omp parallel for collapse(2)
+  for (El::Int col = 0; col < local_width; ++col) {
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& dy = local_workspace(0, col);
+      const auto& x0 = local_input0(row, col);
+      const auto& x1 = local_input1(row, col);
+      const auto& mean0 = local_means(0, col);
+      const auto& mean1 = local_means(1, col);
+      auto& dx0 = local_gradient_wrt_input0(row, col);
+      auto& dx1 = local_gradient_wrt_input1(row, col);
+      dx0 = dy * scale * (x1 - mean1);
+      dx1 = dy * scale * (x0 - mean0);
+    }
+  }
+
+}
+
+} // namespace
+  
+template <>
+void covariance_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
+     ::fp_compute() {
+  fp_cpu(get_prev_activations(0),
+         get_prev_activations(1),
+         get_activations(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void covariance_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
+     ::bp_compute() {
+  bp_cpu(get_prev_activations(0),
+         get_prev_activations(1),
+         get_prev_error_signals(),
+         get_error_signals(0),
+         get_error_signals(1),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void covariance_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
+     ::fp_compute() {
+  fp_cpu(get_prev_activations(0),
+         get_prev_activations(1),
+         get_activations(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void covariance_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
+     ::bp_compute() {
+  bp_cpu(get_prev_activations(0),
+         get_prev_activations(1),
+         get_prev_error_signals(),
+         get_error_signals(0),
+         get_error_signals(1),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+} // namespace lbann
diff --git a/src/layers/misc/covariance.cu b/src/layers/misc/covariance.cu
new file mode 100644
index 00000000000..c750b7b7b1c
--- /dev/null
+++ b/src/layers/misc/covariance.cu
@@ -0,0 +1,337 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/misc/covariance.hpp"
+
+namespace lbann {
+
+namespace {
+
+/** Compute local contributions to means.
+ *  Computes column-wise sums of two input matrices and multiplies
+ *  them by a scaling factor (which should be
+ *  1/height). 'contribution' is interpreted as a 2 x width matrix
+ *  where the first row corresponds to 'input0' and the second row to
+ *  'input1'.
+ */
+template <El::Int block_size>
+__global__ void mean_contribution_kernel(El::Int height,
+                                         El::Int width,
+                                         DataType scale,
+                                         const DataType* __restrict__ input0,
+                                         El::Int input0_ldim,
+                                         const DataType* __restrict__ input1,
+                                         El::Int input1_ldim,
+                                         DataType* __restrict__ contribution) {
+  
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+
+  // Compute local contribution for each matrix column
+  for (El::Int col = bidy; col < width; col += gridDim.y) {
+    
+    // Compute contributions for each thread
+    DataType private_contribution0 = 0;
+    DataType private_contribution1 = 0;
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      private_contribution0 += input0[row + col * input0_ldim];
+      private_contribution1 += input1[row + col * input1_ldim];
+    }
+    
+    // Shared memory reduction to get contribution for each block
+    /// @todo unroll loops
+    __shared__ DataType shared_contribution0[block_size];
+    __shared__ DataType shared_contribution1[block_size];
+    shared_contribution0[tid] = private_contribution0;
+    shared_contribution1[tid] = private_contribution1;
+    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        shared_contribution0[tid] += shared_contribution0[tid + stride];
+        shared_contribution1[tid] += shared_contribution1[tid + stride];
+      }
+    }
+    if (tid == 0) {
+      cuda::atomic_add(&contribution[2*col],
+                       scale * shared_contribution0[0]);
+      cuda::atomic_add(&contribution[2*col+1],
+                       scale * shared_contribution1[0]);
+    }
+    
+  }
+  
+}
+  
+/** Compute local contributions to covariances. */
+template <El::Int block_size>
+__global__ void covariance_contribution_kernel(El::Int height,
+                                               El::Int width,
+                                               DataType scale,
+                                               const DataType* __restrict__ input0,
+                                               El::Int input0_ldim,
+                                               const DataType* __restrict__ input1,
+                                               El::Int input1_ldim,
+                                               const DataType* __restrict__ means,
+                                               DataType* __restrict__ contribution) {
+  
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+
+  // Compute local contribution for each matrix column
+  for (El::Int col = bidy; col < width; col += gridDim.y) {
+    const auto& mean0 = means[2*col];
+    const auto& mean1 = means[2*col+1];
+    
+    // Compute contributions for each thread
+    DataType private_contribution = 0;
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      const auto& x0 = input0[row + col * input0_ldim];
+      const auto& x1 = input1[row + col * input1_ldim];
+      private_contribution += (x0 - mean0) * (x1 - mean1);
+    }
+
+    // Shared memory reduction to get contribution for each block
+    /// @todo unroll loops
+    __shared__ DataType shared_contribution[block_size];
+    shared_contribution[tid] = private_contribution;
+    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        shared_contribution[tid] += shared_contribution[tid + stride];
+      }
+    }
+    if (tid == 0) {
+      cuda::atomic_add(&contribution[col],
+                       scale * shared_contribution[0]);
+    }
+    
+  }
+  
+}
+
+/** Compute gradients w.r.t. inputs. */
+__global__
+void covariance_backprop_kernel(El::Int height,
+                                El::Int width,
+                                DataType scale,
+                                const DataType* __restrict__ gradient_wrt_output,
+                                const DataType* __restrict__ input0,
+                                El::Int input0_ldim,
+                                const DataType* __restrict__ input1,
+                                El::Int input1_ldim,
+                                const DataType* __restrict__ means,
+                                DataType* __restrict__ gradient_wrt_input0,
+                                El::Int gradient_wrt_input0_ldim,
+                                DataType* __restrict__ gradient_wrt_input1,
+                                El::Int gradient_wrt_input1_ldim) {
+  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int size = height * width;
+  const El::Int nthreads = blockDim.x * gridDim.x;
+  for (El::Int pos = gid; pos < size; pos += nthreads) {
+    const auto& row = pos % height;
+    const auto& col = pos / height;
+    const auto& dy = gradient_wrt_output[col];
+    const auto& x0 = input0[row + col * input0_ldim];
+    const auto& x1 = input1[row + col * input1_ldim];
+    const auto& mean0 = means[2*col];
+    const auto& mean1 = means[2*col+1];
+    auto& dx0 = gradient_wrt_input0[row + col * gradient_wrt_input0_ldim];
+    auto& dx1 = gradient_wrt_input1[row + col * gradient_wrt_input1_ldim];
+    dx0 = dy * scale * (x1 - mean1);
+    dx1 = dy * scale * (x0 - mean0);
+  }
+}
+  
+/** GPU forward prop implementation.
+ *  We use a two-pass algorithm since it is more numerically stable
+ *  than the naive single-pass algorithm.
+ */
+void fp_gpu(const AbsDistMat& input0,
+            const AbsDistMat& input1,
+            AbsDistMat& output,
+            AbsDistMat& means,
+            AbsDistMat& workspace,
+            bool biased) {
+
+  // Local matrices
+  const auto& local_input0 = static_cast<const GPUMat&>(input0.LockedMatrix());
+  const auto& local_input1 = static_cast<const GPUMat&>(input1.LockedMatrix());
+  auto& local_means = static_cast<GPUMat&>(means.Matrix());
+  auto& local_workspace = static_cast<GPUMat&>(workspace.Matrix());
+  
+  // Dimensions
+  const auto& height = input0.Height();
+  const auto& width = input0.Width();
+  const auto& local_height = local_input0.Height();
+  const auto& local_width = local_input0.Width();
+
+  // Compute column-wise mean
+  means.Empty(false);
+  means.AlignWith(input0);
+  El::Zeros(means, 2, width);
+  if (!local_input0.IsEmpty()) {
+    constexpr El::Int block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    grid_dims.y = local_width;
+    const auto& scale = DataType(1) / height;
+    mean_contribution_kernel<block_size>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_height, local_width, scale,
+        local_input0.LockedBuffer(), local_input0.LDim(),
+        local_input1.LockedBuffer(), local_input1.LDim(),
+        local_means.Buffer());
+  }
+  El::AllReduce(means, means.RedundantComm());
+
+  // Compute column-wise covariance
+  workspace.Empty(false);
+  workspace.AlignWith(input0);
+  El::Zeros(workspace, 1, width);
+  if (!local_input0.IsEmpty()) {
+    constexpr El::Int block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    grid_dims.y = local_width;
+    const auto& scale = DataType(1) / (biased ? height : height - 1);
+    covariance_contribution_kernel<block_size>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_height, local_width, scale,
+        local_input0.LockedBuffer(), local_input0.LDim(),
+        local_input1.LockedBuffer(), local_input1.LDim(),
+        local_means.LockedBuffer(),
+        local_workspace.Buffer());
+  }
+  El::AllReduce(workspace, workspace.RedundantComm());
+  El::Copy(workspace, output);
+  
+}
+
+/** GPU backprop implementation.
+ *  Means have already been computed in forward prop.
+ */
+void bp_gpu(const AbsDistMat& input0,
+            const AbsDistMat& input1,
+            const AbsDistMat& gradient_wrt_output,
+            AbsDistMat& gradient_wrt_input0,
+            AbsDistMat& gradient_wrt_input1,
+            const AbsDistMat& means,
+            AbsDistMat& workspace,
+            bool biased) {
+
+  // Local matrices
+  const auto& local_input0 = static_cast<const GPUMat&>(input0.LockedMatrix());
+  const auto& local_input1 = static_cast<const GPUMat&>(input1.LockedMatrix());
+  auto& local_gradient_wrt_input0 = static_cast<GPUMat&>(gradient_wrt_input0.Matrix());
+  auto& local_gradient_wrt_input1 = static_cast<GPUMat&>(gradient_wrt_input1.Matrix());
+  const auto& local_means = static_cast<const GPUMat&>(means.LockedMatrix());
+  auto& local_workspace = static_cast<GPUMat&>(workspace.Matrix());
+  
+  // Dimensions
+  const auto& height = input0.Height();
+  const auto& local_height = local_input0.Height();
+  const auto& local_width = local_input0.Width();
+
+  // Initialize workspace with gradients w.r.t. output
+  El::Copy(gradient_wrt_output, workspace);
+
+  // Compute gradients w.r.t. input
+  const DataType scale = DataType(1) / (biased ? height : height - 1);
+  constexpr El::Int block_size = 256;
+  El::Int grid_size = (local_height * local_width + block_size - 1) / block_size;
+  if (grid_size > 0) {
+    covariance_backprop_kernel
+      <<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
+        local_height, local_width, scale,
+        local_workspace.LockedBuffer(),
+        local_input0.LockedBuffer(), local_input0.LDim(),
+        local_input1.LockedBuffer(), local_input1.LDim(),
+        local_means.LockedBuffer(),
+        local_gradient_wrt_input0.Buffer(), local_gradient_wrt_input0.LDim(),
+        local_gradient_wrt_input1.Buffer(), local_gradient_wrt_input1.LDim());
+  }
+
+}
+
+} // namespace
+  
+template <>
+void covariance_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
+     ::fp_compute() {
+  fp_gpu(get_prev_activations(0),
+         get_prev_activations(1),
+         get_activations(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void covariance_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
+     ::bp_compute() {
+  bp_gpu(get_prev_activations(0),
+         get_prev_activations(1),
+         get_prev_error_signals(),
+         get_error_signals(0),
+         get_error_signals(1),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void covariance_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
+     ::fp_compute() {
+  fp_gpu(get_prev_activations(0),
+         get_prev_activations(1),
+         get_activations(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void covariance_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
+     ::bp_compute() {
+  bp_gpu(get_prev_activations(0),
+         get_prev_activations(1),
+         get_prev_error_signals(),
+         get_error_signals(0),
+         get_error_signals(1),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+} // namespace lbann
diff --git a/src/layers/misc/variance.cpp b/src/layers/misc/variance.cpp
new file mode 100644
index 00000000000..12afb8204ef
--- /dev/null
+++ b/src/layers/misc/variance.cpp
@@ -0,0 +1,170 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/misc/variance.hpp"
+
+namespace lbann {
+
+namespace {
+
+/** CPU forward prop implementation.
+ *  We use a two-pass algorithm since it is more numerically stable
+ *  than the naive single-pass algorithm.
+ */
+void fp_cpu(const AbsDistMat& input,
+            AbsDistMat& output,
+            AbsDistMat& means,
+            AbsDistMat& workspace,
+            bool biased) {
+
+  // Local matrices
+  const auto& local_input = static_cast<const CPUMat&>(input.LockedMatrix());
+  auto& local_means = static_cast<CPUMat&>(means.Matrix());
+  auto& local_workspace = static_cast<CPUMat&>(workspace.Matrix());
+  
+  // Dimensions
+  const auto& height = input.Height();
+  const auto& width = input.Width();
+  const auto& local_height = local_input.Height();
+  const auto& local_width = local_input.Width();
+
+  // Compute column-wise mean
+  means.Empty(false);
+  means.AlignWith(input);
+  means.Resize(1, width);
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    DataType sum = 0;
+    for (El::Int row = 0; row < local_height; ++row) {
+      sum += local_input(row, col);
+    }
+    local_means(0, col) = sum / height;
+  }
+  El::AllReduce(means, means.RedundantComm());
+
+  // Compute column-wise variance
+  workspace.Empty(false);
+  workspace.AlignWith(input);
+  workspace.Resize(1, width);
+#pragma omp parallel for
+  for (El::Int col = 0; col < local_width; ++col) {
+    const auto& mean = local_means(0, col);
+    DataType sum = 0;
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& diff = local_input(row, col) - mean;
+      sum += diff * diff;
+    }
+    local_workspace(0, col) = sum / (biased ? height : height - 1);
+  }
+  El::AllReduce(workspace, workspace.RedundantComm());
+  El::Copy(workspace, output);
+  
+}
+
+/** CPU backprop implementation.
+ *  Means have already been computed in forward prop.
+ */
+void bp_cpu(const AbsDistMat& input,
+            const AbsDistMat& gradient_wrt_output,
+            AbsDistMat& gradient_wrt_input,
+            const AbsDistMat& means,
+            AbsDistMat& workspace,
+            bool biased) {
+
+  // Local matrices
+  const auto& local_input = static_cast<const CPUMat&>(input.LockedMatrix());
+  auto& local_gradient_wrt_input = static_cast<CPUMat&>(gradient_wrt_input.Matrix());
+  const auto& local_means = static_cast<const CPUMat&>(means.LockedMatrix());
+  auto& local_workspace = static_cast<CPUMat&>(workspace.Matrix());
+  
+  // Dimensions
+  const auto& height = input.Height();
+  const auto& local_height = local_input.Height();
+  const auto& local_width = local_input.Width();
+
+  // Initialize workspace with gradients w.r.t. output
+  El::Copy(gradient_wrt_output, workspace);
+
+  // Compute gradients w.r.t. input
+  const DataType scale = DataType(2) / (biased? height : height - 1);
+#pragma omp parallel for collapse(2)
+  for (El::Int col = 0; col < local_width; ++col) {
+    for (El::Int row = 0; row < local_height; ++row) {
+      const auto& dy = local_workspace(0, col);
+      const auto& x = local_input(row, col);
+      const auto& mean = local_means(0, col);
+      auto& dx = local_gradient_wrt_input(row, col);
+      dx = dy * scale * (x - mean);
+    }
+  }
+
+}
+
+} // namespace
+  
+template <>
+void variance_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
+     ::fp_compute() {
+  fp_cpu(get_prev_activations(),
+         get_activations(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void variance_layer<data_layout::DATA_PARALLEL, El::Device::CPU>
+     ::bp_compute() {
+  bp_cpu(get_prev_activations(),
+         get_prev_error_signals(),
+         get_error_signals(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void variance_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
+     ::fp_compute() {
+  fp_cpu(get_prev_activations(),
+         get_activations(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void variance_layer<data_layout::MODEL_PARALLEL, El::Device::CPU>
+     ::bp_compute() {
+  bp_cpu(get_prev_activations(),
+         get_prev_error_signals(),
+         get_error_signals(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+} // namespace lbann
diff --git a/src/layers/misc/variance.cu b/src/layers/misc/variance.cu
new file mode 100644
index 00000000000..f30192737c7
--- /dev/null
+++ b/src/layers/misc/variance.cu
@@ -0,0 +1,245 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/misc/variance.hpp"
+
+namespace lbann {
+
+namespace {
+
+template <El::Int block_size>
+__global__ void variance_contribution_kernel(El::Int height,
+                                             El::Int width,
+                                             DataType scale,
+                                             const DataType* __restrict__ input,
+                                             El::Int input_ldim,
+                                             const DataType* __restrict__ means,
+                                             DataType* __restrict__ contribution) {
+  
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
+  const El::Int nthreadsx = blockDim.x * gridDim.x;
+
+  // Compute local contribution for each matrix column
+  for (El::Int col = bidy; col < width; col += gridDim.y) {
+    const auto& mean = means[col];
+    
+    // Compute contributions for each thread
+    DataType private_contribution = 0;
+    for (El::Int row = gidx; row < height; row += nthreadsx) {
+      const auto& diff = input[row + col * input_ldim] - mean;
+      private_contribution += diff * diff;
+    }
+
+    // Shared memory reduction to get contribution for each block
+    /// @todo unroll loops
+    __shared__ DataType shared_contribution[block_size];
+    shared_contribution[tid] = private_contribution;
+    for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      if (tid < stride) {
+        shared_contribution[tid] += shared_contribution[tid + stride];
+      }
+    }
+    if (tid == 0) {
+      cuda::atomic_add(&contribution[col],
+                       scale * shared_contribution[0]);
+    }
+    
+  }
+  
+}
+
+__global__
+void variance_backprop_kernel(El::Int height,
+                              El::Int width,
+                              DataType scale,
+                              const DataType* __restrict__ gradient_wrt_output,
+                              const DataType* __restrict__ input,
+                              El::Int input_ldim,
+                              const DataType* __restrict__ means,
+                              DataType* __restrict__ gradient_wrt_input,
+                              El::Int gradient_wrt_input_ldim) {
+  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int size = height * width;
+  const El::Int nthreads = blockDim.x * gridDim.x;
+  for (El::Int pos = gid; pos < size; pos += nthreads) {
+    const auto& row = pos % height;
+    const auto& col = pos / height;
+    const auto& dy = gradient_wrt_output[col];
+    const auto& x = input[row + col * input_ldim];
+    const auto& mean = means[col];
+    auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim];
+    dx = dy * scale * (x - mean);
+  }
+}
+  
+/** GPU forward prop implementation.
+ *  We use a two-pass algorithm since it is more numerically stable
+ *  than the naive single-pass algorithm.
+ */
+void fp_gpu(const AbsDistMat& input,
+            AbsDistMat& output,
+            AbsDistMat& means,
+            AbsDistMat& workspace,
+            bool biased) {
+
+  // Local matrices
+  const auto& local_input = static_cast<const GPUMat&>(input.LockedMatrix());
+  auto& local_means = static_cast<GPUMat&>(means.Matrix());
+  auto& local_workspace = static_cast<GPUMat&>(workspace.Matrix());
+  
+  // Dimensions
+  const auto& height = input.Height();
+  const auto& width = input.Width();
+  const auto& local_height = local_input.Height();
+  const auto& local_width = local_input.Width();
+
+  // Compute column-wise mean
+  means.Empty(false);
+  means.AlignWith(input);
+  means.Resize(1, width);
+  GPUMat ones;
+#ifdef HYDROGEN_HAVE_CUB
+  ones.SetMemoryMode(1); // Use CUB GPU memory pool
+#endif // HYDROGEN_HAVE_CUB
+  ones.Resize(local_height, 1);
+  El::Fill(ones, DataType(1));
+  El::Gemv(El::TRANSPOSE,
+           DataType(1) / height, local_input, ones,
+           DataType(0), local_means);
+  El::AllReduce(means, means.RedundantComm());
+
+  // Compute column-wise variance
+  workspace.Empty(false);
+  workspace.AlignWith(input);
+  El::Zeros(workspace, 1, width);
+  if (!local_input.IsEmpty()) {
+    constexpr El::Int block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (local_height + block_size - 1) / block_size;
+    grid_dims.y = local_width;
+    const auto& scale = DataType(1) / (biased ? height : height - 1);
+    variance_contribution_kernel<block_size>
+      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
+        local_height, local_width, scale,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_means.LockedBuffer(),
+        local_workspace.Buffer());
+  }
+  El::AllReduce(workspace, workspace.RedundantComm());
+  El::Copy(workspace, output);
+  
+}
+
+/** GPU backprop implementation.
+ *  Means have already been computed in forward prop.
+ */
+void bp_gpu(const AbsDistMat& input,
+            const AbsDistMat& gradient_wrt_output,
+            AbsDistMat& gradient_wrt_input,
+            const AbsDistMat& means,
+            AbsDistMat& workspace,
+            bool biased) {
+
+  // Local matrices
+  const auto& local_input = static_cast<const GPUMat&>(input.LockedMatrix());
+  auto& local_gradient_wrt_input = static_cast<GPUMat&>(gradient_wrt_input.Matrix());
+  const auto& local_means = static_cast<const GPUMat&>(means.LockedMatrix());
+  auto& local_workspace = static_cast<GPUMat&>(workspace.Matrix());
+  
+  // Dimensions
+  const auto& height = input.Height();
+  const auto& local_height = local_input.Height();
+  const auto& local_width = local_input.Width();
+
+  // Initialize workspace with gradients w.r.t. output
+  El::Copy(gradient_wrt_output, workspace);
+
+  // Compute gradients w.r.t. input
+  const DataType scale = DataType(2) / (biased ? height : height - 1);
+  constexpr El::Int block_size = 256;
+  El::Int grid_size = (local_height * local_width + block_size - 1) / block_size;
+  if (grid_size > 0) {
+    variance_backprop_kernel
+      <<<grid_size, block_size, 0, El::GPUManager::Stream()>>>(
+        local_height, local_width, scale,
+        local_workspace.LockedBuffer(),
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_means.LockedBuffer(),
+        local_gradient_wrt_input.Buffer(), local_gradient_wrt_input.LDim());
+  }
+
+}
+
+} // namespace
+  
+template <>
+void variance_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
+     ::fp_compute() {
+  fp_gpu(get_prev_activations(),
+         get_activations(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void variance_layer<data_layout::DATA_PARALLEL, El::Device::GPU>
+     ::bp_compute() {
+  bp_gpu(get_prev_activations(),
+         get_prev_error_signals(),
+         get_error_signals(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void variance_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
+     ::fp_compute() {
+  fp_gpu(get_prev_activations(),
+         get_activations(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+template <>
+void variance_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>
+     ::bp_compute() {
+  bp_gpu(get_prev_activations(),
+         get_prev_error_signals(),
+         get_error_signals(),
+         *m_means,
+         *m_workspace,
+         m_biased);
+}
+
+} // namespace lbann
diff --git a/src/layers/regularizers/CMakeLists.txt b/src/layers/regularizers/CMakeLists.txt
index e8f3b06612f..a85f2e3d740 100644
--- a/src/layers/regularizers/CMakeLists.txt
+++ b/src/layers/regularizers/CMakeLists.txt
@@ -1,3 +1,8 @@
+# Add the source files for this directory
+set_full_path(THIS_DIR_SOURCES
+  batch_normalization.cpp
+  )
+
 if (LBANN_HAS_CUDA)
   # Add the CUDA source files for this directory
   set_full_path(THIS_DIR_CU_SOURCES
@@ -6,4 +11,5 @@ if (LBANN_HAS_CUDA)
 endif ()
 
 # Propagate the files up the tree
+set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
 set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CU_SOURCES}" PARENT_SCOPE)
diff --git a/src/layers/regularizers/batch_normalization.cpp b/src/layers/regularizers/batch_normalization.cpp
new file mode 100644
index 00000000000..98f28d00dca
--- /dev/null
+++ b/src/layers/regularizers/batch_normalization.cpp
@@ -0,0 +1,274 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/layers/regularizers/batch_normalization.hpp"
+
+namespace lbann {
+
+template <>
+void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::fp_compute() {
+  constexpr DataType zero = 0;
+  constexpr DataType one = 1;
+  const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
+
+  // Matrices
+  const auto& input = get_prev_activations();
+  const auto& local_input = input.LockedMatrix();
+  auto& local_output = get_local_activations();
+
+  // Matrix parameters
+  const auto& width = input.Width();
+  const auto& local_width = local_input.Width();
+  const auto& output_dims = get_output_dims();
+  const auto& num_channels = output_dims[0];
+  const auto& channel_size = get_output_size() / num_channels;
+
+  // Compute statistics
+  if (is_training) {
+
+    // Local matrices
+    auto& local_mean = m_mean->Matrix();
+    auto& local_var = m_var->Matrix();
+    auto& local_running_mean = this->m_weights[2]->get_values().Matrix();
+    auto& local_running_var = this->m_weights[3]->get_values().Matrix();
+
+    // Compute sums and sums of squares
+#pragma omp parallel for
+    for (El::Int channel = 0; channel < num_channels; ++channel) {
+      DataType sum = zero;
+      DataType sqsum = zero;
+      const auto& row_start = channel * channel_size;
+      const auto& row_end = (channel+1) * channel_size;
+      for (El::Int col = 0; col < local_width; ++col) {
+        for (El::Int row = row_start; row < row_end; ++row) {
+          const auto& x = local_input(row, col);
+          sum += x;
+          sqsum += x * x;
+        }
+      }
+      local_mean(channel, 0) = sum;
+      local_var(channel, 0) = sqsum;
+    }
+    El::Int num_per_sum;
+    if (m_use_global_stats) {
+      m_comm->allreduce(*m_mean, m_mean->RedundantComm(), El::mpi::SUM);
+      m_comm->allreduce(*m_var, m_var->RedundantComm(), El::mpi::SUM);
+      num_per_sum = channel_size * width;
+    } else {
+      num_per_sum = channel_size * local_width;
+    }
+
+    // Compute minibatch statistics
+    if (num_per_sum <= 1) {
+      El::Fill(local_var, one);
+    } else {
+#pragma omp parallel for
+      for (El::Int channel = 0; channel < num_channels; ++channel) {
+        const auto& mean = local_mean(channel, 0) / num_per_sum;
+        const auto& sqmean = local_var(channel, 0) / num_per_sum;
+        auto var = num_per_sum * (sqmean - mean * mean) / (num_per_sum - 1);
+        var = std::max(var, m_epsilon);
+        local_mean(channel, 0) = mean;
+        local_var(channel, 0) = var;
+        auto& running_mean = local_running_mean(channel, 0);
+        auto& running_var = local_running_var(channel, 0);
+        running_mean = m_decay * running_mean + (one - m_decay) * mean;
+        running_var = m_decay * running_var + (one - m_decay) * var;
+      }
+    }
+
+  }
+
+  // Get matrices
+  const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix();
+  const auto& local_bias = this->m_weights[1]->get_values().LockedMatrix();
+  const auto& local_mean = (is_training ?
+                            m_mean->LockedMatrix() :
+                            this->m_weights[2]->get_values().LockedMatrix());
+  const auto& local_var = (is_training ?
+                           m_var->LockedMatrix() :
+                           this->m_weights[3]->get_values().LockedMatrix());
+
+  // Iterate through channels
+#pragma omp parallel for
+  for (El::Int channel = 0; channel < num_channels; ++channel) {
+
+    // Get channel parameters
+    const auto& mean = local_mean(channel, 0);
+    const auto& var = local_var(channel, 0);
+    const DataType inv_stdev = 1 / std::sqrt(var + m_epsilon);
+    const auto& scale = local_scale(channel, 0);
+    const auto& bias = local_bias(channel, 0);
+
+    // Apply batch normalization to inputs in channel
+    const auto& row_start = channel * channel_size;
+    const auto& row_end = (channel+1) * channel_size;
+    for (El::Int col = 0; col < local_width; ++col) {
+      for (El::Int row = row_start; row < row_end; ++row) {
+        const auto& x = local_input(row, col);
+        const auto& xhat = (x - mean) * inv_stdev;
+        auto& y = local_output(row, col);
+        y = scale * xhat + bias;
+      }
+    }
+
+  }
+    
+}
+
+template <>
+void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::CPU>::bp_compute() {
+  constexpr DataType one = 1;
+  const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
+
+  // Matrices
+  const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix();
+  const auto& local_mean = (is_training ?
+                            m_mean->LockedMatrix() :
+                            this->m_weights[2]->get_values().LockedMatrix());
+  const auto& local_var = (is_training ?
+                           m_var->LockedMatrix() :
+                           this->m_weights[3]->get_values().LockedMatrix());
+  const auto& input = get_prev_activations();
+  const auto& local_input = input.LockedMatrix();
+  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = get_local_error_signals();
+  auto& local_mean_gradient = m_mean_gradient->Matrix();
+  auto& local_var_gradient = m_var_gradient->Matrix();
+  auto& local_scale_gradient = m_scale_gradient->Matrix();
+  auto& local_bias_gradient = m_bias_gradient->Matrix();
+
+  // Matrix parameters
+  const El::Int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size();
+  const auto& width = input.Width();
+  const auto& local_width = local_input.Width();
+  const auto& output_dims = get_output_dims();
+  const auto& num_channels = output_dims[0];
+  const auto& channel_size = get_output_size() / num_channels;
+
+  // Compute local gradients
+#pragma omp parallel for
+  for (El::Int channel = 0; channel < num_channels; ++channel) {
+
+    // Initialize channel parameters and gradients
+    const auto& mean = local_mean(channel, 0);
+    const auto& var = local_var(channel, 0);
+    const auto& scale = local_scale(channel, 0);
+    const DataType inv_stdev = 1 / std::sqrt(var + m_epsilon);
+    const auto& dvar_factor = inv_stdev * inv_stdev * inv_stdev / 2;
+    DataType dmean = 0;
+    DataType dvar = 0;
+    DataType dscale = 0;
+    DataType dbias = 0;
+
+    // Compute gradient contributions from local entries
+    const auto& row_start = channel * channel_size;
+    const auto& row_end = (channel+1) * channel_size;
+    for (El::Int col = 0; col < local_width; ++col) {
+      for (El::Int row = row_start; row < row_end; ++row) {
+        const auto& x = local_input(row, col);
+        const auto& xhat = (x - mean) * inv_stdev;
+        const auto& dy = local_gradient_wrt_output(row, col);
+        dscale += dy * xhat;
+        dbias += dy;
+        const auto& dxhat = dy * scale;
+        dmean += - dxhat * inv_stdev;
+        dvar += - dxhat * (x - mean) * dvar_factor;
+      }
+    }
+    local_mean_gradient(channel, 0) = dmean;
+    local_var_gradient(channel, 0) = dvar;
+    local_scale_gradient(channel, 0) = dscale;
+    local_bias_gradient(channel, 0) = dbias;
+
+  }
+
+  // Accumulate gradients
+  if (is_training) {
+    if (m_use_global_stats) {
+      m_comm->allreduce(*m_mean_gradient,
+                        m_mean_gradient->RedundantComm(),
+                        El::mpi::SUM);
+      m_comm->allreduce(*m_var_gradient,
+                        m_var_gradient->RedundantComm(),
+                        El::mpi::SUM);
+    }
+  } else {
+    El::Zero(*m_mean_gradient);
+    El::Zero(*m_var_gradient);
+  }
+  optimizer* scale_optimizer = m_weights[0]->get_optimizer();
+  if (scale_optimizer != nullptr) {
+    scale_optimizer->add_to_gradient_staging(*m_scale_gradient,
+                                             one / effective_mini_batch_size);
+  }
+  optimizer* bias_optimizer = m_weights[1]->get_optimizer();
+  if (bias_optimizer != nullptr) {
+    bias_optimizer->add_to_gradient_staging(*m_bias_gradient,
+                                            one / effective_mini_batch_size);
+  }
+
+  // Compute error signal
+  const auto& num_per_sum = (m_use_global_stats ?
+                             width * channel_size :
+                             local_width * channel_size);
+  if (num_per_sum <= 1) {
+    El::Zero(local_gradient_wrt_input);
+  } else {
+#pragma omp parallel for
+    for (El::Int channel = 0; channel < num_channels; ++channel) {
+
+      // Initialize channel parameters and gradients
+      const auto& mean = local_mean(channel, 0);
+      const auto& var = local_var(channel, 0);
+      const auto& scale = local_scale(channel, 0);
+      const auto& dmean = local_mean_gradient(channel, 0);
+      const auto& dvar = local_var_gradient(channel, 0);
+
+      // Compute useful constants
+      const DataType inv_stdev = 1 / std::sqrt(var + m_epsilon);
+      const auto& dmean_term = dmean / num_per_sum;
+      const auto& dvar_term = dvar * 2 / (num_per_sum - 1);
+
+      // Compute error signal for current channel
+      const auto& row_start = channel * channel_size;
+      const auto& row_end = (channel+1) * channel_size;
+      for (El::Int col = 0; col < local_width; ++col) {
+        for (El::Int row = row_start; row < row_end; ++row) {
+          const auto& x = local_input(row, col);
+          const auto& dy = local_gradient_wrt_output(row, col);
+          const auto& dxhat = dy * scale;
+          auto& dx = local_gradient_wrt_input(row, col);
+          dx = dxhat * inv_stdev + dmean_term + dvar_term * (x - mean);
+        }
+      }
+
+    }
+  }
+  
+}
+  
+} // namespace lbann
diff --git a/src/layers/regularizers/batch_normalization.cu b/src/layers/regularizers/batch_normalization.cu
index 495c504911d..5b221e68a4b 100644
--- a/src/layers/regularizers/batch_normalization.cu
+++ b/src/layers/regularizers/batch_normalization.cu
@@ -24,96 +24,39 @@
 // permissions and limitations under the license.
 ////////////////////////////////////////////////////////////////////////////////
 
-#include "math.h"
-#include "lbann/base.hpp"
-#include "lbann/utils/exception.hpp"
+#include "lbann/layers/regularizers/batch_normalization.hpp"
 #include "lbann/utils/cuda.hpp"
 
 namespace lbann {
 
 namespace {
 
-// Atomic add functions
-#if __CUDA_ARCH__ >= 530
-__device__ inline __half atomic_add(__half* address, __half val) {
-#if 0 // TODO: replace this once Nvidia implements atomicAdd for __half
-  return atomicAdd(address, val);
-#else
-  unsigned int* address_as_uint = (unsigned int*) address;
-  unsigned int old = *address_as_uint;
-  __half* old_as_half = (__half*) &old;
-  unsigned int assumed;
-  unsigned int updated;
-  __half* updated_as_half = (__half*) &updated;
-  do {
-    assumed = old;
-    updated = old;
-    *updated_as_half += value;
-    old = atomicCAS(address_as_uint, assumed, updated);
-  } while (assumed != old);
-  return *old_as_half;
-#endif // 0
-}
-#endif // __CUDA_ARCH__ >= 530
-__device__ inline float atomic_add(float* address, float val) {
-  return atomicAdd(address, val);
-}
-__device__ inline double atomic_add(double* address, double val) {
-#if __CUDA_ARCH__ >= 600
-  return atomicAdd(address, val);
-#else
-  unsigned long long int* address_as_ull =
-    (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val +
-                                         __longlong_as_double(assumed)));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-#endif // __CUDA_ARCH__ < 600
-}
-
-// Reciprocal square root functions
-#if __CUDA_ARCH__ >= 530
-__device__ inline float rsqrt_(__half x) {
-  return hrsqrt(x);
-}
-#endif // __CUDA_ARCH__ >= 530
-__device__ inline float rsqrt_(float x) {
-  return rsqrtf(x);
-}
-__device__ inline double rsqrt_(double x) {
-  return rsqrt(x);
-}
-
 /** CUDA kernel to compute channel sums.
  *  Sums and squares of sums are used to compute mean and variance.
  */
-template <int block_size>
+template <El::Int block_size>
 __global__ void channel_sums_kernel(
-  int channel_height,
-  int width,
-  const DataType * __restrict__ data, int data_ldim,
+  El::Int channel_height,
+  El::Int width,
+  const DataType * __restrict__ data, El::Int data_ldim,
         DataType * __restrict__ sums,
         DataType * __restrict__ sqsums) {
 
   // Indices
-  const int tid = threadIdx.x;
-  const int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const int bidy = blockIdx.y;
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
 
   // Initialize shared memory
   __shared__ DataType shared_sums[block_size];
- __shared__ DataType shared_sqsums[block_size];
+  __shared__ DataType shared_sqsums[block_size];
 
   // Compute row sums in shared memory
-  DataType private_sum = DataType(0);
-  DataType private_sqsum = DataType(0);
+  DataType private_sum = 0;
+  DataType private_sqsum = 0;
   if (gidx < channel_height) {
-    const int row = gidx + bidy * channel_height;
-    for (int col = 0; col < width; ++col) {
+    const auto& row = gidx + bidy * channel_height;
+    for (El::Int col = 0; col < width; ++col) {
       const auto& x = data[row + col * data_ldim];
       private_sum += x;
       private_sqsum += x * x;
@@ -124,7 +67,7 @@ __global__ void channel_sums_kernel(
 
   // Compute channel sum with shared memory reduction
   /// @todo unroll loops
-  for (int stride = block_size / 2; stride > 0; stride /= 2) {
+  for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
     __syncthreads();
     if(tid < stride) {
       shared_sums[tid] += shared_sums[tid + stride];
@@ -134,8 +77,8 @@ __global__ void channel_sums_kernel(
 
   // Output channel sum to global memory
   if (tid == 0) {
-    atomic_add(&sums[bidy], shared_sums[0]);
-    atomic_add(&sqsums[bidy], shared_sqsums[0]);
+    cuda::atomic_add(&sums[bidy], shared_sums[0]);
+    cuda::atomic_add(&sqsums[bidy], shared_sqsums[0]);
   }
 
 }
@@ -145,18 +88,18 @@ __global__ void channel_sums_kernel(
  *  and squares of sums, respectively.
  */
 __global__ void compute_statistics_kernel(
-  int num_sums,
-  int num_per_sum,
+  El::Int num_sums,
+  El::Int num_per_sum,
   DataType epsilon,
   DataType decay,
   DataType * __restrict__ global_mean,
   DataType * __restrict__ global_var,
   DataType * __restrict__ global_running_mean,
   DataType * __restrict__ global_running_var) {
-  const DataType one = DataType(1);
-  const int gid = threadIdx.x + blockIdx.x * blockDim.x;
-  const int num_threads = blockDim.x * gridDim.x;
-  for (int i = gid; i < num_sums; i += num_threads) {
+  constexpr DataType one = 1;
+  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int num_threads = blockDim.x * gridDim.x;
+  for (El::Int i = gid; i < num_sums; i += num_threads) {
 
     // Compute mean and variance
     const auto& mean = global_mean[i] / num_per_sum;
@@ -177,21 +120,21 @@ __global__ void compute_statistics_kernel(
 }
 
 /** CUDA kernel to apply batch normalization. */
-template <int block_size>
+template <El::Int block_size>
 __global__ void batch_normalization_kernel(
-  int channel_height,
-  int width,
-  const DataType * __restrict__ global_input, int input_ldim,
+  El::Int channel_height,
+  El::Int width,
+  const DataType * __restrict__ global_input, El::Int input_ldim,
   const DataType * __restrict__ global_mean,
   const DataType * __restrict__ global_var,
   DataType epsilon,
   const DataType * __restrict__ global_scale,
   const DataType * __restrict__ global_bias,
-        DataType * __restrict__ global_output, int output_ldim) {
+        DataType * __restrict__ global_output, El::Int output_ldim) {
 
   // Indices
-  const int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const int bidy = blockIdx.y;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
 
   // Copy batch normalization parameters to private memory
   const auto& mean = global_mean[bidy];
@@ -200,12 +143,12 @@ __global__ void batch_normalization_kernel(
   const auto& bias = global_bias[bidy];
 
   // Get reciprocal of standard deviation
-  const auto& inv_stdev = rsqrt_(var + epsilon);
+  const auto& inv_stdev = cuda::rsqrt(var + epsilon);
 
   // Apply batch normalization
   if (gidx < channel_height) {
-    const int row = gidx + bidy * channel_height;
-    for (int col = 0; col < width; ++col) {
+    const auto& row = gidx + bidy * channel_height;
+    for (El::Int col = 0; col < width; ++col) {
       const auto& x = global_input[row + col * input_ldim];
       const auto& xhat = (x - mean) * inv_stdev;
       const auto& y = scale * xhat + bias;
@@ -216,14 +159,14 @@ __global__ void batch_normalization_kernel(
 }
 
 /** CUDA kernel to compute gradients w.r.t. batch norm parameters. */
-template <int block_size>
+template <El::Int block_size>
 __global__ void backprop1_kernel(
-  int channel_height,
-  int width,
+  El::Int channel_height,
+  El::Int width,
   const DataType * __restrict__ global_input,
-  int input_ldim,
+  El::Int input_ldim,
   const DataType * __restrict__ global_gradient_wrt_output,
-  int gradient_wrt_output_ldim,
+  El::Int gradient_wrt_output_ldim,
   const DataType * __restrict__ global_mean,
   const DataType * __restrict__ global_var,
   DataType epsilon,
@@ -234,9 +177,9 @@ __global__ void backprop1_kernel(
         DataType * __restrict__ global_dvar) {
 
   // Indices
-  const int tid = threadIdx.x;
-  const int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const int bidy = blockIdx.y;
+  const El::Int tid = threadIdx.x;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
 
   // Initialize shared memory
   __shared__ DataType shared_dscale[block_size];
@@ -250,8 +193,8 @@ __global__ void backprop1_kernel(
   const auto& scale = global_scale[bidy];
 
   // Compute useful constants
-  const DataType zero = DataType(0);
-  const auto& inv_stdev = rsqrt_(var + epsilon);
+  constexpr DataType zero = 0;
+  const auto& inv_stdev = cuda::rsqrt(var + epsilon);
   const auto& dvar_factor = inv_stdev * inv_stdev * inv_stdev / 2;
 
   // Compute row-wise gradient contributions in shared memory
@@ -260,8 +203,8 @@ __global__ void backprop1_kernel(
   auto dmean = zero;
   auto dvar = zero;
   if (gidx < channel_height) {
-    const int row = gidx + bidy * channel_height;
-    for(int col = 0; col < width; ++col) {
+    const auto& row = gidx + bidy * channel_height;
+    for(El::Int col = 0; col < width; ++col) {
       const auto& x = global_input[row + col * input_ldim];
       const auto& xhat = (x - mean) * inv_stdev;
       const auto& dy = global_gradient_wrt_output[row + col * gradient_wrt_output_ldim];
@@ -279,7 +222,7 @@ __global__ void backprop1_kernel(
 
   // Compute gradients with shared memory reduction
   // @todo unroll loops
-  for (int stride = block_size / 2; stride > 0; stride /= 2) {
+  for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
     __syncthreads();
     if (tid < stride) {
       shared_dscale[tid] += shared_dscale[tid + stride];
@@ -291,24 +234,24 @@ __global__ void backprop1_kernel(
 
   // Output channel sum to global memory
   if (tid == 0) {
-    atomic_add(&global_dscale[bidy], shared_dscale[0]);
-    atomic_add(&global_dbias[bidy], shared_dbias[0]);
-    atomic_add(&global_dmean[bidy], shared_dmean[0]);
-    atomic_add(&global_dvar[bidy], shared_dvar[0]);
+    cuda::atomic_add(&global_dscale[bidy], shared_dscale[0]);
+    cuda::atomic_add(&global_dbias[bidy], shared_dbias[0]);
+    cuda::atomic_add(&global_dmean[bidy], shared_dmean[0]);
+    cuda::atomic_add(&global_dvar[bidy], shared_dvar[0]);
   }
 
 }
 
 /** CUDA kernel to compute gradients w.r.t. input. */
-template <int block_size>
+template <El::Int block_size>
 __global__ void backprop2_kernel(
-  int channel_height,
-  int local_width,
-  int global_width,
+  El::Int channel_height,
+  El::Int local_width,
+  El::Int num_per_sum,
   const DataType * __restrict__ global_input,
-  int input_ldim,
+  El::Int input_ldim,
   const DataType * __restrict__ global_gradient_wrt_output,
-  int gradient_wrt_output_ldim,
+  El::Int gradient_wrt_output_ldim,
   const DataType * __restrict__ global_mean,
   const DataType * __restrict__ global_var,
   DataType epsilon,
@@ -316,11 +259,11 @@ __global__ void backprop2_kernel(
   const DataType * __restrict__ global_dmean,
   const DataType * __restrict__ global_dvar,
         DataType * __restrict__ global_gradient_wrt_input,
-  int gradient_wrt_input_ldim) {
+  El::Int gradient_wrt_input_ldim) {
 
   // Indices
-  const int gidx = threadIdx.x + blockIdx.x * blockDim.x;
-  const int bidy = blockIdx.y;
+  const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int bidy = blockIdx.y;
 
   // Copy batch normalization parameters to private memory
   const auto& mean = global_mean[bidy];
@@ -330,291 +273,226 @@ __global__ void backprop2_kernel(
   const auto& dvar = global_dvar[bidy];
 
   // Compute useful constants
-  const auto& inv_stdev = rsqrt_(var + epsilon);
-  const auto& dmean_term = dmean / (global_width * channel_height);
-  const auto& dvar_term = dvar * 2 / (global_width * channel_height - 1);
+  const auto& inv_stdev = cuda::rsqrt(var + epsilon);
+  const auto& dmean_term = dmean / num_per_sum;
+  const auto& dvar_term = dvar * 2 / (num_per_sum - 1);
 
   // Apply batch normalization
   if (gidx < channel_height) {
-    const int row = gidx + bidy * channel_height;
-    for (int col = 0; col < local_width; ++col) {
+    const auto& row = gidx + bidy * channel_height;
+    for (El::Int col = 0; col < local_width; ++col) {
       const auto& x = global_input[row + col * input_ldim];
       const auto& dy = global_gradient_wrt_output[row + col * gradient_wrt_output_ldim];
       const auto& dxhat = dy * scale;
-      auto dx = dxhat * inv_stdev;
-      dx += dmean_term;
-      dx += dvar_term * (x - mean);
-      global_gradient_wrt_input[row + col * gradient_wrt_input_ldim] = dx;
+      auto& dx = global_gradient_wrt_input[row + col * gradient_wrt_input_ldim];
+      dx = dxhat * inv_stdev + dmean_term + dvar_term * (x - mean);
     }
   }
 
 }
 
 } // namespace
+  
+template <>
+void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::fp_compute() {
+  constexpr DataType one = 1;
+  const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
+
+  // CUDA objects
+  CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+  auto&& stream = El::GPUManager::Stream();
+  
+  // Matrices
+  const auto& input = get_prev_activations();
+  const auto& local_input = input.LockedMatrix();
+  auto& local_output = get_local_activations();
+
+  // Matrix parameters
+  const auto& width = input.Width();
+  const auto& local_width = local_input.Width();
+  const auto& output_dims = get_output_dims();
+  const auto& num_channels = output_dims[0];
+  const auto& channel_size = get_output_size() / num_channels;
+
+  // Compute statistics
+  if (is_training) {
+
+    // Local matrices
+    auto& local_mean = m_mean->Matrix();
+    auto& local_var = m_var->Matrix();
+    auto& local_running_mean = this->m_weights[2]->get_values().Matrix();
+    auto& local_running_var = this->m_weights[3]->get_values().Matrix();
+
+    // Compute sums and sums of squares
+    El::Zero(local_mean);
+    El::Zero(local_var);
+    if (!local_input.IsEmpty()) {
+      const El::Int block_size = 256;
+      dim3 block_dims, grid_dims;
+      block_dims.x = block_size;
+      grid_dims.x = (channel_size + block_size - 1) / block_size;
+      grid_dims.y = num_channels;
+      channel_sums_kernel<block_size>
+        <<<grid_dims, block_dims, 0, stream>>>(
+          channel_size, local_width,
+          local_input.LockedBuffer(), local_input.LDim(),
+          local_mean.Buffer(), local_var.Buffer());
+    }
+    El::Int num_per_sum;
+    if (m_use_global_stats) {
+      m_comm->allreduce(*m_mean, m_mean->RedundantComm(), El::mpi::SUM);
+      m_comm->allreduce(*m_var, m_var->RedundantComm(), El::mpi::SUM);
+      num_per_sum = channel_size * width;
+    } else {
+      num_per_sum = channel_size * local_width;
+    }
 
-namespace batch_normalization_cuda {
-
-void channel_sums(int num_channels,
-                  const AbsMat& data,
-                  AbsMat& sums,
-                  AbsMat& sqsums) {
-
-#ifdef LBANN_DEBUG
-  // Check that inputs are valid
-  if (num_channels < 1) { LBANN_ERROR("non-positive number of channels"); }
-  if (data.Height() % num_channels != 0) {
-    LBANN_ERROR("number of channels does not divide input matrix height"); 
-  }
-  if (data.GetDevice() != El::Device::GPU
-      || sums.GetDevice() != El::Device::GPU
-      || sqsums.GetDevice() != El::Device::GPU) {
-    LBANN_ERROR("matrices do not reside on GPU");
-  }
-#endif // LBANN_DEBUG  
-
-  // Compute channel sums and squares of sums
-  El::Zeros(sums, num_channels, 1);
-  El::Zeros(sqsums, num_channels, 1);
-  if (data.Height() > 0 && data.Width() > 0) {
-    const int channel_height = data.Height() / num_channels;
-    const int block_size = 256;
-    dim3 block_dims, grid_dims;
-    block_dims.x = block_size;
-    grid_dims.x = (channel_height + block_size - 1) / block_size;
-    grid_dims.y = num_channels;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    channel_sums_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
-        channel_height, data.Width(),
-        data.LockedBuffer(), data.LDim(),
-        sums.Buffer(), sqsums.Buffer());
-  }
-}
-
-void compute_statistics(int num_per_sum,
-                        DataType epsilon,
-                        DataType decay,
-                        AbsMat& mean,
-                        AbsMat& var,
-                        AbsMat& running_mean,
-                        AbsMat& running_var) {
-
-#ifdef LBANN_DEBUG
-  // Check that inputs are valid
-  if (mean.Height() != var.Height()
-      || mean.Height() != running_mean.Height()
-      || mean.Height() != running_var.Height()
-      || mean.Width() != 1 || var.Width() != 1
-      || running_mean.Width() != 1 || running_var.Width() != 1) {
-    LBANN_ERROR("invalid matrix dimensions");
-  }
-  if (mean.GetDevice() != El::Device::GPU
-      || var.GetDevice() != El::Device::GPU
-      || running_mean.GetDevice() != El::Device::GPU
-      || running_var.GetDevice() != El::Device::GPU) {
-    LBANN_ERROR("matrices do not reside on GPU");
-  }
-#endif // LBANN_DEBUG  
-
-  // Compute statistics from sums
-  const int block_dim = 256;
-  const int grid_dim = (mean.Height() + block_dim - 1) / block_dim;
-  if (num_per_sum > 1) {
-    if (grid_dim > 0) {
-      CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    // Compute minibatch statistics
+    if (num_per_sum <= 1) {
+      El::Fill(local_var, one);
+    } else if (num_channels > 0) {
+      const El::Int block_dim = 256;
+      const El::Int grid_dim = (num_channels + block_dim - 1) / block_dim;
       compute_statistics_kernel
-        <<<grid_dim, block_dim, 0, El::GPUManager::Stream()>>>(
-          mean.Height(), num_per_sum, epsilon, decay,
-          mean.Buffer(), var.Buffer(),
-          running_mean.Buffer(), running_var.Buffer());
+        <<<grid_dim, block_dim, 0, stream>>>(
+          num_channels, num_per_sum, m_epsilon, m_decay,
+          local_mean.Buffer(), local_var.Buffer(),
+          local_running_mean.Buffer(), local_running_var.Buffer());
     }
-  } else {
-    El::Fill(var, DataType(1));
-  }
-
-}
 
-void batch_normalization(const AbsMat& input,
-                         const AbsMat& mean,
-                         const AbsMat& var,
-                         DataType epsilon,
-                         const AbsMat& scale,
-                         const AbsMat& bias,
-                         AbsMat& output) {
-  const int num_channels = mean.Height();
-
-#ifdef LBANN_DEBUG
-  // Check that inputs are valid
-  if (num_channels < 1) { LBANN_ERROR("non-positive number of channels"); }
-  if (input.Height() % num_channels != 0) {
-    LBANN_ERROR("number of channels does not divide input matrix height"); 
   }
-  if (mean.Height() != num_channels || var.Height() != num_channels
-      || scale.Height() != num_channels || bias.Height() != num_channels
-      || mean.Width() != 1 || var.Width() != 1
-      || scale.Width() != 1 || bias.Width() != 1
-      || input.Height() != output.Height()
-      || input.Width() != output.Width()) {
-    LBANN_ERROR("invalid matrix dimensions");
-  }
-  if (input.GetDevice() != El::Device::GPU
-      || mean.GetDevice() != El::Device::GPU
-      || var.GetDevice() != El::Device::GPU
-      || scale.GetDevice() != El::Device::GPU
-      || bias.GetDevice() != El::Device::GPU
-      || output.GetDevice() != El::Device::GPU) {
-    LBANN_ERROR("matrices do not reside on GPU");
-  }
-#endif // LBANN_DEBUG  
 
   // Apply batch normalization
-  if (input.Height() > 0 && input.Width() > 0) {
-    const int channel_height = input.Height() / num_channels;
-    const int block_size = 256;
+  const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix();
+  const auto& local_bias = this->m_weights[1]->get_values().LockedMatrix();
+  const auto& local_mean = (is_training ?
+                            m_mean->LockedMatrix() :
+                            this->m_weights[2]->get_values().LockedMatrix());
+  const auto& local_var = (is_training ?
+                           m_var->LockedMatrix() :
+                           this->m_weights[3]->get_values().LockedMatrix());
+  if (!local_input.IsEmpty()) {
+    const El::Int block_size = 256;
     dim3 block_dims, grid_dims;
     block_dims.x = block_size;
-    grid_dims.x = (channel_height + block_size - 1) / block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
     grid_dims.y = num_channels;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
     batch_normalization_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
-        channel_height, input.Width(),
-        input.LockedBuffer(), input.LDim(),
-        mean.LockedBuffer(), var.LockedBuffer(), epsilon,
-        scale.LockedBuffer(), bias.LockedBuffer(),
-        output.Buffer(), output.LDim());
+      <<<grid_dims, block_dims, 0, stream>>>(
+        channel_size, local_width,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_mean.LockedBuffer(), local_var.LockedBuffer(), m_epsilon,
+        local_scale.LockedBuffer(), local_bias.LockedBuffer(),
+        local_output.Buffer(), local_output.LDim());
   }
-
+  
 }
 
-void backprop1(const AbsMat& input,
-               const AbsMat& gradient_wrt_output,
-               const AbsMat& mean,
-               const AbsMat& var,
-               DataType epsilon,
-               const AbsMat& scale,
-               AbsMat& dscale,
-               AbsMat& dbias,
-               AbsMat& dmean,
-               AbsMat& dvar) {
-  const int num_channels = mean.Height();
-
-#ifdef LBANN_DEBUG
-  // Check that inputs are valid
-  if (num_channels < 1) { LBANN_ERROR("non-positive number of channels"); }
-  if (input.Height() % num_channels != 0) {
-    LBANN_ERROR("number of channels does not divide input matrix height"); 
-  }
-  if (mean.Height() != num_channels || var.Height() != num_channels
-      || scale.Height() != num_channels
-      || mean.Width() != 1 || var.Width() != 1 || scale.Width() != 1
-      || input.Height() != gradient_wrt_output.Height()
-      || input.Width() != gradient_wrt_output.Width()) {
-    LBANN_ERROR("invalid matrix dimensions");
-  }
-  if (input.GetDevice() != El::Device::GPU
-      || gradient_wrt_output.GetDevice() != El::Device::GPU
-      || mean.GetDevice() != El::Device::GPU
-      || var.GetDevice() != El::Device::GPU
-      || scale.GetDevice() != El::Device::GPU
-      || dscale.GetDevice() != El::Device::GPU
-      || dbias.GetDevice() != El::Device::GPU
-      || dmean.GetDevice() != El::Device::GPU
-      || dvar.GetDevice() != El::Device::GPU) {
-    LBANN_ERROR("matrices do not reside on GPU");
-  }
-#endif // LBANN_DEBUG  
-
+template <>
+void batch_normalization_layer<data_layout::DATA_PARALLEL, El::Device::GPU>::bp_compute() {
+  constexpr DataType one = 1;
+  const bool is_training = this->m_model->get_execution_mode() == execution_mode::training;
+
+  // CUDA objects
+  CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+  auto&& stream = El::GPUManager::Stream();
+
+  // Matrices
+  const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix();
+  const auto& local_mean = (is_training ?
+                            m_mean->LockedMatrix() :
+                            this->m_weights[2]->get_values().LockedMatrix());
+  const auto& local_var = (is_training ?
+                           m_var->LockedMatrix() :
+                           this->m_weights[3]->get_values().LockedMatrix());
+  const auto& input = get_prev_activations();
+  const auto& local_input = input.LockedMatrix();
+  const auto& local_gradient_wrt_output = get_local_prev_error_signals();
+  auto& local_gradient_wrt_input = get_local_error_signals();
+  auto& local_mean_gradient = m_mean_gradient->Matrix();
+  auto& local_var_gradient = m_var_gradient->Matrix();
+  auto& local_scale_gradient = m_scale_gradient->Matrix();
+  auto& local_bias_gradient = m_bias_gradient->Matrix();
+
+  // Matrix parameters
+  const El::Int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size();
+  const auto& width = input.Width();
+  const auto& local_width = local_input.Width();
+  const auto& output_dims = get_output_dims();
+  const auto& num_channels = output_dims[0];
+  const auto& channel_size = get_output_size() / num_channels;
+
+  // Compute local gradients
   // Compute gradients w.r.t. batch norm parameters
-  El::Zeros(dscale, num_channels, 1);
-  El::Zeros(dbias, num_channels, 1);
-  El::Zeros(dmean, num_channels, 1);
-  El::Zeros(dvar, num_channels, 1);
-  if (input.Height() > 0 && input.Width() > 0) {
-    const int channel_height = input.Height() / num_channels;
-    const int block_size = 256;
+  El::Zero(local_scale_gradient);
+  El::Zero(local_bias_gradient);
+  El::Zero(local_mean_gradient);
+  El::Zero(local_var_gradient);
+  if (!local_input.IsEmpty()) {
+    const El::Int block_size = 256;
     dim3 block_dims, grid_dims;
     block_dims.x = block_size;
-    grid_dims.x = (channel_height + block_size - 1) / block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
     grid_dims.y = num_channels;
-    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
     backprop1_kernel<block_size>
-      <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
-        channel_height, input.Width(),
-        input.LockedBuffer(), input.LDim(),
-        gradient_wrt_output.LockedBuffer(), gradient_wrt_output.LDim(),
-        mean.LockedBuffer(), var.LockedBuffer(), epsilon,
-        scale.LockedBuffer(), dscale.Buffer(), dbias.Buffer(),
-        dmean.Buffer(), dvar.Buffer());
+      <<<grid_dims, block_dims, 0, stream>>>(
+        channel_size, local_width,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(),
+        local_mean.LockedBuffer(), local_var.LockedBuffer(), m_epsilon,
+        local_scale.LockedBuffer(),
+        local_scale_gradient.Buffer(), local_bias_gradient.Buffer(),
+        local_mean_gradient.Buffer(), local_var_gradient.Buffer());
   }
 
-}
-
-void backprop2(int global_width,
-               const AbsMat& input,
-               const AbsMat& gradient_wrt_output,
-               const AbsMat& mean,
-               const AbsMat& var,
-               DataType epsilon,
-               const AbsMat& scale,
-               const AbsMat& dmean,
-               const AbsMat& dvar,
-               AbsMat& gradient_wrt_input) {
-  const int num_channels = mean.Height();
-
-#ifdef LBANN_DEBUG
-  // Check that inputs are valid
-  if (num_channels < 1) { LBANN_ERROR("non-positive number of channels"); }
-  if (input.Height() % num_channels != 0) {
-    LBANN_ERROR("number of channels does not divide input matrix height"); 
+  // Accumulate gradients
+  if (is_training) {
+    if (m_use_global_stats) {
+      m_comm->allreduce(*m_mean_gradient,
+                        m_mean_gradient->RedundantComm(),
+                        El::mpi::SUM);
+      m_comm->allreduce(*m_var_gradient,
+                        m_var_gradient->RedundantComm(),
+                        El::mpi::SUM);
+    }
+  } else {
+    El::Zero(*m_mean_gradient);
+    El::Zero(*m_var_gradient);
   }
-  if (mean.Height() != num_channels || var.Height() != num_channels
-      || scale.Height() != num_channels
-      || dmean.Height() != num_channels || dvar.Height() != num_channels
-      || mean.Width() != 1 || var.Width() != 1 || scale.Width() != 1
-      || dmean.Width() != 1 || dvar.Width() != 1
-      || input.Height() != gradient_wrt_output.Height()
-      || input.Height() != gradient_wrt_input.Height()
-      || input.Width() != gradient_wrt_output.Width()
-      || input.Width() != gradient_wrt_input.Width()) {
-    LBANN_ERROR("invalid matrix dimensions");
+  optimizer* scale_optimizer = m_weights[0]->get_optimizer();
+  if (scale_optimizer != nullptr) {
+    scale_optimizer->add_to_gradient_staging(*m_scale_gradient,
+                                             one / effective_mini_batch_size);
   }
-  if (input.GetDevice() != El::Device::GPU
-      || gradient_wrt_output.GetDevice() != El::Device::GPU
-      || mean.GetDevice() != El::Device::GPU
-      || var.GetDevice() != El::Device::GPU
-      || scale.GetDevice() != El::Device::GPU
-      || dmean.GetDevice() != El::Device::GPU
-      || dvar.GetDevice() != El::Device::GPU
-      || gradient_wrt_input.GetDevice() != El::Device::GPU) {
-    LBANN_ERROR("matrices do not reside on GPU");
+  optimizer* bias_optimizer = m_weights[1]->get_optimizer();
+  if (bias_optimizer != nullptr) {
+    bias_optimizer->add_to_gradient_staging(*m_bias_gradient,
+                                            one / effective_mini_batch_size);
   }
-#endif // LBANN_DEBUG  
 
-  // Compute gradient w.r.t. input
-  const int channel_height = input.Height() / num_channels;
-  if (channel_height * global_width <= 1) {
-    // El::Zero(gradient_wrt_input);
-  } else {
-    if (input.Height() > 0 && input.Width() > 0) {
-      const int block_size = 256;
-      dim3 block_dims, grid_dims;
-      block_dims.x = block_size;
-      grid_dims.x = (channel_height + block_size - 1) / block_size;
-      grid_dims.y = num_channels;
-      CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-      backprop2_kernel<block_size>
-        <<<grid_dims, block_dims, 0, El::GPUManager::Stream()>>>(
-          channel_height, input.Width(), global_width,
-          input.LockedBuffer(), input.LDim(),
-          gradient_wrt_output.LockedBuffer(), gradient_wrt_output.LDim(),
-          mean.LockedBuffer(), var.LockedBuffer(), epsilon,
-          scale.LockedBuffer(), dmean.LockedBuffer(), dvar.LockedBuffer(),
-          gradient_wrt_input.Buffer(), gradient_wrt_input.LDim());
-    }
+  // Compute error signal
+  const auto& num_per_sum = (m_use_global_stats ?
+                             width * channel_size :
+                             local_width * channel_size);
+  if (num_per_sum <= 1) {
+    El::Zero(local_gradient_wrt_input);
+  } else if (!local_input.IsEmpty()) {
+    const El::Int block_size = 256;
+    dim3 block_dims, grid_dims;
+    block_dims.x = block_size;
+    grid_dims.x = (channel_size + block_size - 1) / block_size;
+    grid_dims.y = num_channels;
+    backprop2_kernel<block_size>
+      <<<grid_dims, block_dims, 0, stream>>>(
+        channel_size, local_width, num_per_sum,
+        local_input.LockedBuffer(), local_input.LDim(),
+        local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(),
+        local_mean.LockedBuffer(), local_var.LockedBuffer(), m_epsilon,
+        local_scale.LockedBuffer(),
+        local_mean_gradient.LockedBuffer(), local_var_gradient.LockedBuffer(),
+        local_gradient_wrt_input.Buffer(), local_gradient_wrt_input.LDim());
   }
-
+  
 }
-
-} // namespace batch_normalization
+  
 } // namespace lbann
diff --git a/src/layers/transform/evaluation.cpp b/src/layers/transform/evaluation.cpp
index 7f73b0da4ca..10c9b7383fa 100644
--- a/src/layers/transform/evaluation.cpp
+++ b/src/layers/transform/evaluation.cpp
@@ -43,7 +43,7 @@ void fp_cpu(lbann_comm& comm,
   const auto& local_height = local_input.Height();
   const auto& local_width = local_input.Width();
   const auto& mini_batch_size = input.Width();
-  value = DataType(0);
+  value = 0;
 #pragma omp parallel for reduction(+:value) collapse(2)
   for (El::Int col = 0; col < local_width; ++col) {
     for (El::Int row = 0; row < local_height; ++row) {
@@ -59,8 +59,10 @@ void fp_cpu(lbann_comm& comm,
 void fp_gpu(lbann_comm& comm,
             const AbsDistMat& input,
             DataType& value,
-            Al::request& req) {
-
+            cuda::event_wrapper& copy_event) {
+  constexpr DataType zero = 0;
+  constexpr DataType one = 1;
+  
   // Local matrix
   const auto& local_input = input.LockedMatrix();
   const auto& local_height = local_input.Height();
@@ -70,57 +72,63 @@ void fp_gpu(lbann_comm& comm,
   // GPU objects
   GPUMat sum_d, ones_d;
 #ifdef HYDROGEN_HAVE_CUB
-  sum_d.SetMemoryMode(1);  // Use CUB GPU memory pool if possible
-  ones_d.SetMemoryMode(1); // Use CUB GPU memory pool if possible
+  sum_d.SetMemoryMode(1);  // Use CUB GPU memory pool
+  ones_d.SetMemoryMode(1); // Use CUB GPU memory pool
 #endif // HYDROGEN_HAVE_CUB
+  sum_d.Resize(1, 1);
   auto&& handle = El::GPUManager::cuBLASHandle();
+  auto&& stream = El::GPUManager::Stream();
   CHECK_CUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
 
   // Compute sum of local input matrix entries
-  if (local_height < 1 || local_width < 1) {
-    El::Zeros(sum_d, 1, 1);
-  } else if (local_height == local_input.LDim() || local_width == 1) {
-    sum_d.Resize(1, 1);
+  if (local_input.IsEmpty()) {
+    El::Zero(sum_d);
+  } else if (local_input.Contiguous()) {
     ones_d.Resize(local_height * local_width, 1);
-    El::Fill(ones_d, DataType(1));
+    El::Fill(ones_d, one);
     cublas::dot(handle,
                 local_height * local_width,
                 local_input.LockedBuffer(), 1,
                 ones_d.LockedBuffer(), 1,
                 sum_d.Buffer());
   } else if (local_height == 1) {
-    sum_d.Resize(1, 1);
     ones_d.Resize(local_width, 1);
-    El::Fill(ones_d, DataType(1));
+    El::Fill(ones_d, one);
     cublas::dot(handle,
                 local_width,
                 local_input.LockedBuffer(), local_input.LDim(),
                 ones_d.LockedBuffer(), 1,
                 sum_d.Buffer());
   } else {
-    sum_d.Resize(local_width + 1, 1);
-    ones_d.Resize(std::max(local_height, local_width), 1);
-    El::Fill(ones_d, DataType(1));
-    for (El::Int col = 0; col < local_width; ++col) {
-      cublas::dot(handle,
-                  local_height,
-                  local_input.LockedBuffer(0, col), 1,
-                  ones_d.LockedBuffer(), 1,
-                  sum_d.Buffer(col+1, 0));
+    GPUMat col_sums_d;
+#ifdef HYDROGEN_HAVE_CUB
+    col_sums_d.SetMemoryMode(1);  // Use CUB GPU memory pool
+#endif // HYDROGEN_HAVE_CUB
+    col_sums_d.Resize(local_width, 1);
+    ones_d.Resize(local_height, 1);
+    El::Fill(ones_d, one);
+    El::Gemv(El::TRANSPOSE, one, local_input, ones_d, zero, col_sums_d);
+    if (local_width > local_height) {
+      ones_d.Resize(local_width, 1);
+      El::Fill(ones_d, one);
     }
     cublas::dot(handle,
                 local_width,
-                sum_d.LockedBuffer(1, 0), 1,
+                col_sums_d.LockedBuffer(), 1,
                 ones_d.LockedBuffer(), 1,
-                sum_d.Buffer(0, 0));
+                sum_d.Buffer());
   }
   CHECK_CUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
 
   // Compute average value across mini-batch
-  CHECK_CUDA(cudaMemcpy(&value, sum_d.LockedBuffer(), sizeof(DataType),
-                        cudaMemcpyDeviceToHost));
-  value = value / mini_batch_size;
-  comm.nb_allreduce(&value, 1, input.DistComm(), req);
+  El::Scale(one / mini_batch_size, sum_d);
+  comm.allreduce(static_cast<AbsMat&>(sum_d), input.DistComm());
+  CHECK_CUDA(cudaMemcpyAsync(&value,
+                             sum_d.LockedBuffer(),
+                             sizeof(DataType),
+                             cudaMemcpyDeviceToHost,
+                             stream));
+  copy_event.record(stream);
 
 }
 #endif // LBANN_HAS_GPU
@@ -128,27 +136,40 @@ void fp_gpu(lbann_comm& comm,
 } // namespace
 
 EvalType abstract_evaluation_layer::get_value(bool scaled) {
-  get_comm()->wait(m_allreduce_req);
-  if (scaled) { return m_scale * m_value; }
-  else        { return m_value; }
+  switch (get_device_allocation()) {
+  case El::Device::CPU: get_comm()->wait(m_allreduce_req); break;
+#ifdef LBANN_HAS_GPU
+  case El::Device::GPU: m_copy_event.synchronize(); break;
+#endif // LBANN_HAS_GPU
+  default: LBANN_ERROR("invalid device");
+  }
+  if (scaled) { return m_scale * m_value(0, 0); }
+  else        { return m_value(0, 0); }
 }
 
 abstract_evaluation_layer::abstract_evaluation_layer(lbann_comm *comm)
-  : transform_layer(comm), m_scale(0), m_value(0) {
-
-  // Evaluation layer has no children
+  : transform_layer(comm) {
   m_expected_num_child_layers = 0;
-
 }
-
+  
+void abstract_evaluation_layer::setup_data() {
+  transform_layer::setup_data();
+#ifdef LBANN_HAS_GPU
+  m_value.SetMemoryMode(1); // Use pinned memory on host
+#endif // LBANN_HAS_GPU
+  El::Zeros(m_value, 1, 1);
+}
+  
 void abstract_evaluation_layer::fp_compute() {
   switch (get_device_allocation()) {
   case El::Device::CPU:
-    fp_cpu(*get_comm(), get_prev_activations(), m_value, m_allreduce_req);
+    fp_cpu(*get_comm(), get_prev_activations(), m_value(0, 0),
+           m_allreduce_req);
     break;
 #ifdef LBANN_HAS_GPU
   case El::Device::GPU:
-    fp_gpu(*get_comm(), get_prev_activations(), m_value, m_allreduce_req);
+    fp_gpu(*get_comm(), get_prev_activations(), m_value(0, 0),
+           m_copy_event);
     break;
 #endif // LBANN_HAS_GPU
   default: LBANN_ERROR("invalid device");
diff --git a/src/layers/transform/in_top_k.cpp b/src/layers/transform/in_top_k.cpp
index 09c4d3af67a..8b4f6e1796a 100644
--- a/src/layers/transform/in_top_k.cpp
+++ b/src/layers/transform/in_top_k.cpp
@@ -83,7 +83,7 @@ void fp_cpu(lbann_comm& comm,
   // Column communicator
   auto&& col_comm = input.ColComm();
   const auto& col_comm_size = El::mpi::Size(col_comm);
-  
+
   // Find top-k entries in each column of local input matrix
   std::vector<entry> top_entries(local_width * k);
 #pragma omp parallel for
@@ -99,7 +99,7 @@ void fp_cpu(lbann_comm& comm,
                            &top_entries[col*k] + k,
                            entry::compare);
   }
-  
+
   // Find top-k entries in each column of global input matrix
   if (col_comm_size > 1) {
     std::vector<entry> global_top_entries(col_comm_size * local_width * k);
@@ -135,7 +135,7 @@ void fp_cpu(lbann_comm& comm,
       }
     }
   }
-  
+
 }
 
 } // namespace
diff --git a/src/layers/transform/in_top_k.cu b/src/layers/transform/in_top_k.cu
index 1452b2482f9..fdc9aa54ad8 100644
--- a/src/layers/transform/in_top_k.cu
+++ b/src/layers/transform/in_top_k.cu
@@ -28,7 +28,6 @@
 #include "lbann/utils/cuda.hpp"
 #include "lbann/utils/exception.hpp"
 
-#include <thrust/system/cuda/execution_policy.h>
 #include <thrust/sort.h>
 #include <thrust/device_ptr.h>
 
@@ -38,24 +37,17 @@ namespace {
 
 /** Sparse vector entry. */
 struct entry {
-
   /** Vector entry value. */
   DataType value;
   /** Vector entry index. */
   El::Int index;
-
-  /** Minimum possible value. */
-  static constexpr DataType min_value = -std::numeric_limits<DataType>::infinity();
-  /** Maximum possible index. */
-  static constexpr El::Int max_index = std::numeric_limits<El::Int>::max();
-
 };
 
 /** Comparison operation to sort sparse vector entries.
  *  Entries are sorted by value in decreasing order, with ties broken
  *  in favor of entries with smaller indices.
  */
-struct entry_compare : thrust::binary_function<entry,entry,bool> {
+struct entry_compare : ::thrust::binary_function<entry,entry,bool> {
   __host__ __device__ bool operator()(const entry& a, const entry& b) const {
     return a.value > b.value || (a.value == b.value && a.index < b.index);
   }
@@ -89,7 +81,7 @@ __global__ void dense_matrix_to_sparse_vectors(El::Int local_vector_size,
       current_entry.value = local_matrix[local_row + local_col * local_matrix_ldim];
       current_entry.index = global_row;
     } else {
-      current_entry.value = entry::min_value;
+      current_entry.value = -cuda::infinity<DataType>();
       current_entry.index = global_matrix_height;
     }
   }
@@ -114,7 +106,7 @@ __global__ void fill_with_tensor_index(El::Int tensor_size,
   const El::Int num_threads = blockDim.x * gridDim.x;
   for (El::Int i = gid; i < tensor_size; i += num_threads) {
     tensor[i] = (i / dim_stride) % dim;
-  }  
+  }
 }
 
 /** Set selected matrix entries to one.
@@ -152,7 +144,7 @@ __global__ void indicate_matrix_entries(El::Int k,
       }
       local_matrix[local_row + local_col * local_matrix_ldim] = DataType(1);
     }
-  }  
+  }
 }
 
 /** GPU implementation of in_top_k layer forward prop. */
@@ -166,9 +158,9 @@ void fp_gpu(lbann_comm& comm,
   // Local matrices
   const auto& local_input = input.LockedMatrix();
   auto& local_output = output.Matrix();
-  const El::Int height = input.Height();
-  const El::Int local_height = local_input.Height();
-  const El::Int local_width = local_input.Width();
+  const auto& height = input.Height();
+  const auto& local_height = local_input.Height();
+  const auto& local_width = local_input.Width();
 
   // Trivial cases
   if (k < 1) {
@@ -180,7 +172,7 @@ void fp_gpu(lbann_comm& comm,
   } else if (local_width < 1) {
     return;
   }
-  
+
   // Column communicator
   auto&& col_comm = input.ColComm();
   const auto& col_comm_rank = El::mpi::Rank(col_comm);
@@ -188,19 +180,18 @@ void fp_gpu(lbann_comm& comm,
 
   // GPU objects
   auto&& stream = El::GPUManager::Stream();
+  auto&& event = El::GPUManager::Event();
   cuda::thrust::allocator<> alloc(stream);
-  using entry_array = thrust::device_vector<entry, cuda::thrust::allocator<entry>>;
-  using index_array = thrust::device_vector<El::Int, cuda::thrust::allocator<El::Int>>;
 
   // Find top-k entries in each column of local prediction matrix
-  entry_array top_entries(local_width * k);
+  cuda::thrust::vector<entry> top_entries(local_width * k);
   {
     const auto& num_local_entries_per_col = std::max(local_height, k);
     const auto& num_local_entries = local_width * num_local_entries_per_col;
     const auto& block_dim = 256;
     const auto& grid_dim = (num_local_entries + block_dim - 1) / block_dim;
-    entry_array local_entries(num_local_entries);
-    index_array local_entries_cols(num_local_entries);
+    cuda::thrust::vector<entry> local_entries(num_local_entries);
+    cuda::thrust::vector<El::Int> local_entries_cols(num_local_entries);
     dense_matrix_to_sparse_vectors<<<grid_dim, block_dim, 0, stream>>>(
       num_local_entries_per_col, local_height, local_width, height,
       input.ColShift(), input.ColStride(),
@@ -209,15 +200,15 @@ void fp_gpu(lbann_comm& comm,
     fill_with_tensor_index<<<grid_dim, block_dim, 0, stream>>>(
       num_local_entries, local_width, num_local_entries_per_col,
       local_entries_cols.data().get());
-    thrust::sort_by_key(thrust::cuda::par(alloc).on(stream),
-                        local_entries.begin(),
-                        local_entries.end(),
-                        local_entries_cols.begin(),
-                        entry_compare());
-    thrust::stable_sort_by_key(thrust::cuda::par(alloc).on(stream),
-                               local_entries_cols.begin(),
-                               local_entries_cols.end(),
-                               local_entries.begin());
+    ::thrust::sort_by_key(alloc.system(),
+                          local_entries.begin(),
+                          local_entries.end(),
+                          local_entries_cols.begin(),
+                          entry_compare());
+    ::thrust::stable_sort_by_key(alloc.system(),
+                                 local_entries_cols.begin(),
+                                 local_entries_cols.end(),
+                                 local_entries.begin());
     CHECK_CUDA(cudaMemcpy2DAsync(top_entries.data().get(),
                                  k * sizeof(entry),
                                  local_entries.data().get(),
@@ -234,24 +225,24 @@ void fp_gpu(lbann_comm& comm,
     const auto& num_entries = col_comm_size * num_entries_per_rank;
     const auto& block_dim = 256;
     const auto& grid_dim = (num_entries + block_dim - 1) / block_dim;
-    entry_array global_top_entries(num_entries);
-    index_array global_top_entries_cols(num_entries);
+    cuda::thrust::vector<entry> global_top_entries(num_entries);
+    cuda::thrust::vector<El::Int> global_top_entries_cols(num_entries);
     comm.all_gather(reinterpret_cast<El::byte*>(top_entries.data().get()),
                     top_entries.size() * sizeof(entry),
                     reinterpret_cast<El::byte*>(global_top_entries.data().get()),
                     top_entries.size() * sizeof(entry),
-                    col_comm);
+                    col_comm, El::SyncInfo<El::Device::GPU>{stream, event});
     fill_with_tensor_index<<<grid_dim, block_dim, 0, stream>>>(
       num_entries, local_width, k, global_top_entries_cols.data().get());
-    thrust::sort_by_key(thrust::cuda::par(alloc).on(stream),
-                        global_top_entries.begin(),
-                        global_top_entries.end(),
-                        global_top_entries_cols.begin(),
-                        entry_compare());
-    thrust::stable_sort_by_key(thrust::cuda::par(alloc).on(stream),
-                               global_top_entries_cols.begin(),
-                               global_top_entries_cols.end(),
-                               global_top_entries.begin());
+    ::thrust::sort_by_key(alloc.system(),
+                          global_top_entries.begin(),
+                          global_top_entries.end(),
+                          global_top_entries_cols.begin(),
+                          entry_compare());
+    ::thrust::stable_sort_by_key(alloc.system(),
+                                 global_top_entries_cols.begin(),
+                                 global_top_entries_cols.end(),
+                                 global_top_entries.begin());
     CHECK_CUDA(cudaMemcpy2DAsync(top_entries.data().get(),
                                  k * sizeof(entry),
                                  global_top_entries.data().get(),
diff --git a/src/metrics/r2.cpp b/src/metrics/r2.cpp
index c3b2c70724a..c5c6fff2faa 100644
--- a/src/metrics/r2.cpp
+++ b/src/metrics/r2.cpp
@@ -36,14 +36,14 @@ EvalType r2_metric::evaluate_compute(const AbsDistMat& prediction,
   const int local_height = prediction.LocalHeight();
   const int local_width = prediction.LocalWidth();
   const int width = prediction.Width();
-  
+
   // Get local matrices
   const Mat& prediction_local = prediction.LockedMatrix();
   const Mat& ground_truth_local = ground_truth.LockedMatrix();
 
   DataType gt_mean, gt_std;
   // Entry-wise mean of ground truth
-  //@todo fix stat class not to compute stdev if not needed 
+  //@todo fix stat class not to compute stdev if not needed
   entrywise_mean_and_stdev(ground_truth, gt_mean, gt_std);
 
   // Compute residual sum of squares ss_res
@@ -63,12 +63,13 @@ EvalType r2_metric::evaluate_compute(const AbsDistMat& prediction,
   }
 
   EvalType res_tot[2] = {ss_res, ss_tot};  // Pack to do one allreduce.
-  El::mpi::AllReduce(res_tot, 2, prediction.DistComm());
+  El::mpi::AllReduce(res_tot, 2, prediction.DistComm(),
+                     El::SyncInfo<El::Device::CPU>{});
   //Keras and TF add epsilon (1e-07) to denominator to avoid inf score
   //We might actually need to do this here and other places too
   EvalType ss_tot_eps = res_tot[1] + 0.0000001;
   //Multiply by width because base class divide by mini-batch size
-  return ((1-(res_tot[0]/ss_tot_eps))*width); 
+  return ((1-(res_tot[0]/ss_tot_eps))*width);
 }
 
 }  // namespace lbann
diff --git a/src/metrics/top_k_categorical_accuracy.cpp b/src/metrics/top_k_categorical_accuracy.cpp
index 30e94af0330..4d6656e2818 100644
--- a/src/metrics/top_k_categorical_accuracy.cpp
+++ b/src/metrics/top_k_categorical_accuracy.cpp
@@ -29,7 +29,7 @@
 namespace lbann {
 
 top_k_categorical_accuracy_metric::top_k_categorical_accuracy_metric(int top_k,
-                                                                     lbann_comm *comm) 
+                                                                     lbann_comm *comm)
   : metric(comm), m_top_k(top_k) {}
 
 EvalType top_k_categorical_accuracy_metric::evaluate_compute(const AbsDistMat& prediction,
@@ -74,7 +74,8 @@ EvalType top_k_categorical_accuracy_metric::evaluate_compute(const AbsDistMat& p
       std::vector<top_k_ele> global_top_k(
         m_top_k * local_width * col_comm_size);
       get_comm().gather((DataType*) local_top_k.data(), 2*local_top_k.size(),
-                        (DataType*) global_top_k.data(), col_comm);
+                        (DataType*) global_top_k.data(), col_comm,
+                        El::SyncInfo<El::Device::CPU>{});
       // Compute the global top k elements in each column.
       std::vector<El::Int> global_indices(m_top_k * col_comm_size);
       std::iota(global_indices.begin(), global_indices.end(), 0);
@@ -107,7 +108,7 @@ EvalType top_k_categorical_accuracy_metric::evaluate_compute(const AbsDistMat& p
       }
     } else {
       get_comm().gather((DataType*) local_top_k.data(), 2*local_top_k.size(), 0,
-                        col_comm);
+                        col_comm, El::SyncInfo<El::Device::CPU>{});
     }
     num_errors = get_comm().model_allreduce(num_errors);
     const int mini_batch_size = prediction.Width();
diff --git a/src/models/model.cpp b/src/models/model.cpp
index c6cd1834c8e..787c1a2f75b 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -67,7 +67,13 @@ model::model(lbann_comm *comm,
     m_effective_mini_batch_size(mini_batch_size),
     m_current_phase(0),
     m_comm(comm),
-    m_default_optimizer(default_optimizer) {}
+    m_default_optimizer(default_optimizer) { 
+    
+      static int num_models = 0;
+      m_name = "Model" + std::to_string(num_models);
+      num_models++;
+
+  }
 
 model::model(const model& other) :
   m_execution_mode(other.m_execution_mode),
@@ -206,6 +212,10 @@ void model::add_metric(metric *m) {
   m_metrics.push_back(m);
 }
 
+void model::set_name(std::string name) {
+  m_name = name;
+}
+
 void model::set_layers(std::vector<Layer*>& layers) {
 
   // Delete old layers
diff --git a/src/models/sequential.cpp b/src/models/sequential.cpp
index 23413dc2f75..5f90bb6e7e4 100644
--- a/src/models/sequential.cpp
+++ b/src/models/sequential.cpp
@@ -70,7 +70,7 @@ void sequential_model::write_proto(lbann_data::Model* proto) {
   model::write_proto(proto);
   //Add layers
   if (m_comm->am_world_master()) {
-    proto->set_name(name());
+    proto->set_name(get_type());
     for(size_t l = 0; l < m_layers.size(); l++) {
       auto layer_proto = proto->add_layer();
       m_layers[l]->write_proto(layer_proto);
diff --git a/src/objective_functions/CMakeLists.txt b/src/objective_functions/CMakeLists.txt
index e58f1082b55..0387f887397 100644
--- a/src/objective_functions/CMakeLists.txt
+++ b/src/objective_functions/CMakeLists.txt
@@ -11,3 +11,4 @@ add_subdirectory(weight_regularization)
 
 # Propagate the files up the tree
 set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
+set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CU_SOURCES}" PARENT_SCOPE)
diff --git a/src/objective_functions/objective_function.cpp b/src/objective_functions/objective_function.cpp
index 91744350be8..3507de264b7 100644
--- a/src/objective_functions/objective_function.cpp
+++ b/src/objective_functions/objective_function.cpp
@@ -26,6 +26,7 @@
 
 #include "lbann/objective_functions/objective_function.hpp"
 #include "lbann/utils/timer.hpp"
+#include "lbann/utils/profiling.hpp"
 #include <numeric>
 
 namespace lbann {
@@ -75,9 +76,13 @@ void objective_function::setup(model& m) {
 void objective_function::start_evaluation(execution_mode mode,
                                           int mini_batch_size) {
   const auto start_time = get_time();
+  prof_region_begin("obj-start-eval", prof_colors[0], false);
   for (const auto& term : m_terms) {
+    prof_region_begin(("obj-start-eval-" + term->name()).c_str(), prof_colors[1], false);
     term->start_evaluation();
+    prof_region_end(("obj-start-eval-" + term->name()).c_str(), false);
   }
+  prof_region_end("obj-start-eval", false);
   m_evaluation_time += get_time() - start_time;
 }
 
@@ -85,9 +90,13 @@ EvalType objective_function::finish_evaluation(execution_mode mode,
                                                int mini_batch_size) {
   const auto start_time = get_time();
   EvalType value = EvalType(0);
+  prof_region_begin("obj-finish-eval", prof_colors[0], false);
   for (const auto& term : m_terms) {
+    prof_region_begin(("obj-finish-eval-" + term->name()).c_str(), prof_colors[1], false);
     value += term->finish_evaluation();
+    prof_region_end(("obj-finish-eval-" + term->name()).c_str(), false);
   }
+  prof_region_end("obj-finish-eval", false);
   m_statistics[mode].add_value(mini_batch_size * value,
                                mini_batch_size);
   m_evaluation_time += get_time() - start_time;
@@ -96,17 +105,25 @@ EvalType objective_function::finish_evaluation(execution_mode mode,
 
 void objective_function::differentiate() {
   const auto start_time = get_time();
+  prof_region_begin("obj-differentiate", prof_colors[0], false);
   for (const auto& term : m_terms) {
+    prof_region_begin(("obj-differentiate-" + term->name()).c_str(), prof_colors[1], false);
     term->differentiate();
+    prof_region_end(("obj-differentiate-" + term->name()).c_str(), false);
   }
+  prof_region_end("obj-differentiate", false);
   m_differentiation_time += get_time() - start_time;
 }
 
 void objective_function::compute_weight_regularization() {
   const auto start_time = get_time();
+  prof_region_begin("obj-weight-regularization", prof_colors[0], false);
   for (const auto& term : m_terms) {
+    prof_region_begin(("obj-weight-regularization-" + term->name()).c_str(), prof_colors[1], false);
     term->compute_weight_regularization();
+    prof_region_end(("obj-weight-regularization-" + term->name()).c_str(), false);
   }
+  prof_region_end("obj-weight-regularization", false);
   m_differentiation_time += get_time() - start_time;
 }
 
diff --git a/src/objective_functions/weight_regularization/CMakeLists.txt b/src/objective_functions/weight_regularization/CMakeLists.txt
index 362d42730be..1cda4b2c22c 100644
--- a/src/objective_functions/weight_regularization/CMakeLists.txt
+++ b/src/objective_functions/weight_regularization/CMakeLists.txt
@@ -5,5 +5,13 @@ set_full_path(THIS_DIR_SOURCES
   l2.cpp
   )
 
+if (LBANN_HAS_CUDA)
+  # Add the CUDA source files for this directory
+  set_full_path(THIS_DIR_CU_SOURCES
+    l2.cu
+    )
+endif ()
+
 # Propagate the files up the tree
 set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
+set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CU_SOURCES}" PARENT_SCOPE)
diff --git a/src/objective_functions/weight_regularization/l2.cpp b/src/objective_functions/weight_regularization/l2.cpp
index f6df88ebb4a..e256ab71ca0 100644
--- a/src/objective_functions/weight_regularization/l2.cpp
+++ b/src/objective_functions/weight_regularization/l2.cpp
@@ -30,44 +30,36 @@
 #include "lbann/utils/cublas.hpp"
 #endif // LBANN_HAS_GPU
 
-namespace {
+namespace lbann {
 
-  /** Compute the entry-wise sum of squares of a local matrix. */
-  EvalType sum_of_squares(const Mat& mat) {
-    const El::Int height = mat.Height();
-    const El::Int width = mat.Width();
-    const El::Int ldim = mat.LDim();
-    const auto& __restrict__ buf = mat.LockedBuffer();
-    EvalType sqsum = EvalType(0);
-    if (ldim == height) {
-      // Parallelize single loop if data is contiguous
-      const El::Int size = height*width;
-      #pragma omp parallel for reduction(+:sqsum)
-      for (El::Int i = 0; i < size; ++i) {
-        const EvalType val = buf[i];
+template <>
+void l2_weight_regularization::accumulate_contribution<El::Device::CPU>(const CPUMat& vals,
+                                                                        CPUMat& contribution) {
+  auto& sqsum = contribution(0, 0);
+  if (vals.IsEmpty()) {
+  } else if (vals.Contiguous()) {
+    const size_t size = vals.Height() * vals.Width();
+    const auto& __restrict__ vals_buf = vals.LockedBuffer();
+#pragma omp parallel for reduction(+:sqsum)
+    for (size_t i = 0; i < size; ++i) {
+      const auto& val = vals_buf[i];
+      sqsum += val * val;
+    }
+  } else {
+    const El::Int height = vals.Height();
+    const El::Int width = vals.Width();
+#pragma omp parallel for reduction(+:sqsum) collapse(2)
+    for (El::Int col = 0; col < width; ++col) {
+      for (El::Int row = 0; row < height; ++row) {
+        const EvalType val = vals(row, col);
         sqsum += val * val;
       }
-    } else {
-      // Parallelize double loop if data is not contiguous
-      #pragma omp parallel for reduction(+:sqsum) collapse(2)
-      for (El::Int j = 0; j < width; ++j) {
-        for (El::Int i = 0; i < height; ++i) {
-          const EvalType val = buf[i + j*ldim];
-          sqsum += val * val;
-        }
-      }
     }
-    return sqsum;
   }
-
-} // namespace
-
-namespace lbann {
-
+}
+  
 l2_weight_regularization::l2_weight_regularization(EvalType scale_factor)
-  : objective_function_term(scale_factor),
-    m_sqsum(0),
-    m_allreduce_started(false) {}
+  : objective_function_term(scale_factor) {}
 
 void l2_weight_regularization::setup(model& m) {
   objective_function_term::setup(m);
@@ -79,113 +71,95 @@ void l2_weight_regularization::setup(model& m) {
 
   // Add all weights in model if no weights pointers are provided
   if (m_weights.empty()) {
-    for (weights* w : m.get_weights()) {
+    for (auto* w : m.get_weights()) {
       if (w->get_optimizer() != nullptr) {
         m_weights.push_back(w);
       }
     }
   }
 
+  // Construct accumulation variables for each device
+  for (auto* w : m_weights) {
+    const auto& device = w->get_values().GetLocalDevice();
+    if (m_contributions.count(device) == 0) {
+#ifdef LBANN_HAS_GPU
+      m_contributions[device].SetMemoryMode(1); // Pinned memory
+#endif // LBANN_HAS_GPU
+      m_contributions[device].Resize(1, 1);
+    }
+  }
+  
 }
 
 void l2_weight_regularization::start_evaluation() {
   if (m_scale_factor == EvalType(0)) { return; }
-  const int num_weights = m_weights.size();
-
-  // Each weights' local contribution to L2 regularization term
-  CPUMat sqsums;
-  El::Zeros(sqsums, num_weights, 1);
-
-#ifdef LBANN_HAS_GPU
+  const El::Int num_weights = m_weights.size();
 
-  // Check whether any weights are on GPU
-  bool using_gpus = false;
-  for (const auto& w : m_weights) {
-    if (w->get_values().GetLocalDevice() == El::Device::GPU) {
-      using_gpus = true;
-      break;
+  // Compute contributions from CPU weights
+  if (m_contributions.count(El::Device::CPU) > 0) {
+    auto& contribution = m_contributions[El::Device::CPU];
+    contribution(0, 0) = DataType(0);
+    for (El::Int i = 0; i < num_weights; ++i) {
+      const auto& vals = m_weights[i]->get_values();
+      if (vals.GetLocalDevice() == El::Device::CPU
+          && vals.Participating()
+          && vals.RedundantRank() == i % vals.RedundantSize()) {
+        accumulate_contribution<El::Device::CPU>(
+          static_cast<const CPUMat&>(vals.LockedMatrix()),
+          contribution);
+      }
     }
+    get_comm().nb_allreduce(static_cast<AbsMat&>(contribution),
+                            get_comm().get_model_comm(),
+                            m_allreduce_req);
   }
 
-  // Compute L2 regularization term for weights on GPU
-  // Note: cuBLAS is set to device pointer mode to pipeline GPU
-  // kernels. Local contributions are only computed on one process in
-  // each matrix's redundant communicator.
-  if (using_gpus) {
-    auto&& handle = El::GPUManager::cuBLASHandle();
-    CHECK_CUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
-
-    // Initialize workspace
-    GPUMat sqsums_d;
+#ifdef LBANN_HAS_GPU
+  // Compute contributions from GPU weights
+  if (m_contributions.count(El::Device::GPU) > 0) {
+    auto&& stream = El::GPUManager::Stream();
+    GPUMat contribution;
 #ifdef HYDROGEN_HAVE_CUB
-    sqsums_d.SetMemoryMode(1); // CUB memory pool
-#endif
-    El::Zeros(sqsums_d, num_weights, 1);
-
-    // Compute local contributions
-    for (int i = 0; i < num_weights; ++i) {
+    contribution.SetMemoryMode(1); // CUB GPU memory pool
+#endif // HYDROGEN_HAVE_CUB
+    El::Zeros(contribution, 1, 1);
+    for (El::Int i = 0; i < num_weights; ++i) {
       const auto& vals = m_weights[i]->get_values();
-      if (vals.Participating()
-          && vals.GetLocalDevice() == El::Device::GPU
-          && vals.RedundantRank() == i % vals.RedundantSize()
-          && vals.LocalWidth() > 0 && vals.LocalHeight() > 0) {
-        if (vals.LocalWidth() == 1 || vals.LDim() == vals.LocalHeight()) {
-          cublas::dot(handle,
-                      vals.LocalHeight() * vals.LocalWidth(),
-                      vals.LockedBuffer(), 1,
-                      vals.LockedBuffer(), 1,
-                      sqsums_d.Buffer(i, 0));
-        } else {
-          /// @todo Support non-contiguous data
-          std::stringstream err;
-          err << "weights \"" << m_weights[i]->get_name() << "\" "
-              << "has a non-contiguous weight matrix "
-              << "(local height = " << vals.LocalHeight() << ", "
-              << "local width = " << vals.LocalWidth() << ", "
-              << "leading dim = " << vals.LDim() << "), "
-              << "but L2 regularization currently only supports "
-              << "contiguous weight data";
-          LBANN_ERROR(err.str());
-        }
+      if (vals.GetLocalDevice() == El::Device::GPU
+          && vals.Participating()
+          && vals.RedundantRank() == i % vals.RedundantSize()) {
+        accumulate_contribution<El::Device::GPU>(
+          static_cast<const GPUMat&>(vals.LockedMatrix()),
+          contribution);
       }
     }
-
-    CHECK_CUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
-    El::Copy(sqsums_d, sqsums);
+    get_comm().allreduce(static_cast<AbsMat&>(contribution),
+                         get_comm().get_model_comm());
+    CHECK_CUDA(cudaMemcpyAsync(m_contributions[El::Device::GPU].Buffer(),
+                               contribution.LockedBuffer(),
+                               sizeof(DataType),
+                               cudaMemcpyDeviceToHost,
+                               stream));
+    m_copy_event.record(stream);
   }
-
 #endif // LBANN_HAS_GPU
 
-  // Compute local contributions on CPU
-  // Note: Only compute local contribution on one process in each
-  // redundant communicator.
-  m_sqsum = EvalType(0);
-  for (int i = 0; i < num_weights; ++i) {
-    const auto& vals = m_weights[i]->get_values();
-    if (vals.Participating()
-        && vals.GetLocalDevice() == El::Device::CPU
-        && vals.RedundantRank() == i % vals.RedundantSize()) {
-      sqsums(i, 0) = sum_of_squares(vals.LockedMatrix());      
-    }
-    m_sqsum += sqsums(i, 0);
-  }
-
-  // Start aggregating local contributions
-  get_comm().nb_allreduce(&m_sqsum,
-                          1,
-                          get_comm().get_model_comm(),
-                          m_allreduce_req);
-  m_allreduce_started = true;
-
 }
 
 EvalType l2_weight_regularization::finish_evaluation() {
   if (m_scale_factor == EvalType(0)) { return EvalType(0); }
-  if (m_allreduce_started) {
+  EvalType sqsum = 0;
+  if (m_contributions.count(El::Device::CPU) > 0) {
     get_comm().wait(m_allreduce_req);
+    sqsum += m_contributions[El::Device::CPU](0, 0);
   }
-  m_allreduce_started = false;
-  return m_scale_factor * m_sqsum / 2;
+#ifdef LBANN_HAS_GPU
+  if (m_contributions.count(El::Device::GPU) > 0) {
+    m_copy_event.synchronize();
+    sqsum += m_contributions[El::Device::GPU](0, 0);
+  }
+#endif // LBANN_HAS_GPU
+  return m_scale_factor * sqsum / 2;
 }
 
 void l2_weight_regularization::compute_weight_regularization() {
@@ -197,5 +171,5 @@ void l2_weight_regularization::compute_weight_regularization() {
   }
 
 }
-
+                                   
 } // namespace lbann
diff --git a/src/objective_functions/weight_regularization/l2.cu b/src/objective_functions/weight_regularization/l2.cu
new file mode 100644
index 00000000000..7d632504a71
--- /dev/null
+++ b/src/objective_functions/weight_regularization/l2.cu
@@ -0,0 +1,94 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/objective_functions/weight_regularization/l2.hpp"
+#include "lbann/models/model.hpp"
+#ifdef LBANN_HAS_GPU
+#include "lbann/utils/cublas.hpp"
+#endif // LBANN_HAS_GPU
+
+namespace lbann {
+
+namespace {
+
+template <El::Int block_size>
+__global__ void accumulate_contribution_kernel(El::Int height,
+                                               El::Int width,
+                                               const DataType * __restrict__ vals,
+                                               El::Int vals_ldim,
+                                               DataType * __restrict__ contribution) {
+
+  // Indices
+  const El::Int tid = threadIdx.x;
+  const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const El::Int nthreads = blockDim.x * gridDim.x;
+
+  // Compute contributions for each thread
+  DataType private_contribution = 0;
+  const auto& size = height * width;
+  for (El::Int i = gid; i < size; i += nthreads) {
+    const auto& row = i % height;
+    const auto& col = i / height;
+    const auto& val = vals[row + col * vals_ldim];
+    private_contribution += val * val;
+  }
+
+  // Shared memory reduction to get contribution for each block
+  /// @todo unroll loops
+  __shared__ DataType shared_contribution[block_size];
+  shared_contribution[tid] = private_contribution;
+  for (El::Int stride = block_size / 2; stride > 0; stride /= 2) {
+    __syncthreads();
+    if (tid < stride) {
+      shared_contribution[tid] += shared_contribution[tid + stride];
+    }
+  }
+  if (tid == 0) {
+    cuda::atomic_add(contribution, shared_contribution[0]);
+  }
+    
+}
+  
+} // namespace
+  
+template <>
+void l2_weight_regularization::accumulate_contribution<El::Device::GPU>(const GPUMat& vals,
+                                                                        GPUMat& contribution) {
+  if (!vals.IsEmpty()) {
+    const auto& size = vals.Height() * vals.Width();
+    const El::Int block_size = 256;
+    const auto& grid_size = (size + block_size - 1) / block_size;
+    auto&& stream = El::GPUManager::Stream();
+    CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
+    accumulate_contribution_kernel<block_size>
+      <<<grid_size, block_size, 0, stream>>>(
+        vals.Height(), vals.Width(),
+        vals.LockedBuffer(), vals.LDim(),
+        contribution.Buffer());
+  }
+}
+                                   
+} // namespace lbann
diff --git a/src/optimizers/adagrad.cu b/src/optimizers/adagrad.cu
index 8fea4e81c01..65abf7dbdc3 100644
--- a/src/optimizers/adagrad.cu
+++ b/src/optimizers/adagrad.cu
@@ -30,19 +30,6 @@ namespace lbann {
 
 namespace {
 
-// Square root functions
-#if __CUDA_ARCH__ >= 530
-__device__ inline float sqrt_(__half x) {
-  return hsqrt(x);
-}
-#endif // __CUDA_ARCH__ >= 530
-__device__ inline float sqrt_(float x) {
-  return sqrtf(x);
-}
-__device__ inline double sqrt_(double x) {
-  return sqrt(x);
-}
-
 __global__ void adagrad_kernel(int height,
                                int width,
                                DataType learning_rate,
@@ -62,7 +49,7 @@ __global__ void adagrad_kernel(int height,
     const auto& g = gradient[i + j * gradient_ldim];
     auto& c = cache[i + j * cache_ldim];
     c += g * g;
-    x -= learning_rate * g / (sqrt_(c) + eps);
+    x -= learning_rate * g / (cuda::sqrt(c) + eps);
   }
 }
 
diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp
index 6fbe6c5f4df..0b3efdc11d2 100644
--- a/src/proto/factories/callback_factory.cpp
+++ b/src/proto/factories/callback_factory.cpp
@@ -80,12 +80,9 @@ lbann_callback* construct_callback(lbann_comm* comm,
   }
   if (proto_cb.has_save_images()) {
     const auto& params = proto_cb.save_images();
-    auto&& reader = lbann::peek_map(data_readers, execution_mode::training);
-    const auto& layer_names = parse_list<>(params.layer_names());
-    return new lbann_callback_save_images(reader,
-                                          params.image_dir(),
-                                          layer_names,
-                                          params.extension());
+    return new lbann_callback_save_images(parse_list<>(params.layers()),
+                                          params.image_format(),
+                                          params.image_prefix());
   }
 
   //////////////////////////////////////////////////////////////////
diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp
index 992fc45b338..927c77fb199 100644
--- a/src/proto/factories/layer_factory.cpp
+++ b/src/proto/factories/layer_factory.cpp
@@ -30,6 +30,10 @@
 namespace lbann {
 namespace proto {
 
+std::vector<El::Int> get_slice_points_from_reader(const generic_data_reader* dr,
+                                                  const std::string& var_category,
+                                                  bool& is_supported);
+
 template <data_layout layout, El::Device Dev>
 Layer* construct_layer(lbann_comm* comm,
                        const std::map<execution_mode, generic_data_reader*>& data_readers,
@@ -37,6 +41,14 @@ Layer* construct_layer(lbann_comm* comm,
                        const lbann_data::Layer& proto_layer) {
   std::stringstream err;
 
+  // Convenience macro to construct layers with no parameters
+#define CONSTRUCT_LAYER(name)                           \
+  do {                                                  \
+    if (proto_layer.has_##name()) {                     \
+      return new name##_layer<layout, Dev>(comm);       \
+    }                                                   \
+  } while (false)
+
   // Input layers
   if (proto_layer.has_input()) {
     const auto& params = proto_layer.input();
@@ -74,13 +86,67 @@ Layer* construct_layer(lbann_comm* comm,
   // Fully connected layer
   if (proto_layer.has_fully_connected()) {
     const auto& params = proto_layer.fully_connected();
-    int num_neurons = params.num_neurons();
-    if (proto_layer.num_neurons_from_data_reader()) {
-      const auto dr  = lbann::peek_map(data_readers, execution_mode::training);
-      if (!dr) {
-        LBANN_ERROR("training data reader does not exist!");
+    int num_neurons = 0;
+    std::string num_neurons_method_name;
+
+    if (params.get_input_dimension_from_reader()
+        || params.get_image_dimension_from_reader()
+        || params.get_scalar_dimension_from_reader()
+        || params.get_image_and_scalar_dimension_from_reader()) {
+      num_neurons_method_name = "get_*_dimension_from_reader";
+    #if defined(LBANN_HAS_CONDUIT)
+      const auto dr_generic  = lbann::peek_map(data_readers, execution_mode::training);
+      const auto dr = dynamic_cast<lbann::data_reader_jag_conduit_hdf5*>(dr_generic);
+      if (dr != nullptr) {
+        size_t input_dim = dr->get_linearized_input_size();
+        size_t scalar_dim = dr->get_linearized_scalar_size();
+        size_t image_dim = dr->get_linearized_channel_size() * dr->get_num_channels();
+        size_t num_images = dr->get_num_img_srcs();
+
+        if (params.get_input_dimension_from_reader()) {
+          num_neurons += input_dim;
+        }
+        if (params.get_image_dimension_from_reader()) {
+          num_neurons += (num_images * image_dim);
+        }
+        if (params.get_scalar_dimension_from_reader()) {
+          num_neurons += scalar_dim;
+        }
+        if (params.get_image_and_scalar_dimension_from_reader()) {
+          num_neurons += (num_images * image_dim + scalar_dim);
+        }
+      }
+    #endif // defined(LBANN_HAS_CONDUIT)
+    } else if (params.get_num_neurons_of_slice_from_reader_size() > 0) {
+      num_neurons_method_name = "get_num_neurons_of_slice_from_reader";
+    #if defined(LBANN_HAS_CONDUIT)
+      const auto dr_generic  = lbann::peek_map(data_readers, execution_mode::training);
+      const int num_slice_indices = params.get_num_neurons_of_slice_from_reader_size();
+      if (dynamic_cast<lbann::data_reader_jag_conduit*>(dr_generic) != nullptr) {
+        const std::string& var = params.get_slice_points_from_reader();
+        bool is_supported = false; /// @todo Remove unneeded function parameter
+        const auto slice_points = get_slice_points_from_reader(dr_generic, var, is_supported);
+        for (int i = 0; i < num_slice_indices; ++i) {
+          const size_t idx = static_cast<size_t>(params.get_num_neurons_of_slice_from_reader(i));
+          if ((idx == 0u) || (idx >= slice_points.size())) {
+            err << "invalid slice index from get_num_neurons_of_slice_from_reader";
+            LBANN_ERROR(err.str());
+          }
+          const int diff = static_cast<int>(slice_points[idx] - slice_points[idx-1]);
+          num_neurons += diff;
+        }
+      }
+    #endif // defined(LBANN_HAS_CONDUIT)
+    } else {
+      num_neurons_method_name = "num_neurons";
+      num_neurons = params.num_neurons();
+      if (proto_layer.num_neurons_from_data_reader()) {
+        const auto dr  = lbann::peek_map(data_readers, execution_mode::training);
+        if (!dr) {
+          LBANN_ERROR("training data reader does not exist!");
+        }
+        num_neurons = dr->get_linearized_data_size();
       }
-      num_neurons = dr->get_linearized_data_size();
     }
     return new fully_connected_layer<layout, Dev>(comm,
                                                   num_neurons,
@@ -94,14 +160,22 @@ Layer* construct_layer(lbann_comm* comm,
     const auto& params = proto_layer.convolution();
     const auto& num_output_channels = params.num_output_channels();
     const auto& bias = params.has_bias();
+    int num_groups = params.num_groups();
+    if (num_groups == 0) {
+      num_groups = 1;
+    }
     if (params.has_vectors()) {
       const auto& dims = parse_list<int>(params.conv_dims());
       const auto& pads = parse_list<int>(params.conv_pads());
       const auto& strides = parse_list<int>(params.conv_strides());
+      std::vector<int> dilations = parse_list<int>(params.conv_dilations());
+      if (dilations.empty()) {
+        dilations.resize(dims.size(), 1);
+      }
       if (layout == data_layout::DATA_PARALLEL) {
         return new convolution_layer<data_layout::DATA_PARALLEL, Dev>(
                      comm, dims.size(), num_output_channels,
-                     dims, pads, strides, bias
+                     dims, pads, strides, dilations, num_groups, bias
                    );
       }
     } else {
@@ -109,10 +183,14 @@ Layer* construct_layer(lbann_comm* comm,
       const auto& dim = params.conv_dims_i();
       const auto& pad = params.conv_pads_i();
       const auto& stride = params.conv_strides_i();
+      int dilation = params.conv_dilations_i();
+      if (dilation == 0) {
+        dilation = 1;
+      }
       if (layout == data_layout::DATA_PARALLEL) {
         return new convolution_layer<data_layout::DATA_PARALLEL, Dev>(
                      comm, num_dims, num_output_channels,
-                     dim, pad, stride, bias
+                     dim, pad, stride, dilation, num_groups, bias
                    );
       }
     }
@@ -121,6 +199,10 @@ Layer* construct_layer(lbann_comm* comm,
     const auto& params = proto_layer.deconvolution();
     const auto& bias = params.has_bias();
     int num_output_channels = params.num_output_channels();
+    int num_groups = params.num_groups();
+    if (num_groups == 0) {
+      num_groups = 1;
+    }
     if (proto_layer.num_neurons_from_data_reader()) {
       const auto dr  = lbann::peek_map(data_readers, execution_mode::training);
       if (!dr) {
@@ -132,10 +214,14 @@ Layer* construct_layer(lbann_comm* comm,
       const auto& dims = parse_list<int>(params.conv_dims());
       const auto& pads = parse_list<int>(params.conv_pads());
       const auto& strides = parse_list<int>(params.conv_strides());
+      std::vector<int> dilations = parse_list<int>(params.conv_dilations());
+      if (dilations.empty()) {
+        dilations.resize(dims.size(), 1);
+      }
       if (layout == data_layout::DATA_PARALLEL) {
         return new deconvolution_layer<data_layout::DATA_PARALLEL, Dev>(
                      comm, dims.size(), num_output_channels,
-                     dims, pads, strides, bias
+                     dims, pads, strides, dilations, num_groups, bias
                    );
       }
     } else {
@@ -143,10 +229,14 @@ Layer* construct_layer(lbann_comm* comm,
       const auto& dim = params.conv_dims_i();
       const auto& pad = params.conv_pads_i();
       const auto& stride = params.conv_strides_i();
+      int dilation = params.conv_dilations_i();
+      if (dilation == 0) {
+        dilation = 1;
+      }
       if (layout == data_layout::DATA_PARALLEL) {
         return new deconvolution_layer<data_layout::DATA_PARALLEL, Dev>(
                      comm, num_dims, num_output_channels,
-                     dim, pad, stride, bias
+                     dim, pad, stride, dilation, num_groups, bias
                    );
       }
     }
@@ -186,10 +276,51 @@ Layer* construct_layer(lbann_comm* comm,
   }
   if (proto_layer.has_slice()) {
     const auto& params = proto_layer.slice();
-    const auto& slice_points = parse_list<El::Int>(params.slice_points());
-    return new slice_layer<layout, Dev>(comm,
-                                        params.slice_axis(),
-                                        slice_points);
+    std::vector<El::Int> slice_points;
+    bool is_supported = false;
+    std::string slice_point_method_name;
+
+    if (params.get_slice_points_from_reader_bool()) {
+      slice_point_method_name = "'get_slice_points_from_reader_bool'";
+    #if defined(LBANN_HAS_CONDUIT)
+      size_t total = 0;
+      slice_points.push_back(total);
+      const auto dr_generic  = lbann::peek_map(data_readers, execution_mode::training);
+      if (dynamic_cast<lbann::data_reader_jag_conduit_hdf5*>(dr_generic) != nullptr) {
+        is_supported = true;
+        const auto dr1  = lbann::peek_map(data_readers, execution_mode::training);
+        lbann::data_reader_jag_conduit_hdf5 *dr = dynamic_cast<lbann::data_reader_jag_conduit_hdf5*>(dr1);
+        total += dr->get_num_img_srcs() * dr->get_linearized_channel_size() * dr->get_num_channels()
+              + dr->get_linearized_scalar_size();
+        slice_points.push_back(total);
+        total += dr->get_linearized_input_size();
+        slice_points.push_back(total);
+      }
+    #endif // defined(LBANN_HAS_CONDUIT)
+    } else if (params.get_slice_points_from_reader() != "") {
+      slice_point_method_name = "'get_slice_points_from_reader'";
+    #if defined(LBANN_HAS_CONDUIT)
+      const auto dr_generic  = lbann::peek_map(data_readers, execution_mode::training);
+      const std::string& var = params.get_slice_points_from_reader();
+      slice_points = get_slice_points_from_reader(dr_generic, var, is_supported);
+    #endif // defined(LBANN_HAS_CONDUIT)
+    } else {
+      slice_point_method_name = "'slice_points'";
+      slice_points = parse_list<El::Int>(params.slice_points());
+      is_supported = true;
+    }
+    if (slice_points.size() < 2u) {
+      if (is_supported) {
+        err << "Failed to get slice points via " << slice_point_method_name << '.';
+      } else {
+        err << slice_point_method_name << " is not supported by the reader.";
+      }
+      LBANN_ERROR(err.str());
+      return nullptr;
+    }
+    return  new slice_layer<layout, Dev>(comm,
+                                         params.slice_axis(),
+                                         slice_points);
   }
   if (proto_layer.has_hadamard()) {
     return new hadamard_layer<layout, Dev>(comm);
@@ -306,16 +437,6 @@ Layer* construct_layer(lbann_comm* comm,
   if (proto_layer.has_stop_gradient()) {
     return new stop_gradient_layer<layout, Dev>(comm);
   }
-  if (proto_layer.has_max()) {
-    if (Dev == El::Device::CPU) {
-      return new max_layer<layout, El::Device::CPU>(comm);
-    }
-  }
-  if (proto_layer.has_min()) {
-    if (Dev == El::Device::CPU) {
-      return new min_layer<layout, El::Device::CPU>(comm);
-    }
-  }
   if (proto_layer.has_in_top_k()) {
     const auto& params = proto_layer.in_top_k();
     return new in_top_k_layer<layout, Dev>(comm, params.k());
@@ -326,15 +447,22 @@ Layer* construct_layer(lbann_comm* comm,
       return new sort_layer<data_layout::DATA_PARALLEL, Dev>(comm, params.descending());
     }
   }
+  if (proto_layer.has_weights_layer()) {
+    const auto& params = proto_layer.weights_layer();
+    const auto& dims = parse_list<El::Int>(params.dims());
+    return new weights_layer<layout, Dev>(comm, dims);
+  }
 
   // Regularizer layers
   if (proto_layer.has_batch_normalization()) {
     const auto& params = proto_layer.batch_normalization();
     if (layout == data_layout::DATA_PARALLEL) {
-      return new batch_normalization<data_layout::DATA_PARALLEL, Dev>(comm,
-                                                                      params.decay(),
-                                                                      params.epsilon(),
-                                                                      params.global_stats());
+      return new batch_normalization_layer<data_layout::DATA_PARALLEL, Dev>(comm,
+                                                                            params.decay(),
+                                                                            params.epsilon(),
+                                                                            params.global_stats());
+    } else {
+      LBANN_ERROR("batch normalization is only supported in a data-parallel layout");
     }
   }
   if (proto_layer.has_dropout()) {
@@ -363,43 +491,65 @@ Layer* construct_layer(lbann_comm* comm,
     }
   }
 
+  // Math layers
+  if (proto_layer.has_not_()) { return new not_layer<layout, Dev>(comm); }
+  CONSTRUCT_LAYER(abs);
+  CONSTRUCT_LAYER(negative);
+  CONSTRUCT_LAYER(sign);
+  CONSTRUCT_LAYER(round);
+  CONSTRUCT_LAYER(ceil);
+  CONSTRUCT_LAYER(floor);
+  CONSTRUCT_LAYER(reciprocal);
+  CONSTRUCT_LAYER(square);
+  CONSTRUCT_LAYER(sqrt);
+  CONSTRUCT_LAYER(rsqrt);
+  CONSTRUCT_LAYER(safe_reciprocal);
+  CONSTRUCT_LAYER(exp);
+  CONSTRUCT_LAYER(expm1);
+  CONSTRUCT_LAYER(log);
+  CONSTRUCT_LAYER(log1p);
+  CONSTRUCT_LAYER(cos);
+  CONSTRUCT_LAYER(sin);
+  CONSTRUCT_LAYER(tan);
+  CONSTRUCT_LAYER(acos);
+  CONSTRUCT_LAYER(asin);
+  CONSTRUCT_LAYER(atan);
+  CONSTRUCT_LAYER(cosh);
+  CONSTRUCT_LAYER(sinh);
+  CONSTRUCT_LAYER(tanh);
+  CONSTRUCT_LAYER(acosh);
+  CONSTRUCT_LAYER(asinh);
+  CONSTRUCT_LAYER(atanh);
+  CONSTRUCT_LAYER(add);
+  CONSTRUCT_LAYER(subtract);
+  CONSTRUCT_LAYER(multiply);
+  CONSTRUCT_LAYER(divide);
+  CONSTRUCT_LAYER(mod);
+  CONSTRUCT_LAYER(pow);
+  CONSTRUCT_LAYER(safe_divide);
+  CONSTRUCT_LAYER(max);
+  CONSTRUCT_LAYER(min);
+  CONSTRUCT_LAYER(equal);
+  CONSTRUCT_LAYER(not_equal);
+  CONSTRUCT_LAYER(less);
+  CONSTRUCT_LAYER(less_equal);
+  CONSTRUCT_LAYER(greater);
+  CONSTRUCT_LAYER(greater_equal);
+  if (proto_layer.has_and_()) { return new and_layer<layout, Dev>(comm); }
+  if (proto_layer.has_or_())  { return new or_layer<layout, Dev>(comm); }
+  if (proto_layer.has_xor_()) { return new xor_layer<layout, Dev>(comm); }
+
   // Activation layers
-  if (proto_layer.has_softmax()) {
-    return new softmax_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_relu()) {
-    return new relu_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_sigmoid()) {
-    return new sigmoid_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_tanh()) {
-    return new tanh_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_atan()) {
-    return new atan_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_exponential()) {
-    return new exponential_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_identity()) {
-    return new identity_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_bent_identity()) {
-    return new bent_identity_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_softplus()) {
-    return new softplus_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_smooth_relu()) {
-    return new smooth_relu_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_leaky_relu()) {
-    return new leaky_relu_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_swish()) {
-    return new swish_layer<layout, Dev>(comm);
-  }
+  CONSTRUCT_LAYER(softmax);
+  CONSTRUCT_LAYER(log_softmax);
+  CONSTRUCT_LAYER(relu);
+  CONSTRUCT_LAYER(sigmoid);
+  CONSTRUCT_LAYER(identity);
+  CONSTRUCT_LAYER(bent_identity);
+  CONSTRUCT_LAYER(softplus);
+  CONSTRUCT_LAYER(smooth_relu);
+  CONSTRUCT_LAYER(leaky_relu);
+  CONSTRUCT_LAYER(swish);
   if (proto_layer.has_elu()) {
     const auto& params = proto_layer.elu();
     return new elu_layer<layout, Dev>(comm, params.alpha());
@@ -414,42 +564,40 @@ Layer* construct_layer(lbann_comm* comm,
       return new selu_layer<layout, Dev>(comm);
     }
   }
-  if (proto_layer.has_power()) {
-    const auto& params = proto_layer.power();
-    return new power_layer<layout, Dev>(comm, params.exponent());
-  }
-  if (proto_layer.has_log()) {
-    const auto& params = proto_layer.log();
-    const auto& base = params.base();
-    if (base != 0.0) {
-      return new log_layer<layout, Dev>(comm, base);
-    } else {
-      return new log_layer<layout, Dev>(comm);
-    }
-  }
-
-  if (proto_layer.has_abs()) {
-    return new abs_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_l2_loss()) {
-    return new l2_loss_layer<layout, Dev>(comm);
-  }
 
   // Loss layers
-  if (proto_layer.has_cross_entropy()) {
-    return new cross_entropy_layer<layout, Dev>(comm);
-  }
-  if (proto_layer.has_mean_squared_error()) {
-    return new mean_squared_error_layer<layout, Dev>(comm);
-  }
+  CONSTRUCT_LAYER(categorical_accuracy);
+  CONSTRUCT_LAYER(cross_entropy);
+  CONSTRUCT_LAYER(mean_squared_error);
   if (proto_layer.has_top_k_categorical_accuracy()) {
     const auto& params = proto_layer.top_k_categorical_accuracy();
     return new top_k_categorical_accuracy_layer<layout, Dev>(comm, params.k());
   }
+  CONSTRUCT_LAYER(l2_norm2);
+  CONSTRUCT_LAYER(binary_cross_entropy);
+  CONSTRUCT_LAYER(sigmoid_binary_cross_entropy);
+  CONSTRUCT_LAYER(boolean_accuracy);
+  CONSTRUCT_LAYER(boolean_false_negative);
+  CONSTRUCT_LAYER(boolean_false_positive);
 
-  if (proto_layer.has_bce_with_logits()) {
-    const auto& params = proto_layer.bce_with_logits();
-    return new sigmoid_bce_with_logits_layer<layout, Dev>(comm, params.true_label());
+  // Image layers
+  if (proto_layer.has_bilinear_resize()) {
+    const auto& params = proto_layer.bilinear_resize();
+    if (layout == data_layout::DATA_PARALLEL) {
+      return new bilinear_resize_layer<data_layout::DATA_PARALLEL, Dev>(comm,
+                                                                        params.height(),
+                                                                        params.width());
+    }
+  }
+
+  // Miscellaneous layers
+  if (proto_layer.has_covariance()) {
+    const auto& params = proto_layer.covariance();
+    return new covariance_layer<layout, Dev>(comm, params.biased());
+  }
+  if (proto_layer.has_variance()) {
+    const auto& params = proto_layer.variance();
+    return new variance_layer<layout, Dev>(comm, params.biased());
   }
 
   // Throw exception if layer has not been constructed
@@ -487,5 +635,30 @@ template Layer* construct_layer<data_layout::MODEL_PARALLEL, El::Device::GPU>(
 );
 #endif // LBANN_HAS_GPU
 
+/// Obtain the slice points from the data reader
+std::vector<El::Int> get_slice_points_from_reader(const generic_data_reader* dr_generic,
+                                                  const std::string& var_category,
+                                                  bool& is_supported) {
+  std::vector<El::Int> slice_points;
+  is_supported = false;
+#if defined(LBANN_HAS_CONDUIT)
+  // TODO: remove the dynamic cast when this feature gets merged into the base class
+  const auto dr = dynamic_cast<const data_reader_jag_conduit*>(dr_generic);
+
+  if (dr != nullptr) {
+    is_supported = true;
+    if (var_category == "independent") {
+      slice_points = dr->get_slice_points_independent();
+    } else if (var_category == "dependent") {
+      slice_points = dr->get_slice_points_independent();
+    } else {
+      LBANN_ERROR("Unknown variable category \"" + var_category \
+                  + "\". Must be either \"independent\" or \"dependent\".");
+    }
+  }
+#endif
+  return slice_points;
+}
+
 } // namespace proto
 } // namespace lbann
diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp
index 21782d17832..2d9b1f1435a 100644
--- a/src/proto/factories/model_factory.cpp
+++ b/src/proto/factories/model_factory.cpp
@@ -45,12 +45,12 @@ model* instantiate_model(lbann_comm* comm,
   // Construct model
   const auto& type = proto_model.name();
   const auto& mini_batch_size = proto_model.mini_batch_size();
-  if (type == "sequential_model" || type == "") {
-    return new sequential_model(comm, mini_batch_size, obj, opt);
-  }
-  if (type == "directed_acyclic_graph_model") {
+  if (type.empty() || type == "directed_acyclic_graph_model") {
     return new directed_acyclic_graph_model(comm, mini_batch_size, obj, opt);
   }
+  if (type == "sequential_model") {
+    return new sequential_model(comm, mini_batch_size, obj, opt);
+  }
   if (type == "siamese_model") {
     const auto& params = proto_model.siamese();
     return new siamese_model(comm,
@@ -240,6 +240,10 @@ model* construct_model(lbann_comm* comm,
   for (auto&& w   : weights_list ) { m->add_weights(w);   }
   for (auto&& met : metric_list  ) { m->add_metric(met);  }
   for (auto&& cb  : callback_list) { m->add_callback(cb); }
+  m->set_model_id(proto_model.model_id());
+  for (auto t : data_readers) {
+    t.second->set_model(m);
+  }
   return m;
 
 }
diff --git a/src/proto/factories/weights_factory.cpp b/src/proto/factories/weights_factory.cpp
index 8a2c1c99c83..a5e19ffb5c1 100644
--- a/src/proto/factories/weights_factory.cpp
+++ b/src/proto/factories/weights_factory.cpp
@@ -39,6 +39,12 @@ weights_initializer* construct_initializer(const lbann_data::Weights& proto_weig
     const auto& params = proto_weights.constant_initializer();
     return new constant_initializer(params.value());
   }
+
+  // Value initialization
+  if (proto_weights.has_value_initializer()) {
+    const auto& params = proto_weights.value_initializer();
+    return new value_initializer(parse_list<DataType>(params.values()));
+  }
   
   // Random initialization
   if (proto_weights.has_uniform_initializer()) {
diff --git a/src/proto/init_image_data_readers.cpp b/src/proto/init_image_data_readers.cpp
index 5a82eff4b9a..fb9d12cf5d2 100644
--- a/src/proto/init_image_data_readers.cpp
+++ b/src/proto/init_image_data_readers.cpp
@@ -251,6 +251,9 @@ void init_image_preprocessor(const lbann_data::Reader& pb_readme, const bool mas
   // final size of image
   width = pb_preprocessor.raw_width();
   height = pb_preprocessor.raw_height();
+  if (pb_preprocessor.raw_num_channels() > 0) {
+    channels = pb_preprocessor.raw_num_channels();
+  }
 
   if (pb_preprocessor.has_subtractor() && !has_channel_wise_subtractor(pb_preprocessor)) {
     // decolorizer and colorizer are exclusive
@@ -316,7 +319,8 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const bool mast
   std::shared_ptr<cv_process> pp;
   // set up the image preprocessor
   if ((name == "imagenet") || (name == "jag_conduit") || (name == "jag_conduit_hdf5") ||
-      (name == "triplet") || (name == "mnist_siamese") || (name == "multi_images")) {
+      (name == "triplet") || (name == "mnist_siamese") || (name == "multi_images") ||
+      (name == "moving_mnist")) {
     pp = std::make_shared<cv_process>();
   } else if (name == "imagenet_patches") {
     pp = std::make_shared<cv_process_patches>();
@@ -331,7 +335,7 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const bool mast
 
   // final size of image
   int width = 0, height = 0;
-  int channels = 3;
+  int channels = 0;
 
   // setup preprocessor
   init_image_preprocessor(pb_readme, master, pp, width, height, channels);
@@ -347,36 +351,72 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const bool mast
     reader = new data_reader_mnist_siamese(pp, shuffle);
   } else if (name == "multi_images") {
     reader = new data_reader_multi_images(pp, shuffle);
+  } else if (name == "moving_mnist") {
+    reader = new moving_mnist_reader(7, 40, 40, 2);
 #ifdef LBANN_HAS_CONDUIT
   } else if (name =="jag_conduit_hdf5") {
     data_reader_jag_conduit_hdf5* reader_jag = new data_reader_jag_conduit_hdf5(pp, shuffle);
     reader_jag->set_image_dims(width, height);
-    reader_jag->set_use_images(pb_readme.use_images());
-    reader_jag->set_use_scalars(pb_readme.use_scalars());
-    reader_jag->set_use_inputs(pb_readme.use_inputs());
+    reader_jag->set_scalar_keys(pb_readme.scalar_keys());
+    reader_jag->set_input_keys(pb_readme.input_keys());
+    reader_jag->set_image_views(pb_readme.image_views());
+    reader_jag->set_image_channels(pb_readme.image_channels());
     reader = reader_jag;
     if (master) std::cout << reader->get_type() << " is set" << std::endl;
     return;
   } else if (name =="jag_conduit") {
     data_reader_jag_conduit* reader_jag = new data_reader_jag_conduit(pp, shuffle);
 
-    reader_jag->set_image_dims(width, height);
+    if (channels == 0) {
+      channels = 1;
+    }
+    reader_jag->set_image_dims(width, height, channels);
+
+    // Whether to split channels of an image before preprocessing
+    if (pb_readme.split_jag_image_channels()) {
+      reader_jag->set_split_image_channels();
+    } else {
+      reader_jag->unset_split_image_channels();
+    }
+
+    // declare the set of images to use
+    std::vector<std::string> image_keys(pb_readme.jag_image_keys_size());
+
+    for (int i=0; i < pb_readme.jag_image_keys_size(); ++i) {
+      image_keys[i] = pb_readme.jag_image_keys(i);
+    }
+
+    reader_jag->set_image_choices(image_keys);
+
 
     using var_t = data_reader_jag_conduit::variable_t;
+
     // composite independent variable
-    std::vector<var_t> independent_type(pb_readme.independent_size());
+    std::vector< std::vector<var_t> > independent_type(pb_readme.independent_size());
 
     for (int i=0; i < pb_readme.independent_size(); ++i) {
-      independent_type[i] = static_cast<var_t>(pb_readme.independent(i));
+      const lbann_data::Reader::JAGDataSlice& slice = pb_readme.independent(i);
+      const int slice_size = slice.pieces_size();
+      for (int j=0; j < slice_size; ++j) {
+        // TODO: instead of using cast, use proper conversion function
+        const auto var_type = static_cast<var_t>(slice.pieces(j));
+        independent_type[i].push_back(var_type);
+      }
     }
 
     reader_jag->set_independent_variable_type(independent_type);
 
     // composite dependent variable
-    std::vector<var_t> dependent_type(pb_readme.dependent_size());
+    std::vector< std::vector<var_t> > dependent_type(pb_readme.dependent_size());
 
     for (int i=0; i < pb_readme.dependent_size(); ++i) {
-      dependent_type[i] = static_cast<var_t>(pb_readme.dependent(i));
+      const lbann_data::Reader::JAGDataSlice& slice = pb_readme.dependent(i);
+      const int slice_size = slice.pieces_size();
+      for (int j=0; j < slice_size; ++j) {
+        // TODO: instead of using cast, use proper conversion function
+        const auto var_type = static_cast<var_t>(slice.pieces(j));
+        dependent_type[i].push_back(var_type);
+      }
     }
 
     reader_jag->set_dependent_variable_type(dependent_type);
@@ -433,12 +473,43 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const bool mast
       reader_jag->add_input_prefix_filter(pf);
     }
 
+    // add image normalization parameters
+    const int num_image_normalization_params = pb_readme.jag_image_normalization_params_size();
+    for (int i=0; i <  num_image_normalization_params; ++i) {
+      using linear_transform_t = lbann::data_reader_jag_conduit::linear_transform_t;
+      const linear_transform_t np = std::make_pair(pb_readme.jag_image_normalization_params(i).scale(),
+                                                   pb_readme.jag_image_normalization_params(i).bias());
+      reader_jag->add_image_normalization_param(np);
+    }
+
+    // add scalar normalization parameters
+    const int num_scalar_normalization_params = pb_readme.jag_scalar_normalization_params_size();
+    for (int i=0; i <  num_scalar_normalization_params; ++i) {
+      using linear_transform_t = lbann::data_reader_jag_conduit::linear_transform_t;
+      const linear_transform_t np = std::make_pair(pb_readme.jag_scalar_normalization_params(i).scale(),
+                                                   pb_readme.jag_scalar_normalization_params(i).bias());
+      reader_jag->add_scalar_normalization_param(np);
+    }
+
+    // add input normalization parameters
+    const int num_input_normalization_params = pb_readme.jag_input_normalization_params_size();
+    for (int i=0; i <  num_input_normalization_params; ++i) {
+      using linear_transform_t = lbann::data_reader_jag_conduit::linear_transform_t;
+      const linear_transform_t np = std::make_pair(pb_readme.jag_input_normalization_params(i).scale(),
+                                                   pb_readme.jag_input_normalization_params(i).bias());
+      reader_jag->add_input_normalization_param(np);
+    }
+
     reader = reader_jag;
     if (master) std::cout << reader->get_type() << " is set" << std::endl;
     return;
 #endif // LBANN_HAS_CONDUIT
   }
 
+  if (channels == 0) {
+    channels = 3;
+  }
+
   auto* image_data_reader_ptr = dynamic_cast<image_data_reader*>(reader);
   if (!image_data_reader_ptr && master) {
     std::stringstream err;
@@ -530,6 +601,8 @@ void init_org_image_data_reader(const lbann_data::Reader& pb_readme, const bool
   } else if (name == "cifar10") {
     reader = new cifar10_reader(shuffle);
     if (master) std::cout << "cifar10_reader is set" << std::endl;
+  } else if (name == "moving_mnist") {
+    reader = new moving_mnist_reader(7, 40, 40, 2);
   } else {
     if (master) {
       std::stringstream err;
diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto
index c31a90385c3..10ae0e8dd7f 100644
--- a/src/proto/lbann.proto
+++ b/src/proto/lbann.proto
@@ -35,6 +35,17 @@ message Reader {
   ImagePreprocessor image_preprocessor = 13;
 
   //------------------ start of only for jag_conduit -----------------------
+  message JagLinearNormalizationParams {
+    double scale = 1;
+    double bias = 2;
+  }
+
+  repeated JagLinearNormalizationParams jag_image_normalization_params = 86;
+  repeated JagLinearNormalizationParams jag_scalar_normalization_params = 87;
+  repeated JagLinearNormalizationParams jag_input_normalization_params = 88;
+
+  bool split_jag_image_channels = 89;
+  repeated string jag_image_keys = 90;
   repeated string jag_scalar_keys = 91;
   repeated string jag_input_keys = 92;
   message JagKeyPrefixFilter {
@@ -45,19 +56,32 @@ message Reader {
   repeated JagKeyPrefixFilter jag_scalar_prefix_filters = 94;
   repeated string jag_input_filters = 95;
   repeated JagKeyPrefixFilter jag_input_prefix_filters = 96;
-  repeated int32 independent = 97;
-  repeated int32 dependent = 98;
+
+  enum JAG_Data {
+    Undefined  = 0;
+    JAG_Image  = 1;
+    JAG_Scalar = 2;
+    JAG_Input  = 3;
+  }
+  message JAGDataSlice {
+    repeated JAG_Data pieces = 1;
+  }
+  repeated JAGDataSlice independent = 97;
+  repeated JAGDataSlice dependent = 98;
+
   int32 max_files_to_load = 1000;
 
   // for jag_conduit_hdf5
-  bool use_scalars = 1001;
-  bool use_images = 1002;
-  bool use_inputs = 1003;
+  string scalar_keys = 1004;
+  string input_keys = 1005;
+  string image_views = 1006;
+  string image_channels = 1007;
   //------------------  end of only for jag_conduit  -----------------------
 
   int32 num_labels = 99; //for imagenet and synthetic
   int64 num_samples = 100; //only for synthetic
   string synth_dimensions = 101; //only for synthetic
+  string synth_response_dimensions = 115; //only for synthetic
   //csv attributes
   string separator = 102;
   int32 skip_cols = 103;
@@ -74,9 +98,9 @@ message Reader {
   int32 num_image_srcs = 114; // data_reader_multi_images
 
   //------------- start of only for partitioned data sets ------------------
-  bool is_partitioned = 300; 
+  bool is_partitioned = 300;
   double partition_overlap = 301;
-  int32 partition_mode = 302;  
+  int32 partition_mode = 302;
        // 1 - share a portion of your data with two neighbors;
        // 2 - there's a set of overlap indices that are common to all models
   //------------- end of only for partitioned data sets ------------------
@@ -87,6 +111,7 @@ message ImagePreprocessor {
   bool disable = 2;
   int32 raw_width = 3;
   int32 raw_height = 4;
+  int32 raw_num_channels = 5;
 
   message Cropper {
     string name = 1;
@@ -164,15 +189,15 @@ message ImagePreprocessor {
     float factor = 3;
   }
 
-  Cropper cropper = 5;
-  Resizer resizer = 34;
-  Augmenter augmenter = 6;
-  Decolorizer decolorizer = 7;
-  Colorizer colorizer = 8;
-  Subtractor subtractor = 9;
-  Normalizer normalizer = 10;
-  Noiser noiser = 11;
-  PatchExtractor patch_extractor = 12;
+  Cropper cropper = 6;
+  Resizer resizer = 7;
+  Augmenter augmenter = 8;
+  Decolorizer decolorizer = 9;
+  Colorizer colorizer = 10;
+  Subtractor subtractor = 11;
+  Normalizer normalizer = 12;
+  Noiser noiser = 13;
+  PatchExtractor patch_extractor = 14;
 
   int32 early_normalization = 33; // for data_reader_jag only
 }
@@ -199,7 +224,8 @@ message GenericPreprocessor {
 //========================================================================
 
 message Model {
-  string name = 1; //sequential_model, dag_model, greedy_layerwise_autoencoder, siamese_model
+  string model_id = 1000; //arbitrary identifier
+  string name = 1; //deprecated
   ObjectiveFunction objective_function = 2;
   repeated Metric metric = 5;
   string data_layout = 6;
@@ -450,8 +476,8 @@ message Callback {
 }
 
 message CallbackLTFB {
-  int64 round_size = 1; 
-  bool increasing_metric_mode = 2; //Expectation for a good tournament metric: increasing (true) is default 
+  int64 round_size = 1;
+  bool increasing_metric_mode = 2; //Expectation for a good tournament metric: increasing (true) is default
   string eval_metrics = 3; //eval metrics to use for tournament, at least 1 metric has to be provided
   string weights_tosend = 4; //list of weights to transfer between model, default is all weights (classic LTFB)
 }
@@ -473,9 +499,9 @@ message CallbackAdaptiveLearningRate {
 }
 
 message CallbackSaveImages {
-  string image_dir = 1;
-  string layer_names = 2; //layer(s) at which to save images  e.g., "input, reconstruction"
-  string extension = 3;
+  string layers       = 1; // Layer outputs to save as images
+  string image_format = 2; // Image format (e.g. jpg, png, pgm)
+  string image_prefix = 3; // Prefix for saved image files
 }
 
 message CallbackPrint {
@@ -669,14 +695,15 @@ message Weights {
   Optimizer optimizer = 2;
 
   ConstantInitializer constant_initializer = 20;
-  UniformInitializer uniform_initializer = 21;
-  NormalInitializer normal_initializer = 22;
-  GlorotNormalInitializer glorot_normal_initializer = 23;
-  GlorotUniformInitializer glorot_uniform_initializer = 24;
-  HeNormalInitializer he_normal_initializer = 25;
-  HeUniformInitializer he_uniform_initializer = 26;
-  LeCunNormalInitializer lecun_normal_initializer = 27;
-  LeCunUniformInitializer lecun_uniform_initializer = 28;
+  ValueInitializer value_initializer = 21;
+  UniformInitializer uniform_initializer = 22;
+  NormalInitializer normal_initializer = 23;
+  GlorotNormalInitializer glorot_normal_initializer = 24;
+  GlorotUniformInitializer glorot_uniform_initializer = 25;
+  HeNormalInitializer he_normal_initializer = 26;
+  HeUniformInitializer he_uniform_initializer = 27;
+  LeCunNormalInitializer lecun_normal_initializer = 28;
+  LeCunUniformInitializer lecun_uniform_initializer = 29;
 
 }
 
@@ -684,6 +711,9 @@ message Weights {
 message ConstantInitializer {
   double value = 1;
 }
+message ValueInitializer {
+  string values = 1;
+}
 message UniformInitializer {
   double min = 1;
   double max = 2;
@@ -772,10 +802,10 @@ message Layer {
    // motif layer
    MotifLayer motif_layer = 4;
 
-   // input Layers
+   // Input layers
    Input input = 2;
 
-   // transform Layers
+   // Transform layers
    Reshape reshape = 306;
    Pooling pooling = 12;
    Concatenation concatenation = 300;
@@ -797,33 +827,88 @@ message Layer {
    DiscreteRandom discrete_random = 318;
    Dummy dummy = 319;
    StopGradient stop_gradient = 320;
-   Max max = 321;
-   Min min = 322;
    InTopK in_top_k = 324;
    Sort sort = 325;
+   WeightsLayer weights_layer = 326;
 
-   // learning Layers
+   // Learning layers
    FullyConnected fully_connected = 11;
    Convolution convolution = 13;
    Deconvolution deconvolution = 305;
 
-   // loss layers
-   CrossEntropy cross_entropy = 60;
-   MeanSquaredError mean_squared_error = 61;
-   TopKCategoricalAccuracy top_k_categorical_accuracy = 62;
-
-   // target Layers
+   // Loss layers
+   CategoricalAccuracy categorical_accuracy = 60;
+   CrossEntropy cross_entropy = 61;
+   MeanSquaredError mean_squared_error = 62;
+   TopKCategoricalAccuracy top_k_categorical_accuracy = 63;
+   L2Norm2 l2_norm2 = 64;
+   BinaryCrossEntropy binary_cross_entropy = 65;
+   SigmoidBinaryCrossEntropy sigmoid_binary_cross_entropy = 66;
+   BooleanAccuracy boolean_accuracy = 67;
+   BooleanFalseNegative boolean_false_negative = 68;
+   BooleanFalsePositive boolean_false_positive = 69;
+
+   // Math layers
+   Not not = 401;
+   Abs abs = 402;
+   Negative negative = 403;
+   Sign sign = 404;
+   Round round = 405;
+   Ceil ceil = 406;
+   Floor floor = 407;
+   Reciprocal reciprocal = 408;
+   Square square = 409;
+   Sqrt sqrt = 410;
+   Rsqrt rsqrt = 411;
+   SafeReciprocal safe_reciprocal = 412;
+   Exp exp = 413;
+   Expm1 expm1 = 414;
+   Log log = 415;
+   Log1p log1p = 416;
+   Cos cos = 417;
+   Sin sin = 418;
+   Tan tan = 419;
+   Acos acos = 420;
+   Asin asin = 421;
+   Atan atan = 422;
+   Cosh cosh = 423;
+   Sinh sinh = 424;
+   Tanh tanh = 425;
+   Acosh acosh = 426;
+   Asinh asinh = 427;
+   Atanh atanh = 428;
+   Add add = 450;
+   Subtract subtract = 451;
+   Multiply multiply = 452;
+   Divide divide = 453;
+   Mod mod = 454;
+   Pow pow = 455;
+   SafeDivide safe_divide = 456;
+   Max max = 457;
+   Min min = 458;
+   Equal equal = 459;
+   NotEqual not_equal = 460;
+   Less less = 461;
+   LessEqual less_equal = 462;
+   Greater greater = 463;
+   GreaterEqual greater_equal = 464;
+   And and = 465;
+   Or or = 466;
+   Xor xor = 467;
+
+   // Target Layers
    Target target = 18;
    TargetReconstruction reconstruction = 22;
 
-   // regularization Layers
+   // Regularization Layers
    BatchNormalization batch_normalization = 19;
    LocalResponseNormalization local_response_normalization = 20;
    Dropout dropout = 21;
    SeluDropout selu_dropout = 229;
 
-   // activation Layers
+   // Activation Layers
    Softmax softmax = 200;
+   LogSoftmax log_softmax = 203;
    ELU elu = 30;
    Identity identity = 31;
    LeakyRelu leaky_relu = 32;
@@ -832,16 +917,16 @@ message Layer {
    SmoothRelu smooth_relu = 35;
    Softplus softplus = 36;
    Selu selu = 37;
-   Tanh tanh = 38;
-   Atan atan = 39;
    BentIdentity bent_identity = 40;
-   Exponential exponential = 41;
    Swish swish = 42;
-   Power power = 43;
-   Abs   abs = 44;
-   L2Loss l2_loss = 45;
-   Log log = 46;
-   Sigmoid_Binary_Cross_Entropy_With_Logits bce_with_logits = 47;
+
+   // Image layers
+   BilinearResize bilinear_resize = 500;
+
+   // Miscellaneous layers
+   Covariance covariance = 600;
+   Variance variance = 601;
+
 }
 ///////////////////////
 // MotifLayer //
@@ -851,6 +936,55 @@ message MotifLayer {
   repeated string variable = 2;
 }
 
+///////////////////////
+// Math Layers       //
+///////////////////////
+message Not {}
+message Abs {}
+message Negative {}
+message Sign {}
+message Round {}
+message Ceil {}
+message Floor {}
+message Reciprocal {}
+message Square {}
+message Sqrt {}
+message Rsqrt {}
+message SafeReciprocal {}
+message Exp {}
+message Expm1 {}
+message Log {}
+message Log1p {}
+message Cos {}
+message Sin {}
+message Tan {}
+message Acos {}
+message Asin {}
+message Atan {}
+message Cosh {}
+message Sinh {}
+message Tanh {}
+message Acosh {}
+message Asinh {}
+message Atanh {}
+message Add {}
+message Subtract {}
+message Multiply {}
+message Divide {}
+message Mod {}
+message Pow {}
+message SafeDivide {}
+message Max {}
+message Min {}
+message Equal {}
+message NotEqual {}
+message Less {}
+message LessEqual {}
+message Greater {}
+message GreaterEqual {}
+message And {}
+message Or {}
+message Xor {}
 
 ///////////////////////
 // Activation Layers //
@@ -878,18 +1012,9 @@ message SmoothRelu {
 message Softplus {
 }
 
-message Tanh {
-}
-
-message Atan {
-}
-
 message BentIdentity {
 }
 
-message Exponential {
-}
-
 message Swish {
 }
 
@@ -901,23 +1026,16 @@ message Selu {
 message Softmax {
 }
 
-message Power {
-  double exponent = 1;
-}
-
-message Abs {
-}
-
-message L2Loss {
+message LogSoftmax {
 }
 
-message Log {
-  double base = 1;
-}
-
-message Sigmoid_Binary_Cross_Entropy_With_Logits {
-  int32 true_label = 1;
-}
+///////////////////////
+// Loss Layers //
+///////////////////////
+message L2Norm2 {}
+message SigmoidBinaryCrossEntropy {}
+message BooleanFalseNegative {}
+message BooleanFalsePositive {}
 
 ///////////////////////////
 // Regularization Layers //
@@ -1006,6 +1124,9 @@ message Concatenation {
 message Slice {
   int64 slice_axis = 2;
   string slice_points = 3; //should be space-separated list of ints, e.g, "2 6 7"
+  //the following is for jag_conduit_hdf5;
+  string get_slice_points_from_reader = 4;
+  bool get_slice_points_from_reader_bool = 5;
 }
 
 message Split {
@@ -1075,12 +1196,6 @@ message Dummy {
 message StopGradient {
 }
 
-message Max {
-}
-
-message Min {
-}
-
 message InTopK {
   int64 k = 1;
 }
@@ -1089,6 +1204,10 @@ message Sort {
   bool descending = 1;
 }
 
+message WeightsLayer {
+  string dims = 1;
+}
+
 /////////////////////
 // learning Layers //
 /////////////////////
@@ -1101,11 +1220,19 @@ message FullyConnected {
   double group_lasso_regularization_factor = 6; //default: 0
   bool transpose = 7;
   bool num_neurons_is_num_labels = 8;
+
+  bool get_input_dimension_from_reader = 9;
+  bool get_image_and_scalar_dimension_from_reader = 10;
+  bool get_image_dimension_from_reader = 11;
+  bool get_scalar_dimension_from_reader = 12;
+  repeated uint32 get_num_neurons_of_slice_from_reader = 13;
+  string get_slice_points_from_reader = 14;
 }
 
 message Convolution {
   int64 num_dims = 1;
   int64 num_output_channels = 4;
+  int64 num_groups = 3;
 
   bool has_vectors = 2;
 
@@ -1113,11 +1240,13 @@ message Convolution {
   string conv_dims = 5; //should be space-separated list, e.g, "2 2 3"
   string conv_pads = 6;  //should be space-separated list, e.g, "2 2 3"
   string conv_strides = 7; //should be space-separated list, e.g, "2 2 3"
+  string conv_dilations = 8;  //should be space-separated list, e.g. "2 3 3"
 
   // these are used if has_vector = false
   int64 conv_dims_i = 50;
   int64 conv_pads_i = 60;
   int64 conv_strides_i = 70;
+  int64 conv_dilations_i = 80;
 
   string weight_initialization = 9;     //DEPRECATED
   bool has_bias = 10;                   //default: true
@@ -1128,6 +1257,7 @@ message Convolution {
 message Deconvolution {
   int64 num_dims = 1;
   int64 num_output_channels = 4;
+  int64 num_groups = 3;
 
   bool has_vectors = 2;
 
@@ -1135,11 +1265,13 @@ message Deconvolution {
   string conv_dims = 5; //should be space-separated list, e.g, "2 2 3"
   string conv_pads = 6;  //should be space-separated list, e.g, "2 2 3"
   string conv_strides = 7; //should be space-separated list, e.g, "2 2 3"
+  string conv_dilations = 8;  //should be space-separated list, e.g. "2 3 3"
 
   // these are used if has_vector = false
   int64 conv_dims_i = 50;
   int64 conv_pads_i = 60;
   int64 conv_strides_i = 70;
+  int64 conv_dilations_i = 80;
 
   string weight_initialization = 9;     //DEPRECATED
   bool has_bias = 10;                   //default: true
@@ -1159,3 +1291,21 @@ message Target {
 
 message TargetReconstruction {
 }
+
+//////////////////
+// Image Layers //
+//////////////////
+message BilinearResize {
+  int64 height = 1;
+  int64 width = 2;
+}
+
+//////////////////////////
+// Miscellaneous Layers //
+//////////////////////////
+message Covariance {
+  bool biased = 1; //Whether to use a biased covariance estimate
+}
+message Variance {
+  bool biased = 1; //Whether to use a biased variance estimate
+}
diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp
index 1966e61a0dc..219089763b1 100644
--- a/src/proto/proto_common.cpp
+++ b/src/proto/proto_common.cpp
@@ -38,8 +38,13 @@ void expand_motifs(lbann_comm *comm, lbann_data::LbannPB& pb) {
   }
 }
 
+int get_requested_num_parallel_readers(const lbann::lbann_comm *comm, const lbann_data::LbannPB& p);
+
 void init_data_readers(lbann::lbann_comm *comm, const lbann_data::LbannPB& p, std::map<execution_mode, generic_data_reader *>& data_readers)
 {
+#ifdef LBANN_HAS_CONDUIT
+  static std::unordered_map<std::string, data_reader_jag_conduit*> leading_reader_jag_conduit;
+#endif
   bool master = comm->am_world_master();
   std::stringstream err;
 
@@ -73,7 +78,7 @@ void init_data_readers(lbann::lbann_comm *comm, const lbann_data::LbannPB& p, st
     generic_data_reader *reader = nullptr;
     generic_data_reader *reader_validation = nullptr;
 
-    if ((name == "mnist") || (name == "cifar10")) {
+    if ((name == "mnist") || (name == "cifar10") || (name == "moving_mnist")) {
       init_org_image_data_reader(readme, master, reader);
       set_up_generic_preprocessor = false;
     } else if ((name == "imagenet") || (name == "imagenet_patches") ||
@@ -84,18 +89,31 @@ void init_data_readers(lbann::lbann_comm *comm, const lbann_data::LbannPB& p, st
       auto* reader_jag = new data_reader_jag(shuffle);
 
       using var_t = data_reader_jag::variable_t;
-      std::vector<var_t> independent_type(readme.independent_size());
+
+      // composite independent variable
+      std::vector< std::vector<var_t> > independent_type(readme.independent_size());
 
       for (int i=0; i < readme.independent_size(); ++i) {
-        independent_type[i] = static_cast<var_t>(readme.independent(i));
+        const lbann_data::Reader::JAGDataSlice& slice = readme.independent(i);
+        const int slice_size = slice.pieces_size();
+        for (int k=0; k < slice_size; ++k) {
+          const auto var_type = static_cast<var_t>(slice.pieces(k));
+          independent_type[i].push_back(var_type);
+        }
       }
 
       reader_jag->set_independent_variable_type(independent_type);
 
-      std::vector<var_t> dependent_type(readme.dependent_size());
+      // composite dependent variable
+      std::vector< std::vector<var_t> > dependent_type(readme.dependent_size());
 
       for (int i=0; i < readme.dependent_size(); ++i) {
-        dependent_type[i] = static_cast<var_t>(readme.dependent(i));
+        const lbann_data::Reader::JAGDataSlice& slice = readme.dependent(i);
+        const int slice_size = slice.pieces_size();
+        for (int k=0; k < slice_size; ++k) {
+          const auto var_type = static_cast<var_t>(slice.pieces(k));
+          dependent_type[i].push_back(var_type);
+        }
       }
 
       reader_jag->set_dependent_variable_type(dependent_type);
@@ -108,6 +126,30 @@ void init_data_readers(lbann::lbann_comm *comm, const lbann_data::LbannPB& p, st
 #ifdef LBANN_HAS_CONDUIT
     } else if (name == "jag_conduit") {
       init_image_data_reader(readme, master, reader);
+      auto reader_jag_conduit = dynamic_cast<data_reader_jag_conduit*>(reader);
+      const lbann_data::Model& pb_model = p.model();
+      reader->set_mini_batch_size(static_cast<int>(pb_model.mini_batch_size()));
+
+      if (!peek_map(leading_reader_jag_conduit, readme.role())) {
+        leading_reader_jag_conduit[readme.role()] = reader_jag_conduit;
+      } else {
+        const auto leader = peek_map(leading_reader_jag_conduit, readme.role());
+        *reader_jag_conduit = *leader;
+        reader_jag_conduit->set_leading_reader(leader);
+      }
+
+      for (int i=0; i < pb_model.layer_size(); ++i) {
+        const auto& proto_layer = pb_model.layer(i);
+        if (proto_layer.has_input()) {
+          const auto& params = proto_layer.input();
+          const auto& io_buffer = params.io_buffer();
+          reader_jag_conduit->set_io_buffer_type(io_buffer);
+          const auto num_readers = get_requested_num_parallel_readers(comm, p);
+          reader_jag_conduit->set_num_parallel_readers(num_readers);
+          reader_jag_conduit->set_local_id(readme.role());
+          break;
+        }
+      }
       set_up_generic_preprocessor = false;
     } else if (name == "jag_conduit_hdf5") {
       init_image_data_reader(readme, master, reader);
@@ -225,11 +267,23 @@ void init_data_readers(lbann::lbann_comm *comm, const lbann_data::LbannPB& p, st
       }
 
     } else if (name == "synthetic") {
-      reader = new data_reader_synthetic(
-        readme.num_samples(), proto::parse_list<int>(readme.synth_dimensions()),
-        readme.num_labels(), shuffle);
+      if (readme.num_labels() != 0) {
+        reader = new data_reader_synthetic(
+          readme.num_samples(),
+          proto::parse_list<int>(readme.synth_dimensions()),
+          readme.num_labels(),
+          shuffle);
+      } else {
+        reader = new data_reader_synthetic(
+          readme.num_samples(),
+          proto::parse_list<int>(readme.synth_dimensions()),
+          proto::parse_list<int>(readme.synth_response_dimensions()),
+          shuffle);
+      }
     } else if (name == "mesh") {
       reader = new mesh_reader(shuffle);
+    } else if (name == "moving_mnist") {
+      reader = new moving_mnist_reader(7, 40, 40, 2);
     } else {
       if (master) {
         err << __FILE__ << " " << __LINE__ << " :: unknown name for data reader: "
@@ -329,7 +383,19 @@ void init_data_readers(lbann::lbann_comm *comm, const lbann_data::LbannPB& p, st
         *dynamic_cast<data_reader_jag*>(reader_validation) = *dynamic_cast<const data_reader_jag*>(reader);
 #ifdef LBANN_HAS_CONDUIT
       } else if (name == "jag_conduit") {
-        reader_validation = new data_reader_jag_conduit(*dynamic_cast<const data_reader_jag_conduit*>(reader));
+        const std::string role = "validate";
+        if (!peek_map(leading_reader_jag_conduit, role)) {
+          reader_validation = new data_reader_jag_conduit(*dynamic_cast<const data_reader_jag_conduit*>(reader));
+          auto reader_jag_conduit = dynamic_cast<data_reader_jag_conduit*>(reader_validation);
+          reader_jag_conduit->set_leading_reader(reader_jag_conduit);
+          reader_jag_conduit->set_role(role);
+          leading_reader_jag_conduit[role] = reader_jag_conduit;
+        } else {
+          const auto leader = peek_map(leading_reader_jag_conduit, role);
+          reader_validation = new data_reader_jag_conduit(*leader);
+          auto reader_jag_conduit = dynamic_cast<data_reader_jag_conduit*>(reader_validation);
+          reader_jag_conduit->set_leading_reader(leader);
+        }
 #endif // LBANN_HAS_CONDUIT
       } else if (name == "nci") {
         reader_validation = new data_reader_nci(shuffle);
@@ -353,6 +419,8 @@ void init_data_readers(lbann::lbann_comm *comm, const lbann_data::LbannPB& p, st
       } else if (name == "mesh") {
         reader_validation = new mesh_reader(shuffle);
         (*(mesh_reader *)reader_validation) = (*(mesh_reader *)reader);
+      } else if (name == "moving_mnist") {
+        reader_validation = new moving_mnist_reader(7, 40, 40, 2);
       }
 
       reader_validation->set_role("validate");
@@ -433,27 +501,46 @@ bool write_prototext_file(const char *fn, lbann_data::LbannPB& pb)
   return true;
 }
 
-void set_num_parallel_readers(lbann::lbann_comm *comm, lbann_data::LbannPB& p)
+bool check_if_num_parallel_readers_set(const lbann::lbann_comm *comm, const lbann_data::Model& model)
 {
-  bool master = comm->am_world_master();
+  const bool master = comm->am_world_master();
+  const int parallel_io = model.num_parallel_readers();
 
-  lbann_data::Model *model = p.mutable_model();
-
-  int parallel_io = model->num_parallel_readers();
   if (parallel_io == 0) {
     if (master) {
       std::cout << "\tMax Parallel I/O Fetch: " << comm->get_procs_per_model() <<
         " (Limited to # Processes)" << std::endl;
     }
-    parallel_io = comm->get_procs_per_model();
+    return false;
+  }
+  if (master) {
+    std::cout << "\tMax Parallel I/O Fetch: " << parallel_io << std::endl;
+  }
+  return true;
+}
+
+void set_num_parallel_readers(const lbann::lbann_comm *comm, lbann_data::LbannPB& p)
+{
+  lbann_data::Model *model = p.mutable_model();
+  const bool is_set = check_if_num_parallel_readers_set(comm, *model);
+
+  if (!is_set) {
+    const int parallel_io = comm->get_procs_per_model();
     model->set_num_parallel_readers(parallel_io); //adjust the prototext
-  } else {
-    if (master) {
-      std::cout << "\tMax Parallel I/O Fetch: " << parallel_io << std::endl;
-    }
   }
 }
 
+int get_requested_num_parallel_readers(const lbann::lbann_comm *comm, const lbann_data::LbannPB& p)
+{
+  const lbann_data::Model& model = p.model();
+  const bool is_set = check_if_num_parallel_readers_set(comm, model);
+
+  if (!is_set) {
+    return comm->get_procs_per_model();
+  }
+  return model.num_parallel_readers();
+}
+
 void set_data_readers_filenames(std::string which, lbann_data::LbannPB& p)
 {
   options *opts = options::get();
@@ -556,17 +643,6 @@ void get_cmdline_overrides(lbann::lbann_comm *comm, lbann_data::LbannPB& p)
   if (opts->has_string("data_reader_percent")) {
     set_data_readers_percent(p);
   }
-
-  if (opts->has_string("image_dir")) {
-    int sz = model->callback_size();
-    for (int j=0; j<sz; j++) {
-      lbann_data::Callback *c = model->mutable_callback(j);
-      if (c->has_save_images()) {
-        lbann_data::CallbackSaveImages *i = c->mutable_save_images();
-        i->set_image_dir(opts->get_string("image_dir"));
-      }
-    }
-  }
   if (opts->has_bool("no_im_comm") and opts->get_bool("no_im_comm")) {
     int sz = model->callback_size();
     for (int j=0; j<sz; j++) {
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
index ec71105b5d6..02bc8c9c440 100644
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
@@ -20,6 +20,13 @@ set_full_path(THIS_DIR_SOURCES
   summary.cpp
 )
 
+if (LBANN_HAS_CUDA)
+  # Add the CUDA source files for this directory
+  set_full_path(THIS_DIR_CU_SOURCES
+    cuda.cu
+    )
+endif ()
+
 # Propagate the files up the tree
 set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
 set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CU_SOURCES}" PARENT_SCOPE)
diff --git a/src/utils/cuda.cu b/src/utils/cuda.cu
new file mode 100644
index 00000000000..23fc459bfac
--- /dev/null
+++ b/src/utils/cuda.cu
@@ -0,0 +1,83 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/utils/cuda.hpp"
+
+#ifdef LBANN_HAS_GPU
+
+namespace lbann {
+namespace cuda {
+
+////////////////////////////////////////////////////////////
+// CUDA event wrapper
+////////////////////////////////////////////////////////////
+
+event_wrapper::event_wrapper() : m_event(nullptr), m_stream(0) {
+  CHECK_CUDA(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming));
+}
+
+event_wrapper::event_wrapper(const event_wrapper& other)
+  : m_event(nullptr), m_stream(other.m_stream) {
+  CHECK_CUDA(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming));
+  if (!other.query()) { record(m_stream); }
+}
+
+event_wrapper& event_wrapper::operator=(const event_wrapper& other) {
+  m_stream = other.m_stream;
+  if (!other.query()) { record(m_stream); }
+  return *this;
+}
+  
+event_wrapper::~event_wrapper() {
+  cudaEventDestroy(m_event);
+}
+
+void event_wrapper::record(cudaStream_t stream) {
+  m_stream = stream;
+  CHECK_CUDA(cudaEventRecord(m_event, m_stream));
+}
+
+bool event_wrapper::query() const {
+  const auto& status = cudaEventQuery(m_event);
+  switch (status) {
+  case cudaSuccess:       return true;
+  case cudaErrorNotReady: return false;
+  default:
+    CHECK_CUDA(status);
+    return false;
+  }
+}
+
+void event_wrapper::synchronize() {
+  CHECK_CUDA(cudaEventSynchronize(m_event));
+}
+
+cudaEvent_t& event_wrapper::get_event() { return m_event; }
+
+} // namespace cuda
+} // namespace lbann
+
+#endif // LBANN_HAS_GPU
diff --git a/src/utils/cudnn.cpp b/src/utils/cudnn.cpp
index 9ab1fac2cbd..a4628f32d61 100644
--- a/src/utils/cudnn.cpp
+++ b/src/utils/cudnn.cpp
@@ -48,7 +48,7 @@ struct handle_wrapper {
   cudnnHandle_t handle;
   handle_wrapper() : handle(nullptr) {
     CHECK_CUDA(cudaSetDevice(El::GPUManager::Device()));
-    if (handle == nullptr) { FORCE_CHECK_CUDNN(cudnnCreate(&handle)); }
+    if (handle == nullptr) { CHECK_CUDNN(cudnnCreate(&handle)); }
     if (handle == nullptr) { LBANN_ERROR("failed to create cuDNN handle"); }
     CHECK_CUDNN(cudnnSetStream(handle, El::GPUManager::Stream()));
   }
diff --git a/src/utils/statistics.cpp b/src/utils/statistics.cpp
index 3fae28f5f81..5006fe33288 100644
--- a/src/utils/statistics.cpp
+++ b/src/utils/statistics.cpp
@@ -88,7 +88,8 @@ void entrywise_mean_and_stdev(const AbsDistMat& data,
     }
   }
   DataType sum_sqsum[2] = {sum, sqsum};  // Pack to do one allreduce.
-  El::mpi::AllReduce(sum_sqsum, 2, data.DistComm());
+  El::mpi::AllReduce(sum_sqsum, 2, data.DistComm(),
+                     El::SyncInfo<El::Device::CPU>{});
 
   // Compute mean and standard deviation
   mean = sum_sqsum[0] / size;
diff --git a/src/weights/initializer.cpp b/src/weights/initializer.cpp
index 5259e3b6ef8..415a6067036 100644
--- a/src/weights/initializer.cpp
+++ b/src/weights/initializer.cpp
@@ -22,11 +22,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
-//
-// weights_initializer .hpp .cpp - Weights initializer classes
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "lbann/weights/initializer.hpp"
+#include "lbann/utils/exception.hpp"
 #include "lbann/utils/random.hpp"
 
 namespace lbann {
@@ -39,13 +38,55 @@ void constant_initializer::fill(AbsDistMat& matrix) {
   }
 }
 
+void value_initializer::fill(AbsDistMat& matrix) {
+
+  // Check that number of values matches weights matrix
+  if (matrix.Height() * matrix.Width() != (El::Int) m_values.size()) {
+    std::stringstream err;
+    err << "a value initializer with " << m_values.size() << " values "
+        << "attempted to initialize a "
+        << matrix.Height() << " x " << matrix.Width() << " "
+        << "weights matrix";
+    LBANN_ERROR(err.str());
+  }
+
+  // Copy values to a CPU matrix
+  // Note: If the weights matrix is on CPU, the CPU matrix is a matrix
+  // view. Otherwise, the CPU matrix values are copied to the weights
+  // matrix.
+  CPUMat matrix_cpu;
+  if (matrix.GetLocalDevice() == El::Device::CPU) {
+    El::View(matrix_cpu, matrix.Matrix());
+  } else {
+    matrix_cpu.Resize(matrix.LocalHeight(), matrix.LocalWidth());
+  }
+  auto const width = matrix.LocalWidth();
+  auto const height = matrix.LocalHeight();
+#pragma omp parallel for collapse(2)
+  for (El::Int local_col = 0; local_col < width; ++local_col) {
+    for (El::Int local_row = 0; local_row < height; ++local_row) {
+      const auto& global_row = matrix.GlobalRow(local_row);
+      const auto& global_col = matrix.GlobalCol(local_col);
+      const auto& global_pos = global_row + matrix.Height() * global_col;
+      matrix_cpu(local_row, local_col) = m_values[global_pos];
+    }
+  }
+  if (matrix.GetLocalDevice() != El::Device::CPU) {
+    El::Copy(matrix_cpu, matrix.Matrix());
+#ifdef HYDROGEN_HAVE_CUDA
+    El::GPUManager::SynchronizeStream(); /// @todo Use new Hydrogen synchronization semantics when available
+#endif // HYDROGEN_HAVE_CUDA
+  }
+
+}
+
 void uniform_initializer::fill(AbsDistMat& matrix) {
-  uniform_fill(matrix, matrix.Height(), matrix.Width(), 
+  uniform_fill(matrix, matrix.Height(), matrix.Width(),
                (m_max + m_min) / 2, (m_max - m_min) / 2);
 }
 
 void normal_initializer::fill(AbsDistMat& matrix) {
-  gaussian_fill(matrix, matrix.Height(), matrix.Width(), 
+  gaussian_fill(matrix, matrix.Height(), matrix.Width(),
                 m_mean, m_standard_deviation);
 }
 
diff --git a/superbuild/CMakeLists.txt b/superbuild/CMakeLists.txt
index cee35edde4d..7afbf03794c 100644
--- a/superbuild/CMakeLists.txt
+++ b/superbuild/CMakeLists.txt
@@ -1,16 +1,11 @@
 cmake_minimum_required(VERSION 3.9)
 
 message("\nWelcome to the LBANN SuperBuild system.\n\n"
-  "This attempts to fill the shoes of a real package manager by building "
-  "LBANN's dependencies, as well as LBANN itself.\n\n"
-  "A few notes before we begin:\n"
-  "  1. Read the README.md file.\n"
-  "  2. Actually read the README.md file.\n"
-  "  3. See 1 and 2.\n"
-  "  4. You should probably use a real package manager instead (e.g., Spack)\n\n"
-  "Good luck!\n\n")
+  "Please report issues on https://github.com/llnl/lbann/issues\n\n"
+  "Good luck!\n")
 
-project(LBANN_SuperBuild NONE)
+# CXX is always required
+project(LBANN_SuperBuild CXX)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
 
@@ -106,3 +101,41 @@ foreach (pkg ${_BUILD_PKGS})
   endif ()
 endforeach ()
 message("\n-----------------------------------------------------------------\n")
+
+# Add a custom target for bundling all things up
+if (UNIX)
+  find_program(__FIND_EXE find)
+  set(__WORKING_DIR "${CMAKE_BINARY_DIR}")
+  if (__FIND_EXE)
+    set(__cmd "${__FIND_EXE};.;\(;-ipath;*/stamp/*.log;-o;-ipath;*/CMakeFiles/CMake*.log;-o;-name;CMakeCache.txt;\);-exec;${CMAKE_COMMAND};-E;tar;czf;all_output_logs.tar.gz;--;{};+")
+    add_custom_target(gather-logs
+      COMMAND "${__cmd}"
+      BYPRODUCTS "${__WORKING_DIR}/all_output_logs.tar.gz"
+      WORKING_DIRECTORY "${__WORKING_DIR}"
+      COMMENT "Gathering all output logs."
+      VERBATIM
+      COMMAND_EXPAND_LISTS
+      USES_TERMINAL)
+
+    add_custom_target(gather-all)
+    add_dependencies(gather-all gather-logs)
+    if (CMAKE_GENERATOR STREQUAL "Ninja")
+      set(__cmd "${__FIND_EXE};.;-name;*.ninja;-exec;${CMAKE_COMMAND};-E;tar;czf;all_build_files.tar.gz;{};+")
+    elseif (CMAKE_GENERATOR STREQUAL "Unix Makefiles")
+      set(__cmd "${__FIND_EXE};.;\(;-name;link.txt;-o;-name;build.make;-o;-name;flags.make;\);-exec;${CMAKE_COMMAND};-E;tar;czf;all_build_files.tar.gz;{};+")
+    else ()
+      set(__cmd)
+    endif ()
+    if (__cmd)
+      add_custom_target(gather-build
+        COMMAND "${__cmd}"
+        BYPRODUCTS "${__WORKING_DIR}/all_build_files.tar.gz"
+        WORKING_DIRECTORY "${__WORKING_DIR}"
+        COMMENT "Gathering all build files."
+        VERBATIM
+        COMMAND_EXPAND_LISTS
+        USES_TERMINAL)
+      add_dependencies(gather-all gather-build)
+    endif ()
+  endif (__FIND_EXE)
+endif (UNIX)
diff --git a/superbuild/aluminum/CMakeLists.txt b/superbuild/aluminum/CMakeLists.txt
index 0e4bb0fdcc3..7503a6ae7d5 100644
--- a/superbuild/aluminum/CMakeLists.txt
+++ b/superbuild/aluminum/CMakeLists.txt
@@ -1,4 +1,6 @@
-set(Aluminum_URL https://github.com/ndryden/Aluminum
+enable_language(CXX)
+
+set(Aluminum_URL https://github.com/llnl/Aluminum
   CACHE STRING "The URL from which to clone Aluminum")
 
 set(Aluminum_TAG "master"
@@ -16,10 +18,14 @@ option(ALUMINUM_ENABLE_CUDA "Enable CUDA support." OFF)
 option(ALUMINUM_ENABLE_MPI_CUDA "Enable MPI-CUDA support." OFF)
 option(ALUMINUM_ENABLE_NCCL "Enable NCCL support." OFF)
 
+if (ALUMINUM_ENABLE_CUDA OR ALUMINUM_ENABLE_MPI_CUDA OR ALUMINUM_ENABLE_NCCL)
+  enable_language(CUDA)
+endif ()
+
 # Get the list of ALUMINUM variables
 get_property(ALUMINUM_VARIABLES DIRECTORY PROPERTY VARIABLES)
 list(FILTER ALUMINUM_VARIABLES INCLUDE REGEX
-  "^ALUMINUM_.*\|^Aluminum_.*\|^LBANN_SB_FWD_ALUMINUM_.*\|^LBANN_SB_FWD_Aluminum_.*\|CMAKE_\(CXX\|CUDA\)_COMPILER\|CMAKE_BUILD_TYPE")
+  "^ALUMINUM_.*\|^Aluminum_.*\|^LBANN_SB_FWD_ALUMINUM_.*\|^LBANN_SB_FWD_Aluminum_.*")
 list(FILTER ALUMINUM_VARIABLES EXCLUDE REGEX "Aluminum_URL\|Aluminum_TAG")
 
 create_cmake_arguments(
@@ -40,8 +46,19 @@ ExternalProject_Add(Aluminum
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${Aluminum_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
   LIST_SEPARATOR |
   CMAKE_ARGS
+  -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+  -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+  -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}
+  -DCMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}
+  -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CXX_COMPILER}
   ${ALUMINUM_CMAKE_ARGS}
   )
 
diff --git a/superbuild/cnpy/CMakeLists.txt b/superbuild/cnpy/CMakeLists.txt
index 9d946aa3534..c9155275cdb 100644
--- a/superbuild/cnpy/CMakeLists.txt
+++ b/superbuild/cnpy/CMakeLists.txt
@@ -36,6 +36,12 @@ ExternalProject_Add(CNPY
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${CNPY_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
   CMAKE_ARGS
   -G${CMAKE_GENERATOR}
   -DCMAKE_INSTALL_PREFIX=${CNPY_CMAKE_INSTALL_PREFIX}
diff --git a/superbuild/cub/CMakeLists.txt b/superbuild/cub/CMakeLists.txt
index 49c472bdb75..b552b2c45fd 100644
--- a/superbuild/cub/CMakeLists.txt
+++ b/superbuild/cub/CMakeLists.txt
@@ -23,6 +23,13 @@ ExternalProject_Add(CUB
   ${CMAKE_CURRENT_SOURCE_DIR}/CUBCMakeLists.txt
   ${CMAKE_CURRENT_BINARY_DIR}/src/CMakeLists.txt
   INSTALL_DIR ${CUB_CMAKE_INSTALL_PREFIX}
+  USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
   CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CUB_CMAKE_INSTALL_PREFIX}
   )
 
diff --git a/superbuild/hydrogen/CMakeLists.txt b/superbuild/hydrogen/CMakeLists.txt
index adacae3dd62..a0555ed124c 100644
--- a/superbuild/hydrogen/CMakeLists.txt
+++ b/superbuild/hydrogen/CMakeLists.txt
@@ -1,6 +1,7 @@
-enable_language(C)
 enable_language(CXX)
 
+option(Hydrogen_ENABLE_CUDA "Enable CUDA support in Hydrogen" OFF)
+
 option(Hydrogen_ENABLE_OPENMP "Hydrogen use OpenMP threading." ON)
 
 option(Hydrogen_USE_64BIT_INTS
@@ -44,9 +45,17 @@ if (TARGET Aluminum)
 
     set(_hydrogen_depends_tag DEPENDS)
     list(APPEND _HYDROGEN_DEPENDS Aluminum)
+
+    if (ALUMINUM_ENABLE_CUDA OR ALUMINUM_ENABLE_MPI_CUDA OR ALUMINUM_ENABLE_NCCL)
+      set(Hydrogen_ENABLE_CUDA ON)
+    endif ()
   endif (Hydrogen_ENABLE_ALUMINUM)
 endif (TARGET Aluminum)
 
+if (Hydrogen_ENABLE_CUDA)
+  enable_language(CUDA)
+endif ()
+
 # Get the list of HYDROGEN variables
 get_property(HYDROGEN_VARIABLES DIRECTORY PROPERTY VARIABLES)
 list(FILTER HYDROGEN_VARIABLES INCLUDE REGEX
@@ -61,10 +70,6 @@ create_cmake_arguments(
   EXTRA_REMOVE_PREFIXES "LBANN_SB_FWD_HYDROGEN" "LBANN_SB_FWD_Hydrogen"
   VARIABLES ${HYDROGEN_VARIABLES})
 
-# Set the generator
-set(Hydrogen_CMAKE_GENERATOR "${CMAKE_GENERATOR}"
-  CACHE STRING "The generator used by CMake for Hydrogen.")
-
 # Handle the clone mechanism. First URL
 set(Hydrogen_URL "https://github.com/LLNL/Elemental.git"
   CACHE STRING "The URL from which to clone Hydrogen")
@@ -73,7 +78,6 @@ set(Hydrogen_URL "https://github.com/LLNL/Elemental.git"
 set(Hydrogen_TAG "hydrogen"
   CACHE STRING "The git tag or hash to checkout for Hydrogen")
 
-
 include(ExternalProject)
 ExternalProject_Add(HYDROGEN
   PREFIX ${CMAKE_CURRENT_BINARY_DIR}
@@ -86,11 +90,17 @@ ExternalProject_Add(HYDROGEN
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${Hydrogen_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
   LIST_SEPARATOR |
   CMAKE_ARGS
-  -G${Hydrogen_CMAKE_GENERATOR}
-  -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
   -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+  -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}
+  -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CXX_COMPILER}
   ${HYDROGEN_CMAKE_ARGS}
   )
 
diff --git a/superbuild/jpeg-turbo/CMakeLists.txt b/superbuild/jpeg-turbo/CMakeLists.txt
index 34c25adbc30..e221a99618d 100644
--- a/superbuild/jpeg-turbo/CMakeLists.txt
+++ b/superbuild/jpeg-turbo/CMakeLists.txt
@@ -1,5 +1,6 @@
 enable_language(C)
 enable_language(CXX)
+enable_language(ASM_NASM)
 
 # Match the jpeg-turbo default
 option(JPEG-TURBO_ENABLE_STATIC "Enable the jpeg-turbo static linkage." ON)
@@ -26,6 +27,12 @@ ExternalProject_Add(JPEG-TURBO
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${JPEG-TURBO_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
   CMAKE_ARGS
   -G${CMAKE_GENERATOR}
   -DCMAKE_INSTALL_PREFIX=${JPEG-TURBO_CMAKE_INSTALL_PREFIX}
diff --git a/superbuild/lbann/CMakeLists.txt b/superbuild/lbann/CMakeLists.txt
index b65eadd206c..93e41358d55 100644
--- a/superbuild/lbann/CMakeLists.txt
+++ b/superbuild/lbann/CMakeLists.txt
@@ -11,6 +11,10 @@ set(LBANN_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"
 set(LBANN_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
   CACHE STRING "The build type for LBANN.")
 
+if (LBANN_ENABLE_CUDA)
+  enable_language(CUDA)
+endif ()
+
 # Fixes an RPATH issue with LBANN in which all packages get installed
 # to the same prefix
 #
@@ -81,6 +85,14 @@ if (TARGET PROTOBUF)
   set(LBANN_SB_FWD_LBANN_PROTOBUF_DIR "${PROTOBUF_DIR}")
   set(LBANN_SB_FWD_LBANN_protobuf_MODULE_COMPATIBLE ON)
   set(LBANN_SB_FWD_LBANN_protobuf_BUILD_SHARED_LIBS ON)
+
+  if (TARGET HOST-PROTOBUF)
+    list(APPEND _LBANN_DEPENDS HOST-PROTOBUF)
+    set(LBANN_USE_PROTOBUF_MODULE ON)
+    set(LBANN_SB_FWD_LBANN_Protobuf_PROTOC_EXECUTABLE
+      "${HOST_PROTOBUF_protoc_EXE}")
+    # This will exist by the time LBANN configures.
+  endif ()
 endif ()
 
 # Get the list of LBANN variables
@@ -132,10 +144,18 @@ ExternalProject_Add(LBANN
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${LBANN_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
   LIST_SEPARATOR |
   CMAKE_ARGS
   -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
   -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+  -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}
+  -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CXX_COMPILER}
   ${LBANN_CMAKE_ARGS}
   )
 
diff --git a/superbuild/openblas/CMakeLists.txt b/superbuild/openblas/CMakeLists.txt
index e598a9ce7ed..e5768150c95 100644
--- a/superbuild/openblas/CMakeLists.txt
+++ b/superbuild/openblas/CMakeLists.txt
@@ -66,6 +66,12 @@ ExternalProject_Add(OPENBLAS
   CONFIGURE_COMMAND ""
   UPDATE_COMMAND ""
   USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
   BUILD_COMMAND ${GNU_MAKE_PROGRAM} -j${OPENBLAS_MAX_MAKE_JOBS}
     CC=${CMAKE_C_COMPILER}
     FC=${CMAKE_Fortran_COMPILER}
diff --git a/superbuild/opencv/CMakeLists.txt b/superbuild/opencv/CMakeLists.txt
index 1547a5dc6b2..9adfeea6f88 100644
--- a/superbuild/opencv/CMakeLists.txt
+++ b/superbuild/opencv/CMakeLists.txt
@@ -12,18 +12,14 @@ set(OPENCV_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"
 set(OPENCV_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
   CACHE STRING "The build type for OpenCV.")
 
-set(OPENCV_CMAKE_GENERATOR "${CMAKE_GENERATOR}"
-  CACHE STRING "The generator used by CMake for OpenCV.")
-
 if (TARGET JPEG-TURBO)
   set(_opencv_depends_tag DEPENDS)
   list(APPEND _OPENCV_DEPENDS JPEG-TURBO)
   set(OPENCV_WITH_LIBJPEG_TURBO ON)
   set(OPENCV_BUILD_JPEG OFF)
-  set(OPENCV_CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} "${JPEG-TURBO_CMAKE_INSTALL_PREFIX}") 
+  set(OPENCV_CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} "${JPEG-TURBO_CMAKE_INSTALL_PREFIX}")
 endif ()
 
-
 # Get the list of opencv variables
 get_property(OPENCV_VARIABLES DIRECTORY PROPERTY VARIABLES)
 list(FILTER OPENCV_VARIABLES INCLUDE REGEX "^OPENCV_.*")
@@ -52,11 +48,21 @@ ExternalProject_Add(OPENCV
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
   INSTALL_DIR ${OPENCV_CMAKE_INSTALL_PREFIX}
   USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
   LIST_SEPARATOR |
   CMAKE_ARGS
-  -G${OPENCV_CMAKE_GENERATOR}
   -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
   -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+  -DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}
+  -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+  -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+  -DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}
+  -DCMAKE_BUILD_TYPE=${OPENCV_CMAKE_BUILD_TYPE}
   ${OPENCV_CMAKE_ARGS}
   )
 
diff --git a/superbuild/protobuf/CMakeLists.txt b/superbuild/protobuf/CMakeLists.txt
index a5cdaed22a3..5642c0db95c 100644
--- a/superbuild/protobuf/CMakeLists.txt
+++ b/superbuild/protobuf/CMakeLists.txt
@@ -1,6 +1,23 @@
 enable_language(C)
 enable_language(CXX)
 
+# Option to use different compilers to, e.g., build a host version on
+# a cross-compilation system (e.g., theta). You must specify
+#
+# HOST_PROTOBUF_C_COMPILER
+# HOST_PROTOBUF_CXX_COMPILER
+#
+# and may optionally specify
+#
+# HOST_PROTOBUF_CMAKE_INSTALL_PREFIX (def: CMAKE_INSTALL_PREFIX/host-protobuf)
+# HOST_PROTOBUF_CMAKE_BUILD_TYPE (def: CMAKE_BUILD_TYPE)
+# HOST_PROTOBUF_CMAKE_C_FLAGS (def: <empty>)
+# HOST_PROTOBUF_CMAKE_CXX_FLAGS (def: <empty>)
+#
+option(LBANN_SB_BUILD_PROTOBUF_HOST_VERSION
+  "Build a protobuf that will run on the login/host node."
+  OFF)
+
 # Handle the clone mechanism. First URL
 set(PROTOBUF_URL "https://github.com/google/protobuf.git"
   CACHE STRING "The URL from which to clone PROTOBUF")
@@ -13,29 +30,102 @@ set(PROTOBUF_TAG "v3.6.1"
 set(PROTOBUF_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}"
   CACHE PATH "The installation location of PROTOBUF.")
 
-set(PROTOBUF_CMAKE_BUILD_TYPE "${PROTOBUF_CMAKE_BUILD_TYPE}"
+set(PROTOBUF_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}"
   CACHE STRING "The build type for PROTOBUF.")
 
 include(ExternalProject)
 ExternalProject_Add(PROTOBUF
-  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
-  TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp
-  STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp
+  PREFIX "${CMAKE_CURRENT_BINARY_DIR}"
+  TMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/tmp"
+  STAMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/stamp"
   GIT_REPOSITORY ${PROTOBUF_URL}
   GIT_TAG ${PROTOBUF_TAG}
-  SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src
+  SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src"
   SOURCE_SUBDIR cmake
-  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build
-  INSTALL_DIR ${PROTOBUF_CMAKE_INSTALL_PREFIX}
+  BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/build"
+  INSTALL_DIR "${PROTOBUF_CMAKE_INSTALL_PREFIX}"
+  STEP_TARGETS download
   USES_TERMINAL_BUILD 1
+  LOG_DOWNLOAD 1
+  LOG_UPDATE 1
+  LOG_CONFIGURE 1
+  LOG_BUILD 1
+  LOG_INSTALL 1
+  LOG_TEST 1
   CMAKE_ARGS
   -G${CMAKE_GENERATOR}
   -DCMAKE_INSTALL_PREFIX=${PROTOBUF_CMAKE_INSTALL_PREFIX}
   -DCMAKE_BUILD_TYPE=${PROTOBUF_CMAKE_BUILD_TYPE}
   -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
   -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+  -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+  -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
   -DCMAKE_MACOSX_RPATH=ON
-  -DCMAKE_CXX_FLAGS="-fPIC"
+  -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+  -Dprotobuf_BUILD_TESTS=OFF
   )
 set(PROTOBUF_DIR ${PROTOBUF_CMAKE_INSTALL_PREFIX}
   CACHE INTERNAL "The install prefix of Protobuf.")
+
+# Build the host-compatible version if necessary
+if (LBANN_SB_BUILD_PROTOBUF_HOST_VERSION)
+  set(HOST_PROTOBUF_CMAKE_INSTALL_PREFIX
+    "${CMAKE_INSTALL_PREFIX}/host-protobuf"
+    CACHE PATH
+    "The installation location of host-compatible PROTOBUF.")
+
+  set(HOST_PROTOBUF_CMAKE_BUILD_TYPE
+    ${CMAKE_BUILD_TYPE}
+    CACHE STRING
+    "The build type for the host-compatible protobuf.")
+
+  if (NOT HOST_PROTOBUF_CMAKE_C_COMPILER)
+    message(FATAL_ERROR
+      "Requested host protobuf build but did not specify a compiler. "
+      "Please specify HOST_PROTOBUF_CMAKE_C_COMPILER and try again.")
+  endif ()
+  if (NOT HOST_PROTOBUF_CMAKE_CXX_COMPILER)
+    message(FATAL_ERROR
+      "Requested host protobuf build but did not specify a compiler. "
+      "Please specify HOST_PROTOBUF_CMAKE_CXX_COMPILER and try again.")
+  endif ()
+
+  ExternalProject_Get_Property(PROTOBUF SOURCE_DIR)
+  set(__host_protobuf_source_dir "${SOURCE_DIR}")
+  set(SOURCE_DIR)
+
+  ExternalProject_Add(HOST-PROTOBUF
+    DEPENDS PROTOBUF-download
+    PREFIX "${CMAKE_CURRENT_BINARY_DIR}"
+    TMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/tmp"
+    STAMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/stamp"
+    DOWNLOAD_COMMAND ""
+    SOURCE_DIR "${__host_protobuf_source_dir}"
+    SOURCE_SUBDIR cmake
+    BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/host-build"
+    INSTALL_DIR "${HOST_PROTOBUF_CMAKE_INSTALL_PREFIX}"
+    USES_TERMINAL_BUILD 1
+    LOG_DOWNLOAD 1
+    LOG_UPDATE 1
+    LOG_CONFIGURE 1
+    LOG_BUILD 1
+    LOG_INSTALL 1
+    LOG_TEST 1
+    CMAKE_ARGS
+    -DCMAKE_C_COMPILER=${HOST_PROTOBUF_CMAKE_C_COMPILER}
+    -DCMAKE_CXX_COMPILER=${HOST_PROTOBUF_CMAKE_CXX_COMPILER}
+    -DCMAKE_C_FLAGS=${HOST_PROTOBUF_CMAKE_C_FLAGS}
+    -DCMAKE_CXX_FLAGS=${HOST_PROTOBUF_CMAKE_CXX_FLAGS}
+    -DCMAKE_INSTALL_PREFIX=${HOST_PROTOBUF_CMAKE_INSTALL_PREFIX}
+    -DCMAKE_BUILD_TYPE=${PROTOBUF_CMAKE_BUILD_TYPE}
+    -DCMAKE_MACOSX_RPATH=ON
+    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    -Dprotobuf_BUILD_TESTS=OFF
+    )
+
+  set(HOST_PROTOBUF_DIR "${HOST_PROTOBUF_CMAKE_INSTALL_PREFIX}")
+  set(HOST_PROTOBUF_protoc_EXE
+    "${HOST_PROTOBUF_CMAKE_INSTALL_PREFIX}/bin/protoc"
+    CACHE INTERNAL
+    "Path to the host-compatible protoc compiler.")
+endif ()
diff --git a/tools/compute_mean/Mat.hpp b/tools/compute_mean/Mat.hpp
index a32f0495b7a..d5513acbc2f 100644
--- a/tools/compute_mean/Mat.hpp
+++ b/tools/compute_mean/Mat.hpp
@@ -46,8 +46,8 @@ class ElMatLike {
  protected:
   Int m_width;
   Int m_height;
-  IR row_range;
-  IR col_range;
+  IR m_row_range;
+  IR m_col_range;
 
   std::shared_ptr< std::vector<T> > m_buf;
 
@@ -62,11 +62,11 @@ class ElMatLike {
   T& operator()(const Int r, const Int c) const;
 
   Int Width() const {
-    return col_range.IsInitialized() ? (col_range.End() - col_range.Beg()) : m_width;
+    return m_col_range.IsInitialized() ? (m_col_range.End() - m_col_range.Beg()) : m_width;
   }
 
   Int Height() const {
-    return row_range.IsInitialized() ? (row_range.End() - row_range.Beg()) : m_height;
+    return m_row_range.IsInitialized() ? (m_row_range.End() - m_row_range.Beg()) : m_height;
   }
 
   Int LDim() const {
@@ -80,6 +80,8 @@ class ElMatLike {
 
   void Set(const Int r, const Int c, const T d);
   T Get(const Int r, const Int c) const;
+
+  ElMatLike<T>& Copy(const ElMatLike<T>& src);
 };
 
 
@@ -89,8 +91,8 @@ inline Int ElMatLike<T>::Offset(const Int r, const Int c) const {
     throw lbann::lbann_exception("invalid point : (" + std::to_string(r) + ',' + std::to_string(c) + ')');
   }
 
-  const Int rv = r + row_range.Beg();
-  const Int cv = c + col_range.Beg();
+  const Int rv = r + m_row_range.Beg();
+  const Int cv = c + m_col_range.Beg();
 
   return (LDim()*cv + rv);
 }
@@ -107,11 +109,11 @@ inline ElMatLike<T> ElMatLike<T>::operator()(const IR& rr, const IR& cr) const {
       ("(cols End " + to_string(cr.End()) + " < Width " + to_string(Width()) + ")")));
   }
 
-  Int r_beg = row_range.IsInitialized()? row_range.Beg() : 0;
-  Int c_beg = col_range.IsInitialized()? col_range.Beg() : 0;
+  Int r_beg = m_row_range.IsInitialized()? m_row_range.Beg() : 0;
+  Int c_beg = m_col_range.IsInitialized()? m_col_range.Beg() : 0;
 
-  view.row_range = rr + r_beg;
-  view.col_range = cr + c_beg;
+  view.m_row_range = rr + r_beg;
+  view.m_col_range = cr + c_beg;
   return view;
 }
 
@@ -137,8 +139,8 @@ inline void ElMatLike<T>::Resize(const Int h, const Int w) {
     m_buf->resize(static_cast<size_t>(m_width*m_height));
   }
 
-  row_range.Init();
-  col_range.Init();
+  m_row_range.Init();
+  m_col_range.Init();
 }
 
 
@@ -174,11 +176,25 @@ inline T ElMatLike<T>::Get(const Int r, const Int c) const {
 }
 
 
+template<typename T>
+inline ElMatLike<T>& ElMatLike<T>::Copy(const ElMatLike<T>& src) {
+  m_buf = nullptr;
+  Resize(src.m_height, src.m_width);
+  m_row_range = src.m_row_range;
+  m_col_range = src.m_col_range;
+  return (*this);
+}
+
 template<typename T>
 inline void View(ElMatLike<T>& V, const ElMatLike<T>& X, const IR& r, const IR& c) {
   V = X(r, c);
 }
 
+template<typename T>
+inline void Copy(const ElMatLike<T>& S, ElMatLike<T>& D) {
+  D.Copy(S);
+}
+
 } // end of namespace
 
 using Mat = El::ElMatLike<lbann::DataType>;
diff --git a/tools/compute_mean/lbann/utils/glob.hpp b/tools/compute_mean/lbann/utils/glob.hpp
new file mode 120000
index 00000000000..3eddf0ddf67
--- /dev/null
+++ b/tools/compute_mean/lbann/utils/glob.hpp
@@ -0,0 +1 @@
+../../../../include/lbann/utils/glob.hpp
\ No newline at end of file
diff --git a/tools/compute_mean/lbann/utils/omp_pragma.hpp b/tools/compute_mean/lbann/utils/omp_pragma.hpp
new file mode 120000
index 00000000000..6b5eab4380d
--- /dev/null
+++ b/tools/compute_mean/lbann/utils/omp_pragma.hpp
@@ -0,0 +1 @@
+../../../../include/lbann/utils/omp_pragma.hpp
\ No newline at end of file
diff --git a/tools/compute_mean/lbann/utils/peek_map.hpp b/tools/compute_mean/lbann/utils/peek_map.hpp
new file mode 120000
index 00000000000..f7f840c5959
--- /dev/null
+++ b/tools/compute_mean/lbann/utils/peek_map.hpp
@@ -0,0 +1 @@
+../../../../include/lbann/utils/peek_map.hpp
\ No newline at end of file
diff --git a/tools/compute_mean/lbann/utils/timer.hpp b/tools/compute_mean/lbann/utils/timer.hpp
new file mode 120000
index 00000000000..d270e1e6e61
--- /dev/null
+++ b/tools/compute_mean/lbann/utils/timer.hpp
@@ -0,0 +1 @@
+../../../../include/lbann/utils/timer.hpp
\ No newline at end of file
diff --git a/tools/siamese_patch_list/CMakeLists.txt b/tools/siamese_patch_list/CMakeLists.txt
index 3f75a7cb003..73a5d9c57d3 100644
--- a/tools/siamese_patch_list/CMakeLists.txt
+++ b/tools/siamese_patch_list/CMakeLists.txt
@@ -1,13 +1,15 @@
 project(siamese_patches)
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.8)
 cmake_policy(SET CMP0015 NEW)
 
 set(COMPILER "gnu")
-set(CLUSTER "catalyst")
+#set(CLUSTER "catalyst")
+set(CLUSTER "pascal")
 #set(CLUSTER "surface")
 #set(CLUSTER "quartz")
 set(LBANN_DIR ../..)
 set(LBANN_INSTALL_DIR ${LBANN_DIR}/build/${COMPILER}.Release.${CLUSTER}.llnl.gov/install)
+include(${LBANN_DIR}/cmake/modules/FindCNPY.cmake)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 set(SIAMESE_PATCHES_EXE siamese_patches)
@@ -21,20 +23,20 @@ add_definitions(-std=c++11)
 add_definitions(-D_OFFLINE_PATCHES_NPZ_OFFLINE_TOOL_MODE_)
 
 
-list(APPEND CNPY_DIR /usr)
-find_package(CNPY QUIET HINTS ${CNPY_DIR})
-message(STATUS "CNPY_DIR: ${CNPY_DIR}")
-
 if(NOT CNPY_FOUND)
+  list(APPEND CNPY_DIR /usr)
+  find_package(CNPY QUIET HINTS ${CNPY_DIR})
+  message(STATUS "CNPY_DIR: ${CNPY_DIR}")
+
   set(CNPY_DIR ${LBANN_INSTALL_DIR})
-  set(CNPY_LIBS "libcnpy.so;libz.so")
+  set(CNPY_LIBRARY "libcnpy.so;libz.so")
   set(CNPY_INCLUDE_DIRS "${CNPY_DIR}/include")
   set(CNPY_LIB_DIR "${CNPY_DIR}/lib")
   message(STATUS "CNPY_DIR: ${CNPY_DIR}")
+  link_directories(${CNPY_LIB_DIR})
 endif()
 
 include_directories(SYSTEM ${CNPY_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR})
-link_directories(${CNPY_LIB_DIR})
 
 
 
@@ -44,4 +46,4 @@ file(GLOB SIAMESE_PATCHES_DEPEND_SRCS
      ${LBANN_DIR}/src/data_readers/offline_patches_npz.cpp)
 
 add_executable(${SIAMESE_PATCHES_EXE} ${SIAMESE_PATCHES_SRCS} ${SIAMESE_PATCHES_DEPEND_SRCS})
-target_link_libraries(${SIAMESE_PATCHES_EXE} ${CNPY_LIBS})
+target_link_libraries(${SIAMESE_PATCHES_EXE} ${CNPY_LIBRARY})