espressomd · RudolfWeeber · Apr 2, 2019 · fweik · Apr 2, 2019 · fweik
diff --git a/src/core/cuda_common_cuda.cu b/src/core/cuda_common_cuda.cu
@@ -49,20 +49,17 @@ static CUDA_particle_seed *particle_seeds_device = nullptr;
 static CUDA_energy *energy_device = nullptr;
 
 CUDA_particle_data *particle_data_host = nullptr;
-std::vector<float> particle_forces_host;
 CUDA_energy energy_host;
 
-std::vector<float> particle_torques_host;
 #ifdef ENGINE
 std::vector<CUDA_v_cs> host_v_cs;
 #endif
 
-/**cuda streams for parallel computing on cpu and gpu */
-cudaStream_t stream[1];
-
 cudaError_t _err;
 cudaError_t CU_err;
 
+cudaStream_t stream[1];
+
 void _cuda_safe_mem(cudaError_t CU_err, const char *file, unsigned int line) {
   if (cudaSuccess != CU_err) {
     fprintf(stderr, "Cuda Memory error at %s:%u.\n", file, line);
@@ -185,8 +182,6 @@ void gpu_change_number_of_part_to_comm() {
                                      &global_part_vars_host,
                                      sizeof(CUDA_global_part_vars)));
 
-    // if the arrays exists free them to prevent memory leaks
-    particle_forces_host.clear();
     if (particle_data_host) {
       cuda_safe_mem(cudaFreeHost(particle_data_host));
       particle_data_host = nullptr;
@@ -206,9 +201,6 @@ void gpu_change_number_of_part_to_comm() {
 #ifdef ENGINE
     host_v_cs.clear();
 #endif
-#ifdef ROTATION
-    particle_torques_host.clear();
-#endif
 
 #ifdef ROTATION
     if (particle_torques_device) {
@@ -224,15 +216,9 @@ void gpu_change_number_of_part_to_comm() {
                                   global_part_vars_host.number_of_particles *
                                       sizeof(CUDA_particle_data),
                                   cudaHostAllocWriteCombined));
-      particle_forces_host.resize(3 *
-                                  global_part_vars_host.number_of_particles);
 #ifdef ENGINE
       host_v_cs.resize(global_part_vars_host.number_of_particles);
 #endif
-#if (defined DIPOLES || defined ROTATION)
-      particle_torques_host.resize(3 *
-                                   global_part_vars_host.number_of_particles);
-#endif
 
       cuda_safe_mem(cudaMalloc((void **)&particle_forces_device,
                                3 * global_part_vars_host.number_of_particles *
@@ -267,6 +253,8 @@ void gpu_change_number_of_part_to_comm() {
   }
 }
 
+cudaEvent_t forces_torques_dtoh;
+
 /** setup and call particle reallocation from the host
  *  Note that in addition to calling this function the parameters must be
  * broadcast with either:
@@ -277,6 +265,8 @@ void gpu_change_number_of_part_to_comm() {
  * nodes)
  */
 void gpu_init_particle_comm() {
+  cuda_safe_mem(
+      cudaEventCreateWithFlags(&forces_torques_dtoh, cudaEventDisableTiming));
   if (this_node == 0 && global_part_vars_host.communication_enabled == 0) {
     if (cuda_get_n_gpus() == -1) {
       runtimeErrorMsg()
@@ -336,26 +326,45 @@ void copy_part_data_to_gpu(ParticleRange particles) {
   }
 }
 
+std::unique_ptr<PinnedVectorHost<float>> particle_forces_host{
+    std::make_unique<PinnedVectorHost<float>>(PinnedVectorHost<float>{})};
+std::unique_ptr<PinnedVectorHost<float>> particle_torques_host{
+    std::make_unique<PinnedVectorHost<float>>(PinnedVectorHost<float>{})};
+
 /** setup and call kernel to copy particle forces to host
  */
 void copy_forces_from_GPU(ParticleRange particles) {
-
   if (global_part_vars_host.communication_enabled == 1 &&
       global_part_vars_host.number_of_particles) {
 
     /** Copy result from device memory to host memory*/
     if (this_node == 0) {
-      cuda_safe_mem(cudaMemcpy(
-          &(particle_forces_host[0]), particle_forces_device,
+      particle_forces_host->resize(3 *
+                                   global_part_vars_host.number_of_particles);
+      particle_torques_host->resize(3 *
+                                    global_part_vars_host.number_of_particles);
+      cuda_safe_mem(cudaMemcpyAsync(
+          &((*particle_forces_host)[0]), particle_forces_device,
           3 * global_part_vars_host.number_of_particles * sizeof(float),
-          cudaMemcpyDeviceToHost));
+          cudaMemcpyDeviceToHost, stream[0]));
 #ifdef ROTATION
-      cuda_safe_mem(cudaMemcpy(
-          &(particle_torques_host[0]), particle_torques_device,
+      cuda_safe_mem(cudaMemcpyAsync(
+          &((*particle_torques_host)[0]), particle_torques_device,
           global_part_vars_host.number_of_particles * 3 * sizeof(float),
-          cudaMemcpyDeviceToHost));
+          cudaMemcpyDeviceToHost, stream[0]));
 #endif
+      cudaEventRecord(forces_torques_dtoh, stream[0]);
+    }
+  }
+}
 
+void distribute_gpu_forces(ParticleRange particles) {
+  if (global_part_vars_host.communication_enabled == 1 &&
+      global_part_vars_host.number_of_particles) {
+    cudaEventSynchronize(forces_torques_dtoh);
+    cuda_mpi_send_forces(particles, *particle_forces_host,
+                         *particle_torques_host);
+    if (this_node == 0) {
       /** values for the particle kernel */
       int threads_per_block_particles = 64;
       int blocks_per_grid_particles_y = 4;
@@ -371,14 +380,15 @@ void copy_forces_from_GPU(ParticleRange particles) {
       KERNELCALL(reset_particle_force, dim_grid_particles,
                  threads_per_block_particles, particle_forces_device,
                  particle_torques_device);
-      cudaDeviceSynchronize();
     }
-
-    cuda_mpi_send_forces(particles, particle_forces_host,
-                         particle_torques_host);
   }
 }
 
+void free_cuda_buffers() {
+  particle_forces_host.reset(nullptr);
+  particle_torques_host.reset(nullptr);
+};
+
 #if defined(ENGINE) && defined(LB_GPU)
 // setup and call kernel to copy v_cs to host
 void copy_v_cs_from_GPU(ParticleRange particles) {

diff --git a/src/core/cuda_init_cuda.cu b/src/core/cuda_init_cuda.cu
@@ -38,7 +38,9 @@ static const int computeCapabilityMinMinor = 0;
 
 const char *cuda_error;
 
-void cuda_init() { cudaStreamCreate(&stream[0]); }
+void cuda_init() {
+  cuda_safe_mem(cudaStreamCreateWithFlags(&stream[0], cudaStreamDefault));
+}
 
 /// get the number of CUDA devices.
 int cuda_get_n_gpus() {
@@ -100,7 +102,7 @@ int cuda_get_device_props(const int dev, EspressoGpuDevice &d) {
 int cuda_set_device(int dev) {
   cudaSetDevice(dev);
   cudaStreamDestroy(stream[0]);
-  cudaError_t error = cudaStreamCreate(&stream[0]);
+  cudaError_t error = cudaStreamCreateWithFlags(&stream[0], cudaStreamDefault);
 
   if (error != cudaSuccess) {
     cuda_error = cudaGetErrorString(error);

diff --git a/src/core/cuda_interface.cpp b/src/core/cuda_interface.cpp
@@ -28,6 +28,7 @@
 #include "nonbonded_interactions/nonbonded_interaction_data.hpp"
 #include "serialization/CUDA_particle_data.hpp"
 
+#include "thrust/system/cuda/experimental/pinned_allocator.h"
 #include "utils/mpi/gather_buffer.hpp"
 #include "utils/mpi/scatter_buffer.hpp"
 
@@ -147,9 +148,10 @@ void cuda_mpi_get_particles(ParticleRange particles,
  * @param torques The torques as flat array of size 3 * particles.size(),
  *                this is only touched if ROTATION is active.
  */
+template <typename FloatContainer>
 static void add_forces_and_torques(ParticleRange particles,
-                                   const std::vector<float> &forces,
-                                   const std::vector<float> &torques) {
+                                   const FloatContainer &forces,
+                                   const FloatContainer &torques) {
   int i = 0;
   for (auto &part : particles) {
     for (int j = 0; j < 3; j++) {
@@ -163,8 +165,8 @@ static void add_forces_and_torques(ParticleRange particles,
 }
 
 void cuda_mpi_send_forces(ParticleRange particles,
-                          std::vector<float> &host_forces,
-                          std::vector<float> &host_torques) {
+                          PinnedVectorHost<float> &host_forces,
+                          PinnedVectorHost<float> &host_torques) {
   auto const n_elements = 3 * particles.size();
 
   if (this_node > 0) {

diff --git a/src/core/cuda_interface.hpp b/src/core/cuda_interface.hpp
@@ -26,6 +26,12 @@
 #include "ParticleRange.hpp"
 #include "SystemInterface.hpp"
 
+#include "thrust/host_vector.h"
+#include "thrust/system/cuda/experimental/pinned_allocator.h"
+template <class T>
+using PinnedVectorHost = thrust::host_vector<
+    T, thrust::system::cuda::experimental::pinned_allocator<T>>;
+
 #ifdef ENGINE
 // velocities which need to be copied from the GPU to the CPU to calculate a
 // torque
@@ -122,6 +128,7 @@ typedef struct {
 } CUDA_global_part_vars;
 
 void copy_forces_from_GPU(ParticleRange particles);
+void distribute_gpu_forces(ParticleRange particles);
 void copy_energy_from_GPU();
 void copy_CUDA_energy_to_energy(CUDA_energy energy_host);
 void clear_energy_on_GPU();
@@ -158,8 +165,8 @@ void copy_part_data_to_gpu(ParticleRange particles);
  * This is a collective call.
  */
 void cuda_mpi_send_forces(ParticleRange particles,
-                          std::vector<float> &host_forces,
-                          std::vector<float> &host_torques);
+                          PinnedVectorHost<float> &host_forces,
+                          PinnedVectorHost<float> &host_torques);
 void cuda_bcast_global_part_params();
 void cuda_copy_to_device(void *host_data, void *device_data, size_t n);
 void cuda_copy_to_host(void *host_device, void *device_host, size_t n);
@@ -170,6 +177,8 @@ void cuda_mpi_send_v_cs(ParticleRange particles,
                         std::vector<CUDA_v_cs> host_v_cs);
 #endif
 
+void free_cuda_buffers();
+
 #endif /* ifdef CUDA */
 
 #endif /* ifdef CUDA_INTERFACE_HPP */
diff --git a/src/core/event.cpp b/src/core/event.cpp
@@ -651,3 +651,5 @@ void on_ghost_flags_change() {
   }
 #endif
 }
+
+void on_unload() { free_cuda_buffers(); }
diff --git a/src/core/event.hpp b/src/core/event.hpp
@@ -111,6 +111,10 @@ void on_ghost_flags_change();
 /** called every time the walls for the lb fluid are changed */
 void on_lbboundary_change();
 
+/** @brief Clenaup that needs to happen before library dependencies are unloaded
+ */
+void on_unload();
+
 /*@}*/
 
 #endif
diff --git a/src/core/forces.cpp b/src/core/forces.cpp
@@ -94,7 +94,22 @@ void check_forces() {
 void force_calc() {
   ESPRESSO_PROFILER_CXX_MARK_FUNCTION;
 
+  // GPU methods can be launched before the forces on the particles are
+  // cleared in init_forces()
   espressoSystemInterface.update();
+#ifdef LB_GPU
+  if (lattice_switch == ActiveLB::GPU) {
+    lb_lbcoupling_calc_particle_lattice_ia(thermo_virtual);
+  }
+#endif
+
+  for (ActorList::iterator actor = forceActors.begin();
+       actor != forceActors.end(); ++actor) {
+    (*actor)->computeForces(espressoSystemInterface);
+#ifdef ROTATION
+    (*actor)->computeTorques(espressoSystemInterface);
+#endif
+  }
 
 #ifdef COLLISION_DETECTION
   prepare_local_collision_queue();
@@ -105,14 +120,21 @@ void force_calc() {
 #endif
   init_forces();
 
-  for (auto actor = forceActors.begin(); actor != forceActors.end(); ++actor) {
-    (*actor)->computeForces(espressoSystemInterface);
-#ifdef ROTATION
-    (*actor)->computeTorques(espressoSystemInterface);
+  calc_long_range_forces();
+#ifdef LB
+  if (lattice_switch == ActiveLB::CPU) {
+#ifdef ENGINE
+    ghost_communicator(&cell_structure.exchange_ghosts_comm,
+                       GHOSTTRANS_SWIMMING);
 #endif
+    lb_lbcoupling_calc_particle_lattice_ia(thermo_virtual);
   }
+#endif
 
-  calc_long_range_forces();
+  // Do not launch GPU kernels after this
+#ifdef CUDA
+  copy_forces_from_GPU(local_cells.particles());
+#endif
 
   // Only calculate pair forces if the maximum cutoff is >0
   if (max_cut > 0) {
@@ -151,19 +173,15 @@ void force_calc() {
   immersed_boundaries.volume_conservation();
 #endif
 
-#if defined(LB_GPU) || defined(LB)
-  lb_lbcoupling_calc_particle_lattice_ia(thermo_virtual);
+#ifdef CUDA
+  distribute_gpu_forces(local_cells.particles());
 #endif
 
 #ifdef METADYNAMICS
   /* Metadynamics main function */
   meta_perform();
 #endif
 
-#ifdef CUDA
-  copy_forces_from_GPU(local_cells.particles());
-#endif
-
 // VIRTUAL_SITES distribute forces
 #ifdef VIRTUAL_SITES
   if (virtual_sites()->need_ghost_comm_before_back_transfer()) {

diff --git a/src/python/espressomd/__init__.py b/src/python/espressomd/__init__.py
@@ -21,32 +21,8 @@
 
 # Initialize MPI, start the main loop on the slaves
 import espressomd._init
+import atexit
+from espressomd.features import features, has_features, missing_features, assert_features
+from .system import System, _unload
 
-from espressomd.system import System
-from espressomd.code_info import features
-
-
-def has_features(*args):
-    """Tests whether a list of features is a subset of the compiled-in features"""
-
-    if len(args) == 1 and not isinstance(args[0], str) and hasattr(args[0], "__iter__"):
-        return set(args[0]) < set(features())
-
-    return set(args) < set(features())
-
-
-def missing_features(*args):
-    """Returns a list of the missing features in the argument"""
-
-    if len(args) == 1 and not isinstance(args[0], str) and hasattr(args[0], "__iter__"):
-            return set(args[0]) - set(features())
-
-    return set(args) - set(features())
-
-
-def assert_features(*args):
-    """Raises an exception when a list of features is not a subset of the compiled-in features"""
-
-    if not has_features(*args):
-        raise Exception(
-            "Missing features " + ", ".join(missing_features(*args)))
+atexit.register(_unload)
diff --git a/src/python/espressomd/system.pxd b/src/python/espressomd/system.pxd
@@ -55,3 +55,6 @@ cdef extern from "particle_data.hpp":
     int init_type_map(int type) except +
     int get_random_p_id(int type) except +
     int number_of_particles_with_type(int type) except +
+
+cdef extern from "event.hpp":
+    void on_unload()
diff --git a/src/python/espressomd/system.pyx b/src/python/espressomd/system.pyx
@@ -175,7 +175,7 @@ cdef class System(object):
         odict['minimize_energy'] = System.__getattribute__(
             self, "minimize_energy")
         odict['thermostat'] = System.__getattribute__(self, "thermostat")
-        IF COLLISION_DETECTION: 
+        IF COLLISION_DETECTION:
             odict['collision_detection'] = System.__getattribute__(
                 self, "collision_detection")
         return odict
@@ -528,3 +528,7 @@ cdef class System(object):
         self.check_valid_type(type)
         pid = get_random_p_id(type)
         return int(pid)
+
+
+def _unload():
+    on_unload()