Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: async GPU force/torque back-transfer, launch kernels earlier #2637

Open
wants to merge 1 commit into
base: python
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 37 additions & 27 deletions src/core/cuda_common_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,17 @@ static CUDA_particle_seed *particle_seeds_device = nullptr;
static CUDA_energy *energy_device = nullptr;

CUDA_particle_data *particle_data_host = nullptr;
std::vector<float> particle_forces_host;
CUDA_energy energy_host;

std::vector<float> particle_torques_host;
#ifdef ENGINE
std::vector<CUDA_v_cs> host_v_cs;
#endif

/**cuda streams for parallel computing on cpu and gpu */
cudaStream_t stream[1];

cudaError_t _err;
cudaError_t CU_err;

cudaStream_t stream[1];

void _cuda_safe_mem(cudaError_t CU_err, const char *file, unsigned int line) {
if (cudaSuccess != CU_err) {
fprintf(stderr, "Cuda Memory error at %s:%u.\n", file, line);
Expand Down Expand Up @@ -185,8 +182,6 @@ void gpu_change_number_of_part_to_comm() {
&global_part_vars_host,
sizeof(CUDA_global_part_vars)));

// if the arrays exists free them to prevent memory leaks
particle_forces_host.clear();
if (particle_data_host) {
cuda_safe_mem(cudaFreeHost(particle_data_host));
particle_data_host = nullptr;
Expand All @@ -206,9 +201,6 @@ void gpu_change_number_of_part_to_comm() {
#ifdef ENGINE
host_v_cs.clear();
#endif
#ifdef ROTATION
particle_torques_host.clear();
#endif

#ifdef ROTATION
if (particle_torques_device) {
Expand All @@ -224,15 +216,9 @@ void gpu_change_number_of_part_to_comm() {
global_part_vars_host.number_of_particles *
sizeof(CUDA_particle_data),
cudaHostAllocWriteCombined));
particle_forces_host.resize(3 *
global_part_vars_host.number_of_particles);
#ifdef ENGINE
host_v_cs.resize(global_part_vars_host.number_of_particles);
#endif
#if (defined DIPOLES || defined ROTATION)
particle_torques_host.resize(3 *
global_part_vars_host.number_of_particles);
#endif

cuda_safe_mem(cudaMalloc((void **)&particle_forces_device,
3 * global_part_vars_host.number_of_particles *
Expand Down Expand Up @@ -267,6 +253,8 @@ void gpu_change_number_of_part_to_comm() {
}
}

cudaEvent_t forces_torques_dtoh;

/** setup and call particle reallocation from the host
* Note that in addition to calling this function the parameters must be
* broadcast with either:
Expand All @@ -277,6 +265,8 @@ void gpu_change_number_of_part_to_comm() {
* nodes)
*/
void gpu_init_particle_comm() {
cuda_safe_mem(
cudaEventCreateWithFlags(&forces_torques_dtoh, cudaEventDisableTiming));
if (this_node == 0 && global_part_vars_host.communication_enabled == 0) {
if (cuda_get_n_gpus() == -1) {
runtimeErrorMsg()
Expand Down Expand Up @@ -336,26 +326,45 @@ void copy_part_data_to_gpu(ParticleRange particles) {
}
}

std::unique_ptr<PinnedVectorHost<float>> particle_forces_host{
Copy link
Contributor

@fweik fweik Apr 2, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really don't like this construction. I a way this is worse than manual memory management, because it pretends to be RAII, but really isn't. As far as I can see there is not reason for these buffers to exist statically, couldn't you just create them e.g. when the integration starts, and release them after. Depending on the cost of the allocation, it may even be feasible to create it in the force calculation.

std::make_unique<PinnedVectorHost<float>>(PinnedVectorHost<float>{})};
std::unique_ptr<PinnedVectorHost<float>> particle_torques_host{
std::make_unique<PinnedVectorHost<float>>(PinnedVectorHost<float>{})};

/** setup and call kernel to copy particle forces to host
*/
void copy_forces_from_GPU(ParticleRange particles) {

if (global_part_vars_host.communication_enabled == 1 &&
global_part_vars_host.number_of_particles) {

/** Copy result from device memory to host memory*/
if (this_node == 0) {
cuda_safe_mem(cudaMemcpy(
&(particle_forces_host[0]), particle_forces_device,
particle_forces_host->resize(3 *
global_part_vars_host.number_of_particles);
particle_torques_host->resize(3 *
global_part_vars_host.number_of_particles);
cuda_safe_mem(cudaMemcpyAsync(
&((*particle_forces_host)[0]), particle_forces_device,
3 * global_part_vars_host.number_of_particles * sizeof(float),
cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost, stream[0]));
#ifdef ROTATION
cuda_safe_mem(cudaMemcpy(
&(particle_torques_host[0]), particle_torques_device,
cuda_safe_mem(cudaMemcpyAsync(
&((*particle_torques_host)[0]), particle_torques_device,
global_part_vars_host.number_of_particles * 3 * sizeof(float),
cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost, stream[0]));
#endif
cudaEventRecord(forces_torques_dtoh, stream[0]);
}
}
}

void distribute_gpu_forces(ParticleRange particles) {
if (global_part_vars_host.communication_enabled == 1 &&
global_part_vars_host.number_of_particles) {
cudaEventSynchronize(forces_torques_dtoh);
cuda_mpi_send_forces(particles, *particle_forces_host,
*particle_torques_host);
if (this_node == 0) {
/** values for the particle kernel */
int threads_per_block_particles = 64;
int blocks_per_grid_particles_y = 4;
Expand All @@ -371,14 +380,15 @@ void copy_forces_from_GPU(ParticleRange particles) {
KERNELCALL(reset_particle_force, dim_grid_particles,
threads_per_block_particles, particle_forces_device,
particle_torques_device);
cudaDeviceSynchronize();
}

cuda_mpi_send_forces(particles, particle_forces_host,
particle_torques_host);
}
}

void free_cuda_buffers() {
particle_forces_host.reset(nullptr);
particle_torques_host.reset(nullptr);
};

#if defined(ENGINE) && defined(LB_GPU)
// setup and call kernel to copy v_cs to host
void copy_v_cs_from_GPU(ParticleRange particles) {
Expand Down
6 changes: 4 additions & 2 deletions src/core/cuda_init_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ static const int computeCapabilityMinMinor = 0;

const char *cuda_error;

void cuda_init() { cudaStreamCreate(&stream[0]); }
void cuda_init() {
cuda_safe_mem(cudaStreamCreateWithFlags(&stream[0], cudaStreamDefault));
}

/// get the number of CUDA devices.
int cuda_get_n_gpus() {
Expand Down Expand Up @@ -100,7 +102,7 @@ int cuda_get_device_props(const int dev, EspressoGpuDevice &d) {
int cuda_set_device(int dev) {
cudaSetDevice(dev);
cudaStreamDestroy(stream[0]);
cudaError_t error = cudaStreamCreate(&stream[0]);
cudaError_t error = cudaStreamCreateWithFlags(&stream[0], cudaStreamDefault);

if (error != cudaSuccess) {
cuda_error = cudaGetErrorString(error);
Expand Down
10 changes: 6 additions & 4 deletions src/core/cuda_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "nonbonded_interactions/nonbonded_interaction_data.hpp"
#include "serialization/CUDA_particle_data.hpp"

#include "thrust/system/cuda/experimental/pinned_allocator.h"
#include "utils/mpi/gather_buffer.hpp"
#include "utils/mpi/scatter_buffer.hpp"

Expand Down Expand Up @@ -147,9 +148,10 @@ void cuda_mpi_get_particles(ParticleRange particles,
* @param torques The torques as flat array of size 3 * particles.size(),
* this is only touched if ROTATION is active.
*/
template <typename FloatContainer>
static void add_forces_and_torques(ParticleRange particles,
const std::vector<float> &forces,
const std::vector<float> &torques) {
const FloatContainer &forces,
const FloatContainer &torques) {
int i = 0;
for (auto &part : particles) {
for (int j = 0; j < 3; j++) {
Expand All @@ -163,8 +165,8 @@ static void add_forces_and_torques(ParticleRange particles,
}

void cuda_mpi_send_forces(ParticleRange particles,
std::vector<float> &host_forces,
std::vector<float> &host_torques) {
PinnedVectorHost<float> &host_forces,
PinnedVectorHost<float> &host_torques) {
auto const n_elements = 3 * particles.size();

if (this_node > 0) {
Expand Down
13 changes: 11 additions & 2 deletions src/core/cuda_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@
#include "ParticleRange.hpp"
#include "SystemInterface.hpp"

#include "thrust/host_vector.h"
#include "thrust/system/cuda/experimental/pinned_allocator.h"
template <class T>
using PinnedVectorHost = thrust::host_vector<
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be called PinnedHostVector to keep it closer to thrusts name.

T, thrust::system::cuda::experimental::pinned_allocator<T>>;

#ifdef ENGINE
// velocities which need to be copied from the GPU to the CPU to calculate a
// torque
Expand Down Expand Up @@ -122,6 +128,7 @@ typedef struct {
} CUDA_global_part_vars;

void copy_forces_from_GPU(ParticleRange particles);
void distribute_gpu_forces(ParticleRange particles);
void copy_energy_from_GPU();
void copy_CUDA_energy_to_energy(CUDA_energy energy_host);
void clear_energy_on_GPU();
Expand Down Expand Up @@ -158,8 +165,8 @@ void copy_part_data_to_gpu(ParticleRange particles);
* This is a collective call.
*/
void cuda_mpi_send_forces(ParticleRange particles,
std::vector<float> &host_forces,
std::vector<float> &host_torques);
PinnedVectorHost<float> &host_forces,
PinnedVectorHost<float> &host_torques);
void cuda_bcast_global_part_params();
void cuda_copy_to_device(void *host_data, void *device_data, size_t n);
void cuda_copy_to_host(void *host_device, void *device_host, size_t n);
Expand All @@ -170,6 +177,8 @@ void cuda_mpi_send_v_cs(ParticleRange particles,
std::vector<CUDA_v_cs> host_v_cs);
#endif

void free_cuda_buffers();

#endif /* ifdef CUDA */

#endif /* ifdef CUDA_INTERFACE_HPP */
2 changes: 2 additions & 0 deletions src/core/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -651,3 +651,5 @@ void on_ghost_flags_change() {
}
#endif
}

void on_unload() { free_cuda_buffers(); }
4 changes: 4 additions & 0 deletions src/core/event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ void on_ghost_flags_change();
/** called every time the walls for the lb fluid are changed */
void on_lbboundary_change();

/** @brief Clenaup that needs to happen before library dependencies are unloaded
*/
void on_unload();

/*@}*/

#endif
40 changes: 29 additions & 11 deletions src/core/forces.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,22 @@ void check_forces() {
void force_calc() {
ESPRESSO_PROFILER_CXX_MARK_FUNCTION;

// GPU methods can be launched before the forces on the particles are
// cleared in init_forces()
espressoSystemInterface.update();
#ifdef LB_GPU
if (lattice_switch == ActiveLB::GPU) {
lb_lbcoupling_calc_particle_lattice_ia(thermo_virtual);
}
#endif

for (ActorList::iterator actor = forceActors.begin();
actor != forceActors.end(); ++actor) {
(*actor)->computeForces(espressoSystemInterface);
#ifdef ROTATION
(*actor)->computeTorques(espressoSystemInterface);
#endif
}

#ifdef COLLISION_DETECTION
prepare_local_collision_queue();
Expand All @@ -105,14 +120,21 @@ void force_calc() {
#endif
init_forces();

for (auto actor = forceActors.begin(); actor != forceActors.end(); ++actor) {
(*actor)->computeForces(espressoSystemInterface);
#ifdef ROTATION
(*actor)->computeTorques(espressoSystemInterface);
calc_long_range_forces();
#ifdef LB
if (lattice_switch == ActiveLB::CPU) {
#ifdef ENGINE
ghost_communicator(&cell_structure.exchange_ghosts_comm,
GHOSTTRANS_SWIMMING);
#endif
lb_lbcoupling_calc_particle_lattice_ia(thermo_virtual);
}
#endif

calc_long_range_forces();
// Do not launch GPU kernels after this
#ifdef CUDA
copy_forces_from_GPU(local_cells.particles());
#endif

// Only calculate pair forces if the maximum cutoff is >0
if (max_cut > 0) {
Expand Down Expand Up @@ -151,19 +173,15 @@ void force_calc() {
immersed_boundaries.volume_conservation();
#endif

#if defined(LB_GPU) || defined(LB)
lb_lbcoupling_calc_particle_lattice_ia(thermo_virtual);
#ifdef CUDA
distribute_gpu_forces(local_cells.particles());
#endif

#ifdef METADYNAMICS
/* Metadynamics main function */
meta_perform();
#endif

#ifdef CUDA
copy_forces_from_GPU(local_cells.particles());
#endif

// VIRTUAL_SITES distribute forces
#ifdef VIRTUAL_SITES
if (virtual_sites()->need_ghost_comm_before_back_transfer()) {
Expand Down
32 changes: 4 additions & 28 deletions src/python/espressomd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,32 +21,8 @@

# Initialize MPI, start the main loop on the slaves
import espressomd._init
import atexit
from espressomd.features import features, has_features, missing_features, assert_features
from .system import System, _unload

from espressomd.system import System
from espressomd.code_info import features


def has_features(*args):
"""Tests whether a list of features is a subset of the compiled-in features"""

if len(args) == 1 and not isinstance(args[0], str) and hasattr(args[0], "__iter__"):
return set(args[0]) < set(features())

return set(args) < set(features())


def missing_features(*args):
"""Returns a list of the missing features in the argument"""

if len(args) == 1 and not isinstance(args[0], str) and hasattr(args[0], "__iter__"):
return set(args[0]) - set(features())

return set(args) - set(features())


def assert_features(*args):
"""Raises an exception when a list of features is not a subset of the compiled-in features"""

if not has_features(*args):
raise Exception(
"Missing features " + ", ".join(missing_features(*args)))
atexit.register(_unload)
3 changes: 3 additions & 0 deletions src/python/espressomd/system.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,6 @@ cdef extern from "particle_data.hpp":
int init_type_map(int type) except +
int get_random_p_id(int type) except +
int number_of_particles_with_type(int type) except +

cdef extern from "event.hpp":
void on_unload()
6 changes: 5 additions & 1 deletion src/python/espressomd/system.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ cdef class System(object):
odict['minimize_energy'] = System.__getattribute__(
self, "minimize_energy")
odict['thermostat'] = System.__getattribute__(self, "thermostat")
IF COLLISION_DETECTION:
IF COLLISION_DETECTION:
odict['collision_detection'] = System.__getattribute__(
self, "collision_detection")
return odict
Expand Down Expand Up @@ -528,3 +528,7 @@ cdef class System(object):
self.check_valid_type(type)
pid = get_random_p_id(type)
return int(pid)


def _unload():
on_unload()
Loading