Skip to content

Commit

Permalink
Add advice device support in dplasma
Browse files Browse the repository at this point in the history
  • Loading branch information
Qinglei Cao authored and Qinglei Cao committed Oct 18, 2024
1 parent fee69e6 commit fcbf2d6
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 1 deletion.
107 changes: 107 additions & 0 deletions src/dplasmaaux.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include <string.h>
#include "dplasmaaux.h"
#include "parsec/utils/show_help.h"
#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"

#if defined(PARSEC_HAVE_MPI)
/*
Expand Down Expand Up @@ -110,3 +112,108 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A )
}
}

#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)

/** Find all GPUs
* Size of dev_index: at least parsec_nb_devices
*/
void dplasma_find_nb_gpus(int *dev_index, int *nb) {
*nb = 0;
for(int i = 0; i < (int)parsec_nb_devices; i++) {
parsec_device_module_t *device = parsec_mca_device_get(i);
if( PARSEC_DEV_CUDA & device->type || PARSEC_DEV_HIP & device->type ) {
dev_index[(*nb)++] = device->device_index;
}
}

#if defined(DPLASMA_DEBUG)
if((*nb) == 0) {
char hostname[256];
gethostname(hostname, 256);
parsec_warning(stderr, "No CUDA device found on rank %d on %s\n",
parsec->my_rank, hostname);
}
#endif
}

/** Get the most suitable process/gpu grid */
int dplasma_grid_calculation( int nb_process ) {
int P;
for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) {
if( 0 == nb_process % P ) break;
}
return P;
}

/* Operator 2D */
int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es,
const parsec_tiled_matrix_t *A,
void *_A, parsec_matrix_uplo_t uplo,
int m, int n, void *op_args) {
dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)op_args;

if( args->nb_gpu_devices > 0 ) {
/* Nested 2D grid on GPU */
int g = (m / args->grid_rows % args->gpu_rows) * args->gpu_cols + n / args->grid_cols % args->gpu_cols;
parsec_advise_data_on_device(A->super.data_of((parsec_data_collection_t*)A, m, n),
args->gpu_device_index[g],
PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
}

(void)es; (void)uplo;
return 0;
}

/* Set advise data on device
*
* If op_args == NULL, use dplasma_advise_data_on_device_t by default
*/
int dplasma_advise_data_on_device(parsec_context_t *parsec,
parsec_matrix_uplo_t uplo,
parsec_tiled_matrix_t *A,
parsec_tiled_matrix_unary_op_t operation,
void *op_args) {

if(NULL != op_args) {
parsec_apply(parsec, uplo, A, operation, op_args);
} else {
/* Find the number of GPUs */
dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)malloc(sizeof(dplasma_advise_data_on_device_t));
args->gpu_device_index = (int *)malloc(parsec_nb_devices * sizeof(int));
dplasma_find_nb_gpus(args->gpu_device_index, &args->nb_gpu_devices);

/* Calculate the nested grid for the multiple GPUs on one process
* gpu_rows >= gpu_cols and as square as possible */
if(dplasmaUpper == uplo) {
args->gpu_rows = dplasma_grid_calculation(args->nb_gpu_devices);
args->gpu_cols = args->nb_gpu_devices/args->gpu_rows;
} else {
args->gpu_cols = dplasma_grid_calculation(args->nb_gpu_devices);
args->gpu_rows = args->nb_gpu_devices/args->gpu_cols;
}

if(dplasmaUpper == uplo || dplasmaLower == uplo) {
args->grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows;
args->grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols;
} else if(dplasmaUpperLower == uplo) {
args->grid_rows = ((parsec_matrix_block_cyclic_t *)A)->grid.rows;
args->grid_cols = ((parsec_matrix_block_cyclic_t *)A)->grid.cols;
} else {
dplasma_error("dplasma_advise_data_on_device", "illegal value of uplo");
}

#if defined(DPLASMA_DEBUG)
parsec_warning("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n",
args->nb_gpu_devices, args->gpu_rows, args->gpu_cols, args->grid_rows, args->grid_cols);
#endif

parsec_apply(parsec, uplo, A, operation, (void *)args);

free(args->gpu_device_index);
free(args);
}

return 0;
}

#endif
37 changes: 37 additions & 0 deletions src/dplasmaaux.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,41 @@ extern void *dplasma_pcomm;
#if defined(DPLASMA_HAVE_HIP)
#include "dplasmaaux_hip.h"
#endif

#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
/* Advise data on device arguments */
typedef struct dplasma_advise_data_on_device_s {
int nb_gpu_devices;
int *gpu_device_index;
int gpu_rows;
int gpu_cols;
int grid_rows;
int grid_cols;
} dplasma_advise_data_on_device_t;

/* Find all GPUs
* Size of dev_index: at least parsec_nb_devices
* */
void dplasma_find_nb_gpus(int *dev_index, int *nb);

/* Get the most suitable process/gpu grid */
int dplasma_grid_calculation( int nb_process );

/* Operator 2D */
int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es,
const parsec_tiled_matrix_t *descA,
void *_A, parsec_matrix_uplo_t uplo,
int m, int n, void *args);

/* Set advise data on device
*
* If op_args == NULL, use dplasma_advise_data_on_device_t by default
*/
int dplasma_advise_data_on_device( parsec_context_t *parsec,
parsec_matrix_uplo_t uplo,
parsec_tiled_matrix_t *A,
parsec_tiled_matrix_unary_op_t operation,
void *op_args );
#endif

#endif /* _DPLASMAAUX_H_INCLUDED */
21 changes: 21 additions & 0 deletions tests/testing_zgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
*/

#include "common.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"

static int check_solution( parsec_context_t *parsec, int loud,
Expand Down Expand Up @@ -76,6 +77,16 @@ int main(int argc, char ** argv)
dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcC, Cseed);
if(loud > 2) printf("Done\n");

/* Advice data on device */
#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
#endif

int t;
for(t = 0; t < nruns; t++) {
parsec_devices_release_memory();
Expand Down Expand Up @@ -142,6 +153,16 @@ int main(int argc, char ** argv)
parsec_devices_release_memory();
parsec_devices_reset_load(parsec);

/* Advice data on device */
#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
#endif

/* Create GEMM PaRSEC */
if(loud) printf("Compute ... ... ");
PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(parsec, zgemm,
Expand Down
10 changes: 9 additions & 1 deletion tests/testing_zpotrf.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "common.h"
#include "flops.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"

Expand All @@ -18,7 +19,7 @@ int main(int argc, char ** argv)
{
parsec_context_t* parsec;
int iparam[IPARAM_SIZEOF];
dplasma_enum_t uplo = dplasmaUpper;
dplasma_enum_t uplo = dplasmaLower;
int info = 0;
int ret = 0;

Expand All @@ -43,6 +44,13 @@ int main(int argc, char ** argv)
parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
rank, MB, NB, LDA, N, 0, 0,
N, N, P, nodes/P, uplo));

/* Advice data on device */
#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
dplasma_advise_data_on_device(parsec, uplo, (parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
#endif

int t;
for(t = 0; t < nruns; t++) {
/* matrix (re)generation */
Expand Down

0 comments on commit fcbf2d6

Please sign in to comment.