Skip to content

Commit

Permalink
Merge pull request #118 from QingleiCao/qinglei/potrf_gpu_advice
Browse files Browse the repository at this point in the history
Add parsec_advise_data_on_device for zpotrf_L
  • Loading branch information
abouteiller authored Oct 25, 2024
2 parents 82c4afc + 5cf5a0b commit 42f4773
Show file tree
Hide file tree
Showing 12 changed files with 199 additions and 4 deletions.
2 changes: 1 addition & 1 deletion parsec
Submodule parsec updated 100 files
107 changes: 107 additions & 0 deletions src/dplasmaaux.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include <string.h>
#include "dplasmaaux.h"
#include "parsec/utils/show_help.h"
#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"

#if defined(PARSEC_HAVE_MPI)
/*
Expand Down Expand Up @@ -109,3 +111,108 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A )
}
}

#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)

/** Find all GPUs
* Size of dev_index: at least parsec_nb_devices
*/
void dplasma_find_nb_gpus(int *dev_index, int *nb) {
*nb = 0;
for(int i = 0; i < (int)parsec_nb_devices; i++) {
parsec_device_module_t *device = parsec_mca_device_get(i);
if( PARSEC_DEV_CUDA & device->type || PARSEC_DEV_HIP & device->type ) {
dev_index[(*nb)++] = device->device_index;
}
}

#if defined(DPLASMA_DEBUG)
if((*nb) == 0) {
char hostname[256];
gethostname(hostname, 256);
parsec_warning(stderr, "No CUDA device found on rank %d on %s\n",
parsec->my_rank, hostname);
}
#endif
}

/** Get the most suitable process/gpu grid */
int dplasma_grid_calculation( int nb_process ) {
int P;
for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) {
if( 0 == nb_process % P ) break;
}
return P;
}

/* Operator 2D */
int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es,
const parsec_tiled_matrix_t *A,
void *_A, parsec_matrix_uplo_t uplo,
int m, int n, void *op_args) {
dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)op_args;

if( args->nb_gpu_devices > 0 ) {
/* Nested 2D grid on GPU */
int g = (m / args->grid_rows % args->gpu_rows) * args->gpu_cols + n / args->grid_cols % args->gpu_cols;
parsec_advise_data_on_device(A->super.data_of((parsec_data_collection_t*)A, m, n),
args->gpu_device_index[g],
PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
}

(void)es; (void)uplo;
return 0;
}

/* Set advise data on device
*
* If op_args == NULL, use dplasma_advise_data_on_device_t by default
*/
int dplasma_advise_data_on_device(parsec_context_t *parsec,
parsec_matrix_uplo_t uplo,
parsec_tiled_matrix_t *A,
parsec_tiled_matrix_unary_op_t operation,
void *op_args) {

if(NULL != op_args) {
parsec_apply(parsec, uplo, A, operation, op_args);
} else {
/* Find the number of GPUs */
dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)malloc(sizeof(dplasma_advise_data_on_device_t));
args->gpu_device_index = (int *)malloc(parsec_nb_devices * sizeof(int));
dplasma_find_nb_gpus(args->gpu_device_index, &args->nb_gpu_devices);

/* Calculate the nested grid for the multiple GPUs on one process
* gpu_rows >= gpu_cols and as square as possible */
if(dplasmaUpper == uplo) {
args->gpu_rows = dplasma_grid_calculation(args->nb_gpu_devices);
args->gpu_cols = args->nb_gpu_devices/args->gpu_rows;
} else {
args->gpu_cols = dplasma_grid_calculation(args->nb_gpu_devices);
args->gpu_rows = args->nb_gpu_devices/args->gpu_cols;
}

if(dplasmaUpper == uplo || dplasmaLower == uplo) {
args->grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows;
args->grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols;
} else if(dplasmaUpperLower == uplo) {
args->grid_rows = ((parsec_matrix_block_cyclic_t *)A)->grid.rows;
args->grid_cols = ((parsec_matrix_block_cyclic_t *)A)->grid.cols;
} else {
dplasma_error("dplasma_advise_data_on_device", "illegal value of uplo");
}

#if defined(DPLASMA_DEBUG)
parsec_warning("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n",
args->nb_gpu_devices, args->gpu_rows, args->gpu_cols, args->grid_rows, args->grid_cols);
#endif

parsec_apply(parsec, uplo, A, operation, (void *)args);

free(args->gpu_device_index);
free(args);
}

return 0;
}

#endif
37 changes: 37 additions & 0 deletions src/dplasmaaux.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,41 @@ extern void *dplasma_pcomm;
#if defined(DPLASMA_HAVE_HIP)
#include "dplasmaaux_hip.h"
#endif

#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
/* Advise data on device arguments */
typedef struct dplasma_advise_data_on_device_s {
int nb_gpu_devices;
int *gpu_device_index;
int gpu_rows;
int gpu_cols;
int grid_rows;
int grid_cols;
} dplasma_advise_data_on_device_t;

/* Find all GPUs
* Size of dev_index: at least parsec_nb_devices
* */
void dplasma_find_nb_gpus(int *dev_index, int *nb);

/* Get the most suitable process/gpu grid */
int dplasma_grid_calculation( int nb_process );

/* Operator 2D */
int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es,
const parsec_tiled_matrix_t *descA,
void *_A, parsec_matrix_uplo_t uplo,
int m, int n, void *args);

/* Set advise data on device
*
* If op_args == NULL, use dplasma_advise_data_on_device_t by default
*/
int dplasma_advise_data_on_device( parsec_context_t *parsec,
parsec_matrix_uplo_t uplo,
parsec_tiled_matrix_t *A,
parsec_tiled_matrix_unary_op_t operation,
void *op_args );
#endif

#endif /* _DPLASMAAUX_H_INCLUDED */
4 changes: 4 additions & 0 deletions src/zlascal_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "dplasma.h"
#include "dplasma/types.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/apply.h"
#include "cores/core_blas.h"


Expand Down Expand Up @@ -142,6 +143,9 @@ dplasma_zlascal_New( dplasma_enum_t uplo,
void
dplasma_zlascal_Destruct( parsec_taskpool_t *tp )
{
if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) {
free( ((parsec_apply_taskpool_t *)tp)->_g_op_args );
}
parsec_apply_Destruct(tp);
}

Expand Down
4 changes: 4 additions & 0 deletions src/zlaset_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "dplasma.h"
#include "dplasma/types.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/apply.h"


static int
Expand Down Expand Up @@ -126,6 +127,9 @@ dplasma_zlaset_New( dplasma_enum_t uplo,
void
dplasma_zlaset_Destruct( parsec_taskpool_t *tp )
{
if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) {
free( ((parsec_apply_taskpool_t *)tp)->_g_op_args );
}
parsec_apply_Destruct(tp);
}

Expand Down
2 changes: 2 additions & 0 deletions src/zlatms_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,10 @@ dplasma_zlatms( parsec_context_t *parsec,
parsec_context_start( parsec );
parsec_context_wait( parsec );
parsec_apply_Destruct(tp);
free(condptr);
}
else {
free(condptr);
return -1;
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/zplghe_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "dplasma.h"
#include "dplasma/types.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/apply.h"
#include "cores/core_blas.h"


Expand Down Expand Up @@ -127,6 +128,9 @@ dplasma_zplghe_New( double bump, dplasma_enum_t uplo,
void
dplasma_zplghe_Destruct( parsec_taskpool_t *tp )
{
if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) {
free( ((parsec_apply_taskpool_t *)tp)->_g_op_args );
}
parsec_apply_Destruct(tp);
}

Expand Down
5 changes: 4 additions & 1 deletion src/zplgsy_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "dplasma.h"
#include "dplasma/types.h"
#include "dplasmaaux.h"

#include "parsec/data_dist/matrix/apply.h"
#include "cores/core_blas.h"

struct zplgsy_args_s {
Expand Down Expand Up @@ -129,6 +129,9 @@ dplasma_zplgsy_New( dplasma_complex64_t bump, dplasma_enum_t uplo,
void
dplasma_zplgsy_Destruct( parsec_taskpool_t *tp )
{
if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) {
free( ((parsec_apply_taskpool_t *)tp)->_g_op_args );
}
parsec_apply_Destruct(tp);
}

Expand Down
5 changes: 4 additions & 1 deletion src/zplrnt_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "dplasma.h"
#include "dplasma/types.h"
#include "dplasmaaux.h"

#include "parsec/data_dist/matrix/apply.h"
#include "cores/core_blas.h"


Expand Down Expand Up @@ -143,6 +143,9 @@ dplasma_zplrnt_New( int diagdom,
void
dplasma_zplrnt_Destruct( parsec_taskpool_t *tp )
{
if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) {
free( ((parsec_apply_taskpool_t *)tp)->_g_op_args );
}
parsec_apply_Destruct(tp);
}

Expand Down
2 changes: 2 additions & 0 deletions src/zpltmg_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,11 @@ dplasma_zpltmg_generic( parsec_context_t *parsec,
{
parsec_context_add_taskpool(parsec, (parsec_taskpool_t*)parsec_zpltmg);
dplasma_wait_until_completion(parsec);
free(params);
parsec_apply_Destruct( parsec_zpltmg );
return 0;
}
free(params);
return -101;
}

Expand Down
21 changes: 21 additions & 0 deletions tests/testing_zgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
*/

#include "common.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"

static int check_solution( parsec_context_t *parsec, int loud,
Expand Down Expand Up @@ -76,6 +77,16 @@ int main(int argc, char ** argv)
dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcC, Cseed);
if(loud > 2) printf("Done\n");

/* Advice data on device */
#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
#endif

int t;
for(t = 0; t < nruns; t++) {
parsec_devices_release_memory();
Expand Down Expand Up @@ -142,6 +153,16 @@ int main(int argc, char ** argv)
parsec_devices_release_memory();
parsec_devices_reset_load(parsec);

/* Advice data on device */
#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
#endif

/* Create GEMM PaRSEC */
if(loud) printf("Compute ... ... ");
PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(parsec, zgemm,
Expand Down
10 changes: 9 additions & 1 deletion tests/testing_zpotrf.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "common.h"
#include "flops.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"

Expand All @@ -18,7 +19,7 @@ int main(int argc, char ** argv)
{
parsec_context_t* parsec;
int iparam[IPARAM_SIZEOF];
dplasma_enum_t uplo = dplasmaUpper;
dplasma_enum_t uplo = dplasmaLower;
int info = 0;
int ret = 0;

Expand All @@ -43,6 +44,13 @@ int main(int argc, char ** argv)
parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
rank, MB, NB, LDA, N, 0, 0,
N, N, P, nodes/P, uplo));

/* Advice data on device */
#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
dplasma_advise_data_on_device(parsec, uplo, (parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
#endif

int t;
for(t = 0; t < nruns; t++) {
/* matrix (re)generation */
Expand Down

0 comments on commit 42f4773

Please sign in to comment.