From d44d0680b96a4250984d1b9065afbb4438851f19 Mon Sep 17 00:00:00 2001 From: Qinglei Cao Date: Fri, 27 Sep 2024 12:13:10 +0000 Subject: [PATCH 1/3] Update PaRSEC --- parsec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsec b/parsec index adabbd4d..2f417e91 160000 --- a/parsec +++ b/parsec @@ -1 +1 @@ -Subproject commit adabbd4d1fb580358a32d489df19fa9c05a316e1 +Subproject commit 2f417e9141e2f0e603388bd553211b6b3ff9933e From 30f5cd948d70d4886118f9a01471ba96cac93990 Mon Sep 17 00:00:00 2001 From: Qinglei Cao Date: Fri, 17 May 2024 02:52:29 +0000 Subject: [PATCH 2/3] Add advice device support in dplasma --- src/dplasmaaux.c | 107 +++++++++++++++++++++++++++++++++++++++++ src/dplasmaaux.h | 37 ++++++++++++++ tests/testing_zgemm.c | 21 ++++++++ tests/testing_zpotrf.c | 10 +++- 4 files changed, 174 insertions(+), 1 deletion(-) diff --git a/src/dplasmaaux.c b/src/dplasmaaux.c index 86a6b189..cebeabb6 100644 --- a/src/dplasmaaux.c +++ b/src/dplasmaaux.c @@ -14,6 +14,8 @@ #include #include "dplasmaaux.h" #include "parsec/utils/show_help.h" +#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #if defined(PARSEC_HAVE_MPI) /* @@ -110,3 +112,108 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A ) } } +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) + +/** Find all GPUs + * Size of dev_index: at least parsec_nb_devices + */ +void dplasma_find_nb_gpus(int *dev_index, int *nb) { + *nb = 0; + for(int i = 0; i < (int)parsec_nb_devices; i++) { + parsec_device_module_t *device = parsec_mca_device_get(i); + if( PARSEC_DEV_CUDA & device->type || PARSEC_DEV_HIP & device->type ) { + dev_index[(*nb)++] = device->device_index; + } + } + +#if defined(DPLASMA_DEBUG) + if((*nb) == 0) { + char hostname[256]; + gethostname(hostname, 256); + parsec_warning(stderr, "No CUDA device found on rank %d on %s\n", + parsec->my_rank, hostname); + } +#endif +} + +/** Get the most suitable process/gpu grid */ +int dplasma_grid_calculation( int nb_process ) { + int P; + for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) { + if( 0 == nb_process % P ) break; + } + return P; +} + +/* Operator 2D */ +int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es, + const parsec_tiled_matrix_t *A, + void *_A, parsec_matrix_uplo_t uplo, + int m, int n, void *op_args) { + dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)op_args; + + if( args->nb_gpu_devices > 0 ) { + /* Nested 2D grid on GPU */ + int g = (m / args->grid_rows % args->gpu_rows) * args->gpu_cols + n / args->grid_cols % args->gpu_cols; + parsec_advise_data_on_device(A->super.data_of((parsec_data_collection_t*)A, m, n), + args->gpu_device_index[g], + PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + + (void)es; (void)uplo; + return 0; +} + +/* Set advise data on device + * + * If op_args == NULL, use dplasma_advise_data_on_device_t by default + */ +int dplasma_advise_data_on_device(parsec_context_t *parsec, + parsec_matrix_uplo_t uplo, + parsec_tiled_matrix_t *A, + parsec_tiled_matrix_unary_op_t operation, + void *op_args) { + + if(NULL != op_args) { + parsec_apply(parsec, uplo, A, operation, op_args); + } else { + /* Find the number of GPUs */ + dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)malloc(sizeof(dplasma_advise_data_on_device_t)); + args->gpu_device_index = (int *)malloc(parsec_nb_devices * sizeof(int)); + dplasma_find_nb_gpus(args->gpu_device_index, &args->nb_gpu_devices); + + /* Calculate the nested grid for the multiple GPUs on one process + * gpu_rows >= gpu_cols and as square as possible */ + if(dplasmaUpper == uplo) { + args->gpu_rows = dplasma_grid_calculation(args->nb_gpu_devices); + args->gpu_cols = args->nb_gpu_devices/args->gpu_rows; + } else { + args->gpu_cols = dplasma_grid_calculation(args->nb_gpu_devices); + args->gpu_rows = args->nb_gpu_devices/args->gpu_cols; + } + + if(dplasmaUpper == uplo || dplasmaLower == uplo) { + args->grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows; + args->grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols; + } else if(dplasmaUpperLower == uplo) { + args->grid_rows = ((parsec_matrix_block_cyclic_t *)A)->grid.rows; + args->grid_cols = ((parsec_matrix_block_cyclic_t *)A)->grid.cols; + } else { + dplasma_error("dplasma_advise_data_on_device", "illegal value of uplo"); + } + +#if defined(DPLASMA_DEBUG) + parsec_warning("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n", + args->nb_gpu_devices, args->gpu_rows, args->gpu_cols, args->grid_rows, args->grid_cols); +#endif + + parsec_apply(parsec, uplo, A, operation, (void *)args); + + free(args->gpu_device_index); + free(args); + } + + return 0; +} + +#endif diff --git a/src/dplasmaaux.h b/src/dplasmaaux.h index 28ae2039..4146b5b4 100644 --- a/src/dplasmaaux.h +++ b/src/dplasmaaux.h @@ -115,4 +115,41 @@ extern void *dplasma_pcomm; #if defined(DPLASMA_HAVE_HIP) #include "dplasmaaux_hip.h" #endif + +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) +/* Advise data on device arguments */ +typedef struct dplasma_advise_data_on_device_s { + int nb_gpu_devices; + int *gpu_device_index; + int gpu_rows; + int gpu_cols; + int grid_rows; + int grid_cols; +} dplasma_advise_data_on_device_t; + +/* Find all GPUs + * Size of dev_index: at least parsec_nb_devices + * */ +void dplasma_find_nb_gpus(int *dev_index, int *nb); + +/* Get the most suitable process/gpu grid */ +int dplasma_grid_calculation( int nb_process ); + +/* Operator 2D */ +int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es, + const parsec_tiled_matrix_t *descA, + void *_A, parsec_matrix_uplo_t uplo, + int m, int n, void *args); + +/* Set advise data on device + * + * If op_args == NULL, use dplasma_advise_data_on_device_t by default + */ +int dplasma_advise_data_on_device( parsec_context_t *parsec, + parsec_matrix_uplo_t uplo, + parsec_tiled_matrix_t *A, + parsec_tiled_matrix_unary_op_t operation, + void *op_args ); +#endif + #endif /* _DPLASMAAUX_H_INCLUDED */ diff --git a/tests/testing_zgemm.c b/tests/testing_zgemm.c index c3f08647..cf6829df 100644 --- a/tests/testing_zgemm.c +++ b/tests/testing_zgemm.c @@ -8,6 +8,7 @@ */ #include "common.h" +#include "dplasmaaux.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" static int check_solution( parsec_context_t *parsec, int loud, @@ -76,6 +77,16 @@ int main(int argc, char ** argv) dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcC, Cseed); if(loud > 2) printf("Done\n"); + /* Advice data on device */ +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); +#endif + int t; for(t = 0; t < nruns; t++) { parsec_devices_release_memory(); @@ -142,6 +153,16 @@ int main(int argc, char ** argv) parsec_devices_release_memory(); parsec_devices_reset_load(parsec); + /* Advice data on device */ +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); +#endif + /* Create GEMM PaRSEC */ if(loud) printf("Compute ... ... "); PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(parsec, zgemm, diff --git a/tests/testing_zpotrf.c b/tests/testing_zpotrf.c index 7d0fa3b7..2f327978 100644 --- a/tests/testing_zpotrf.c +++ b/tests/testing_zpotrf.c @@ -9,6 +9,7 @@ #include "common.h" #include "flops.h" +#include "dplasmaaux.h" #include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" @@ -18,7 +19,7 @@ int main(int argc, char ** argv) { parsec_context_t* parsec; int iparam[IPARAM_SIZEOF]; - dplasma_enum_t uplo = dplasmaUpper; + dplasma_enum_t uplo = dplasmaLower; int info = 0; int ret = 0; @@ -43,6 +44,13 @@ int main(int argc, char ** argv) parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, rank, MB, NB, LDA, N, 0, 0, N, N, P, nodes/P, uplo)); + + /* Advice data on device */ +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) + dplasma_advise_data_on_device(parsec, uplo, (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); +#endif + int t; for(t = 0; t < nruns; t++) { /* matrix (re)generation */ From 5cf5a0bd6fa133a3eb96350790c7c7c992f3bcbb Mon Sep 17 00:00:00 2001 From: Qinglei Cao Date: Fri, 11 Oct 2024 20:06:35 +0000 Subject: [PATCH 3/3] Update the parsec_apply because of the changes in PaRSEC PR#676 --- src/zlascal_wrapper.c | 4 ++++ src/zlaset_wrapper.c | 4 ++++ src/zlatms_wrapper.c | 2 ++ src/zplghe_wrapper.c | 4 ++++ src/zplgsy_wrapper.c | 5 ++++- src/zplrnt_wrapper.c | 5 ++++- src/zpltmg_wrapper.c | 2 ++ 7 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/zlascal_wrapper.c b/src/zlascal_wrapper.c index 955717a2..3b42214e 100644 --- a/src/zlascal_wrapper.c +++ b/src/zlascal_wrapper.c @@ -12,6 +12,7 @@ #include "dplasma.h" #include "dplasma/types.h" #include "dplasmaaux.h" +#include "parsec/data_dist/matrix/apply.h" #include "cores/core_blas.h" @@ -142,6 +143,9 @@ dplasma_zlascal_New( dplasma_enum_t uplo, void dplasma_zlascal_Destruct( parsec_taskpool_t *tp ) { + if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) { + free( ((parsec_apply_taskpool_t *)tp)->_g_op_args ); + } parsec_apply_Destruct(tp); } diff --git a/src/zlaset_wrapper.c b/src/zlaset_wrapper.c index aee6e782..dc0cc896 100644 --- a/src/zlaset_wrapper.c +++ b/src/zlaset_wrapper.c @@ -12,6 +12,7 @@ #include "dplasma.h" #include "dplasma/types.h" #include "dplasmaaux.h" +#include "parsec/data_dist/matrix/apply.h" static int @@ -126,6 +127,9 @@ dplasma_zlaset_New( dplasma_enum_t uplo, void dplasma_zlaset_Destruct( parsec_taskpool_t *tp ) { + if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) { + free( ((parsec_apply_taskpool_t *)tp)->_g_op_args ); + } parsec_apply_Destruct(tp); } diff --git a/src/zlatms_wrapper.c b/src/zlatms_wrapper.c index d5dab702..138fa4dd 100644 --- a/src/zlatms_wrapper.c +++ b/src/zlatms_wrapper.c @@ -125,8 +125,10 @@ dplasma_zlatms( parsec_context_t *parsec, parsec_context_start( parsec ); parsec_context_wait( parsec ); parsec_apply_Destruct(tp); + free(condptr); } else { + free(condptr); return -1; } } diff --git a/src/zplghe_wrapper.c b/src/zplghe_wrapper.c index 6e10f335..e42ac913 100644 --- a/src/zplghe_wrapper.c +++ b/src/zplghe_wrapper.c @@ -11,6 +11,7 @@ #include "dplasma.h" #include "dplasma/types.h" #include "dplasmaaux.h" +#include "parsec/data_dist/matrix/apply.h" #include "cores/core_blas.h" @@ -127,6 +128,9 @@ dplasma_zplghe_New( double bump, dplasma_enum_t uplo, void dplasma_zplghe_Destruct( parsec_taskpool_t *tp ) { + if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) { + free( ((parsec_apply_taskpool_t *)tp)->_g_op_args ); + } parsec_apply_Destruct(tp); } diff --git a/src/zplgsy_wrapper.c b/src/zplgsy_wrapper.c index 37a407bb..819f1a0f 100644 --- a/src/zplgsy_wrapper.c +++ b/src/zplgsy_wrapper.c @@ -11,7 +11,7 @@ #include "dplasma.h" #include "dplasma/types.h" #include "dplasmaaux.h" - +#include "parsec/data_dist/matrix/apply.h" #include "cores/core_blas.h" struct zplgsy_args_s { @@ -129,6 +129,9 @@ dplasma_zplgsy_New( dplasma_complex64_t bump, dplasma_enum_t uplo, void dplasma_zplgsy_Destruct( parsec_taskpool_t *tp ) { + if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) { + free( ((parsec_apply_taskpool_t *)tp)->_g_op_args ); + } parsec_apply_Destruct(tp); } diff --git a/src/zplrnt_wrapper.c b/src/zplrnt_wrapper.c index 770ec063..e110f3db 100644 --- a/src/zplrnt_wrapper.c +++ b/src/zplrnt_wrapper.c @@ -11,7 +11,7 @@ #include "dplasma.h" #include "dplasma/types.h" #include "dplasmaaux.h" - +#include "parsec/data_dist/matrix/apply.h" #include "cores/core_blas.h" @@ -143,6 +143,9 @@ dplasma_zplrnt_New( int diagdom, void dplasma_zplrnt_Destruct( parsec_taskpool_t *tp ) { + if( ((parsec_apply_taskpool_t *)tp)->_g_op_args ) { + free( ((parsec_apply_taskpool_t *)tp)->_g_op_args ); + } parsec_apply_Destruct(tp); } diff --git a/src/zpltmg_wrapper.c b/src/zpltmg_wrapper.c index 3597dd2a..33d5a058 100644 --- a/src/zpltmg_wrapper.c +++ b/src/zpltmg_wrapper.c @@ -125,9 +125,11 @@ dplasma_zpltmg_generic( parsec_context_t *parsec, { parsec_context_add_taskpool(parsec, (parsec_taskpool_t*)parsec_zpltmg); dplasma_wait_until_completion(parsec); + free(params); parsec_apply_Destruct( parsec_zpltmg ); return 0; } + free(params); return -101; }