Skip to content

Commit

Permalink
Add parsec_advise_data_on_device for zpotrf_L
Browse files Browse the repository at this point in the history
  • Loading branch information
Qinglei Cao committed Sep 5, 2024
1 parent 5fa144b commit c67b4b5
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 11 deletions.
2 changes: 1 addition & 1 deletion parsec
Submodule parsec updated 78 files
+6 −0 .github/workflows/build_cmake.yml
+13 −3 CMakeLists.txt
+19 −11 contrib/platforms/macosx
+1 −1 parsec/class/parsec_hash_table.c
+6 −0 parsec/data.c
+1 −1 parsec/data_dist/matrix/map_operator.c
+6 −5 parsec/data_dist/matrix/matrix.c
+1 −1 parsec/data_dist/matrix/matrix.h
+8 −1 parsec/data_internal.h
+64 −92 parsec/interfaces/dtd/insert_function.c
+0 −4 parsec/interfaces/dtd/insert_function_internal.h
+176 −207 parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+1 −1 parsec/mca/device/CMakeLists.txt
+1 −1 parsec/mca/device/cuda/device_cuda.h
+11 −41 parsec/mca/device/cuda/device_cuda_component.c
+1 −1 parsec/mca/device/cuda/device_cuda_internal.h
+64 −30 parsec/mca/device/cuda/device_cuda_module.c
+200 −130 parsec/mca/device/device.c
+23 −5 parsec/mca/device/device.h
+303 −301 parsec/mca/device/device_gpu.c
+0 −2 parsec/mca/device/device_gpu.h
+16 −14 parsec/mca/device/transfer_gpu.c
+63 −6 parsec/parsec.c
+5 −1 parsec/parsec_internal.h
+4 −4 parsec/parsec_reshape.c
+2 −2 parsec/remote_dep.c
+6 −6 parsec/remote_dep.h
+185 −65 parsec/remote_dep_mpi.c
+28 −0 parsec/runtime.h
+38 −71 parsec/scheduling.c
+5 −3 parsec/utils/debug.h
+1 −1 parsec/vpmap.c
+6 −0 tests/CMakeLists.txt
+3 −1 tests/api/init_fini.c
+4 −3 tests/api/touch_exf.F90
+2 −2 tests/apps/merge_sort/Testings.cmake
+26 −26 tests/apps/stencil/stencil_1D.jdf
+1 −1 tests/apps/stencil/stencil_internal.c
+1 −1 tests/apps/stencil/stencil_internal.h
+2 −2 tests/apps/stencil/testing_stencil_1D.c
+11 −0 tests/class/atomics.c
+1 −2 tests/class/future.c
+1 −3 tests/class/future_datacopy.c
+1 −2 tests/class/lifo.c
+1 −2 tests/class/list.c
+2 −2 tests/collections/reshape/avoidable_reshape.jdf
+2 −2 tests/collections/reshape/local_no_reshape.jdf
+1 −1 tests/collections/reshape/remote_multiple_outs_same_pred_flow.jdf
+1 −1 tests/collections/reshape/remote_multiple_outs_same_pred_flow_multiple_deps.jdf
+2 −2 tests/collections/reshape/testing_avoidable_reshape.c
+1 −1 tests/collections/reshape/testing_input_dep_reshape_single_copy.c
+4 −4 tests/collections/reshape/testing_remote_multiple_outs_same_pred_flow.c
+3 −3 tests/collections/reshape/testing_reshape.c
+1 −0 tests/dsl/dtd/CMakeLists.txt
+2 −0 tests/dsl/dtd/Testings.cmake
+26 −0 tests/dsl/dtd/dtd_test_empty.c
+3 −3 tests/dsl/dtd/dtd_test_simple_gemm.c
+2 −1 tests/dsl/dtd/dtd_test_task_insertion.c
+4 −3 tests/dsl/ptg/CMakeLists.txt
+1 −0 tests/dsl/ptg/Testings.cmake
+6 −0 tests/dsl/ptg/multisize_bcast/CMakeLists.txt
+4 −0 tests/dsl/ptg/multisize_bcast/Testings.cmake
+92 −0 tests/dsl/ptg/multisize_bcast/check_multisize_bcast.jdf
+43 −0 tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.c
+5 −0 tests/dsl/ptg/multisize_bcast/check_multisize_bcast_wrapper.h
+41 −0 tests/dsl/ptg/multisize_bcast/data_gen.c
+10 −0 tests/dsl/ptg/multisize_bcast/data_gen.h
+74 −0 tests/dsl/ptg/multisize_bcast/main.c
+1 −1 tests/profiling/check-comms.py
+3 −2 tests/runtime/cuda/nvlink.jdf
+9 −1 tests/runtime/cuda/nvlink_main.c
+13 −25 tests/runtime/cuda/nvlink_wrapper.c
+9 −8 tests/runtime/cuda/stage_custom.jdf
+12 −5 tests/runtime/cuda/stage_main.c
+3 −2 tests/runtime/cuda/stress.jdf
+1 −1 tests/runtime/cuda/stress_main.c
+4 −8 tests/runtime/cuda/stress_wrapper.c
+2 −2 tests/runtime/cuda/testing_get_best_device.c
57 changes: 48 additions & 9 deletions src/zpotrf_L.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,49 @@ cuda_workspaces_infokey [type = "int" hidden = on default = -1 ]
hip_handles_infokey [type = "int" hidden = on default = -1 ]
hip_workspaces_infokey [type = "int" hidden = on default = -1 ]

nb_gpu_devices [ type = "int" hidden = on default = 0 ]
gpu_device_index [ type = "int *" hidden = on default = "NULL"]
gpu_rows [ type = "int" hidden = on default = 1]
gpu_cols [ type = "int" hidden = on default = 1]
grid_rows [ type = "int" hidden = on default = 1]
grid_cols [ type = "int" hidden = on default = 1]


/**************************************************
* potrf_bind_A *
**************************************************/
potrf_bind_A(m, n)

// Execution space
m = 0 .. descA->mt-1
n = 0 .. m

loc_A = %{ return LOC(descA, m, n); %}

// Parallel partitioning
:descA(m, n)

READ A <- ddescA(m, n) [ type = %{ return ADTT_READ(ddescA, loc_A, DEFAULT, TILED); %}
type_data = %{ return ADTT_READ(ddescA, loc_A, DEFAULT, LAPACK); %} ]
-> (m == 0 && n == 0) ? T potrf_zpotrf(0)
-> (n == 0)? C potrf_ztrsm(m, n)
-> (m == n && n > 0) ? T potrf_zherk(0, m)
-> (m != n && n > 0) ? C potrf_zgemm(m, n, 0)

BODY
{
#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
if( nb_gpu_devices > 0 ) {
int g = (m / grid_rows % gpu_rows) * gpu_cols + n / grid_cols % gpu_cols;
parsec_advise_data_on_device( _f_A->original,
gpu_device_index[g],
PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
}
#endif
}
END


/**************************************************
* potrf_zpotrf *
**************************************************/
Expand All @@ -106,8 +149,7 @@ loc_T = %{ return LOC(descA, k, k); %}

// Parameters

RW T <- (k == 0) ? ddescA(k, k) [ type = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, TILED); %}
type_data = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, LAPACK); %} ]
RW T <- (k == 0) ? A potrf_bind_A(k, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
<- (k != 0) ? T potrf_zherk(k-1, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
-> T potrf_ztrsm(k+1..descA->mt-1, k) /* dep OUT: rely on datacopy dtt for sending */
-> ddescA(k, k) [ type = %{ return ADTT_CP(_f_T, ddescA, loc_T, DEFAULT); %}
Expand Down Expand Up @@ -235,8 +277,7 @@ loc_C = %{ return LOC(descA, m, k); %}

// Parameters
READ T <- T potrf_zpotrf(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
RW C <- (k == 0) ? ddescA(m, k) [ type = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, TILED); %}
type_data = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, LAPACK); %} ]
RW C <- (k == 0) ? A potrf_bind_A(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ]
<- (k != 0) ? C potrf_zgemm(m, k, k-1) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ]
-> A potrf_zherk(k, m) /* dep OUT: rely on datacopy dtt for sending */
-> A potrf_zgemm(m, k+1..m-1, k) /* dep OUT: rely on datacopy dtt for sending */
Expand Down Expand Up @@ -370,9 +411,8 @@ loc_T = %{ return LOC(descA, m, m); %}

//Parameters
READ A <- C potrf_ztrsm(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, DEFAULT, TILED); %} ]
RW T <- (k == 0) ? ddescA(m, m) [ type = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, TILED); %}
type_data = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, LAPACK); %} ]
<- (k != 0) ? T potrf_zherk(k-1, m) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
RW T <- (k == 0) ? A potrf_bind_A(m, m) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
<- (k != 0) ? T potrf_zherk(k-1, m) /* dep OUT: rely on datacopy dtt for sending */
-> (m == k+1) ? T potrf_zpotrf(m) : T potrf_zherk(k+1, m) /* dep OUT: rely on datacopy dtt for sending */

; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * (m - k) : PRI_MAX
Expand Down Expand Up @@ -493,8 +533,7 @@ loc_C = %{ return LOC(descA, m, n); %}
// Parameters
READ A <- C potrf_ztrsm(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, DEFAULT, TILED); %} ]
READ B <- C potrf_ztrsm(n, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_B, DEFAULT, TILED); %} ]
RW C <- (k == 0) ? ddescA(m, n) [ type = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, TILED); %}
type_data = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, LAPACK); %} ]
RW C <- (k == 0) ? A potrf_bind_A(m, n) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ]
<- (k != 0) ? C potrf_zgemm(m, n, k-1) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ]
-> (n == k+1) ? C potrf_ztrsm(m, n) : C potrf_zgemm(m, n, k+1) /* dep OUT: rely on datacopy dtt for sending */

Expand Down
50 changes: 50 additions & 0 deletions src/zpotrf_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "zpotrf_U.h"
#include "zpotrf_L.h"
#include "cores/dplasma_plasmatypes.h"
#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"

#define MAX_SHAPES 1

Expand Down Expand Up @@ -129,6 +130,42 @@ static void zpotrf_destroy_hip_workspace(void *_ws, void *_n)
free(ws);
(void)_n;
}

/* Find all devices */
static void parsec_find_nb_devices(int **dev_index, int *nb) {
for(int i = 0; i < (int)parsec_nb_devices; i++) {
parsec_device_module_t *device = parsec_mca_device_get(i);
if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) {
(*nb)++;
}
}
#if defined(DPLASMA_DEBUG)
if((*nb) == 0) {
char hostname[256];
gethostname(hostname, 256);
fprintf(stderr, "No CUDA device found on rank %d on %s\n",
parsec->my_rank, hostname);
}
#endif
*dev_index = (int *)malloc((*nb) * sizeof(int));
*nb = 0;
for(int i = 0; i < (int)parsec_nb_devices; i++) {
parsec_device_module_t *device = parsec_mca_device_get(i);
if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) {
(*dev_index)[(*nb)++] = device->device_index;
}
}
}

/* Get the most suitable process/gpu grid */
static int parsec_grid_calculation( int nb_process ) {
int P;
for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) {
if( 0 == nb_process % P ) break;
}
return P;
}

#endif

/**
Expand Down Expand Up @@ -237,6 +274,19 @@ dplasma_zpotrf_New( dplasma_enum_t uplo,
zpotrf_destroy_cuda_workspace, NULL,
zpotrf_create_cuda_workspace, parsec_zpotrf,
NULL);
int nb = 0, *dev_index;
parsec_find_nb_devices(&dev_index, &nb);
parsec_zpotrf->_g_nb_gpu_devices = nb;
parsec_zpotrf->_g_gpu_device_index = dev_index;
parsec_zpotrf->_g_gpu_cols = parsec_grid_calculation(nb);
parsec_zpotrf->_g_gpu_rows = nb/parsec_zpotrf->_g_gpu_cols;
parsec_zpotrf->_g_grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows;
parsec_zpotrf->_g_grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols;
#if defined(DPLASMA_DEBUG)
printf("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n",
parsec_zpotrf->_g_nb_gpu_devices, parsec_zpotrf->_g_gpu_rows,
parsec_zpotrf->_g_gpu_cols, parsec_zpotrf->_g_grid_rows, parsec_zpotrf->_g_grid_cols);
#endif
#else
parsec_zpotrf->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED;
parsec_zpotrf->_g_cuda_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED;
Expand Down
3 changes: 2 additions & 1 deletion tests/testing_zpotrf.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ int main(int argc, char ** argv)
{
parsec_context_t* parsec;
int iparam[IPARAM_SIZEOF];
dplasma_enum_t uplo = dplasmaUpper;
//dplasma_enum_t uplo = dplasmaUpper;
dplasma_enum_t uplo = dplasmaLower;
int info = 0;
int ret = 0;

Expand Down

0 comments on commit c67b4b5

Please sign in to comment.