diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index 79ff1a2d..3a07e349 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2017-2020 The University of Tennessee and The University + * Copyright (c) 2017-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -18,7 +18,7 @@ extern "C" %{ #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ - + static void succ(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l) { int xn = *x, yn = *y, zn = *z; @@ -41,7 +41,7 @@ static void succ(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l) } else { zn = zn+1; } - + l--; } *x = xn; @@ -58,13 +58,13 @@ static int succ_x(int x, int y, int z, int xMax, int yMax, int zMax, int l) static int succ_y(int x, int y, int z, int xMax, int yMax, int zMax, int l) { succ(&x, &y, &z, xMax, yMax, zMax, l); - return y; + return y; } static int succ_z(int x, int y, int z, int xMax, int yMax, int zMax, int l) { succ(&x, &y, &z, xMax, yMax, zMax, l); - return z; + return z; } static void pred(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l) @@ -226,9 +226,11 @@ CTL Z <- Z LOCAL_BARRIER( m/(tB*tP), n/(tC*tQ), 0, u, v ) BODY if( nb_cuda_devices > 0 ) { int g = (n / tQ) % nb_cuda_devices; - parsec_advise_data_on_device( _f_C->original, - cuda_device_index[g], - PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + if( _f_C->original->preferred_device <= 0 ) { + parsec_advise_data_on_device( _f_C->original, + cuda_device_index[g], + PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } } END @@ -354,7 +356,7 @@ GEMM(m, n, k) READ A <- A READ_A(m, k, x, y, z) READ B <- B READ_B(k, n, x, y, z) -RW C <- k == 0 ? C READ_C(m, n) +RW C <- k == 0 ? C READ_C(m, n) : C GEMM(m, n, k-1 ) -> k + 1 == descB->mt ? descC(m, n) : C GEMM(m, n, k+1) diff --git a/src/zgeqrf.jdf b/src/zgeqrf.jdf index 2a7ea982..103362b0 100644 --- a/src/zgeqrf.jdf +++ b/src/zgeqrf.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -495,6 +495,8 @@ BODY [type=CUDA device=%{ return n; %} WORK, ib, WORKC, descA->mb, parsec_body.stream ); + + parsec_gpu_push_workspace(gpu_device, gpu_stream); } END diff --git a/tests/common.h b/tests/common.h index 07c1f3fd..6c3d23e9 100644 --- a/tests/common.h +++ b/tests/common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2020 The University of Tennessee and The University + * Copyright (c) 2009-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -283,6 +283,7 @@ static inline int min(int a, int b) { return a < b ? a : b; } "\n", \ gflops); \ } \ + if(rank==0) fflush(stdout); \ (void)gflops; #endif /* _TESTSCOMMON_H */ diff --git a/tests/testing_zgebrd_ge2gb.c b/tests/testing_zgebrd_ge2gb.c index f189cc91..25a1fbbc 100644 --- a/tests/testing_zgebrd_ge2gb.c +++ b/tests/testing_zgebrd_ge2gb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2020 The University of Tennessee and The University + * Copyright (c) 2011-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2015-2016 Inria, CNRS (LaBRI - UMR 5800), University of @@ -95,9 +95,20 @@ int GD_cpQR( int p, int q ) { } } +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int loud, int M, int N, int LDA, int MB, int NB, int IB, int P, int Q, int hmb, - int ltre0, int htre0, int ltree, int htree, int ts, int domino, int rbidiag ) + int ltre0, int htre0, int ltree, int htree, int ts, int domino, int rbidiag, int nbrun ) { int ret = 0; dplasma_qrtree_t qrtre0, qrtree, lqtree; @@ -106,7 +117,7 @@ int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int lo int MT = (M%MB==0) ? (M/MB) : (M/MB+1); int NT = (N%NB==0) ? (N/NB) : (N/NB+1); int cp = -1; - int i, nbrun = 3; + int i; int rc; //PASTE_CODE_FLOPS(FLOPS_ZGEBRD, ((DagDouble_t)M, (DagDouble_t)N)); @@ -151,6 +162,21 @@ int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int lo parsec_matrix_block_cyclic, (&dcBand, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_LAPACK, rank, MB+1, NB, MB+1, minMN, 0, 0, MB+1, minMN, 1, 1, 1, 1, 0, 0)); + if(rank > 0 && nodes == 1 && loud == -1) { + /* Fix distributions for local-only testing */ + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + dcTS0.super.super.rank_of = always_local_rank_of; + dcTS0.super.super.rank_of_key = always_local_rank_of_key; + dcTT0.super.super.rank_of = always_local_rank_of; + dcTT0.super.super.rank_of_key = always_local_rank_of_key; + dcTS.super.super.rank_of = always_local_rank_of; + dcTS.super.super.rank_of_key = always_local_rank_of_key; + dcTT.super.super.rank_of = always_local_rank_of; + dcTT.super.super.rank_of_key = always_local_rank_of_key; + dcBand.super.super.rank_of = always_local_rank_of; + dcBand.super.super.rank_of_key = always_local_rank_of_key; + } /* Initialize the matrix */ if(loud > 3) printf("+++ Generate matrices ... "); @@ -313,7 +339,7 @@ int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int lo time_avg += sync_time_elapsed; gflops = (flops/1.e9)/(sync_time_elapsed); - if (rank == 0){ + if (rank == 0 && loud >= 0){ fprintf(stdout, "zgebrd_ge2gb M= %2d N= %2d NP= %2d NC= %2d P= %2d Q= %2d NB= %2d IB= %2d R-bidiag= %2d treeh= %2d treel_rb= %2d qr_a= %2d QR(domino= %2d treel_qr= %2d ) : %.2f s %f gflops\n", M, N, nodes, cores, P, Q, NB, IB, @@ -401,6 +427,13 @@ int main(int argc, char ** argv) int ltree = iparam[IPARAM_LOWLVL_TREE] == DPLASMA_GREEDY_TREE ? DPLASMA_GREEDY1P_TREE : iparam[IPARAM_LOWLVL_TREE]; ltree = iparam[IPARAM_ASYNC] ? ltree : 9; + /* Warmup run */ + RunOneTest(parsec, 1, iparam[IPARAM_NCORES], rank, -1, 1000, 1000, 1000, 100, 100, 10, 1, 1, + iparam[IPARAM_HMB], iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE], + ltree, iparam[IPARAM_HIGHLVL_TREE], iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], + iparam[IPARAM_QR_TSRR], 1); + + /** * Test for varying matrix sizes m-by-n where: * 1) m = M .. N .. K, and n = m (square) @@ -421,7 +454,7 @@ int main(int argc, char ** argv) m, m, LDA, MB, NB, IB, P, Q, iparam[IPARAM_HMB], iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE], ltree, iparam[IPARAM_HIGHLVL_TREE], - iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] ); + iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR], iparam[IPARAM_NRUNS] ); } for (m=N; m<=M; m+=K ) { @@ -429,7 +462,7 @@ int main(int argc, char ** argv) m, N, LDA, MB, NB, IB, P, Q, iparam[IPARAM_HMB], iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE], ltree, iparam[IPARAM_HIGHLVL_TREE], - iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] ); + iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR], iparam[IPARAM_NRUNS] ); } cleanup_parsec(parsec, iparam); diff --git a/tests/testing_zgelqf.c b/tests/testing_zgelqf.c index 03658e13..4e592cde 100644 --- a/tests/testing_zgelqf.c +++ b/tests/testing_zgelqf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2021 The University of Tennessee and The University + * Copyright (c) 2011-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -21,6 +21,8 @@ static int check_solution( parsec_context_t *parsec, int loud, parsec_tiled_matrix_t *dcB, parsec_tiled_matrix_t *dcX ); +static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec); + int main(int argc, char ** argv) { parsec_context_t* parsec; @@ -41,6 +43,8 @@ int main(int argc, char ** argv) PASTE_CODE_FLOPS(FLOPS_ZGELQF, ((DagDouble_t)M, (DagDouble_t)N)); LDA = max(M, LDA); + warmup_zgelqf(rank, random_seed, parsec); + /* initializing matrix structure */ PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, @@ -362,3 +366,88 @@ static int check_solution( parsec_context_t *parsec, int loud, return info_solution; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec) +{ + int MB = 64; + int IB = 40; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + int matrix_init = dplasmaMatrixRandom; + + /* initializing matrix structure */ + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, M, N, 0, 0, + M, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcT, 1, + parsec_matrix_block_cyclic, (&dcT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcT.super.super.rank_of = always_local_rank_of; + dcT.super.super.rank_of_key = always_local_rank_of_key; + + /* Do the CPU warmup first */ + dplasma_zplrnt( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t*)&dcT ); + parsec_taskpool_t *zgelqf = dplasma_zgelqf_New((parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcT); + zgelqf->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgelqf); + parsec_context_start(parsec); + parsec_context_wait(parsec); + + /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */ + for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { + for(int i = 0; i < (int)zgelqf->nb_task_classes; i++) { + for(int j = 0; NULL != zgelqf->task_classes_array[i]->incarnations[j].hook; j++) { + if( zgelqf->task_classes_array[i]->incarnations[j].type == dtype ) { + goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ + } + } + } + continue; /* No incarnation of this device type on any task class; try another type */ + do_run: + for(int did = 0; did < (int)parsec_nb_devices; did++) { + parsec_device_module_t *dev = parsec_mca_device_get(did); + if(dev->type != dtype) + continue; + /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ + for(int m = 0; m < MT; m++) { + for(int n = 0; n < NT; n++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcT.super.super.data_of(&dcT.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + dplasma_zplrnt( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t*)&dcT ); + parsec_taskpool_t *zgelqf_device = dplasma_zgelqf_New((parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcT); + parsec_context_add_taskpool(parsec, zgelqf_device); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zgelqf_Destruct(zgelqf_device); + } + } + + dplasma_zgelqf_Destruct(zgelqf); + +} diff --git a/tests/testing_zgelqf_hqr.c b/tests/testing_zgelqf_hqr.c index ee528a80..147d6bcc 100644 --- a/tests/testing_zgelqf_hqr.c +++ b/tests/testing_zgelqf_hqr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2020 The University of Tennessee and The University + * Copyright (c) 2011-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -20,6 +20,7 @@ static int check_solution( parsec_context_t *parsec, int loud, parsec_tiled_matrix_t *dcA, parsec_tiled_matrix_t *dcB, parsec_tiled_matrix_t *dcX ); +static void warmup_hqr(parsec_context_t *parsec, int *iparam); int main(int argc, char ** argv) { @@ -47,6 +48,8 @@ int main(int argc, char ** argv) PASTE_CODE_IPARAM_LOCALS(iparam); PASTE_CODE_FLOPS(FLOPS_ZGELQF, ((DagDouble_t)M, (DagDouble_t)N)); + warmup_hqr(parsec, iparam); + LDA = max(M, LDA); /* initializing matrix structure */ PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, @@ -81,79 +84,81 @@ int main(int argc, char ** argv) rank, MB, NB, LDB, NRHS, 0, 0, N, NRHS, P, nodes/P, KP, KQ, IP, JQ)); - /* matrix generation */ - if(loud > 3) printf("+++ Generate matrices ... "); - dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); - if( check ) - dplasma_zlacpy( parsec, dplasmaUpperLower, - (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 ); - dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); - dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); - if(loud > 3) printf("Done\n"); - dplasma_hqr_init( &qrtree, - dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA, - iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE], - iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_HLVL_SZE], - iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] ); - - /* Create PaRSEC */ - PASTE_CODE_ENQUEUE_KERNEL(parsec, zgelqf_param, - (&qrtree, - (parsec_tiled_matrix_t*)&dcA, - (parsec_tiled_matrix_t*)&dcTS, - (parsec_tiled_matrix_t*)&dcTT)); - - /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */ - SYNC_TIME_START(); - parsec_context_start(parsec); - TIME_START(); - parsec_context_wait(parsec); + dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA, + iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_HLVL_SZE], + iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] ); + + for(int t = 0; t < iparam[IPARAM_NRUNS]; t++) { + /* matrix generation */ + if(loud > 3) printf("+++ Generate matrices ... "); + dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); + if( check ) + dplasma_zlacpy( parsec, dplasmaUpperLower, + (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + if(loud > 3) printf("Done\n"); + + /* Create PaRSEC */ + PASTE_CODE_ENQUEUE_KERNEL(parsec, zgelqf_param, + (&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT)); + + /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */ + SYNC_TIME_START(); + parsec_context_start(parsec); + TIME_START(); + parsec_context_wait(parsec); + + SYNC_TIME_PRINT(rank, + ("zgelqf HQR computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n", + iparam[IPARAM_NNODES], + iparam[IPARAM_NCORES], + iparam[IPARAM_P], + iparam[IPARAM_IB], + iparam[IPARAM_MB], + iparam[IPARAM_NB], + iparam[IPARAM_QR_TS_SZE], + iparam[IPARAM_QR_HLVL_SZE], + iparam[IPARAM_LOWLVL_TREE], + iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_DOMINO], + iparam[IPARAM_QR_TSRR], + iparam[IPARAM_M], + iparam[IPARAM_N], + gflops = (flops/1e9)/(sync_time_elapsed))); + if(loud >= 5 && rank == 0) { + printf("\n" + "%g\n" + "\n", + gflops); + } - SYNC_TIME_PRINT(rank, - ("zgelqf HQR computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n", - iparam[IPARAM_NNODES], - iparam[IPARAM_NCORES], - iparam[IPARAM_P], - iparam[IPARAM_IB], - iparam[IPARAM_MB], - iparam[IPARAM_NB], - iparam[IPARAM_QR_TS_SZE], - iparam[IPARAM_QR_HLVL_SZE], - iparam[IPARAM_LOWLVL_TREE], - iparam[IPARAM_HIGHLVL_TREE], - iparam[IPARAM_QR_DOMINO], - iparam[IPARAM_QR_TSRR], - iparam[IPARAM_M], - iparam[IPARAM_N], - gflops = (flops/1e9)/(sync_time_elapsed))); - if(loud >= 5 && rank == 0) { - printf("\n" - "%g\n" - "\n", - gflops); + dplasma_zgelqf_param_Destruct( PARSEC_zgelqf_param ); } #if defined(PARSEC_SIM) if ( rank == 0 ) { printf("zgelqf HQR simulation NP= %d NC= %d P= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d MT= %d NT= %d : %d \n", - iparam[IPARAM_NNODES], - iparam[IPARAM_NCORES], - iparam[IPARAM_P], - iparam[IPARAM_QR_TS_SZE], - iparam[IPARAM_QR_HLVL_SZE], - iparam[IPARAM_LOWLVL_TREE], - iparam[IPARAM_HIGHLVL_TREE], - iparam[IPARAM_QR_DOMINO], - iparam[IPARAM_QR_TSRR], - MT, NT, - parsec_getsimulationdate( parsec )); + iparam[IPARAM_NNODES], + iparam[IPARAM_NCORES], + iparam[IPARAM_P], + iparam[IPARAM_QR_TS_SZE], + iparam[IPARAM_QR_HLVL_SZE], + iparam[IPARAM_LOWLVL_TREE], + iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_DOMINO], + iparam[IPARAM_QR_TSRR], + MT, NT, + parsec_getsimulationdate( parsec )); } #endif - dplasma_zgelqf_param_Destruct( PARSEC_zgelqf_param ); - if( check ) { if (N >= M) { if(loud > 2) printf("+++ Generate the Q ..."); @@ -397,3 +402,127 @@ static int check_solution( parsec_context_t *parsec, int loud, return info_solution; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_hqr(parsec_context_t *parsec, int *iparam) +{ + dplasma_qrtree_t qrtree; + int M, N, LDA, MB, NB, IB, MT; + /* Fixed problem size */ + M = 1000; + N = 1000; + LDA = 1000; + MB = 100; + NB = 100; + IB = 10; + MT = M/MB; + + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + iparam[IPARAM_RANK], MB, NB, LDA, N, 0, 0, + M, N, 1, 1, 1, 1, 0, 0)); + PASTE_CODE_ALLOCATE_MATRIX(dcTS, 1, + parsec_matrix_block_cyclic, (&dcTS, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + iparam[IPARAM_RANK], IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + PASTE_CODE_ALLOCATE_MATRIX(dcTT, 1, + parsec_matrix_block_cyclic, (&dcTT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + iparam[IPARAM_RANK], IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + dcTS.super.super.rank_of = always_local_rank_of; + dcTS.super.super.rank_of_key = always_local_rank_of_key; + dcTT.super.super.rank_of = always_local_rank_of; + dcTT.super.super.rank_of_key = always_local_rank_of_key; + + dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, iparam[IPARAM_RANDOM_SEED] ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + + dplasma_hqr_init( &qrtree, + dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA, + iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_HLVL_SZE], + iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] ); + + /* Create PaRSEC */ + parsec_taskpool_t *zgelqf_param_tp = dplasma_zgelqf_param_New(&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT); + zgelqf_param_tp->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgelqf_param_tp); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_hqr_finalize( &qrtree ); + + /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */ + for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { + for(int i = 0; i < (int)zgelqf_param_tp->nb_task_classes; i++) { + for(int j = 0; NULL != zgelqf_param_tp->task_classes_array[i]->incarnations[j].hook; j++) { + if( zgelqf_param_tp->task_classes_array[i]->incarnations[j].type == dtype ) { + goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ + } + } + } + continue; /* No incarnation of this device type on any task class; try another type */ + do_run: + for(int did = 0; did < (int)parsec_nb_devices; did++) { + parsec_device_module_t *dev = parsec_mca_device_get(did); + if(dev->type != dtype) + continue; + /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ + for(int m = 0; m < MT; m++) { + for(int n = 0; n < MT; n++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcTS.super.super.data_of(&dcTS.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcTT.super.super.data_of(&dcTT.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, iparam[IPARAM_RANDOM_SEED] ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + + dplasma_hqr_init( &qrtree, + dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA, + iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_HLVL_SZE], + iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] ); + + /* Create PaRSEC */ + parsec_taskpool_t *zgelqf_device = dplasma_zgelqf_param_New(&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT); + parsec_context_add_taskpool(parsec, zgelqf_device); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_hqr_finalize( &qrtree ); + + parsec_taskpool_free(zgelqf_device); + } + } + + parsec_taskpool_free(zgelqf_param_tp); + + parsec_data_free(dcA.mat); + parsec_data_free(dcTS.mat); + parsec_data_free(dcTT.mat); + parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcA); + parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcTS); + parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcTT); +} diff --git a/tests/testing_zgelqf_systolic.c b/tests/testing_zgelqf_systolic.c index 874d9552..08b7b64e 100644 --- a/tests/testing_zgelqf_systolic.c +++ b/tests/testing_zgelqf_systolic.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2020 The University of Tennessee and The University + * Copyright (c) 2011-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -20,6 +20,7 @@ static int check_solution( parsec_context_t *parsec, int loud, parsec_tiled_matrix_t *dcA, parsec_tiled_matrix_t *dcB, parsec_tiled_matrix_t *dcX ); +static void warmup_hqr(parsec_context_t *parsec, int *iparam); int main(int argc, char ** argv) { @@ -46,6 +47,8 @@ int main(int argc, char ** argv) PASTE_CODE_IPARAM_LOCALS(iparam); PASTE_CODE_FLOPS(FLOPS_ZGELQF, ((DagDouble_t)M, (DagDouble_t)N)); + warmup_hqr(parsec, iparam); + LDA = max(M, LDA); /* initializing matrix structure */ PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, @@ -80,59 +83,61 @@ int main(int argc, char ** argv) rank, MB, NB, LDB, NRHS, 0, 0, N, NRHS, P, nodes/P, KP, KQ, IP, JQ)); - /* matrix generation */ - if(loud > 3) printf("+++ Generate matrices ... "); - dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); - if( check ) - dplasma_zlacpy( parsec, dplasmaUpperLower, - (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 ); - dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); - dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); - if(loud > 3) printf("Done\n"); - - dplasma_systolic_init( &qrtree, - dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA, - iparam[IPARAM_P], - iparam[IPARAM_Q] ); - - /* Create PaRSEC */ - PASTE_CODE_ENQUEUE_KERNEL(parsec, zgelqf_param, - (&qrtree, - (parsec_tiled_matrix_t*)&dcA, - (parsec_tiled_matrix_t*)&dcTS, - (parsec_tiled_matrix_t*)&dcTT)); - - /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */ - SYNC_TIME_START(); - parsec_context_start(parsec); - TIME_START(); - parsec_context_wait(parsec); - - SYNC_TIME_PRINT(rank, - ("zgelqf systolic computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n", - iparam[IPARAM_NNODES], - iparam[IPARAM_NCORES], - iparam[IPARAM_P], - iparam[IPARAM_IB], - iparam[IPARAM_MB], - iparam[IPARAM_NB], - iparam[IPARAM_Q], - iparam[IPARAM_P], - iparam[IPARAM_LOWLVL_TREE], - iparam[IPARAM_HIGHLVL_TREE], - iparam[IPARAM_QR_DOMINO], - iparam[IPARAM_QR_TSRR], - iparam[IPARAM_M], - iparam[IPARAM_N], - gflops = (flops/1e9)/(sync_time_elapsed))); - if(loud >= 5 && rank == 0) { - printf("\n" - "%g\n" - "\n", - gflops); + for(int t = 0; t < iparam[IPARAM_NRUNS]; t++) { + /* matrix generation */ + if(loud > 3) printf("+++ Generate matrices ... "); + dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); + if( check ) + dplasma_zlacpy( parsec, dplasmaUpperLower, + (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + if(loud > 3) printf("Done\n"); + + dplasma_systolic_init( &qrtree, + dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA, + iparam[IPARAM_P], + iparam[IPARAM_Q] ); + + /* Create PaRSEC */ + PASTE_CODE_ENQUEUE_KERNEL(parsec, zgelqf_param, + (&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT)); + + /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */ + SYNC_TIME_START(); + parsec_context_start(parsec); + TIME_START(); + parsec_context_wait(parsec); + + SYNC_TIME_PRINT(rank, + ("zgelqf systolic computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n", + iparam[IPARAM_NNODES], + iparam[IPARAM_NCORES], + iparam[IPARAM_P], + iparam[IPARAM_IB], + iparam[IPARAM_MB], + iparam[IPARAM_NB], + iparam[IPARAM_Q], + iparam[IPARAM_P], + iparam[IPARAM_LOWLVL_TREE], + iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_DOMINO], + iparam[IPARAM_QR_TSRR], + iparam[IPARAM_M], + iparam[IPARAM_N], + gflops = (flops/1e9)/(sync_time_elapsed))); + if(loud >= 5 && rank == 0) { + printf("\n" + "%g\n" + "\n", + gflops); + } + dplasma_zgelqf_param_Destruct( PARSEC_zgelqf_param ); } - #if defined(PARSEC_SIM) if ( rank == 0 ) { printf("zgelqf systolic simulation NP= %d NC= %d P= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d MT= %d NT= %d : %d \n", @@ -150,8 +155,6 @@ int main(int argc, char ** argv) } #endif - dplasma_zgelqf_param_Destruct( PARSEC_zgelqf_param ); - if( check ) { if (N >= M) { if(loud > 2) printf("+++ Generate the Q ..."); @@ -395,3 +398,123 @@ static int check_solution( parsec_context_t *parsec, int loud, return info_solution; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_hqr(parsec_context_t *parsec, int *iparam) +{ + dplasma_qrtree_t qrtree; + int M, N, LDA, MB, NB, IB, MT; + /* Fixed problem size */ + M = 1000; + N = 1000; + LDA = 1000; + MB = 100; + NB = 100; + IB = 10; + MT = M/MB; + + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + iparam[IPARAM_RANK], MB, NB, LDA, N, 0, 0, + M, N, 1, 1, 1, 1, 0, 0)); + PASTE_CODE_ALLOCATE_MATRIX(dcTS, 1, + parsec_matrix_block_cyclic, (&dcTS, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + iparam[IPARAM_RANK], IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + PASTE_CODE_ALLOCATE_MATRIX(dcTT, 1, + parsec_matrix_block_cyclic, (&dcTT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + iparam[IPARAM_RANK], IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + dcTS.super.super.rank_of = always_local_rank_of; + dcTS.super.super.rank_of_key = always_local_rank_of_key; + dcTT.super.super.rank_of = always_local_rank_of; + dcTT.super.super.rank_of_key = always_local_rank_of_key; + + dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, iparam[IPARAM_RANDOM_SEED] ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + + dplasma_systolic_init( &qrtree, + dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA, + 1, 1 ); + + /* Create PaRSEC */ + parsec_taskpool_t *zgelqf_sys_tp = dplasma_zgelqf_param_New(&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT); + zgelqf_sys_tp->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgelqf_sys_tp); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_systolic_finalize( &qrtree ); + + /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */ + for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { + for(int i = 0; i < (int)zgelqf_sys_tp->nb_task_classes; i++) { + for(int j = 0; NULL != zgelqf_sys_tp->task_classes_array[i]->incarnations[j].hook; j++) { + if( zgelqf_sys_tp->task_classes_array[i]->incarnations[j].type == dtype ) { + goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ + } + } + } + continue; /* No incarnation of this device type on any task class; try another type */ + do_run: + for(int did = 0; did < (int)parsec_nb_devices; did++) { + parsec_device_module_t *dev = parsec_mca_device_get(did); + if(dev->type != dtype) + continue; + /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ + for(int m = 0; m < MT; m++) { + for(int n = 0; n < MT; n++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcTS.super.super.data_of(&dcTS.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcTT.super.super.data_of(&dcTT.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, iparam[IPARAM_RANDOM_SEED] ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + + dplasma_systolic_init( &qrtree, + dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA, + 1, 1 ); + + /* Create PaRSEC */ + parsec_taskpool_t *zgelqf_device = dplasma_zgelqf_param_New(&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT); + parsec_context_add_taskpool(parsec, zgelqf_device); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_systolic_finalize( &qrtree ); + + parsec_taskpool_free(zgelqf_device); + } + } + + parsec_taskpool_free(zgelqf_sys_tp); + + parsec_data_free(dcA.mat); + parsec_data_free(dcTS.mat); + parsec_data_free(dcTT.mat); + parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcA); + parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcTS); + parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcTT); +} diff --git a/tests/testing_zgemm.c b/tests/testing_zgemm.c index c9f753d3..fc2dadb8 100644 --- a/tests/testing_zgemm.c +++ b/tests/testing_zgemm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2020 The University of Tennessee and The University + * Copyright (c) 2009-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -16,6 +16,7 @@ static int check_solution( parsec_context_t *parsec, int loud, int Bm, int Bn, int Bseed, dplasma_complex64_t beta, int M, int N, int Cseed, parsec_matrix_block_cyclic_t *dcCfinal ); +static void warmup_zgemm(int rank, int nodes, int random_seed, parsec_context_t *parsec); int main(int argc, char ** argv) { @@ -49,6 +50,8 @@ int main(int argc, char ** argv) LDB = max(LDB, max(K, N)); LDC = max(LDC, M); + warmup_zgemm(rank, nodes, random_seed, parsec); + PASTE_CODE_ALLOCATE_MATRIX(dcC, 1, parsec_matrix_block_cyclic, (&dcC, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, rank, MB, NB, LDC, N, 0, 0, @@ -282,3 +285,76 @@ static int check_solution( parsec_context_t *parsec, int loud, return info_solution; } + +static void warmup_zgemm(int rank, int nodes, int random_seed, parsec_context_t *parsec) +{ + int MB = 64; + int NB = 64; + int KB = 64; + int MT = nodes; + int NT = 1; + int KT = 1; + int M = MT*MB; + int N = NT*NB; + int K = KT*KB; + int did; + unsigned int rs = (unsigned int)random_seed; + int Aseed = rand_r(&rs); + int Bseed = rand_r(&rs); + int Cseed = rand_r(&rs); + int tA = dplasmaNoTrans; + int tB = dplasmaNoTrans; + dplasma_complex64_t alpha = 0.51; + dplasma_complex64_t beta = -0.42; + + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, KB, M, K, 0, 0, + M, K, nodes, 1, 1, 1, 0, 0)); + + PASTE_CODE_ALLOCATE_MATRIX(dcB, 1, + parsec_matrix_block_cyclic, (&dcB, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, KB, NB, K, N, 0, 0, + K, N, 1, 1, 1, 1, 0, 0)); + + PASTE_CODE_ALLOCATE_MATRIX(dcC, 1, + parsec_matrix_block_cyclic, (&dcC, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, M, N, 0, 0, + M, N, nodes, 1, 1, 1, 0, 0)); + + /* Do the CPU warmup first */ + dplasma_zplrnt( parsec, 0, &dcA.super, Aseed); + dplasma_zplrnt( parsec, 0, &dcB.super, Bseed); + dplasma_zplrnt( parsec, 0, &dcC.super, Cseed); + parsec_taskpool_t *zgemm = dplasma_zgemm_New(tA, tB, alpha, &dcA.super, &dcB.super, beta, &dcC.super); + zgemm->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgemm); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zgemm_Destruct(zgemm); + + /* Now do the other devices, skipping RECURSIVE */ + /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */ + for(did = 2; did < (int)parsec_nb_devices; did++) { + for(int i = 0; i < MT; i++) { + for(int j = 0; j < NT; j++) { + if( rank == (int)dcC.super.super.rank_of(&dcC.super.super, i, j) ) { + parsec_data_t *dta = dcC.super.super.data_of(&dcC.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + } + dplasma_zplrnt( parsec, 0, &dcA.super, Aseed); + dplasma_zplrnt( parsec, 0, &dcB.super, Bseed); + dplasma_zplrnt( parsec, 0, &dcC.super, Cseed); + dplasma_zgemm(parsec, tA, tB, alpha, &dcA.super, &dcB.super, beta, &dcC.super); + parsec_devices_release_memory(); + } + + parsec_data_free(dcA.mat); dcA.mat = NULL; + parsec_tiled_matrix_destroy( &dcA.super ); + parsec_data_free(dcB.mat); dcB.mat = NULL; + parsec_tiled_matrix_destroy( &dcB.super ); + parsec_data_free(dcC.mat); dcC.mat = NULL; + parsec_tiled_matrix_destroy( &dcC.super ); +} diff --git a/tests/testing_zgemm_dtd.c b/tests/testing_zgemm_dtd.c index c54d537d..f6eb7320 100644 --- a/tests/testing_zgemm_dtd.c +++ b/tests/testing_zgemm_dtd.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2020 The University of Tennessee and The University + * Copyright (c) 2015-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -21,6 +21,7 @@ static int check_solution( parsec_context_t *parsec, int loud, int Bm, int Bn, int Bseed, dplasma_complex64_t beta, int M, int N, int Cseed, parsec_matrix_block_cyclic_t *dcCfinal ); +static void warmup_zgemm(int rank, int nodes, int random_seed, parsec_context_t *parsec); static int parsec_core_gemm(parsec_execution_stream_t *es, parsec_task_t *this_task) @@ -84,6 +85,8 @@ int main(int argc, char ** argv) LDB = max(LDB, max(K, N)); LDC = max(LDC, M); + warmup_zgemm(rank, nodes, random_seed, parsec); + PASTE_CODE_ALLOCATE_MATRIX(dcC, 1, parsec_matrix_block_cyclic, (&dcC, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, rank, MB, NB, LDC, N, 0, 0, @@ -669,3 +672,76 @@ static int check_solution( parsec_context_t *parsec, int loud, return info_solution; } + +static void warmup_zgemm(int rank, int nodes, int random_seed, parsec_context_t *parsec) +{ + int MB = 64; + int NB = 64; + int KB = 64; + int MT = nodes; + int NT = 1; + int KT = 1; + int M = MT*MB; + int N = NT*NB; + int K = KT*KB; + int did; + unsigned int rs = (unsigned int)random_seed; + int Aseed = rand_r(&rs); + int Bseed = rand_r(&rs); + int Cseed = rand_r(&rs); + int tA = dplasmaNoTrans; + int tB = dplasmaNoTrans; + dplasma_complex64_t alpha = 0.51; + dplasma_complex64_t beta = -0.42; + + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, KB, M, K, 0, 0, + M, K, nodes, 1, 1, 1, 0, 0)); + + PASTE_CODE_ALLOCATE_MATRIX(dcB, 1, + parsec_matrix_block_cyclic, (&dcB, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, KB, NB, K, N, 0, 0, + K, N, 1, 1, 1, 1, 0, 0)); + + PASTE_CODE_ALLOCATE_MATRIX(dcC, 1, + parsec_matrix_block_cyclic, (&dcC, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, M, N, 0, 0, + M, N, nodes, 1, 1, 1, 0, 0)); + + /* Do the CPU warmup first */ + dplasma_zplrnt( parsec, 0, &dcA.super, Aseed); + dplasma_zplrnt( parsec, 0, &dcB.super, Bseed); + dplasma_zplrnt( parsec, 0, &dcC.super, Cseed); + parsec_taskpool_t *zgemm = dplasma_zgemm_New(tA, tB, alpha, &dcA.super, &dcB.super, beta, &dcC.super); + zgemm->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgemm); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zgemm_Destruct(zgemm); + + /* Now do the other devices, skipping RECURSIVE */ + /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */ + for(did = 2; did < (int)parsec_nb_devices; did++) { + for(int i = 0; i < MT; i++) { + for(int j = 0; j < NT; j++) { + if( rank == (int)dcC.super.super.rank_of(&dcC.super.super, i, j) ) { + parsec_data_t *dta = dcC.super.super.data_of(&dcC.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + } + dplasma_zplrnt( parsec, 0, &dcA.super, Aseed); + dplasma_zplrnt( parsec, 0, &dcB.super, Bseed); + dplasma_zplrnt( parsec, 0, &dcC.super, Cseed); + dplasma_zgemm(parsec, tA, tB, alpha, &dcA.super, &dcB.super, beta, &dcC.super); + parsec_devices_release_memory(); + } + + parsec_data_free(dcA.mat); dcA.mat = NULL; + parsec_tiled_matrix_destroy( &dcA.super ); + parsec_data_free(dcB.mat); dcB.mat = NULL; + parsec_tiled_matrix_destroy( &dcB.super ); + parsec_data_free(dcC.mat); dcC.mat = NULL; + parsec_tiled_matrix_destroy( &dcC.super ); +} diff --git a/tests/testing_zgeqrf.c b/tests/testing_zgeqrf.c index fdf63a9f..3f3d3af1 100644 --- a/tests/testing_zgeqrf.c +++ b/tests/testing_zgeqrf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2021 The University of Tennessee and The University + * Copyright (c) 2011-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -20,6 +20,7 @@ static int check_solution( parsec_context_t *parsec, int loud, parsec_tiled_matrix_t *dcA, parsec_tiled_matrix_t *dcB, parsec_tiled_matrix_t *dcX ); +static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec); int main(int argc, char ** argv) { @@ -43,6 +44,8 @@ int main(int argc, char ** argv) LDA = max(M, LDA); LDB = max(M, LDB); + warmup_zgeqrf(rank, random_seed, parsec); + /* initializing matrix structure */ PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, @@ -363,3 +366,71 @@ static int check_solution( parsec_context_t *parsec, int loud, return info_solution; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec) +{ + int IB = 32; + int MB = 64; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + int did; + + /* initializing matrix structure */ + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, M, N, 0, 0, + M, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcT, 1, + parsec_matrix_block_cyclic, (&dcT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcT.super.super.rank_of = always_local_rank_of; + dcT.super.super.rank_of_key = always_local_rank_of_key; + + dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT); + parsec_taskpool_t *zgeqrf = dplasma_zgeqrf_New( (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT); + zgeqrf->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgeqrf); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zgeqrf_Destruct(zgeqrf); + + /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */ + for(did = 2; did < (int)parsec_nb_devices; did++) { + for(int i = 0; i < MT; i++) { + for(int j = 0; j < NT; j++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcT.super.super.data_of(&dcT.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT); + dplasma_zgeqrf( parsec, (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT); + parsec_devices_release_memory(); + } + + parsec_data_free(dcA.mat); dcA.mat = NULL; + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA ); + parsec_data_free(dcT.mat); dcT.mat = NULL; + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcT ); + parsec_devices_reset_load(parsec); +} diff --git a/tests/testing_zgeqrf_dtd.c b/tests/testing_zgeqrf_dtd.c index 839212ff..cd4bb238 100644 --- a/tests/testing_zgeqrf_dtd.c +++ b/tests/testing_zgeqrf_dtd.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2020 The University of Tennessee and The University + * Copyright (c) 2015-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -12,6 +12,18 @@ #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include "parsec/interfaces/dtd/insert_function.h" +static int check_orthogonality(parsec_context_t *parsec, int loud, + parsec_tiled_matrix_t *Q); +static int check_factorization(parsec_context_t *parsec, int loud, + parsec_tiled_matrix_t *Aorig, + parsec_tiled_matrix_t *A, + parsec_tiled_matrix_t *Q); +static int check_solution( parsec_context_t *parsec, int loud, + parsec_tiled_matrix_t *dcA, + parsec_tiled_matrix_t *dcB, + parsec_tiled_matrix_t *dcX ); +static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec); + /* Global indices for the different datatypes */ static int TILE_FULL, TILE_RECTANGLE; @@ -120,17 +132,6 @@ parsec_core_tsmqr(parsec_execution_stream_t *es, parsec_task_t *this_task) return PARSEC_HOOK_RETURN_DONE; } -static int check_orthogonality(parsec_context_t *parsec, int loud, - parsec_tiled_matrix_t *Q); -static int check_factorization(parsec_context_t *parsec, int loud, - parsec_tiled_matrix_t *Aorig, - parsec_tiled_matrix_t *A, - parsec_tiled_matrix_t *Q); -static int check_solution( parsec_context_t *parsec, int loud, - parsec_tiled_matrix_t *dcA, - parsec_tiled_matrix_t *dcB, - parsec_tiled_matrix_t *dcX ); - int main(int argc, char **argv) { parsec_context_t* parsec; @@ -153,6 +154,8 @@ int main(int argc, char **argv) LDA = max(M, LDA); LDB = max(M, LDB); + warmup_zgeqrf(rank, random_seed, parsec); + /* initializing matrix structure */ PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, @@ -594,3 +597,71 @@ static int check_solution( parsec_context_t *parsec, int loud, return info_solution; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec) +{ + int IB = 32; + int MB = 64; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + int did; + + /* initializing matrix structure */ + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, M, N, 0, 0, + M, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcT, 1, + parsec_matrix_block_cyclic, (&dcT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcT.super.super.rank_of = always_local_rank_of; + dcT.super.super.rank_of_key = always_local_rank_of_key; + + dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT); + parsec_taskpool_t *zgeqrf = dplasma_zgeqrf_New( (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT); + zgeqrf->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgeqrf); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zgeqrf_Destruct(zgeqrf); + + /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */ + for(did = 2; did < (int)parsec_nb_devices; did++) { + for(int i = 0; i < MT; i++) { + for(int j = 0; j < NT; j++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcT.super.super.data_of(&dcT.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT); + dplasma_zgeqrf( parsec, (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT); + parsec_devices_release_memory(); + } + + parsec_data_free(dcA.mat); dcA.mat = NULL; + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA ); + parsec_data_free(dcT.mat); dcT.mat = NULL; + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcT ); + parsec_devices_reset_load(parsec); +} diff --git a/tests/testing_zgeqrf_dtd_untied.c b/tests/testing_zgeqrf_dtd_untied.c index 5e2fb5cf..7645db4e 100644 --- a/tests/testing_zgeqrf_dtd_untied.c +++ b/tests/testing_zgeqrf_dtd_untied.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2020 The University of Tennessee and The University + * Copyright (c) 2015-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -12,6 +12,18 @@ #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include "parsec/interfaces/dtd/insert_function.h" +static int check_orthogonality(parsec_context_t *parsec, int loud, + parsec_tiled_matrix_t *Q); +static int check_factorization(parsec_context_t *parsec, int loud, + parsec_tiled_matrix_t *Aorig, + parsec_tiled_matrix_t *A, + parsec_tiled_matrix_t *Q); +static int check_solution( parsec_context_t *parsec, int loud, + parsec_tiled_matrix_t *dcA, + parsec_tiled_matrix_t *dcB, + parsec_tiled_matrix_t *dcX ); +static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec); + /* Global indices for the different datatypes */ static int TILE_FULL, TILE_LOWER, @@ -252,17 +264,6 @@ insert_task_geqrf(parsec_execution_stream_t *es, parsec_task_t *this_task) return PARSEC_HOOK_RETURN_DONE; } -static int check_orthogonality(parsec_context_t *parsec, int loud, - parsec_tiled_matrix_t *Q); -static int check_factorization(parsec_context_t *parsec, int loud, - parsec_tiled_matrix_t *Aorig, - parsec_tiled_matrix_t *A, - parsec_tiled_matrix_t *Q); -static int check_solution( parsec_context_t *parsec, int loud, - parsec_tiled_matrix_t *dcA, - parsec_tiled_matrix_t *dcB, - parsec_tiled_matrix_t *dcX ); - int main(int argc, char ** argv) { parsec_context_t* parsec; @@ -285,6 +286,8 @@ int main(int argc, char ** argv) LDA = max(M, LDA); LDB = max(M, LDB); + warmup_zgeqrf(rank, random_seed, parsec); + /* initializing matrix structure */ PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, @@ -647,3 +650,71 @@ static int check_solution( parsec_context_t *parsec, int loud, return info_solution; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec) +{ + int IB = 32; + int MB = 64; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + int did; + + /* initializing matrix structure */ + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, M, N, 0, 0, + M, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcT, 1, + parsec_matrix_block_cyclic, (&dcT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcT.super.super.rank_of = always_local_rank_of; + dcT.super.super.rank_of_key = always_local_rank_of_key; + + dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT); + parsec_taskpool_t *zgeqrf = dplasma_zgeqrf_New( (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT); + zgeqrf->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgeqrf); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zgeqrf_Destruct(zgeqrf); + + /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */ + for(did = 2; did < (int)parsec_nb_devices; did++) { + for(int i = 0; i < MT; i++) { + for(int j = 0; j < NT; j++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcT.super.super.data_of(&dcT.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT); + dplasma_zgeqrf( parsec, (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT); + parsec_devices_release_memory(); + } + + parsec_data_free(dcA.mat); dcA.mat = NULL; + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA ); + parsec_data_free(dcT.mat); dcT.mat = NULL; + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcT ); + parsec_devices_reset_load(parsec); +} diff --git a/tests/testing_zgeqrf_hqr.c b/tests/testing_zgeqrf_hqr.c index fce601c9..09d260a1 100644 --- a/tests/testing_zgeqrf_hqr.c +++ b/tests/testing_zgeqrf_hqr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2020 The University of Tennessee and The University + * Copyright (c) 2011-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -20,6 +20,7 @@ static int check_solution( parsec_context_t *parsec, int loud, parsec_tiled_matrix_t *dcA, parsec_tiled_matrix_t *dcB, parsec_tiled_matrix_t *dcX ); +static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_context_t *parsec); int main(int argc, char ** argv) { @@ -48,6 +49,9 @@ int main(int argc, char ** argv) PASTE_CODE_FLOPS(FLOPS_ZGEQRF, ((DagDouble_t)M, (DagDouble_t)N)); LDA = max(M, LDA); + + warmup_zgeqrf_hqr(rank, random_seed, iparam, parsec); + /* initializing matrix structure */ PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, @@ -81,58 +85,61 @@ int main(int argc, char ** argv) rank, MB, NB, LDB, NRHS, 0, 0, M, NRHS, P, nodes/P, KP, KQ, IP, JQ)); - /* matrix generation */ - if(loud > 3) printf("+++ Generate matrices ... "); - dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); - if( check ) - dplasma_zlacpy( parsec, dplasmaUpperLower, - (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 ); - dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); - dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); - if(loud > 3) printf("Done\n"); - dplasma_hqr_init( &qrtree, - dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA, - iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE], - iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_HLVL_SZE], - iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] ); - - /* Create PaRSEC */ - PASTE_CODE_ENQUEUE_KERNEL(parsec, zgeqrf_param, - (&qrtree, - (parsec_tiled_matrix_t*)&dcA, - (parsec_tiled_matrix_t*)&dcTS, - (parsec_tiled_matrix_t*)&dcTT)); - - /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */ - SYNC_TIME_START(); - parsec_context_start(parsec); - TIME_START(); - parsec_context_wait(parsec); - - SYNC_TIME_PRINT(rank, - ("zgeqrf HQR computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n", - iparam[IPARAM_NNODES], - iparam[IPARAM_NCORES], - iparam[IPARAM_P], - iparam[IPARAM_IB], - iparam[IPARAM_MB], - iparam[IPARAM_NB], - iparam[IPARAM_QR_TS_SZE], - iparam[IPARAM_QR_HLVL_SZE], - iparam[IPARAM_LOWLVL_TREE], - iparam[IPARAM_HIGHLVL_TREE], - iparam[IPARAM_QR_DOMINO], - iparam[IPARAM_QR_TSRR], - iparam[IPARAM_M], - iparam[IPARAM_N], - gflops = (flops/1e9)/(sync_time_elapsed))); - if(loud >= 5 && rank == 0) { - printf("\n" - "%g\n" - "\n", - gflops); + dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA, + iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_HLVL_SZE], + iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] ); + + for(int t = 0; t < nruns; t++) { + /* matrix generation */ + if(loud > 3) printf("+++ Generate matrices ... "); + dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); + if( check ) + dplasma_zlacpy( parsec, dplasmaUpperLower, + (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + if(loud > 3) printf("Done\n"); + + /* Create PaRSEC */ + PASTE_CODE_ENQUEUE_KERNEL(parsec, zgeqrf_param, + (&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT)); + + /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */ + SYNC_TIME_START(); + parsec_context_start(parsec); + TIME_START(); + parsec_context_wait(parsec); + + SYNC_TIME_PRINT(rank, + ("zgeqrf HQR computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n", + iparam[IPARAM_NNODES], + iparam[IPARAM_NCORES], + iparam[IPARAM_P], + iparam[IPARAM_IB], + iparam[IPARAM_MB], + iparam[IPARAM_NB], + iparam[IPARAM_QR_TS_SZE], + iparam[IPARAM_QR_HLVL_SZE], + iparam[IPARAM_LOWLVL_TREE], + iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_DOMINO], + iparam[IPARAM_QR_TSRR], + iparam[IPARAM_M], + iparam[IPARAM_N], + gflops = (flops/1e9)/(sync_time_elapsed))); + if(loud >= 5 && rank == 0) { + printf("\n" + "%g\n" + "\n", + gflops); + } + dplasma_zgeqrf_param_Destruct( PARSEC_zgeqrf_param ); } #if defined(PARSEC_SIM) @@ -152,8 +159,6 @@ int main(int argc, char ** argv) } #endif - dplasma_zgeqrf_param_Destruct( PARSEC_zgeqrf_param ); - if( check ) { if (M >= N) { if(loud > 2) printf("+++ Generate the Q ..."); @@ -397,3 +402,118 @@ static int check_solution( parsec_context_t *parsec, int loud, return info_solution; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_context_t *parsec) +{ + int MB = 64; + int IB = 40; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + int LDA = N; + dplasma_qrtree_t qrtree; + + /* initializing matrix structure */ + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, LDA, N, 0, 0, + M, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcTS, 1, + parsec_matrix_block_cyclic, (&dcTS, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcTS.super.super.rank_of = always_local_rank_of; + dcTS.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcTT, 1, + parsec_matrix_block_cyclic, (&dcTT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcTT.super.super.rank_of = always_local_rank_of; + dcTT.super.super.rank_of_key = always_local_rank_of_key; + + /* Do the CPU warmup first */ + dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + + dplasma_hqr_init( &qrtree, + dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA, + iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_HLVL_SZE], + iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] ); + + parsec_taskpool_t *zgeqrf_hqr = dplasma_zgeqrf_param_New(&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT); + zgeqrf_hqr->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgeqrf_hqr); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_hqr_finalize( &qrtree ); + + /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */ + for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { + for(int i = 0; i < (int)zgeqrf_hqr->nb_task_classes; i++) { + for(int j = 0; NULL != zgeqrf_hqr->task_classes_array[i]->incarnations[j].hook; j++) { + if( zgeqrf_hqr->task_classes_array[i]->incarnations[j].type == dtype ) { + goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ + } + } + } + continue; /* No incarnation of this device type on any task class; try another type */ + do_run: + for(int did = 0; did < (int)parsec_nb_devices; did++) { + parsec_device_module_t *dev = parsec_mca_device_get(did); + if(dev->type != dtype) + continue; + /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ + for(int m = 0; m < MT; m++) { + for(int n = 0; n < NT; n++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcTS.super.super.data_of(&dcTS.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcTT.super.super.data_of(&dcTT.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + + dplasma_hqr_init( &qrtree, + dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA, + iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_HLVL_SZE], + iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] ); + + parsec_taskpool_t *zgeqrf_hqr_device = dplasma_zgeqrf_param_New(&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT); + parsec_context_add_taskpool(parsec, zgeqrf_hqr_device); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_hqr_finalize( &qrtree ); + dplasma_zgeqrf_param_Destruct(zgeqrf_hqr_device); + } + } + + dplasma_zgeqrf_param_Destruct(zgeqrf_hqr); +} diff --git a/tests/testing_zgeqrf_systolic.c b/tests/testing_zgeqrf_systolic.c index 06664b51..8f9db2a8 100644 --- a/tests/testing_zgeqrf_systolic.c +++ b/tests/testing_zgeqrf_systolic.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2020 The University of Tennessee and The University + * Copyright (c) 2009-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -20,6 +20,7 @@ static int check_solution( parsec_context_t *parsec, int loud, parsec_tiled_matrix_t *dcA, parsec_tiled_matrix_t *dcB, parsec_tiled_matrix_t *dcX ); +static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parsec_context_t *parsec); int main(int argc, char ** argv) { @@ -47,6 +48,9 @@ int main(int argc, char ** argv) PASTE_CODE_FLOPS(FLOPS_ZGEQRF, ((DagDouble_t)M, (DagDouble_t)N)); LDA = max(M, LDA); + + warmup_zgeqrf_systolic(rank, random_seed, iparam, parsec); + /* initializing matrix structure */ PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, @@ -80,53 +84,56 @@ int main(int argc, char ** argv) rank, MB, NB, LDB, NRHS, 0, 0, M, NRHS, P, nodes/P, KP, KQ, IP, JQ)); - /* matrix generation */ - if(loud > 2) printf("+++ Generate matrices ... "); - dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, 3872); - if( check ) - dplasma_zlacpy( parsec, dplasmaUpperLower, - (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 ); - dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); - dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); - if(loud > 2) printf("Done\n"); - dplasma_systolic_init( &qrtree, - dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA, - iparam[IPARAM_P], - iparam[IPARAM_Q] ); - - /* Create PaRSEC */ - PASTE_CODE_ENQUEUE_KERNEL(parsec, zgeqrf_param, - (&qrtree, - (parsec_tiled_matrix_t*)&dcA, - (parsec_tiled_matrix_t*)&dcTS, - (parsec_tiled_matrix_t*)&dcTT)); - - /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */ - SYNC_TIME_START(); - parsec_context_start(parsec); - TIME_START(); - parsec_context_wait(parsec); - - SYNC_TIME_PRINT(rank, - ("zgeqrf_systolic computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d M= %d N= %d : %f gflops\n", - iparam[IPARAM_NNODES], - iparam[IPARAM_NCORES], - iparam[IPARAM_P], - iparam[IPARAM_IB], - iparam[IPARAM_MB], - iparam[IPARAM_NB], - iparam[IPARAM_Q], - iparam[IPARAM_P], - iparam[IPARAM_M], - iparam[IPARAM_N], - gflops = (flops/1e9)/(sync_time_elapsed))); - if(loud >= 5 && rank == 0) { - printf("\n" - "%g\n" - "\n", - gflops); + dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA, + iparam[IPARAM_P], + iparam[IPARAM_Q] ); + + for(int t = 0; t < nruns; t++) { + /* matrix generation */ + if(loud > 2) printf("+++ Generate matrices ... "); + dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, 3872); + if( check ) + dplasma_zlacpy( parsec, dplasmaUpperLower, + (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + if(loud > 2) printf("Done\n"); + + /* Create PaRSEC */ + PASTE_CODE_ENQUEUE_KERNEL(parsec, zgeqrf_param, + (&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT)); + + /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */ + SYNC_TIME_START(); + parsec_context_start(parsec); + TIME_START(); + parsec_context_wait(parsec); + + SYNC_TIME_PRINT(rank, + ("zgeqrf_systolic computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d M= %d N= %d : %f gflops\n", + iparam[IPARAM_NNODES], + iparam[IPARAM_NCORES], + iparam[IPARAM_P], + iparam[IPARAM_IB], + iparam[IPARAM_MB], + iparam[IPARAM_NB], + iparam[IPARAM_Q], + iparam[IPARAM_P], + iparam[IPARAM_M], + iparam[IPARAM_N], + gflops = (flops/1e9)/(sync_time_elapsed))); + if(loud >= 5 && rank == 0) { + printf("\n" + "%g\n" + "\n", + gflops); + } + dplasma_zgeqrf_param_Destruct( PARSEC_zgeqrf_param ); } #if defined(PARSEC_SIM) @@ -142,8 +149,6 @@ int main(int argc, char ** argv) } #endif - dplasma_zgeqrf_param_Destruct( PARSEC_zgeqrf_param ); - if( check ) { if (M >= N) { if(loud > 2) printf("+++ Generate the Q ..."); @@ -384,3 +389,116 @@ static int check_solution( parsec_context_t *parsec, int loud, return info_solution; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parsec_context_t *parsec) +{ + int MB = 64; + int IB = 40; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + int LDA = N; + dplasma_qrtree_t qrtree; + + /* initializing matrix structure */ + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, LDA, N, 0, 0, + M, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcTS, 1, + parsec_matrix_block_cyclic, (&dcTS, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcTS.super.super.rank_of = always_local_rank_of; + dcTS.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcTT, 1, + parsec_matrix_block_cyclic, (&dcTT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcTT.super.super.rank_of = always_local_rank_of; + dcTT.super.super.rank_of_key = always_local_rank_of_key; + + /* Do the CPU warmup first */ + dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + + dplasma_systolic_init( &qrtree, + dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA, + iparam[IPARAM_P], + iparam[IPARAM_Q] ); + + parsec_taskpool_t *zgeqrf_systolic = dplasma_zgeqrf_param_New(&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT); + zgeqrf_systolic->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgeqrf_systolic); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_systolic_finalize( &qrtree ); + + /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */ + for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { + for(int i = 0; i < (int)zgeqrf_systolic->nb_task_classes; i++) { + for(int j = 0; NULL != zgeqrf_systolic->task_classes_array[i]->incarnations[j].hook; j++) { + if( zgeqrf_systolic->task_classes_array[i]->incarnations[j].type == dtype ) { + goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ + } + } + } + continue; /* No incarnation of this device type on any task class; try another type */ + do_run: + for(int did = 0; did < (int)parsec_nb_devices; did++) { + parsec_device_module_t *dev = parsec_mca_device_get(did); + if(dev->type != dtype) + continue; + /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ + for(int m = 0; m < MT; m++) { + for(int n = 0; n < NT; n++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcTS.super.super.data_of(&dcTS.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcTT.super.super.data_of(&dcTT.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, random_seed ); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS); + dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT); + + dplasma_systolic_init( &qrtree, + dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA, + iparam[IPARAM_P], + iparam[IPARAM_Q] ); + + parsec_taskpool_t *device_zgeqrf_systolic = dplasma_zgeqrf_param_New(&qrtree, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcTS, + (parsec_tiled_matrix_t*)&dcTT); + parsec_context_add_taskpool(parsec, device_zgeqrf_systolic); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_systolic_finalize( &qrtree ); + dplasma_zgeqrf_param_Destruct(device_zgeqrf_systolic); + } + } + + dplasma_zgeqrf_param_Destruct(zgeqrf_systolic); +} diff --git a/tests/testing_zgesvd.c b/tests/testing_zgesvd.c index 66b8e109..02787cd7 100644 --- a/tests/testing_zgesvd.c +++ b/tests/testing_zgesvd.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2020 The University of Tennessee and The University + * Copyright (c) 2011-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2015-2016 Inria, CNRS (LaBRI - UMR 5800), University of @@ -13,6 +13,7 @@ #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" static int check_solution(int N, const double *E1, const double *E2); +static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec); int main(int argc, char ** argv) { @@ -20,7 +21,8 @@ int main(int argc, char ** argv) int iparam[IPARAM_SIZEOF]; int ret = 0; double *s0 = NULL; - double *s1; + double *s1 = NULL; + double *e = NULL; int minMN; int info_solution; double time_ge2gb, time_gb2bd, time_solve = -1.; @@ -52,6 +54,8 @@ int main(int argc, char ** argv) LDA = max(M, LDA); + warmup_zgesvd(rank, random_seed, parsec); + if ( M < N ) { fprintf(stderr, "This testing can only perform SVD on matrices with M >= N\n"); return EXIT_FAILURE; @@ -68,123 +72,122 @@ int main(int argc, char ** argv) rank, MB+1, NB, MB+1, minMN, 0, 0, MB+1, minMN, 1, 1, 1, 1, IP, JQ)); - /* Initialize the matrix */ - if(loud > 3) printf("+++ Generate matrices ... "); - - /* Generate the matrix on rank 0 */ - if ( check ) { + s1 = (double*)malloc( minMN * sizeof(double)); + e = (double*)malloc( minMN * sizeof(double)); - /* Generate the singular values vector as in latms routines for check purpose */ - if (rank == 0) - { - double tmp = 1. / (double)N; - double alp = ( 1. - tmp ) / ((double)( N - 1 )); - int i; - s0 = (double *) malloc(minMN * sizeof(double)); - - s0[0] = 1.; - for(i=1; i < minMN; i++){ - s0[i] = (double)(N-i-1) * alp + tmp; - } - } + for(int t = 0; t < nruns; t++) { + /* Initialize the matrix */ + if(loud > 3) printf("+++ Generate matrices ... "); - dplasma_zlatms( parsec, dplasmaGeneral, (double)N, (parsec_tiled_matrix_t *)&dcA, 3872); - } - else { - dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, 3872); - } + /* Generate the matrix on rank 0 */ + if ( check ) { - /* Create Parsec */ - PASTE_CODE_ENQUEUE_KERNEL(parsec, zgebrd_ge2gb, - (IB, - (parsec_tiled_matrix_t*)&dcA, - (parsec_tiled_matrix_t*)&dcBand)); + /* Generate the singular values vector as in latms routines for check purpose */ + if (rank == 0 && NULL ==s0 ) + { + double tmp = 1. / (double)N; + double alp = ( 1. - tmp ) / ((double)( N - 1 )); + int i; + s0 = (double *) malloc(minMN * sizeof(double)); - /* lets rock! */ - SYNC_TIME_START(); - rc = parsec_context_start(parsec); - PARSEC_CHECK_ERROR(rc, "parsec_context_start"); - TIME_START(); - rc = parsec_context_wait(parsec); - PARSEC_CHECK_ERROR(rc, "parsec_context_wait"); - SYNC_TIME_STOP(); - time_ge2gb = sync_time_elapsed; + s0[0] = 1.; + for(i=1; i < minMN; i++){ + s0[i] = (double)(N-i-1) * alp + tmp; + } + } - if( rank == 0 ) { - double *e; + dplasma_zlatms( parsec, dplasmaGeneral, (double)N, (parsec_tiled_matrix_t *)&dcA, 3872); + } + else { + dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, 3872); + } - s1 = (double*)malloc( minMN * sizeof(double)); - e = (double*)malloc( minMN * sizeof(double)); + /* Create Parsec */ + PASTE_CODE_ENQUEUE_KERNEL(parsec, zgebrd_ge2gb, + (IB, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcBand)); -/* #if defined(__ICC) || defined(__INTEL_COMPILER) */ -/* mkl_set_num_threads( iparam[IPARAM_NCORES] ); */ -/* #endif */ - /* Reduce the band */ + /* lets rock! */ + SYNC_TIME_START(); + rc = parsec_context_start(parsec); + PARSEC_CHECK_ERROR(rc, "parsec_context_start"); TIME_START(); - info_solution = LAPACKE_zgbbrd( LAPACK_COL_MAJOR, - 'N', - M, N, - 0, 0, NB, - dcBand.mat, MB+1, - s1, e, - NULL, 1, - NULL, 1, - NULL, 1 ); - TIME_STOP(); - time_gb2bd = time_elapsed; - - /* Solve the bidiagonal SVD problem */ - if (info_solution == 0){ + rc = parsec_context_wait(parsec); + PARSEC_CHECK_ERROR(rc, "parsec_context_wait"); + SYNC_TIME_STOP(); + time_ge2gb = sync_time_elapsed; + + if( rank == 0 ) { + /* #if defined(__ICC) || defined(__INTEL_COMPILER) */ + /* mkl_set_num_threads( iparam[IPARAM_NCORES] ); */ + /* #endif */ + /* Reduce the band */ TIME_START(); - info_solution = LAPACKE_zbdsqr( LAPACK_COL_MAJOR, 'U', - minMN, 0, 0, 0, + info_solution = LAPACKE_zgbbrd( LAPACK_COL_MAJOR, + 'N', + M, N, + 0, 0, NB, + dcBand.mat, MB+1, s1, e, - NULL, 1, NULL, 1, NULL, 1 ); + NULL, 1, + NULL, 1, + NULL, 1 ); TIME_STOP(); - time_solve = time_elapsed; + time_gb2bd = time_elapsed; + + /* Solve the bidiagonal SVD problem */ + if (info_solution == 0){ + TIME_START(); + info_solution = LAPACKE_zbdsqr( LAPACK_COL_MAJOR, 'U', + minMN, 0, 0, 0, + s1, e, + NULL, 1, NULL, 1, NULL, 1 ); + TIME_STOP(); + time_solve = time_elapsed; + } + + /* #if defined(__ICC) || defined(__INTEL_COMPILER) */ + /* mkl_set_num_threads( 1 ); */ + /* #endif */ + fprintf(stderr, "WARNING: This code is using the non optimized Lapack zbdsqr subroutine to reduce the band to bi-diagonal form. Please replace this call by the multi-threaded PLASMA implementation in order to get performance\n"); + printf("zgeqrf GESVD computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d R-bidiag= %d M= %d N= %d : %e %e %e / %f gflops\n", + iparam[IPARAM_NNODES], + iparam[IPARAM_NCORES], + iparam[IPARAM_P], + iparam[IPARAM_IB], + iparam[IPARAM_MB], + iparam[IPARAM_NB], + iparam[IPARAM_QR_TS_SZE], + iparam[IPARAM_QR_HLVL_SZE], + iparam[IPARAM_LOWLVL_TREE], + iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_DOMINO], + iparam[IPARAM_QR_TSRR], + iparam[IPARAM_M], + iparam[IPARAM_N], + time_ge2gb, time_gb2bd, time_solve, + gflops = (flops/1e9)/(time_ge2gb+time_gb2bd+time_solve)); + + #if defined(PARSEC_SIM) + printf("zgeqrf GESVD simulation NP= %d NC= %d P= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d MT= %d NT= %d : %d \n", + iparam[IPARAM_NNODES], + iparam[IPARAM_NCORES], + iparam[IPARAM_P], + iparam[IPARAM_QR_TS_SZE], + iparam[IPARAM_QR_HLVL_SZE], + iparam[IPARAM_LOWLVL_TREE], + iparam[IPARAM_HIGHLVL_TREE], + iparam[IPARAM_QR_DOMINO], + iparam[IPARAM_QR_TSRR], + MT, NT, + parsec_getsimulationdate( parsec )); + #endif } - free(e); -/* #if defined(__ICC) || defined(__INTEL_COMPILER) */ -/* mkl_set_num_threads( 1 ); */ -/* #endif */ - fprintf(stderr, "WARNING: This code is using the non optimized Lapack zbdsqr subroutine to reduce the band to bi-diagonal form. Please replace this call by the multi-threaded PLASMA implementation in order to get performance\n"); - printf("zgeqrf GESVD computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d R-bidiag= %d M= %d N= %d : %e %e %e / %f gflops\n", - iparam[IPARAM_NNODES], - iparam[IPARAM_NCORES], - iparam[IPARAM_P], - iparam[IPARAM_IB], - iparam[IPARAM_MB], - iparam[IPARAM_NB], - iparam[IPARAM_QR_TS_SZE], - iparam[IPARAM_QR_HLVL_SZE], - iparam[IPARAM_LOWLVL_TREE], - iparam[IPARAM_HIGHLVL_TREE], - iparam[IPARAM_QR_DOMINO], - iparam[IPARAM_QR_TSRR], - iparam[IPARAM_M], - iparam[IPARAM_N], - time_ge2gb, time_gb2bd, time_solve, - gflops = (flops/1e9)/(time_ge2gb+time_gb2bd+time_solve)); - -#if defined(PARSEC_SIM) - printf("zgeqrf GESVD simulation NP= %d NC= %d P= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d MT= %d NT= %d : %d \n", - iparam[IPARAM_NNODES], - iparam[IPARAM_NCORES], - iparam[IPARAM_P], - iparam[IPARAM_QR_TS_SZE], - iparam[IPARAM_QR_HLVL_SZE], - iparam[IPARAM_LOWLVL_TREE], - iparam[IPARAM_HIGHLVL_TREE], - iparam[IPARAM_QR_DOMINO], - iparam[IPARAM_QR_TSRR], - MT, NT, - parsec_getsimulationdate( parsec )); -#endif + dplasma_zgebrd_ge2gb_Destruct( PARSEC_zgebrd_ge2gb ); } - dplasma_zgebrd_ge2gb_Destruct( PARSEC_zgebrd_ge2gb ); - if( check && (rank==0) ) { if (info_solution == 0 ) { info_solution = check_solution(minMN, s0, s1); @@ -200,10 +203,12 @@ int main(int argc, char ** argv) " ---- TESTING ZGESVD .. M >= N .. FAILED !\n" "***************************************************\n"); } - free(s1); - free(s0); } + free(s1); + free(s0); + free(e); + parsec_data_free(dcA.mat); parsec_data_free(dcBand.mat); @@ -259,3 +264,111 @@ static int check_solution(int N, const double *E1, const double *E2) } return info_solution; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec) +{ + int MB = 64; + int IB = 40; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + double *s1, *e; + + /* initializing matrix structure */ + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, M, N, 0, 0, + M, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcBand, 1, + parsec_matrix_block_cyclic, (&dcBand, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_LAPACK, + rank, MB+1, NB, MB+1, M, 0, 0, + MB+1, M, 1, 1, 1, 1, 0, 0)); + dcBand.super.super.rank_of = always_local_rank_of; + dcBand.super.super.rank_of_key = always_local_rank_of_key; + s1 = (double*)malloc( M * sizeof(double)); + e = (double*)malloc( M * sizeof(double)); + + /* Do the CPU warmup first */ + dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, random_seed); + parsec_taskpool_t *zgesvd = dplasma_zgebrd_ge2gb_New(IB, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcBand); + zgesvd->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgesvd); + parsec_context_start(parsec); + parsec_context_wait(parsec); + (void)LAPACKE_zgbbrd( LAPACK_COL_MAJOR, + 'N', + M, N, + 0, 0, NB, + dcBand.mat, MB+1, + s1, e, + NULL, 1, + NULL, 1, + NULL, 1 ); + (void)LAPACKE_zbdsqr( LAPACK_COL_MAJOR, 'U', + M, 0, 0, 0, + s1, e, + NULL, 1, NULL, 1, NULL, 1 ); + + /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */ + for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { + for(int i = 0; i < (int)zgesvd->nb_task_classes; i++) { + for(int j = 0; NULL != zgesvd->task_classes_array[i]->incarnations[j].hook; j++) { + if( zgesvd->task_classes_array[i]->incarnations[j].type == dtype ) { + goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ + } + } + } + continue; /* No incarnation of this device type on any task class; try another type */ + do_run: + for(int did = 0; did < (int)parsec_nb_devices; did++) { + parsec_device_module_t *dev = parsec_mca_device_get(did); + if(dev->type != dtype) + continue; + /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ + for(int m = 0; m < MT; m++) { + for(int n = 0; n < NT; n++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcA.super.super.data_of(&dcA.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + if(m == 0) { + dta = dcBand.super.super.data_of(&dcBand.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + } + dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, random_seed); + parsec_taskpool_t *zgesvd_device = dplasma_zgebrd_ge2gb_New(IB, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcBand); + zgesvd->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgesvd_device); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zgebrd_ge2gb_Destruct( zgesvd_device ); + /* No need to redo zgbbrd and zbdsqr as those are LAPACK / CPU-only */ + } + } + + free(e); + free(s1); + dplasma_zgebrd_ge2gb_Destruct( zgesvd ); + +} diff --git a/tests/testing_zgetrf_incpiv.c b/tests/testing_zgetrf_incpiv.c index fce18b09..2eef974c 100644 --- a/tests/testing_zgetrf_incpiv.c +++ b/tests/testing_zgetrf_incpiv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2021 The University of Tennessee and The University + * Copyright (c) 2009-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -14,11 +14,11 @@ static int check_solution( parsec_context_t *parsec, int loud, parsec_tiled_matrix_t *dcA, parsec_tiled_matrix_t *dcB, parsec_tiled_matrix_t *dcX ); - static int check_inverse( parsec_context_t *parsec, int loud, parsec_tiled_matrix_t *dcA, parsec_tiled_matrix_t *dcInvA, parsec_tiled_matrix_t *dcI ); +static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec); int main(int argc, char ** argv) { @@ -40,6 +40,7 @@ int main(int argc, char ** argv) PASTE_CODE_FLOPS(FLOPS_ZGETRF, ((DagDouble_t)M,(DagDouble_t)N)); LDA = max(M, LDA); + warmup_zgetrf(rank, random_seed, parsec); if ( M != N && check ) { fprintf(stderr, "Check is impossible if M != N\n"); @@ -253,3 +254,100 @@ static int check_inverse( parsec_context_t *parsec, int loud, return info_solution; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec) +{ + int MB = 64; + int IB = 40; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + int matrix_init = dplasmaMatrixRandom; + int info; + + /* initializing matrix structure */ + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, M, N, 0, 0, + M, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcL, 1, + parsec_matrix_block_cyclic, (&dcL, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcL.super.super.rank_of = always_local_rank_of; + dcL.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcIPIV, 1, + parsec_matrix_block_cyclic, (&dcIPIV, PARSEC_MATRIX_INTEGER, PARSEC_MATRIX_TILE, + rank, MB, 1, M, NT, 0, 0, + M, NT, 1, 1, 1, 1, 0, 0)); + dcIPIV.super.super.rank_of = always_local_rank_of; + dcIPIV.super.super.rank_of_key = always_local_rank_of_key; + + /* Do the CPU warmup first */ + dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); + parsec_taskpool_t *zgetrf_incpiv = dplasma_zgetrf_incpiv_New((parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcL, + (parsec_tiled_matrix_t*)&dcIPIV, + &info); + zgetrf_incpiv->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgetrf_incpiv); + parsec_context_start(parsec); + parsec_context_wait(parsec); + + /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */ + for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { + for(int i = 0; i < (int)zgetrf_incpiv->nb_task_classes; i++) { + for(int j = 0; NULL != zgetrf_incpiv->task_classes_array[i]->incarnations[j].hook; j++) { + if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type == dtype ) { + goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ + } + } + } + continue; /* No incarnation of this device type on any task class; try another type */ + do_run: + for(int did = 0; did < (int)parsec_nb_devices; did++) { + parsec_device_module_t *dev = parsec_mca_device_get(did); + if(dev->type != dtype) + continue; + /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ + for(int m = 0; m < MT; m++) { + for(int n = 0; n < NT; n++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcL.super.super.data_of(&dcL.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcIPIV.super.super.data_of(&dcIPIV.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); + parsec_taskpool_t *zgetrf_incpiv_device = dplasma_zgetrf_incpiv_New((parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcL, + (parsec_tiled_matrix_t*)&dcIPIV, + &info); + parsec_context_add_taskpool(parsec, zgetrf_incpiv_device); + parsec_context_start(parsec); + parsec_context_wait(parsec); + + dplasma_zgetrf_incpiv_Destruct(zgetrf_incpiv_device); + } + } + + dplasma_zgetrf_incpiv_Destruct(zgetrf_incpiv); + +} diff --git a/tests/testing_zgetrf_incpiv_dtd.c b/tests/testing_zgetrf_incpiv_dtd.c index d710ef9c..e1044774 100644 --- a/tests/testing_zgetrf_incpiv_dtd.c +++ b/tests/testing_zgetrf_incpiv_dtd.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2020 The University of Tennessee and The University + * Copyright (c) 2015-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -12,6 +12,17 @@ #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include "parsec/interfaces/dtd/insert_function.h" +static int check_solution( parsec_context_t *parsec, int loud, + parsec_tiled_matrix_t *dcA, + parsec_tiled_matrix_t *dcB, + parsec_tiled_matrix_t *dcX ); + +static int check_inverse( parsec_context_t *parsec, int loud, + parsec_tiled_matrix_t *dcA, + parsec_tiled_matrix_t *dcInvA, + parsec_tiled_matrix_t *dcI ); +static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec); + /* Global indices for the different datatypes */ static int TILE_FULL, TILE_RECTANGLE, @@ -124,16 +135,6 @@ parsec_core_ssssm(parsec_execution_stream_t *es, parsec_task_t * this_task) return PARSEC_HOOK_RETURN_DONE; } -static int check_solution( parsec_context_t *parsec, int loud, - parsec_tiled_matrix_t *dcA, - parsec_tiled_matrix_t *dcB, - parsec_tiled_matrix_t *dcX ); - -static int check_inverse( parsec_context_t *parsec, int loud, - parsec_tiled_matrix_t *dcA, - parsec_tiled_matrix_t *dcInvA, - parsec_tiled_matrix_t *dcI ); - int main(int argc, char ** argv) { parsec_context_t* parsec; @@ -159,6 +160,7 @@ int main(int argc, char ** argv) fprintf(stderr, "Check is impossible if M != N\n"); check = 0; } + warmup_zgetrf(rank, random_seed, parsec); /* initializing matrix structure */ PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, @@ -555,3 +557,100 @@ static int check_inverse( parsec_context_t *parsec, int loud, return info_solution; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec) +{ + int MB = 64; + int IB = 40; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + int matrix_init = dplasmaMatrixRandom; + int info; + + /* initializing matrix structure */ + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, M, N, 0, 0, + M, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcL, 1, + parsec_matrix_block_cyclic, (&dcL, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcL.super.super.rank_of = always_local_rank_of; + dcL.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcIPIV, 1, + parsec_matrix_block_cyclic, (&dcIPIV, PARSEC_MATRIX_INTEGER, PARSEC_MATRIX_TILE, + rank, MB, 1, M, NT, 0, 0, + M, NT, 1, 1, 1, 1, 0, 0)); + dcIPIV.super.super.rank_of = always_local_rank_of; + dcIPIV.super.super.rank_of_key = always_local_rank_of_key; + + /* Do the CPU warmup first */ + dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); + parsec_taskpool_t *zgetrf_incpiv = dplasma_zgetrf_incpiv_New((parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcL, + (parsec_tiled_matrix_t*)&dcIPIV, + &info); + zgetrf_incpiv->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zgetrf_incpiv); + parsec_context_start(parsec); + parsec_context_wait(parsec); + + /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */ + for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { + for(int i = 0; i < (int)zgetrf_incpiv->nb_task_classes; i++) { + for(int j = 0; NULL != zgetrf_incpiv->task_classes_array[i]->incarnations[j].hook; j++) { + if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type == dtype ) { + goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ + } + } + } + continue; /* No incarnation of this device type on any task class; try another type */ + do_run: + for(int did = 0; did < (int)parsec_nb_devices; did++) { + parsec_device_module_t *dev = parsec_mca_device_get(did); + if(dev->type != dtype) + continue; + /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ + for(int m = 0; m < MT; m++) { + for(int n = 0; n < NT; n++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcL.super.super.data_of(&dcL.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcIPIV.super.super.data_of(&dcIPIV.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed ); + parsec_taskpool_t *zgetrf_incpiv_device = dplasma_zgetrf_incpiv_New((parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcL, + (parsec_tiled_matrix_t*)&dcIPIV, + &info); + parsec_context_add_taskpool(parsec, zgetrf_incpiv_device); + parsec_context_start(parsec); + parsec_context_wait(parsec); + + dplasma_zgetrf_incpiv_Destruct(zgetrf_incpiv_device); + } + } + + dplasma_zgetrf_incpiv_Destruct(zgetrf_incpiv); + +} diff --git a/tests/testing_zheev.c b/tests/testing_zheev.c index e44dda64..c2845e50 100644 --- a/tests/testing_zheev.c +++ b/tests/testing_zheev.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2020 The University of Tennessee and The University + * Copyright (c) 2011-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -21,6 +21,7 @@ #undef PRINTF_HEAVY static int check_solution(int N, double *E1, double *E2, double eps); +static void warmup_zherbt(int rank, int random_seed, int uplo, parsec_context_t *parsec); int main(int argc, char *argv[]) { @@ -43,6 +44,8 @@ int main(int argc, char *argv[]) LDA = dplasma_imax( LDA, N ); LDB = dplasma_imax( LDB, N ); + warmup_zherbt(rank, random_seed, uplo, parsec); + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, rank, MB, NB, LDA, N, 0, 0, @@ -52,25 +55,27 @@ int main(int argc, char *argv[]) rank, IB, NB, MT*IB, N, 0, 0, MT*IB, N, P, nodes/P, KP, KP, IP, JQ)); - /* Fill A with randomness */ - dplasma_zplghe( parsec, (double)N, uplo, - (parsec_tiled_matrix_t *)&dcA, 3872); + for(int t = 0; t < nruns; t++) { + /* Fill A with randomness */ + dplasma_zplghe( parsec, (double)N, uplo, + (parsec_tiled_matrix_t *)&dcA, random_seed); #ifdef PRINTF_HEAVY - printf("########### A (initial, tile storage)\n"); - dplasma_zprint( parsec, uplo, (parsec_tiled_matrix_t *)&dcA ); + printf("########### A (initial, tile storage)\n"); + dplasma_zprint( parsec, uplo, (parsec_tiled_matrix_t *)&dcA ); #endif - /* Step 1 - Reduction A to band matrix */ - PASTE_CODE_ENQUEUE_KERNEL(parsec, zherbt, - (uplo, IB, - (parsec_tiled_matrix_t*)&dcA, - (parsec_tiled_matrix_t*)&dcT)); - PASTE_CODE_PROGRESS_KERNEL(parsec, zherbt); + /* Step 1 - Reduction A to band matrix */ + PASTE_CODE_ENQUEUE_KERNEL(parsec, zherbt, + (uplo, IB, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcT)); + PASTE_CODE_PROGRESS_KERNEL(parsec, zherbt); #ifdef PRINTF_HEAVY - printf("########### A (reduced to band form)\n"); - dplasma_zprint( parsec, uplo, &dcA); + printf("########### A (reduced to band form)\n"); + dplasma_zprint( parsec, uplo, &dcA); #endif - + dplasma_zherbt_Destruct( PARSEC_zherbt ); + } goto fin; /* Step 2 - Conversion of the tiled band to 1D band storage */ @@ -243,17 +248,16 @@ goto fin; free(W0); free(D); free(E); } - dplasma_zherbt_Destruct( PARSEC_zherbt ); parsec_taskpool_free( &PARSEC_diag_band_to_rect->super ); dplasma_zhbrdt_Destruct( PARSEC_zhbrdt ); parsec_data_free(dcBAND.mat); + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcBAND); +fin: parsec_data_free(dcA.mat); parsec_data_free(dcT.mat); - parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcBAND); parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA); parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcT); -fin: cleanup_parsec(parsec, iparam); return EXIT_SUCCESS; @@ -297,3 +301,85 @@ static int check_solution(int N, double *E1, double *E2, double eps) return info_solution; } +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zherbt(int rank, int random_seed, int uplo, parsec_context_t *parsec) +{ + int MB = 64; + int IB = 40; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int LDA = N; + + /* initializing matrix structure */ + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, MB, NB, LDA, N, 0, 0, + N, N, 1, 1, 1, 1, 0, 0)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + PASTE_CODE_ALLOCATE_MATRIX(dcT, 1, + parsec_matrix_block_cyclic, (&dcT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE, + rank, IB, NB, MT*IB, N, 0, 0, + MT*IB, N, 1, 1, 1, 1, 0, 0)); + dcT.super.super.rank_of = always_local_rank_of; + dcT.super.super.rank_of_key = always_local_rank_of_key; + + /* Do the CPU warmup first */ + dplasma_zplghe( parsec, (double)N, uplo, (parsec_tiled_matrix_t *)&dcA, random_seed); + parsec_taskpool_t *zherbt = dplasma_zherbt_New(uplo, IB, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcT); + zherbt->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zherbt); + parsec_context_start(parsec); + parsec_context_wait(parsec); + + /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */ + for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) { + for(int i = 0; i < (int)zherbt->nb_task_classes; i++) { + for(int j = 0; NULL != zherbt->task_classes_array[i]->incarnations[j].hook; j++) { + if( zherbt->task_classes_array[i]->incarnations[j].type == dtype ) { + goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */ + } + } + } + continue; /* No incarnation of this device type on any task class; try another type */ + do_run: + for(int did = 0; did < (int)parsec_nb_devices; did++) { + parsec_device_module_t *dev = parsec_mca_device_get(did); + if(dev->type != dtype) + continue; + /* This should work, right? Unfortunately, we can't test until there is a -enabled implementation for this test */ + for(int m = 0; m < MT; m++) { + for(int n = 0; n < NT; n++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + dta = dcT.super.super.data_of(&dcT.super.super, m, n); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + dplasma_zplghe( parsec, (double)N, uplo, (parsec_tiled_matrix_t *)&dcA, random_seed); + parsec_taskpool_t *zherbt_device = dplasma_zherbt_New(uplo, IB, + (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_t*)&dcT); + parsec_context_add_taskpool(parsec, zherbt_device); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zherbt_Destruct(zherbt_device); + } + } + + dplasma_zherbt_Destruct(zherbt); +} diff --git a/tests/testing_zpoinv.c b/tests/testing_zpoinv.c index f907a189..01675de5 100644 --- a/tests/testing_zpoinv.c +++ b/tests/testing_zpoinv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2020 The University of Tennessee and The University + * Copyright (c) 2009-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -12,6 +12,8 @@ #include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" +static void warmup_zpoinv(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec); + int main(int argc, char ** argv) { parsec_context_t* parsec; @@ -36,29 +38,33 @@ int main(int argc, char ** argv) KP = 1; KQ = 1; + warmup_zpoinv(rank, uplo, random_seed, parsec); + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, rank, MB, NB, LDA, N, 0, 0, N, N, P, nodes/P, uplo)); - /* matrix generation */ - if(loud > 3) printf("+++ Generate matrices ... "); - dplasma_zplghe( parsec, (double)(N), uplo, - (parsec_tiled_matrix_t *)&dcA, random_seed); - if(loud > 3) printf("Done\n"); - - if (async) { - PASTE_CODE_ENQUEUE_KERNEL(parsec, zpoinv, - (uplo, (parsec_tiled_matrix_t*)&dcA, &info)); - PASTE_CODE_PROGRESS_KERNEL(parsec, zpoinv); - dplasma_zpoinv_Destruct( PARSEC_zpoinv ); - } - else { - SYNC_TIME_START(); - info = dplasma_zpoinv_sync( parsec, uplo, (parsec_tiled_matrix_t*)&dcA ); - SYNC_TIME_PRINT(rank, ("zpoinv\tPxQ= %3d %-3d NB= %4d N= %7d : %14f gflops\n", - P, Q, NB, N, - gflops=(flops/1e9)/sync_time_elapsed)); + for(int t = 0; t < nruns; t++) { + /* matrix generation */ + if(loud > 3) printf("+++ Generate matrices ... "); + dplasma_zplghe( parsec, (double)(N), uplo, + (parsec_tiled_matrix_t *)&dcA, random_seed); + if(loud > 3) printf("Done\n"); + + if (async) { + PASTE_CODE_ENQUEUE_KERNEL(parsec, zpoinv, + (uplo, (parsec_tiled_matrix_t*)&dcA, &info)); + PASTE_CODE_PROGRESS_KERNEL(parsec, zpoinv); + dplasma_zpoinv_Destruct( PARSEC_zpoinv ); + } + else { + SYNC_TIME_START(); + info = dplasma_zpoinv_sync( parsec, uplo, (parsec_tiled_matrix_t*)&dcA ); + SYNC_TIME_PRINT(rank, ("zpoinv\tPxQ= %3d %-3d NB= %4d N= %7d : %14f gflops\n", + P, Q, NB, N, + gflops=(flops/1e9)/sync_time_elapsed)); + } } if( 0 == rank && info != 0 ) { @@ -96,3 +102,68 @@ int main(int argc, char ** argv) cleanup_parsec(parsec, iparam); return ret; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zpoinv(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec) +{ + int MB = 64; + int NB = 64; + int MT = 4; + int NT = 4; + int M = MB*MT; + int N = NB*NT; + int did; + int info; + + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, + rank, MB, NB, M, N, 0, 0, + M, N, 1, 1, uplo)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + + /* Do the CPU warmup first */ + dplasma_zplghe(parsec, (double)(N), uplo, &dcA.super, random_seed); + parsec_taskpool_t *zpoinv = dplasma_zpoinv_New(uplo, &dcA.super, &info ); + zpoinv->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zpoinv); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zpoinv_Destruct(zpoinv); + + /* Now do the other devices, skipping RECURSIVE */ + /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */ + for(did = 2; did < (int)parsec_nb_devices; did++) { + if(PARSEC_MATRIX_LOWER == uplo) { + for(int i = 0; i < MT; i++) { + for(int j = 0; j <= i; j++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + } else { + for(int i = 0; i < MT; i++) { + for(int j = i; j < NT; j++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + } + dplasma_zplghe(parsec, (double)(N), uplo, &dcA.super, random_seed); + dplasma_zpoinv( parsec, uplo, &dcA.super ); + parsec_devices_release_memory(); + } + + parsec_data_free(dcA.mat); dcA.mat = NULL; + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA ); +} diff --git a/tests/testing_zpotrf.c b/tests/testing_zpotrf.c index 5245adb9..c59fc33d 100644 --- a/tests/testing_zpotrf.c +++ b/tests/testing_zpotrf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2021 The University of Tennessee and The University + * Copyright (c) 2009-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -12,6 +12,8 @@ #include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" +static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec); + int main(int argc, char ** argv) { parsec_context_t* parsec; @@ -35,6 +37,8 @@ int main(int argc, char ** argv) KP = 1; KQ = 1; + warmup_zpotrf(rank, uplo, random_seed, parsec); + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, rank, MB, NB, LDA, N, 0, 0, @@ -67,7 +71,7 @@ int main(int argc, char ** argv) } else { - PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(parsec, zpotrf, + PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(parsec, zpotrf, ( uplo, (parsec_tiled_matrix_t*)&dcA, &info), dplasma_zpotrf_Destruct( PARSEC_zpotrf )); } @@ -130,3 +134,69 @@ int main(int argc, char ** argv) cleanup_parsec(parsec, iparam); return ret; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec) +{ + int MB = 64; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + int did; + int info; + + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, + rank, MB, NB, M, N, 0, 0, + M, N, 1, 1, uplo)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + + /* Do the CPU warmup first */ + dplasma_zplghe(parsec, (double)(N), uplo, &dcA.super, random_seed); + parsec_taskpool_t *zpotrf = dplasma_zpotrf_New(uplo, &dcA.super, &info ); + zpotrf->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zpotrf); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zpotrf_Destruct(zpotrf); + + /* Now do the other devices, skipping RECURSIVE */ + /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */ + for(did = 2; did < (int)parsec_nb_devices; did++) { + if(PARSEC_MATRIX_LOWER == uplo) { + for(int i = 0; i < MT; i++) { + for(int j = 0; j <= i; j++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + } else { + for(int i = 0; i < MT; i++) { + for(int j = i; j < NT; j++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + } + dplasma_zplghe( parsec, (double)(N), uplo, + (parsec_tiled_matrix_t *)&dcA, random_seed); + dplasma_zpotrf( parsec, uplo, &dcA.super ); + parsec_devices_release_memory(); + } + + parsec_data_free(dcA.mat); dcA.mat = NULL; + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA ); +} diff --git a/tests/testing_zpotrf_dtd.c b/tests/testing_zpotrf_dtd.c index 08beebe8..32e2dd38 100644 --- a/tests/testing_zpotrf_dtd.c +++ b/tests/testing_zpotrf_dtd.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2020 The University of Tennessee and The University + * Copyright (c) 2013-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -20,6 +20,8 @@ #include #endif /* defined(DPLASMA_HAVE_CUDA) */ +static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec); + /* Global index for the full tile datatype */ static int TILE_FULL; @@ -226,6 +228,8 @@ int main(int argc, char **argv) KP = 1; KQ = 1; + warmup_zpotrf(rank, uplo, random_seed, parsec); + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, rank, MB, NB, LDA, N, 0, 0, @@ -544,3 +548,69 @@ int main(int argc, char **argv) cleanup_parsec(parsec, iparam); return ret; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec) +{ + int MB = 64; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + int did; + int info; + + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, + rank, MB, NB, M, N, 0, 0, + M, N, 1, 1, uplo)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + + /* Do the CPU warmup first */ + dplasma_zplghe(parsec, (double)(N), uplo, &dcA.super, random_seed); + parsec_taskpool_t *zpotrf = dplasma_zpotrf_New(uplo, &dcA.super, &info ); + zpotrf->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zpotrf); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zpotrf_Destruct(zpotrf); + + /* Now do the other devices, skipping RECURSIVE */ + /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */ + for(did = 2; did < (int)parsec_nb_devices; did++) { + if(PARSEC_MATRIX_LOWER == uplo) { + for(int i = 0; i < MT; i++) { + for(int j = 0; j <= i; j++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + } else { + for(int i = 0; i < MT; i++) { + for(int j = i; j < NT; j++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + } + dplasma_zplghe( parsec, (double)(N), uplo, + (parsec_tiled_matrix_t *)&dcA, random_seed); + dplasma_zpotrf( parsec, uplo, &dcA.super ); + parsec_devices_release_memory(); + } + + parsec_data_free(dcA.mat); dcA.mat = NULL; + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA ); +} diff --git a/tests/testing_zpotrf_dtd_untied.c b/tests/testing_zpotrf_dtd_untied.c index 63458744..55c39026 100644 --- a/tests/testing_zpotrf_dtd_untied.c +++ b/tests/testing_zpotrf_dtd_untied.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2020 The University of Tennessee and The University + * Copyright (c) 2015-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -26,6 +26,8 @@ #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include "parsec/interfaces/dtd/insert_function.h" +static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec); + /* Global index for the full tile datatype */ static int TILE_FULL; @@ -351,6 +353,8 @@ int main(int argc, char **argv) KP = 1; KQ = 1; + warmup_zpotrf(rank, uplo, random_seed, parsec); + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, rank, MB, NB, LDA, N, 0, 0, @@ -481,3 +485,69 @@ int main(int argc, char **argv) cleanup_parsec(parsec, iparam); return ret; } + +static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...) +{ + return desc->myrank; +} + +static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key) +{ + (void)key; + return desc->myrank; +} + +static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec) +{ + int MB = 64; + int NB = 64; + int MT = 4; + int NT = 4; + int N = NB*NT; + int M = MB*MT; + int did; + int info; + + PASTE_CODE_ALLOCATE_MATRIX(dcA, 1, + parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, + rank, MB, NB, M, N, 0, 0, + M, N, 1, 1, uplo)); + dcA.super.super.rank_of = always_local_rank_of; + dcA.super.super.rank_of_key = always_local_rank_of_key; + + /* Do the CPU warmup first */ + dplasma_zplghe(parsec, (double)(N), uplo, &dcA.super, random_seed); + parsec_taskpool_t *zpotrf = dplasma_zpotrf_New(uplo, &dcA.super, &info ); + zpotrf->devices_index_mask = 1<<0; /* Only CPU ! */ + parsec_context_add_taskpool(parsec, zpotrf); + parsec_context_start(parsec); + parsec_context_wait(parsec); + dplasma_zpotrf_Destruct(zpotrf); + + /* Now do the other devices, skipping RECURSIVE */ + /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */ + for(did = 2; did < (int)parsec_nb_devices; did++) { + if(PARSEC_MATRIX_LOWER == uplo) { + for(int i = 0; i < MT; i++) { + for(int j = 0; j <= i; j++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + } else { + for(int i = 0; i < MT; i++) { + for(int j = i; j < NT; j++) { + parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j); + parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + } + } + dplasma_zplghe( parsec, (double)(N), uplo, + (parsec_tiled_matrix_t *)&dcA, random_seed); + dplasma_zpotrf( parsec, uplo, &dcA.super ); + parsec_devices_release_memory(); + } + + parsec_data_free(dcA.mat); dcA.mat = NULL; + parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA ); +}