Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DPLASMA Warmup -- 2nd try #69

Merged
merged 6 commits into from
Jun 27, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions src/zgemm_NN_gpu.jdf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
extern "C" %{
/*
* Copyright (c) 2017-2020 The University of Tennessee and The University
* Copyright (c) 2017-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
Expand All @@ -18,7 +18,7 @@ extern "C" %{
#if defined(DPLASMA_HAVE_CUDA)
#include <cublas.h>
#endif /* defined(DPLASMA_HAVE_CUDA) */

static void succ(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l)
{
int xn = *x, yn = *y, zn = *z;
Expand All @@ -41,7 +41,7 @@ static void succ(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l)
} else {
zn = zn+1;
}

l--;
}
*x = xn;
Expand All @@ -58,13 +58,13 @@ static int succ_x(int x, int y, int z, int xMax, int yMax, int zMax, int l)
static int succ_y(int x, int y, int z, int xMax, int yMax, int zMax, int l)
{
succ(&x, &y, &z, xMax, yMax, zMax, l);
return y;
return y;
}

static int succ_z(int x, int y, int z, int xMax, int yMax, int zMax, int l)
{
succ(&x, &y, &z, xMax, yMax, zMax, l);
return z;
return z;
}

static void pred(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l)
Expand Down Expand Up @@ -226,9 +226,11 @@ CTL Z <- Z LOCAL_BARRIER( m/(tB*tP), n/(tC*tQ), 0, u, v )
BODY
if( nb_cuda_devices > 0 ) {
int g = (n / tQ) % nb_cuda_devices;
parsec_advise_data_on_device( _f_C->original,
cuda_device_index[g],
PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
if( _f_C->original->preferred_device <= 0 ) {
parsec_advise_data_on_device( _f_C->original,
cuda_device_index[g],
PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
}
}
END

Expand Down Expand Up @@ -354,7 +356,7 @@ GEMM(m, n, k)

READ A <- A READ_A(m, k, x, y, z)
READ B <- B READ_B(k, n, x, y, z)
RW C <- k == 0 ? C READ_C(m, n)
RW C <- k == 0 ? C READ_C(m, n)
: C GEMM(m, n, k-1 )
-> k + 1 == descB->mt ? descC(m, n)
: C GEMM(m, n, k+1)
Expand Down
2 changes: 2 additions & 0 deletions src/zgeqrf.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,8 @@ BODY [type=CUDA device=%{ return n; %}
WORK, ib,
WORKC, descA->mb,
parsec_body.stream );

parsec_gpu_push_workspace(gpu_device, gpu_stream);
abouteiller marked this conversation as resolved.
Show resolved Hide resolved
}
END

Expand Down
1 change: 1 addition & 0 deletions tests/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ static inline int min(int a, int b) { return a < b ? a : b; }
"</DartMeasurement>\n", \
gflops); \
} \
if(rank==0) fflush(stdout); \
(void)gflops;

#endif /* _TESTSCOMMON_H */
45 changes: 39 additions & 6 deletions tests/testing_zgebrd_ge2gb.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011-2020 The University of Tennessee and The University
* Copyright (c) 2011-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2015-2016 Inria, CNRS (LaBRI - UMR 5800), University of
Expand Down Expand Up @@ -95,9 +95,20 @@ int GD_cpQR( int p, int q ) {
}
}

static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
{
return desc->myrank;
}

static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
{
(void)key;
return desc->myrank;
}

int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int loud,
int M, int N, int LDA, int MB, int NB, int IB, int P, int Q, int hmb,
int ltre0, int htre0, int ltree, int htree, int ts, int domino, int rbidiag )
int ltre0, int htre0, int ltree, int htree, int ts, int domino, int rbidiag, int nbrun )
{
int ret = 0;
dplasma_qrtree_t qrtre0, qrtree, lqtree;
Expand All @@ -106,7 +117,7 @@ int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int lo
int MT = (M%MB==0) ? (M/MB) : (M/MB+1);
int NT = (N%NB==0) ? (N/NB) : (N/NB+1);
int cp = -1;
int i, nbrun = 3;
int i;
int rc;

//PASTE_CODE_FLOPS(FLOPS_ZGEBRD, ((DagDouble_t)M, (DagDouble_t)N));
Expand Down Expand Up @@ -151,6 +162,21 @@ int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int lo
parsec_matrix_block_cyclic, (&dcBand, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_LAPACK,
rank, MB+1, NB, MB+1, minMN, 0, 0,
MB+1, minMN, 1, 1, 1, 1, 0, 0));
if(rank > 0 && nodes == 1 && loud == -1) {
/* Fix distributions for local-only testing */
dcA.super.super.rank_of = always_local_rank_of;
dcA.super.super.rank_of_key = always_local_rank_of_key;
dcTS0.super.super.rank_of = always_local_rank_of;
dcTS0.super.super.rank_of_key = always_local_rank_of_key;
dcTT0.super.super.rank_of = always_local_rank_of;
dcTT0.super.super.rank_of_key = always_local_rank_of_key;
dcTS.super.super.rank_of = always_local_rank_of;
dcTS.super.super.rank_of_key = always_local_rank_of_key;
dcTT.super.super.rank_of = always_local_rank_of;
dcTT.super.super.rank_of_key = always_local_rank_of_key;
dcBand.super.super.rank_of = always_local_rank_of;
dcBand.super.super.rank_of_key = always_local_rank_of_key;
}

/* Initialize the matrix */
if(loud > 3) printf("+++ Generate matrices ... ");
Expand Down Expand Up @@ -313,7 +339,7 @@ int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int lo
time_avg += sync_time_elapsed;
gflops = (flops/1.e9)/(sync_time_elapsed);

if (rank == 0){
if (rank == 0 && loud >= 0){
fprintf(stdout,
"zgebrd_ge2gb M= %2d N= %2d NP= %2d NC= %2d P= %2d Q= %2d NB= %2d IB= %2d R-bidiag= %2d treeh= %2d treel_rb= %2d qr_a= %2d QR(domino= %2d treel_qr= %2d ) : %.2f s %f gflops\n",
M, N, nodes, cores, P, Q, NB, IB,
Expand Down Expand Up @@ -401,6 +427,13 @@ int main(int argc, char ** argv)
int ltree = iparam[IPARAM_LOWLVL_TREE] == DPLASMA_GREEDY_TREE ? DPLASMA_GREEDY1P_TREE : iparam[IPARAM_LOWLVL_TREE];
ltree = iparam[IPARAM_ASYNC] ? ltree : 9;

/* Warmup run */
RunOneTest(parsec, 1, iparam[IPARAM_NCORES], rank, -1, 1000, 1000, 1000, 100, 100, 10, 1, 1,
iparam[IPARAM_HMB], iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
ltree, iparam[IPARAM_HIGHLVL_TREE], iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO],
iparam[IPARAM_QR_TSRR], 1);


/**
* Test for varying matrix sizes m-by-n where:
* 1) m = M .. N .. K, and n = m (square)
Expand All @@ -421,15 +454,15 @@ int main(int argc, char ** argv)
m, m, LDA, MB, NB, IB, P, Q, iparam[IPARAM_HMB],
iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
ltree, iparam[IPARAM_HIGHLVL_TREE],
iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] );
iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR], iparam[IPARAM_NRUNS] );
}

for (m=N; m<=M; m+=K ) {
RunOneTest( parsec, nodes, iparam[IPARAM_NCORES], rank, loud,
m, N, LDA, MB, NB, IB, P, Q, iparam[IPARAM_HMB],
iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
ltree, iparam[IPARAM_HIGHLVL_TREE],
iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] );
iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR], iparam[IPARAM_NRUNS] );
}

cleanup_parsec(parsec, iparam);
Expand Down
91 changes: 90 additions & 1 deletion tests/testing_zgelqf.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011-2021 The University of Tennessee and The University
* Copyright (c) 2011-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
Expand All @@ -21,6 +21,8 @@ static int check_solution( parsec_context_t *parsec, int loud,
parsec_tiled_matrix_t *dcB,
parsec_tiled_matrix_t *dcX );

static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec);

int main(int argc, char ** argv)
{
parsec_context_t* parsec;
Expand All @@ -41,6 +43,8 @@ int main(int argc, char ** argv)
PASTE_CODE_FLOPS(FLOPS_ZGELQF, ((DagDouble_t)M, (DagDouble_t)N));

LDA = max(M, LDA);
warmup_zgelqf(rank, random_seed, parsec);

/* initializing matrix structure */
PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
Expand Down Expand Up @@ -362,3 +366,88 @@ static int check_solution( parsec_context_t *parsec, int loud,

return info_solution;
}

static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
{
return desc->myrank;
}

static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
{
(void)key;
return desc->myrank;
}

static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec)
{
int MB = 64;
int IB = 40;
int NB = 64;
int MT = 4;
int NT = 4;
int N = NB*NT;
int M = MB*MT;
int matrix_init = dplasmaMatrixRandom;

/* initializing matrix structure */
PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
rank, MB, NB, M, N, 0, 0,
M, N, 1, 1, 1, 1, 0, 0));
dcA.super.super.rank_of = always_local_rank_of;
dcA.super.super.rank_of_key = always_local_rank_of_key;
PASTE_CODE_ALLOCATE_MATRIX(dcT, 1,
parsec_matrix_block_cyclic, (&dcT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
rank, IB, NB, MT*IB, N, 0, 0,
MT*IB, N, 1, 1, 1, 1, 0, 0));
dcT.super.super.rank_of = always_local_rank_of;
dcT.super.super.rank_of_key = always_local_rank_of_key;

/* Do the CPU warmup first */
dplasma_zplrnt( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t*)&dcT );
parsec_taskpool_t *zgelqf = dplasma_zgelqf_New((parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_t*)&dcT);
zgelqf->devices_index_mask = 1<<0; /* Only CPU ! */
parsec_context_add_taskpool(parsec, zgelqf);
parsec_context_start(parsec);
parsec_context_wait(parsec);

/* Check for which device type (skipping RECURSIVE), we need to warmup this operation */
for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
for(int i = 0; i < (int)zgelqf->nb_task_classes; i++) {
for(int j = 0; NULL != zgelqf->task_classes_array[i]->incarnations[j].hook; j++) {
if( zgelqf->task_classes_array[i]->incarnations[j].type == dtype ) {
goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
}
}
}
continue; /* No incarnation of this device type on any task class; try another type */
do_run:
for(int did = 0; did < (int)parsec_nb_devices; did++) {
parsec_device_module_t *dev = parsec_mca_device_get(did);
if(dev->type != dtype)
continue;
/* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
for(int m = 0; m < MT; m++) {
for(int n = 0; n < NT; n++) {
parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n);
parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
dta = dcT.super.super.data_of(&dcT.super.super, m, n);
parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
}
}
dplasma_zplrnt( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t*)&dcT );
parsec_taskpool_t *zgelqf_device = dplasma_zgelqf_New((parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_t*)&dcT);
parsec_context_add_taskpool(parsec, zgelqf_device);
parsec_context_start(parsec);
parsec_context_wait(parsec);
dplasma_zgelqf_Destruct(zgelqf_device);
}
}

dplasma_zgelqf_Destruct(zgelqf);

}
Loading