diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf
index 79ff1a2d..3a07e349 100644
--- a/src/zgemm_NN_gpu.jdf
+++ b/src/zgemm_NN_gpu.jdf
@@ -1,6 +1,6 @@
 extern "C" %{
 /*
- * Copyright (c) 2017-2020 The University of Tennessee and The University
+ * Copyright (c) 2017-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation. All rights
  *                         reserved.
  *
@@ -18,7 +18,7 @@ extern "C" %{
 #if defined(DPLASMA_HAVE_CUDA)
 #include <cublas.h>
 #endif  /* defined(DPLASMA_HAVE_CUDA) */
-    
+
 static void succ(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l)
 {
     int xn = *x, yn = *y, zn = *z;
@@ -41,7 +41,7 @@ static void succ(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l)
         } else {
             zn = zn+1;
         }
-        
+
         l--;
     }
     *x = xn;
@@ -58,13 +58,13 @@ static int succ_x(int x, int y, int z, int xMax, int yMax, int zMax, int l)
 static int succ_y(int x, int y, int z, int xMax, int yMax, int zMax, int l)
 {
     succ(&x, &y, &z, xMax, yMax, zMax, l);
-    return y;  
+    return y;
 }
 
 static int succ_z(int x, int y, int z, int xMax, int yMax, int zMax, int l)
 {
     succ(&x, &y, &z, xMax, yMax, zMax, l);
-    return z;  
+    return z;
 }
 
 static void pred(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l)
@@ -226,9 +226,11 @@ CTL Z <- Z LOCAL_BARRIER( m/(tB*tP), n/(tC*tQ), 0, u, v )
 BODY
   if( nb_cuda_devices > 0 ) {
       int g = (n / tQ) % nb_cuda_devices;
-      parsec_advise_data_on_device( _f_C->original,
-                                    cuda_device_index[g],
-                                    PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+      if( _f_C->original->preferred_device <= 0 ) {
+          parsec_advise_data_on_device( _f_C->original,
+                                        cuda_device_index[g],
+                                        PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+      }
   }
 END
 
@@ -354,7 +356,7 @@ GEMM(m, n, k)
 
 READ A <- A READ_A(m, k, x, y, z)
 READ B <- B READ_B(k, n, x, y, z)
-RW   C <- k == 0 ? C READ_C(m, n)                    
+RW   C <- k == 0 ? C READ_C(m, n)
                  : C GEMM(m, n, k-1 )
        -> k + 1 == descB->mt ? descC(m, n)
                              : C GEMM(m, n, k+1)
diff --git a/src/zgeqrf.jdf b/src/zgeqrf.jdf
index 2a7ea982..103362b0 100644
--- a/src/zgeqrf.jdf
+++ b/src/zgeqrf.jdf
@@ -1,6 +1,6 @@
 extern "C" %{
 /*
- * Copyright (c) 2010-2020 The University of Tennessee and The University
+ * Copyright (c) 2010-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation. All rights
  *                         reserved.
  * Copyright (c) 2013      Inria. All rights reserved.
@@ -495,6 +495,8 @@ BODY [type=CUDA device=%{ return n; %}
                          WORK,  ib,
                          WORKC, descA->mb,
                          parsec_body.stream );
+
+      parsec_gpu_push_workspace(gpu_device, gpu_stream);
 }
 END
 
diff --git a/tests/common.h b/tests/common.h
index 07c1f3fd..6c3d23e9 100644
--- a/tests/common.h
+++ b/tests/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2020 The University of Tennessee and The University
+ * Copyright (c) 2009-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -283,6 +283,7 @@ static inline int min(int a, int b) { return a < b ? a : b; }
                "</DartMeasurement>\n",                                  \
                gflops);                                                 \
     }                                                                   \
+    if(rank==0) fflush(stdout);                                         \
     (void)gflops;
 
 #endif /* _TESTSCOMMON_H */
diff --git a/tests/testing_zgebrd_ge2gb.c b/tests/testing_zgebrd_ge2gb.c
index f189cc91..25a1fbbc 100644
--- a/tests/testing_zgebrd_ge2gb.c
+++ b/tests/testing_zgebrd_ge2gb.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2020 The University of Tennessee and The University
+ * Copyright (c) 2011-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2015-2016 Inria, CNRS (LaBRI - UMR 5800), University of
@@ -95,9 +95,20 @@ int GD_cpQR( int p, int q ) {
     }
 }
 
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
 int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int loud,
                 int M, int N, int LDA, int MB, int NB, int IB, int P, int Q, int hmb,
-                int ltre0, int htre0, int ltree, int htree, int ts, int domino, int rbidiag )
+                int ltre0, int htre0, int ltree, int htree, int ts, int domino, int rbidiag, int nbrun )
 {
     int ret = 0;
     dplasma_qrtree_t qrtre0, qrtree, lqtree;
@@ -106,7 +117,7 @@ int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int lo
     int MT = (M%MB==0) ? (M/MB) : (M/MB+1);
     int NT = (N%NB==0) ? (N/NB) : (N/NB+1);
     int cp = -1;
-    int i, nbrun = 3;
+    int i;
     int rc;
 
     //PASTE_CODE_FLOPS(FLOPS_ZGEBRD, ((DagDouble_t)M, (DagDouble_t)N));
@@ -151,6 +162,21 @@ int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int lo
         parsec_matrix_block_cyclic, (&dcBand, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_LAPACK,
                                rank, MB+1, NB, MB+1, minMN, 0, 0,
                                MB+1, minMN, 1, 1, 1, 1, 0, 0));
+    if(rank > 0 && nodes == 1 && loud == -1) {
+        /* Fix distributions for local-only testing */
+        dcA.super.super.rank_of = always_local_rank_of;
+        dcA.super.super.rank_of_key = always_local_rank_of_key;
+        dcTS0.super.super.rank_of = always_local_rank_of;
+        dcTS0.super.super.rank_of_key = always_local_rank_of_key;
+        dcTT0.super.super.rank_of = always_local_rank_of;
+        dcTT0.super.super.rank_of_key = always_local_rank_of_key;
+        dcTS.super.super.rank_of = always_local_rank_of;
+        dcTS.super.super.rank_of_key = always_local_rank_of_key;
+        dcTT.super.super.rank_of = always_local_rank_of;
+        dcTT.super.super.rank_of_key = always_local_rank_of_key;
+        dcBand.super.super.rank_of = always_local_rank_of;
+        dcBand.super.super.rank_of_key = always_local_rank_of_key;
+    }
 
     /* Initialize the matrix */
     if(loud > 3) printf("+++ Generate matrices ... ");
@@ -313,7 +339,7 @@ int RunOneTest( parsec_context_t *parsec, int nodes, int cores, int rank, int lo
         time_avg += sync_time_elapsed;
         gflops = (flops/1.e9)/(sync_time_elapsed);
 
-        if (rank == 0){
+        if (rank == 0 && loud >= 0){
             fprintf(stdout,
                     "zgebrd_ge2gb M= %2d N= %2d NP= %2d NC= %2d P= %2d Q= %2d NB= %2d IB= %2d R-bidiag= %2d treeh= %2d treel_rb= %2d qr_a= %2d QR(domino= %2d treel_qr= %2d ) : %.2f s %f gflops\n",
                     M, N, nodes, cores, P, Q, NB, IB,
@@ -401,6 +427,13 @@ int main(int argc, char ** argv)
     int ltree = iparam[IPARAM_LOWLVL_TREE] == DPLASMA_GREEDY_TREE ? DPLASMA_GREEDY1P_TREE : iparam[IPARAM_LOWLVL_TREE];
     ltree = iparam[IPARAM_ASYNC] ? ltree : 9;
 
+    /* Warmup run */
+    RunOneTest(parsec, 1, iparam[IPARAM_NCORES], rank, -1, 1000, 1000, 1000, 100, 100, 10, 1, 1,
+                iparam[IPARAM_HMB], iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
+                ltree, iparam[IPARAM_HIGHLVL_TREE], iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], 
+                iparam[IPARAM_QR_TSRR], 1);
+
+
     /**
      * Test for varying matrix sizes m-by-n where:
      *    1) m = M .. N .. K, and n = m  (square)
@@ -421,7 +454,7 @@ int main(int argc, char ** argv)
                     m, m, LDA, MB, NB, IB, P, Q, iparam[IPARAM_HMB],
                     iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
                     ltree, iparam[IPARAM_HIGHLVL_TREE],
-                    iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] );
+                    iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR], iparam[IPARAM_NRUNS] );
     }
 
     for (m=N; m<=M; m+=K ) {
@@ -429,7 +462,7 @@ int main(int argc, char ** argv)
                     m, N, LDA, MB, NB, IB, P, Q, iparam[IPARAM_HMB],
                     iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
                     ltree, iparam[IPARAM_HIGHLVL_TREE],
-                    iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR] );
+                    iparam[IPARAM_QR_TS_SZE], iparam[IPARAM_QR_DOMINO], iparam[IPARAM_QR_TSRR], iparam[IPARAM_NRUNS] );
     }
 
     cleanup_parsec(parsec, iparam);
diff --git a/tests/testing_zgelqf.c b/tests/testing_zgelqf.c
index 03658e13..4e592cde 100644
--- a/tests/testing_zgelqf.c
+++ b/tests/testing_zgelqf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2021 The University of Tennessee and The University
+ * Copyright (c) 2011-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -21,6 +21,8 @@ static int check_solution( parsec_context_t *parsec, int loud,
                            parsec_tiled_matrix_t *dcB,
                            parsec_tiled_matrix_t *dcX );
 
+static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec);
+
 int main(int argc, char ** argv)
 {
     parsec_context_t* parsec;
@@ -41,6 +43,8 @@ int main(int argc, char ** argv)
     PASTE_CODE_FLOPS(FLOPS_ZGELQF, ((DagDouble_t)M, (DagDouble_t)N));
 
     LDA = max(M, LDA);
+    warmup_zgelqf(rank, random_seed, parsec);
+
     /* initializing matrix structure */
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
         parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
@@ -362,3 +366,88 @@ static int check_solution( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zgelqf(int rank, int random_seed, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int IB = 40;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    int matrix_init = dplasmaMatrixRandom;
+
+    /* initializing matrix structure */
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, MB, NB, M, N, 0, 0,
+                               M, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcT, 1,
+        parsec_matrix_block_cyclic, (&dcT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcT.super.super.rank_of = always_local_rank_of;
+    dcT.super.super.rank_of_key = always_local_rank_of_key;
+
+    /* Do the CPU warmup first */
+    dplasma_zplrnt( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t*)&dcT );
+    parsec_taskpool_t *zgelqf = dplasma_zgelqf_New((parsec_tiled_matrix_t*)&dcA,
+                            (parsec_tiled_matrix_t*)&dcT);
+    zgelqf->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgelqf);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+
+    /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */
+    for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
+        for(int i = 0; i < (int)zgelqf->nb_task_classes; i++) {
+            for(int j = 0; NULL != zgelqf->task_classes_array[i]->incarnations[j].hook; j++) {
+                if( zgelqf->task_classes_array[i]->incarnations[j].type == dtype ) {
+                    goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
+                }
+            }
+        }
+        continue; /* No incarnation of this device type on any task class; try another type */
+    do_run:
+        for(int did = 0; did < (int)parsec_nb_devices; did++) {
+            parsec_device_module_t *dev = parsec_mca_device_get(did);
+            if(dev->type != dtype)
+                continue;
+            /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
+            for(int m = 0; m < MT; m++) {
+                for(int n = 0; n < NT; n++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcT.super.super.data_of(&dcT.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+            dplasma_zplrnt( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
+            dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t*)&dcT );
+            parsec_taskpool_t *zgelqf_device = dplasma_zgelqf_New((parsec_tiled_matrix_t*)&dcA,
+                                    (parsec_tiled_matrix_t*)&dcT);
+            parsec_context_add_taskpool(parsec, zgelqf_device);
+            parsec_context_start(parsec);
+            parsec_context_wait(parsec);
+            dplasma_zgelqf_Destruct(zgelqf_device);
+        }
+    }
+
+    dplasma_zgelqf_Destruct(zgelqf);
+
+}
diff --git a/tests/testing_zgelqf_hqr.c b/tests/testing_zgelqf_hqr.c
index ee528a80..147d6bcc 100644
--- a/tests/testing_zgelqf_hqr.c
+++ b/tests/testing_zgelqf_hqr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2020 The University of Tennessee and The University
+ * Copyright (c) 2011-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -20,6 +20,7 @@ static int check_solution( parsec_context_t *parsec, int loud,
                            parsec_tiled_matrix_t *dcA,
                            parsec_tiled_matrix_t *dcB,
                            parsec_tiled_matrix_t *dcX );
+static void warmup_hqr(parsec_context_t *parsec, int *iparam);
 
 int main(int argc, char ** argv)
 {
@@ -47,6 +48,8 @@ int main(int argc, char ** argv)
     PASTE_CODE_IPARAM_LOCALS(iparam);
     PASTE_CODE_FLOPS(FLOPS_ZGELQF, ((DagDouble_t)M, (DagDouble_t)N));
 
+    warmup_hqr(parsec, iparam);
+
     LDA = max(M, LDA);
     /* initializing matrix structure */
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
@@ -81,79 +84,81 @@ int main(int argc, char ** argv)
                                rank, MB, NB, LDB, NRHS, 0, 0,
                                N, NRHS, P, nodes/P, KP, KQ, IP, JQ));
 
-    /* matrix generation */
-    if(loud > 3) printf("+++ Generate matrices ... ");
-    dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
-    if( check )
-        dplasma_zlacpy( parsec, dplasmaUpperLower,
-                        (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 );
-    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
-    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
-    if(loud > 3) printf("Done\n");
-
     dplasma_hqr_init( &qrtree,
-                      dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA,
-                      iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
-                      iparam[IPARAM_QR_TS_SZE],   iparam[IPARAM_QR_HLVL_SZE],
-                      iparam[IPARAM_QR_DOMINO],   iparam[IPARAM_QR_TSRR] );
-
-    /* Create PaRSEC */
-    PASTE_CODE_ENQUEUE_KERNEL(parsec, zgelqf_param,
-                              (&qrtree,
-                               (parsec_tiled_matrix_t*)&dcA,
-                               (parsec_tiled_matrix_t*)&dcTS,
-                               (parsec_tiled_matrix_t*)&dcTT));
-
-    /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */
-    SYNC_TIME_START();
-    parsec_context_start(parsec);
-    TIME_START();
-    parsec_context_wait(parsec);
+                    dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA,
+                    iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
+                    iparam[IPARAM_QR_TS_SZE],   iparam[IPARAM_QR_HLVL_SZE],
+                    iparam[IPARAM_QR_DOMINO],   iparam[IPARAM_QR_TSRR] );
+
+    for(int t = 0; t < iparam[IPARAM_NRUNS]; t++) {
+        /* matrix generation */
+        if(loud > 3) printf("+++ Generate matrices ... ");
+        dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
+        if( check )
+            dplasma_zlacpy( parsec, dplasmaUpperLower,
+                            (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 );
+        dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+        dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+        if(loud > 3) printf("Done\n");
+
+        /* Create PaRSEC */
+        PASTE_CODE_ENQUEUE_KERNEL(parsec, zgelqf_param,
+                                (&qrtree,
+                                (parsec_tiled_matrix_t*)&dcA,
+                                (parsec_tiled_matrix_t*)&dcTS,
+                                (parsec_tiled_matrix_t*)&dcTT));
+
+        /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */
+        SYNC_TIME_START();
+        parsec_context_start(parsec);
+        TIME_START();
+        parsec_context_wait(parsec);
+
+        SYNC_TIME_PRINT(rank,
+                        ("zgelqf HQR computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n",
+                        iparam[IPARAM_NNODES],
+                        iparam[IPARAM_NCORES],
+                        iparam[IPARAM_P],
+                        iparam[IPARAM_IB],
+                        iparam[IPARAM_MB],
+                        iparam[IPARAM_NB],
+                        iparam[IPARAM_QR_TS_SZE],
+                        iparam[IPARAM_QR_HLVL_SZE],
+                        iparam[IPARAM_LOWLVL_TREE],
+                        iparam[IPARAM_HIGHLVL_TREE],
+                        iparam[IPARAM_QR_DOMINO],
+                        iparam[IPARAM_QR_TSRR],
+                        iparam[IPARAM_M],
+                        iparam[IPARAM_N],
+                        gflops = (flops/1e9)/(sync_time_elapsed)));
+        if(loud >= 5 && rank == 0) {
+            printf("<DartMeasurement name=\"performance\" type=\"numeric/double\"\n"
+                "                 encoding=\"none\" compression=\"none\">\n"
+                "%g\n"
+                "</DartMeasurement>\n",
+                gflops);
+        }
 
-    SYNC_TIME_PRINT(rank,
-                    ("zgelqf HQR computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n",
-                     iparam[IPARAM_NNODES],
-                     iparam[IPARAM_NCORES],
-                     iparam[IPARAM_P],
-                     iparam[IPARAM_IB],
-                     iparam[IPARAM_MB],
-                     iparam[IPARAM_NB],
-                     iparam[IPARAM_QR_TS_SZE],
-                     iparam[IPARAM_QR_HLVL_SZE],
-                     iparam[IPARAM_LOWLVL_TREE],
-                     iparam[IPARAM_HIGHLVL_TREE],
-                     iparam[IPARAM_QR_DOMINO],
-                     iparam[IPARAM_QR_TSRR],
-                     iparam[IPARAM_M],
-                     iparam[IPARAM_N],
-                     gflops = (flops/1e9)/(sync_time_elapsed)));
-    if(loud >= 5 && rank == 0) {
-        printf("<DartMeasurement name=\"performance\" type=\"numeric/double\"\n"
-               "                 encoding=\"none\" compression=\"none\">\n"
-               "%g\n"
-               "</DartMeasurement>\n",
-               gflops);
+        dplasma_zgelqf_param_Destruct( PARSEC_zgelqf_param );
     }
 
 #if defined(PARSEC_SIM)
     if ( rank == 0 ) {
         printf("zgelqf HQR simulation NP= %d NC= %d P= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d MT= %d NT= %d : %d \n",
-               iparam[IPARAM_NNODES],
-               iparam[IPARAM_NCORES],
-               iparam[IPARAM_P],
-               iparam[IPARAM_QR_TS_SZE],
-               iparam[IPARAM_QR_HLVL_SZE],
-               iparam[IPARAM_LOWLVL_TREE],
-               iparam[IPARAM_HIGHLVL_TREE],
-               iparam[IPARAM_QR_DOMINO],
-               iparam[IPARAM_QR_TSRR],
-               MT, NT,
-               parsec_getsimulationdate( parsec ));
+            iparam[IPARAM_NNODES],
+            iparam[IPARAM_NCORES],
+            iparam[IPARAM_P],
+            iparam[IPARAM_QR_TS_SZE],
+            iparam[IPARAM_QR_HLVL_SZE],
+            iparam[IPARAM_LOWLVL_TREE],
+            iparam[IPARAM_HIGHLVL_TREE],
+            iparam[IPARAM_QR_DOMINO],
+            iparam[IPARAM_QR_TSRR],
+            MT, NT,
+            parsec_getsimulationdate( parsec ));
     }
 #endif
 
-    dplasma_zgelqf_param_Destruct( PARSEC_zgelqf_param );
-
     if( check ) {
         if (N >= M) {
             if(loud > 2) printf("+++ Generate the Q ...");
@@ -397,3 +402,127 @@ static int check_solution( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_hqr(parsec_context_t *parsec, int *iparam)
+{
+    dplasma_qrtree_t qrtree;
+    int M, N, LDA, MB, NB, IB, MT;
+    /* Fixed problem size */
+    M = 1000;
+    N = 1000;
+    LDA = 1000;
+    MB = 100;
+    NB = 100;
+    IB = 10;
+    MT = M/MB;
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               iparam[IPARAM_RANK], MB, NB, LDA, N, 0, 0,
+                               M, N, 1, 1, 1, 1, 0, 0));
+    PASTE_CODE_ALLOCATE_MATRIX(dcTS, 1,
+        parsec_matrix_block_cyclic, (&dcTS, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               iparam[IPARAM_RANK], IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    PASTE_CODE_ALLOCATE_MATRIX(dcTT, 1,
+        parsec_matrix_block_cyclic, (&dcTT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               iparam[IPARAM_RANK], IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    dcTS.super.super.rank_of = always_local_rank_of;
+    dcTS.super.super.rank_of_key = always_local_rank_of_key;
+    dcTT.super.super.rank_of = always_local_rank_of;
+    dcTT.super.super.rank_of_key = always_local_rank_of_key;
+
+    dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, iparam[IPARAM_RANDOM_SEED] );
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+
+    dplasma_hqr_init( &qrtree,
+                      dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA,
+                      iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
+                      iparam[IPARAM_QR_TS_SZE],   iparam[IPARAM_QR_HLVL_SZE],
+                      iparam[IPARAM_QR_DOMINO],   iparam[IPARAM_QR_TSRR] );
+
+    /* Create PaRSEC */
+    parsec_taskpool_t *zgelqf_param_tp = dplasma_zgelqf_param_New(&qrtree,
+                               (parsec_tiled_matrix_t*)&dcA,
+                               (parsec_tiled_matrix_t*)&dcTS,
+                               (parsec_tiled_matrix_t*)&dcTT);
+    zgelqf_param_tp->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgelqf_param_tp);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_hqr_finalize( &qrtree );
+
+        /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */
+    for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
+        for(int i = 0; i < (int)zgelqf_param_tp->nb_task_classes; i++) {
+            for(int j = 0; NULL != zgelqf_param_tp->task_classes_array[i]->incarnations[j].hook; j++) {
+                if( zgelqf_param_tp->task_classes_array[i]->incarnations[j].type == dtype ) {
+                    goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
+                }
+            }
+        }
+        continue; /* No incarnation of this device type on any task class; try another type */
+    do_run:
+        for(int did = 0; did < (int)parsec_nb_devices; did++) {
+            parsec_device_module_t *dev = parsec_mca_device_get(did);
+            if(dev->type != dtype)
+                continue;
+            /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
+            for(int m = 0; m < MT; m++) {
+                for(int n = 0; n < MT; n++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcTS.super.super.data_of(&dcTS.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcTT.super.super.data_of(&dcTT.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+            dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, iparam[IPARAM_RANDOM_SEED] );
+            dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+            dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+
+            dplasma_hqr_init( &qrtree,
+                            dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA,
+                            iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
+                            iparam[IPARAM_QR_TS_SZE],   iparam[IPARAM_QR_HLVL_SZE],
+                            iparam[IPARAM_QR_DOMINO],   iparam[IPARAM_QR_TSRR] );
+
+            /* Create PaRSEC */
+            parsec_taskpool_t *zgelqf_device = dplasma_zgelqf_param_New(&qrtree,
+                                    (parsec_tiled_matrix_t*)&dcA,
+                                    (parsec_tiled_matrix_t*)&dcTS,
+                                    (parsec_tiled_matrix_t*)&dcTT);
+            parsec_context_add_taskpool(parsec, zgelqf_device);
+            parsec_context_start(parsec);
+            parsec_context_wait(parsec);
+            dplasma_hqr_finalize( &qrtree );
+
+            parsec_taskpool_free(zgelqf_device);
+        }
+    }
+
+    parsec_taskpool_free(zgelqf_param_tp);
+
+    parsec_data_free(dcA.mat);
+    parsec_data_free(dcTS.mat);
+    parsec_data_free(dcTT.mat);
+    parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcA);
+    parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcTS);
+    parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcTT);
+}
diff --git a/tests/testing_zgelqf_systolic.c b/tests/testing_zgelqf_systolic.c
index 874d9552..08b7b64e 100644
--- a/tests/testing_zgelqf_systolic.c
+++ b/tests/testing_zgelqf_systolic.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2020 The University of Tennessee and The University
+ * Copyright (c) 2011-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -20,6 +20,7 @@ static int check_solution( parsec_context_t *parsec, int loud,
                            parsec_tiled_matrix_t *dcA,
                            parsec_tiled_matrix_t *dcB,
                            parsec_tiled_matrix_t *dcX );
+static void warmup_hqr(parsec_context_t *parsec, int *iparam);
 
 int main(int argc, char ** argv)
 {
@@ -46,6 +47,8 @@ int main(int argc, char ** argv)
     PASTE_CODE_IPARAM_LOCALS(iparam);
     PASTE_CODE_FLOPS(FLOPS_ZGELQF, ((DagDouble_t)M, (DagDouble_t)N));
 
+    warmup_hqr(parsec, iparam);
+
     LDA = max(M, LDA);
     /* initializing matrix structure */
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
@@ -80,59 +83,61 @@ int main(int argc, char ** argv)
                                rank, MB, NB, LDB, NRHS, 0, 0,
                                N, NRHS, P, nodes/P, KP, KQ, IP, JQ));
 
-    /* matrix generation */
-    if(loud > 3) printf("+++ Generate matrices ... ");
-    dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
-    if( check )
-        dplasma_zlacpy( parsec, dplasmaUpperLower,
-                        (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 );
-    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
-    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
-    if(loud > 3) printf("Done\n");
-
-    dplasma_systolic_init( &qrtree,
-                           dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA,
-                           iparam[IPARAM_P],
-                           iparam[IPARAM_Q] );
-
-    /* Create PaRSEC */
-    PASTE_CODE_ENQUEUE_KERNEL(parsec, zgelqf_param,
-                              (&qrtree,
-                               (parsec_tiled_matrix_t*)&dcA,
-                               (parsec_tiled_matrix_t*)&dcTS,
-                               (parsec_tiled_matrix_t*)&dcTT));
-
-    /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */
-    SYNC_TIME_START();
-    parsec_context_start(parsec);
-    TIME_START();
-    parsec_context_wait(parsec);
-
-    SYNC_TIME_PRINT(rank,
-                    ("zgelqf systolic computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n",
-                     iparam[IPARAM_NNODES],
-                     iparam[IPARAM_NCORES],
-                     iparam[IPARAM_P],
-                     iparam[IPARAM_IB],
-                     iparam[IPARAM_MB],
-                     iparam[IPARAM_NB],
-                     iparam[IPARAM_Q],
-                     iparam[IPARAM_P],
-                     iparam[IPARAM_LOWLVL_TREE],
-                     iparam[IPARAM_HIGHLVL_TREE],
-                     iparam[IPARAM_QR_DOMINO],
-                     iparam[IPARAM_QR_TSRR],
-                     iparam[IPARAM_M],
-                     iparam[IPARAM_N],
-                     gflops = (flops/1e9)/(sync_time_elapsed)));
-    if(loud >= 5 && rank == 0) {
-        printf("<DartMeasurement name=\"performance\" type=\"numeric/double\"\n"
-               "                 encoding=\"none\" compression=\"none\">\n"
-               "%g\n"
-               "</DartMeasurement>\n",
-               gflops);
+    for(int t = 0; t < iparam[IPARAM_NRUNS]; t++) {
+        /* matrix generation */
+        if(loud > 3) printf("+++ Generate matrices ... ");
+        dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
+        if( check )
+            dplasma_zlacpy( parsec, dplasmaUpperLower,
+                            (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 );
+        dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+        dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+        if(loud > 3) printf("Done\n");
+
+        dplasma_systolic_init( &qrtree,
+                            dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA,
+                            iparam[IPARAM_P],
+                            iparam[IPARAM_Q] );
+
+        /* Create PaRSEC */
+        PASTE_CODE_ENQUEUE_KERNEL(parsec, zgelqf_param,
+                                (&qrtree,
+                                (parsec_tiled_matrix_t*)&dcA,
+                                (parsec_tiled_matrix_t*)&dcTS,
+                                (parsec_tiled_matrix_t*)&dcTT));
+
+        /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */
+        SYNC_TIME_START();
+        parsec_context_start(parsec);
+        TIME_START();
+        parsec_context_wait(parsec);
+
+        SYNC_TIME_PRINT(rank,
+                        ("zgelqf systolic computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n",
+                        iparam[IPARAM_NNODES],
+                        iparam[IPARAM_NCORES],
+                        iparam[IPARAM_P],
+                        iparam[IPARAM_IB],
+                        iparam[IPARAM_MB],
+                        iparam[IPARAM_NB],
+                        iparam[IPARAM_Q],
+                        iparam[IPARAM_P],
+                        iparam[IPARAM_LOWLVL_TREE],
+                        iparam[IPARAM_HIGHLVL_TREE],
+                        iparam[IPARAM_QR_DOMINO],
+                        iparam[IPARAM_QR_TSRR],
+                        iparam[IPARAM_M],
+                        iparam[IPARAM_N],
+                        gflops = (flops/1e9)/(sync_time_elapsed)));
+        if(loud >= 5 && rank == 0) {
+            printf("<DartMeasurement name=\"performance\" type=\"numeric/double\"\n"
+                "                 encoding=\"none\" compression=\"none\">\n"
+                "%g\n"
+                "</DartMeasurement>\n",
+                gflops);
+        }
+        dplasma_zgelqf_param_Destruct( PARSEC_zgelqf_param );
     }
-
 #if defined(PARSEC_SIM)
     if ( rank == 0 ) {
         printf("zgelqf systolic simulation NP= %d NC= %d P= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d MT= %d NT= %d : %d \n",
@@ -150,8 +155,6 @@ int main(int argc, char ** argv)
     }
 #endif
 
-    dplasma_zgelqf_param_Destruct( PARSEC_zgelqf_param );
-
     if( check ) {
         if (N >= M) {
             if(loud > 2) printf("+++ Generate the Q ...");
@@ -395,3 +398,123 @@ static int check_solution( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_hqr(parsec_context_t *parsec, int *iparam)
+{
+    dplasma_qrtree_t qrtree;
+    int M, N, LDA, MB, NB, IB, MT;
+    /* Fixed problem size */
+    M = 1000;
+    N = 1000;
+    LDA = 1000;
+    MB = 100;
+    NB = 100;
+    IB = 10;
+    MT = M/MB;
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               iparam[IPARAM_RANK], MB, NB, LDA, N, 0, 0,
+                               M, N, 1, 1, 1, 1, 0, 0));
+    PASTE_CODE_ALLOCATE_MATRIX(dcTS, 1,
+        parsec_matrix_block_cyclic, (&dcTS, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               iparam[IPARAM_RANK], IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    PASTE_CODE_ALLOCATE_MATRIX(dcTT, 1,
+        parsec_matrix_block_cyclic, (&dcTT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               iparam[IPARAM_RANK], IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    dcTS.super.super.rank_of = always_local_rank_of;
+    dcTS.super.super.rank_of_key = always_local_rank_of_key;
+    dcTT.super.super.rank_of = always_local_rank_of;
+    dcTT.super.super.rank_of_key = always_local_rank_of_key;
+
+    dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, iparam[IPARAM_RANDOM_SEED] );
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+
+    dplasma_systolic_init( &qrtree,
+                           dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA,
+                           1, 1 );
+
+    /* Create PaRSEC */
+    parsec_taskpool_t *zgelqf_sys_tp = dplasma_zgelqf_param_New(&qrtree,
+                               (parsec_tiled_matrix_t*)&dcA,
+                               (parsec_tiled_matrix_t*)&dcTS,
+                               (parsec_tiled_matrix_t*)&dcTT);
+    zgelqf_sys_tp->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgelqf_sys_tp);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_systolic_finalize( &qrtree );
+
+    /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */
+    for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
+        for(int i = 0; i < (int)zgelqf_sys_tp->nb_task_classes; i++) {
+            for(int j = 0; NULL != zgelqf_sys_tp->task_classes_array[i]->incarnations[j].hook; j++) {
+                if( zgelqf_sys_tp->task_classes_array[i]->incarnations[j].type == dtype ) {
+                    goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
+                }
+            }
+        }
+        continue; /* No incarnation of this device type on any task class; try another type */
+    do_run:
+        for(int did = 0; did < (int)parsec_nb_devices; did++) {
+            parsec_device_module_t *dev = parsec_mca_device_get(did);
+            if(dev->type != dtype)
+                continue;
+            /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
+            for(int m = 0; m < MT; m++) {
+                for(int n = 0; n < MT; n++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcTS.super.super.data_of(&dcTS.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcTT.super.super.data_of(&dcTT.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+            dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, iparam[IPARAM_RANDOM_SEED] );
+            dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+            dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+
+            dplasma_systolic_init( &qrtree,
+                                dplasmaConjTrans, (parsec_tiled_matrix_t *)&dcA,
+                                1, 1 );
+
+            /* Create PaRSEC */
+            parsec_taskpool_t *zgelqf_device = dplasma_zgelqf_param_New(&qrtree,
+                                    (parsec_tiled_matrix_t*)&dcA,
+                                    (parsec_tiled_matrix_t*)&dcTS,
+                                    (parsec_tiled_matrix_t*)&dcTT);
+            parsec_context_add_taskpool(parsec, zgelqf_device);
+            parsec_context_start(parsec);
+            parsec_context_wait(parsec);
+            dplasma_systolic_finalize( &qrtree );
+
+            parsec_taskpool_free(zgelqf_device);
+        }
+    }
+
+    parsec_taskpool_free(zgelqf_sys_tp);
+
+    parsec_data_free(dcA.mat);
+    parsec_data_free(dcTS.mat);
+    parsec_data_free(dcTT.mat);
+    parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcA);
+    parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcTS);
+    parsec_tiled_matrix_destroy((parsec_tiled_matrix_t*)&dcTT);
+}
diff --git a/tests/testing_zgemm.c b/tests/testing_zgemm.c
index c9f753d3..fc2dadb8 100644
--- a/tests/testing_zgemm.c
+++ b/tests/testing_zgemm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2020 The University of Tennessee and The University
+ * Copyright (c) 2009-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -16,6 +16,7 @@ static int check_solution( parsec_context_t *parsec, int loud,
                                                       int Bm, int Bn, int Bseed,
                            dplasma_complex64_t beta,  int M,  int N,  int Cseed,
                            parsec_matrix_block_cyclic_t *dcCfinal );
+static void warmup_zgemm(int rank, int nodes, int random_seed, parsec_context_t *parsec);
 
 int main(int argc, char ** argv)
 {
@@ -49,6 +50,8 @@ int main(int argc, char ** argv)
     LDB = max(LDB, max(K, N));
     LDC = max(LDC, M);
 
+    warmup_zgemm(rank, nodes, random_seed, parsec);
+
     PASTE_CODE_ALLOCATE_MATRIX(dcC, 1,
         parsec_matrix_block_cyclic, (&dcC, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
                                rank, MB, NB, LDC, N, 0, 0,
@@ -282,3 +285,76 @@ static int check_solution( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static void warmup_zgemm(int rank, int nodes, int random_seed, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int NB = 64;
+    int KB = 64;
+    int MT = nodes;
+    int NT = 1;
+    int KT = 1;
+    int M = MT*MB;
+    int N = NT*NB;
+    int K = KT*KB;
+    int did;
+    unsigned int rs = (unsigned int)random_seed;
+    int Aseed = rand_r(&rs);
+    int Bseed = rand_r(&rs);
+    int Cseed = rand_r(&rs);
+    int tA = dplasmaNoTrans;
+    int tB = dplasmaNoTrans;
+    dplasma_complex64_t alpha =  0.51;
+    dplasma_complex64_t beta  = -0.42;
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+            parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                                   rank, MB, KB, M, K, 0, 0,
+                                   M, K, nodes, 1, 1, 1, 0, 0));
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcB, 1,
+            parsec_matrix_block_cyclic, (&dcB, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                                   rank, KB, NB, K, N, 0, 0,
+                                   K, N, 1, 1, 1, 1, 0, 0));
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcC, 1,
+            parsec_matrix_block_cyclic, (&dcC, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                                   rank, MB, NB, M, N, 0, 0,
+                                   M, N, nodes, 1, 1, 1, 0, 0));
+
+    /* Do the CPU warmup first */
+    dplasma_zplrnt( parsec, 0, &dcA.super, Aseed);
+    dplasma_zplrnt( parsec, 0, &dcB.super, Bseed);
+    dplasma_zplrnt( parsec, 0, &dcC.super, Cseed);
+    parsec_taskpool_t *zgemm = dplasma_zgemm_New(tA, tB, alpha, &dcA.super, &dcB.super, beta, &dcC.super);
+    zgemm->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgemm);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_zgemm_Destruct(zgemm);
+
+    /* Now do the other devices, skipping RECURSIVE */
+    /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */
+    for(did = 2; did < (int)parsec_nb_devices; did++) {
+        for(int i = 0; i < MT; i++) {
+            for(int j = 0; j < NT; j++) {
+                if( rank == (int)dcC.super.super.rank_of(&dcC.super.super, i, j) ) {
+                    parsec_data_t *dta = dcC.super.super.data_of(&dcC.super.super, i, j);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+        }
+        dplasma_zplrnt( parsec, 0, &dcA.super, Aseed);
+        dplasma_zplrnt( parsec, 0, &dcB.super, Bseed);
+        dplasma_zplrnt( parsec, 0, &dcC.super, Cseed);
+        dplasma_zgemm(parsec, tA, tB, alpha, &dcA.super, &dcB.super, beta, &dcC.super);
+        parsec_devices_release_memory();
+    }
+
+    parsec_data_free(dcA.mat); dcA.mat = NULL;
+    parsec_tiled_matrix_destroy( &dcA.super );
+    parsec_data_free(dcB.mat); dcB.mat = NULL;
+    parsec_tiled_matrix_destroy( &dcB.super );
+    parsec_data_free(dcC.mat); dcC.mat = NULL;
+    parsec_tiled_matrix_destroy( &dcC.super );
+}
diff --git a/tests/testing_zgemm_dtd.c b/tests/testing_zgemm_dtd.c
index c54d537d..f6eb7320 100644
--- a/tests/testing_zgemm_dtd.c
+++ b/tests/testing_zgemm_dtd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020 The University of Tennessee and The University
+ * Copyright (c) 2015-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -21,6 +21,7 @@ static int check_solution( parsec_context_t *parsec, int loud,
                                                     int Bm, int Bn, int Bseed,
                            dplasma_complex64_t beta,  int M,  int N,  int Cseed,
                            parsec_matrix_block_cyclic_t *dcCfinal );
+static void warmup_zgemm(int rank, int nodes, int random_seed, parsec_context_t *parsec);
 
 static int
 parsec_core_gemm(parsec_execution_stream_t *es, parsec_task_t *this_task)
@@ -84,6 +85,8 @@ int main(int argc, char ** argv)
     LDB = max(LDB, max(K, N));
     LDC = max(LDC, M);
 
+    warmup_zgemm(rank, nodes, random_seed, parsec);
+
     PASTE_CODE_ALLOCATE_MATRIX(dcC, 1,
         parsec_matrix_block_cyclic, (&dcC, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
                                rank, MB, NB, LDC, N, 0, 0,
@@ -669,3 +672,76 @@ static int check_solution( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static void warmup_zgemm(int rank, int nodes, int random_seed, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int NB = 64;
+    int KB = 64;
+    int MT = nodes;
+    int NT = 1;
+    int KT = 1;
+    int M = MT*MB;
+    int N = NT*NB;
+    int K = KT*KB;
+    int did;
+    unsigned int rs = (unsigned int)random_seed;
+    int Aseed = rand_r(&rs);
+    int Bseed = rand_r(&rs);
+    int Cseed = rand_r(&rs);
+    int tA = dplasmaNoTrans;
+    int tB = dplasmaNoTrans;
+    dplasma_complex64_t alpha =  0.51;
+    dplasma_complex64_t beta  = -0.42;
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+            parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                                   rank, MB, KB, M, K, 0, 0,
+                                   M, K, nodes, 1, 1, 1, 0, 0));
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcB, 1,
+            parsec_matrix_block_cyclic, (&dcB, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                                   rank, KB, NB, K, N, 0, 0,
+                                   K, N, 1, 1, 1, 1, 0, 0));
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcC, 1,
+            parsec_matrix_block_cyclic, (&dcC, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                                   rank, MB, NB, M, N, 0, 0,
+                                   M, N, nodes, 1, 1, 1, 0, 0));
+
+    /* Do the CPU warmup first */
+    dplasma_zplrnt( parsec, 0, &dcA.super, Aseed);
+    dplasma_zplrnt( parsec, 0, &dcB.super, Bseed);
+    dplasma_zplrnt( parsec, 0, &dcC.super, Cseed);
+    parsec_taskpool_t *zgemm = dplasma_zgemm_New(tA, tB, alpha, &dcA.super, &dcB.super, beta, &dcC.super);
+    zgemm->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgemm);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_zgemm_Destruct(zgemm);
+
+    /* Now do the other devices, skipping RECURSIVE */
+    /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */
+    for(did = 2; did < (int)parsec_nb_devices; did++) {
+        for(int i = 0; i < MT; i++) {
+            for(int j = 0; j < NT; j++) {
+                if( rank == (int)dcC.super.super.rank_of(&dcC.super.super, i, j) ) {
+                    parsec_data_t *dta = dcC.super.super.data_of(&dcC.super.super, i, j);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+        }
+        dplasma_zplrnt( parsec, 0, &dcA.super, Aseed);
+        dplasma_zplrnt( parsec, 0, &dcB.super, Bseed);
+        dplasma_zplrnt( parsec, 0, &dcC.super, Cseed);
+        dplasma_zgemm(parsec, tA, tB, alpha, &dcA.super, &dcB.super, beta, &dcC.super);
+        parsec_devices_release_memory();
+    }
+
+    parsec_data_free(dcA.mat); dcA.mat = NULL;
+    parsec_tiled_matrix_destroy( &dcA.super );
+    parsec_data_free(dcB.mat); dcB.mat = NULL;
+    parsec_tiled_matrix_destroy( &dcB.super );
+    parsec_data_free(dcC.mat); dcC.mat = NULL;
+    parsec_tiled_matrix_destroy( &dcC.super );
+}
diff --git a/tests/testing_zgeqrf.c b/tests/testing_zgeqrf.c
index fdf63a9f..3f3d3af1 100644
--- a/tests/testing_zgeqrf.c
+++ b/tests/testing_zgeqrf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2021 The University of Tennessee and The University
+ * Copyright (c) 2011-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -20,6 +20,7 @@ static int check_solution( parsec_context_t *parsec, int loud,
                            parsec_tiled_matrix_t *dcA,
                            parsec_tiled_matrix_t *dcB,
                            parsec_tiled_matrix_t *dcX );
+static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec);
 
 int main(int argc, char ** argv)
 {
@@ -43,6 +44,8 @@ int main(int argc, char ** argv)
     LDA = max(M, LDA);
     LDB = max(M, LDB);
 
+    warmup_zgeqrf(rank, random_seed, parsec);
+
     /* initializing matrix structure */
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
         parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
@@ -363,3 +366,71 @@ static int check_solution( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec)
+{
+    int IB = 32;
+    int MB = 64;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    int did;
+
+    /* initializing matrix structure */
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, MB, NB, M, N, 0, 0,
+                               M, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcT, 1,
+        parsec_matrix_block_cyclic, (&dcT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcT.super.super.rank_of = always_local_rank_of;
+    dcT.super.super.rank_of_key = always_local_rank_of_key;
+
+    dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed );
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT);
+    parsec_taskpool_t *zgeqrf = dplasma_zgeqrf_New( (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT);
+    zgeqrf->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgeqrf);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_zgeqrf_Destruct(zgeqrf);
+
+    /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */
+    for(did = 2; did < (int)parsec_nb_devices; did++) {
+        for(int i = 0; i < MT; i++) {
+            for(int j = 0; j < NT; j++) {
+                parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j);
+                parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                dta = dcT.super.super.data_of(&dcT.super.super, i, j);
+                parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+            }
+        }
+        dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed );
+        dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT);
+        dplasma_zgeqrf( parsec, (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT);
+        parsec_devices_release_memory();
+    }
+
+    parsec_data_free(dcA.mat); dcA.mat = NULL;
+    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA );
+    parsec_data_free(dcT.mat); dcT.mat = NULL;
+    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcT );
+    parsec_devices_reset_load(parsec);
+}
diff --git a/tests/testing_zgeqrf_dtd.c b/tests/testing_zgeqrf_dtd.c
index 839212ff..cd4bb238 100644
--- a/tests/testing_zgeqrf_dtd.c
+++ b/tests/testing_zgeqrf_dtd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020 The University of Tennessee and The University
+ * Copyright (c) 2015-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -12,6 +12,18 @@
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 #include "parsec/interfaces/dtd/insert_function.h"
 
+static int check_orthogonality(parsec_context_t *parsec, int loud,
+                               parsec_tiled_matrix_t *Q);
+static int check_factorization(parsec_context_t *parsec, int loud,
+                               parsec_tiled_matrix_t *Aorig,
+                               parsec_tiled_matrix_t *A,
+                               parsec_tiled_matrix_t *Q);
+static int check_solution( parsec_context_t *parsec, int loud,
+                           parsec_tiled_matrix_t *dcA,
+                           parsec_tiled_matrix_t *dcB,
+                           parsec_tiled_matrix_t *dcX );
+static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec);
+
 /* Global indices for the different datatypes */
 static int TILE_FULL,
            TILE_RECTANGLE;
@@ -120,17 +132,6 @@ parsec_core_tsmqr(parsec_execution_stream_t *es, parsec_task_t *this_task)
     return PARSEC_HOOK_RETURN_DONE;
 }
 
-static int check_orthogonality(parsec_context_t *parsec, int loud,
-                               parsec_tiled_matrix_t *Q);
-static int check_factorization(parsec_context_t *parsec, int loud,
-                               parsec_tiled_matrix_t *Aorig,
-                               parsec_tiled_matrix_t *A,
-                               parsec_tiled_matrix_t *Q);
-static int check_solution( parsec_context_t *parsec, int loud,
-                           parsec_tiled_matrix_t *dcA,
-                           parsec_tiled_matrix_t *dcB,
-                           parsec_tiled_matrix_t *dcX );
-
 int main(int argc, char **argv)
 {
     parsec_context_t* parsec;
@@ -153,6 +154,8 @@ int main(int argc, char **argv)
     LDA = max(M, LDA);
     LDB = max(M, LDB);
 
+    warmup_zgeqrf(rank, random_seed, parsec);
+
     /* initializing matrix structure */
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
         parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
@@ -594,3 +597,71 @@ static int check_solution( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec)
+{
+    int IB = 32;
+    int MB = 64;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    int did;
+
+    /* initializing matrix structure */
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, MB, NB, M, N, 0, 0,
+                               M, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcT, 1,
+        parsec_matrix_block_cyclic, (&dcT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcT.super.super.rank_of = always_local_rank_of;
+    dcT.super.super.rank_of_key = always_local_rank_of_key;
+
+    dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed );
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT);
+    parsec_taskpool_t *zgeqrf = dplasma_zgeqrf_New( (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT);
+    zgeqrf->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgeqrf);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_zgeqrf_Destruct(zgeqrf);
+
+    /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */
+    for(did = 2; did < (int)parsec_nb_devices; did++) {
+        for(int i = 0; i < MT; i++) {
+            for(int j = 0; j < NT; j++) {
+                parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j);
+                parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                dta = dcT.super.super.data_of(&dcT.super.super, i, j);
+                parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+            }
+        }
+        dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed );
+        dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT);
+        dplasma_zgeqrf( parsec, (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT);
+        parsec_devices_release_memory();
+    }
+
+    parsec_data_free(dcA.mat); dcA.mat = NULL;
+    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA );
+    parsec_data_free(dcT.mat); dcT.mat = NULL;
+    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcT );
+    parsec_devices_reset_load(parsec);
+}
diff --git a/tests/testing_zgeqrf_dtd_untied.c b/tests/testing_zgeqrf_dtd_untied.c
index 5e2fb5cf..7645db4e 100644
--- a/tests/testing_zgeqrf_dtd_untied.c
+++ b/tests/testing_zgeqrf_dtd_untied.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020 The University of Tennessee and The University
+ * Copyright (c) 2015-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -12,6 +12,18 @@
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 #include "parsec/interfaces/dtd/insert_function.h"
 
+static int check_orthogonality(parsec_context_t *parsec, int loud,
+                               parsec_tiled_matrix_t *Q);
+static int check_factorization(parsec_context_t *parsec, int loud,
+                               parsec_tiled_matrix_t *Aorig,
+                               parsec_tiled_matrix_t *A,
+                               parsec_tiled_matrix_t *Q);
+static int check_solution( parsec_context_t *parsec, int loud,
+                           parsec_tiled_matrix_t *dcA,
+                           parsec_tiled_matrix_t *dcB,
+                           parsec_tiled_matrix_t *dcX );
+static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec);
+
 /* Global indices for the different datatypes */
 static int TILE_FULL,
            TILE_LOWER,
@@ -252,17 +264,6 @@ insert_task_geqrf(parsec_execution_stream_t *es, parsec_task_t *this_task)
     return PARSEC_HOOK_RETURN_DONE;
 }
 
-static int check_orthogonality(parsec_context_t *parsec, int loud,
-                               parsec_tiled_matrix_t *Q);
-static int check_factorization(parsec_context_t *parsec, int loud,
-                               parsec_tiled_matrix_t *Aorig,
-                               parsec_tiled_matrix_t *A,
-                               parsec_tiled_matrix_t *Q);
-static int check_solution( parsec_context_t *parsec, int loud,
-                           parsec_tiled_matrix_t *dcA,
-                           parsec_tiled_matrix_t *dcB,
-                           parsec_tiled_matrix_t *dcX );
-
 int main(int argc, char ** argv)
 {
     parsec_context_t* parsec;
@@ -285,6 +286,8 @@ int main(int argc, char ** argv)
     LDA = max(M, LDA);
     LDB = max(M, LDB);
 
+    warmup_zgeqrf(rank, random_seed, parsec);
+
     /* initializing matrix structure */
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
         parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
@@ -647,3 +650,71 @@ static int check_solution( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zgeqrf(int rank, int random_seed, parsec_context_t *parsec)
+{
+    int IB = 32;
+    int MB = 64;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    int did;
+
+    /* initializing matrix structure */
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, MB, NB, M, N, 0, 0,
+                               M, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcT, 1,
+        parsec_matrix_block_cyclic, (&dcT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcT.super.super.rank_of = always_local_rank_of;
+    dcT.super.super.rank_of_key = always_local_rank_of_key;
+
+    dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed );
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT);
+    parsec_taskpool_t *zgeqrf = dplasma_zgeqrf_New( (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT);
+    zgeqrf->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgeqrf);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_zgeqrf_Destruct(zgeqrf);
+
+    /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */
+    for(did = 2; did < (int)parsec_nb_devices; did++) {
+        for(int i = 0; i < MT; i++) {
+            for(int j = 0; j < NT; j++) {
+                parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j);
+                parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                dta = dcT.super.super.data_of(&dcT.super.super, i, j);
+                parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+            }
+        }
+        dplasma_zpltmg( parsec, dplasmaMatrixRandom, (parsec_tiled_matrix_t *)&dcA, random_seed );
+        dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcT);
+        dplasma_zgeqrf( parsec, (parsec_tiled_matrix_t*)&dcA, (parsec_tiled_matrix_t*)&dcT);
+        parsec_devices_release_memory();
+    }
+
+    parsec_data_free(dcA.mat); dcA.mat = NULL;
+    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA );
+    parsec_data_free(dcT.mat); dcT.mat = NULL;
+    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcT );
+    parsec_devices_reset_load(parsec);
+}
diff --git a/tests/testing_zgeqrf_hqr.c b/tests/testing_zgeqrf_hqr.c
index fce601c9..09d260a1 100644
--- a/tests/testing_zgeqrf_hqr.c
+++ b/tests/testing_zgeqrf_hqr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2020 The University of Tennessee and The University
+ * Copyright (c) 2011-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -20,6 +20,7 @@ static int check_solution( parsec_context_t *parsec, int loud,
                            parsec_tiled_matrix_t *dcA,
                            parsec_tiled_matrix_t *dcB,
                            parsec_tiled_matrix_t *dcX );
+static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_context_t *parsec);
 
 int main(int argc, char ** argv)
 {
@@ -48,6 +49,9 @@ int main(int argc, char ** argv)
     PASTE_CODE_FLOPS(FLOPS_ZGEQRF, ((DagDouble_t)M, (DagDouble_t)N));
 
     LDA = max(M, LDA);
+
+    warmup_zgeqrf_hqr(rank, random_seed, iparam, parsec);
+
     /* initializing matrix structure */
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
         parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
@@ -81,58 +85,61 @@ int main(int argc, char ** argv)
                                rank, MB, NB, LDB, NRHS, 0, 0,
                                M, NRHS, P, nodes/P, KP, KQ, IP, JQ));
 
-    /* matrix generation */
-    if(loud > 3) printf("+++ Generate matrices ... ");
-    dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
-    if( check )
-        dplasma_zlacpy( parsec, dplasmaUpperLower,
-                        (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 );
-    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
-    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
-    if(loud > 3) printf("Done\n");
-
     dplasma_hqr_init( &qrtree,
-                      dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA,
-                      iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
-                      iparam[IPARAM_QR_TS_SZE],   iparam[IPARAM_QR_HLVL_SZE],
-                      iparam[IPARAM_QR_DOMINO],   iparam[IPARAM_QR_TSRR] );
-
-    /* Create PaRSEC */
-    PASTE_CODE_ENQUEUE_KERNEL(parsec, zgeqrf_param,
-                              (&qrtree,
-                               (parsec_tiled_matrix_t*)&dcA,
-                               (parsec_tiled_matrix_t*)&dcTS,
-                               (parsec_tiled_matrix_t*)&dcTT));
-
-    /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */
-    SYNC_TIME_START();
-    parsec_context_start(parsec);
-    TIME_START();
-    parsec_context_wait(parsec);
-
-    SYNC_TIME_PRINT(rank,
-                    ("zgeqrf HQR computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n",
-                     iparam[IPARAM_NNODES],
-                     iparam[IPARAM_NCORES],
-                     iparam[IPARAM_P],
-                     iparam[IPARAM_IB],
-                     iparam[IPARAM_MB],
-                     iparam[IPARAM_NB],
-                     iparam[IPARAM_QR_TS_SZE],
-                     iparam[IPARAM_QR_HLVL_SZE],
-                     iparam[IPARAM_LOWLVL_TREE],
-                     iparam[IPARAM_HIGHLVL_TREE],
-                     iparam[IPARAM_QR_DOMINO],
-                     iparam[IPARAM_QR_TSRR],
-                     iparam[IPARAM_M],
-                     iparam[IPARAM_N],
-                     gflops = (flops/1e9)/(sync_time_elapsed)));
-    if(loud >= 5 && rank == 0) {
-        printf("<DartMeasurement name=\"performance\" type=\"numeric/double\"\n"
-               "                 encoding=\"none\" compression=\"none\">\n"
-               "%g\n"
-               "</DartMeasurement>\n",
-               gflops);
+                    dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA,
+                    iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
+                    iparam[IPARAM_QR_TS_SZE],   iparam[IPARAM_QR_HLVL_SZE],
+                    iparam[IPARAM_QR_DOMINO],   iparam[IPARAM_QR_TSRR] );
+
+    for(int t = 0; t < nruns; t++) {
+        /* matrix generation */
+        if(loud > 3) printf("+++ Generate matrices ... ");
+        dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
+        if( check )
+            dplasma_zlacpy( parsec, dplasmaUpperLower,
+                            (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 );
+        dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+        dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+        if(loud > 3) printf("Done\n");
+
+        /* Create PaRSEC */
+        PASTE_CODE_ENQUEUE_KERNEL(parsec, zgeqrf_param,
+                                (&qrtree,
+                                (parsec_tiled_matrix_t*)&dcA,
+                                (parsec_tiled_matrix_t*)&dcTS,
+                                (parsec_tiled_matrix_t*)&dcTT));
+
+        /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */
+        SYNC_TIME_START();
+        parsec_context_start(parsec);
+        TIME_START();
+        parsec_context_wait(parsec);
+
+        SYNC_TIME_PRINT(rank,
+                        ("zgeqrf HQR computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d M= %d N= %d : %f gflops\n",
+                        iparam[IPARAM_NNODES],
+                        iparam[IPARAM_NCORES],
+                        iparam[IPARAM_P],
+                        iparam[IPARAM_IB],
+                        iparam[IPARAM_MB],
+                        iparam[IPARAM_NB],
+                        iparam[IPARAM_QR_TS_SZE],
+                        iparam[IPARAM_QR_HLVL_SZE],
+                        iparam[IPARAM_LOWLVL_TREE],
+                        iparam[IPARAM_HIGHLVL_TREE],
+                        iparam[IPARAM_QR_DOMINO],
+                        iparam[IPARAM_QR_TSRR],
+                        iparam[IPARAM_M],
+                        iparam[IPARAM_N],
+                        gflops = (flops/1e9)/(sync_time_elapsed)));
+        if(loud >= 5 && rank == 0) {
+            printf("<DartMeasurement name=\"performance\" type=\"numeric/double\"\n"
+                "                 encoding=\"none\" compression=\"none\">\n"
+                "%g\n"
+                "</DartMeasurement>\n",
+                gflops);
+        }
+        dplasma_zgeqrf_param_Destruct( PARSEC_zgeqrf_param );
     }
 
 #if defined(PARSEC_SIM)
@@ -152,8 +159,6 @@ int main(int argc, char ** argv)
     }
 #endif
 
-    dplasma_zgeqrf_param_Destruct( PARSEC_zgeqrf_param );
-
     if( check ) {
         if (M >= N) {
             if(loud > 2) printf("+++ Generate the Q ...");
@@ -397,3 +402,118 @@ static int check_solution( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zgeqrf_hqr(int rank, int random_seed, int *iparam, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int IB = 40;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    int LDA = N;
+    dplasma_qrtree_t qrtree;
+
+    /* initializing matrix structure */
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, MB, NB, LDA, N, 0, 0,
+                               M, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcTS, 1,
+        parsec_matrix_block_cyclic, (&dcTS, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcTS.super.super.rank_of = always_local_rank_of;
+    dcTS.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcTT, 1,
+        parsec_matrix_block_cyclic, (&dcTT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcTT.super.super.rank_of = always_local_rank_of;
+    dcTT.super.super.rank_of_key = always_local_rank_of_key;
+
+    /* Do the CPU warmup first */
+    dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, random_seed );
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+
+    dplasma_hqr_init( &qrtree,
+                      dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA,
+                      iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
+                      iparam[IPARAM_QR_TS_SZE],   iparam[IPARAM_QR_HLVL_SZE],
+                      iparam[IPARAM_QR_DOMINO],   iparam[IPARAM_QR_TSRR] );
+
+    parsec_taskpool_t *zgeqrf_hqr = dplasma_zgeqrf_param_New(&qrtree,
+                               (parsec_tiled_matrix_t*)&dcA,
+                               (parsec_tiled_matrix_t*)&dcTS,
+                               (parsec_tiled_matrix_t*)&dcTT);
+    zgeqrf_hqr->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgeqrf_hqr);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_hqr_finalize( &qrtree );
+
+    /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */
+    for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
+        for(int i = 0; i < (int)zgeqrf_hqr->nb_task_classes; i++) {
+            for(int j = 0; NULL != zgeqrf_hqr->task_classes_array[i]->incarnations[j].hook; j++) {
+                if( zgeqrf_hqr->task_classes_array[i]->incarnations[j].type == dtype ) {
+                    goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
+                }
+            }
+        }
+        continue; /* No incarnation of this device type on any task class; try another type */
+    do_run:
+        for(int did = 0; did < (int)parsec_nb_devices; did++) {
+            parsec_device_module_t *dev = parsec_mca_device_get(did);
+            if(dev->type != dtype)
+                continue;
+            /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
+            for(int m = 0; m < MT; m++) {
+                for(int n = 0; n < NT; n++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcTS.super.super.data_of(&dcTS.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcTT.super.super.data_of(&dcTT.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+            dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, random_seed );
+            dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+            dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+
+            dplasma_hqr_init( &qrtree,
+                      dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA,
+                      iparam[IPARAM_LOWLVL_TREE], iparam[IPARAM_HIGHLVL_TREE],
+                      iparam[IPARAM_QR_TS_SZE],   iparam[IPARAM_QR_HLVL_SZE],
+                      iparam[IPARAM_QR_DOMINO],   iparam[IPARAM_QR_TSRR] );
+
+            parsec_taskpool_t *zgeqrf_hqr_device = dplasma_zgeqrf_param_New(&qrtree,
+                               (parsec_tiled_matrix_t*)&dcA,
+                               (parsec_tiled_matrix_t*)&dcTS,
+                               (parsec_tiled_matrix_t*)&dcTT);
+            parsec_context_add_taskpool(parsec, zgeqrf_hqr_device);
+            parsec_context_start(parsec);
+            parsec_context_wait(parsec);
+            dplasma_hqr_finalize( &qrtree );
+            dplasma_zgeqrf_param_Destruct(zgeqrf_hqr_device);
+        }
+    }
+
+    dplasma_zgeqrf_param_Destruct(zgeqrf_hqr);
+}
diff --git a/tests/testing_zgeqrf_systolic.c b/tests/testing_zgeqrf_systolic.c
index 06664b51..8f9db2a8 100644
--- a/tests/testing_zgeqrf_systolic.c
+++ b/tests/testing_zgeqrf_systolic.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2020 The University of Tennessee and The University
+ * Copyright (c) 2009-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -20,6 +20,7 @@ static int check_solution( parsec_context_t *parsec, int loud,
                            parsec_tiled_matrix_t *dcA,
                            parsec_tiled_matrix_t *dcB,
                            parsec_tiled_matrix_t *dcX );
+static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parsec_context_t *parsec);
 
 int main(int argc, char ** argv)
 {
@@ -47,6 +48,9 @@ int main(int argc, char ** argv)
     PASTE_CODE_FLOPS(FLOPS_ZGEQRF, ((DagDouble_t)M, (DagDouble_t)N));
 
     LDA = max(M, LDA);
+
+    warmup_zgeqrf_systolic(rank, random_seed, iparam, parsec);
+
     /* initializing matrix structure */
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
         parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
@@ -80,53 +84,56 @@ int main(int argc, char ** argv)
                                rank, MB, NB, LDB, NRHS, 0, 0,
                                M, NRHS, P, nodes/P, KP, KQ, IP, JQ));
 
-    /* matrix generation */
-    if(loud > 2) printf("+++ Generate matrices ... ");
-    dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, 3872);
-    if( check )
-        dplasma_zlacpy( parsec, dplasmaUpperLower,
-                        (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 );
-    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
-    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
-    if(loud > 2) printf("Done\n");
-
     dplasma_systolic_init( &qrtree,
-                           dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA,
-                           iparam[IPARAM_P],
-                           iparam[IPARAM_Q] );
-
-    /* Create PaRSEC */
-    PASTE_CODE_ENQUEUE_KERNEL(parsec, zgeqrf_param,
-                              (&qrtree,
-                               (parsec_tiled_matrix_t*)&dcA,
-                               (parsec_tiled_matrix_t*)&dcTS,
-                               (parsec_tiled_matrix_t*)&dcTT));
-
-    /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */
-    SYNC_TIME_START();
-    parsec_context_start(parsec);
-    TIME_START();
-    parsec_context_wait(parsec);
-
-    SYNC_TIME_PRINT(rank,
-                    ("zgeqrf_systolic computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d M= %d N= %d : %f gflops\n",
-                     iparam[IPARAM_NNODES],
-                     iparam[IPARAM_NCORES],
-                     iparam[IPARAM_P],
-                     iparam[IPARAM_IB],
-                     iparam[IPARAM_MB],
-                     iparam[IPARAM_NB],
-                     iparam[IPARAM_Q],
-                     iparam[IPARAM_P],
-                     iparam[IPARAM_M],
-                     iparam[IPARAM_N],
-                     gflops = (flops/1e9)/(sync_time_elapsed)));
-    if(loud >= 5 && rank == 0) {
-        printf("<DartMeasurement name=\"performance\" type=\"numeric/double\"\n"
-               "                 encoding=\"none\" compression=\"none\">\n"
-               "%g\n"
-               "</DartMeasurement>\n",
-               gflops);
+                        dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA,
+                        iparam[IPARAM_P],
+                        iparam[IPARAM_Q] );
+
+    for(int t = 0; t < nruns; t++) {
+        /* matrix generation */
+        if(loud > 2) printf("+++ Generate matrices ... ");
+        dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, 3872);
+        if( check )
+            dplasma_zlacpy( parsec, dplasmaUpperLower,
+                            (parsec_tiled_matrix_t *)&dcA, (parsec_tiled_matrix_t *)&dcA0 );
+        dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+        dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+        if(loud > 2) printf("Done\n");
+
+        /* Create PaRSEC */
+        PASTE_CODE_ENQUEUE_KERNEL(parsec, zgeqrf_param,
+                                (&qrtree,
+                                (parsec_tiled_matrix_t*)&dcA,
+                                (parsec_tiled_matrix_t*)&dcTS,
+                                (parsec_tiled_matrix_t*)&dcTT));
+
+        /* lets rock! This code should be copy the PASTE_CODE_PROGRESS_KERNEL macro */
+        SYNC_TIME_START();
+        parsec_context_start(parsec);
+        TIME_START();
+        parsec_context_wait(parsec);
+
+        SYNC_TIME_PRINT(rank,
+                        ("zgeqrf_systolic computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d M= %d N= %d : %f gflops\n",
+                        iparam[IPARAM_NNODES],
+                        iparam[IPARAM_NCORES],
+                        iparam[IPARAM_P],
+                        iparam[IPARAM_IB],
+                        iparam[IPARAM_MB],
+                        iparam[IPARAM_NB],
+                        iparam[IPARAM_Q],
+                        iparam[IPARAM_P],
+                        iparam[IPARAM_M],
+                        iparam[IPARAM_N],
+                        gflops = (flops/1e9)/(sync_time_elapsed)));
+        if(loud >= 5 && rank == 0) {
+            printf("<DartMeasurement name=\"performance\" type=\"numeric/double\"\n"
+                "                 encoding=\"none\" compression=\"none\">\n"
+                "%g\n"
+                "</DartMeasurement>\n",
+                gflops);
+        }
+        dplasma_zgeqrf_param_Destruct( PARSEC_zgeqrf_param );
     }
 
 #if defined(PARSEC_SIM)
@@ -142,8 +149,6 @@ int main(int argc, char ** argv)
     }
 #endif
 
-    dplasma_zgeqrf_param_Destruct( PARSEC_zgeqrf_param );
-
     if( check ) {
         if (M >= N) {
             if(loud > 2) printf("+++ Generate the Q ...");
@@ -384,3 +389,116 @@ static int check_solution( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zgeqrf_systolic(int rank, int random_seed, int *iparam, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int IB = 40;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    int LDA = N;
+    dplasma_qrtree_t qrtree;
+
+    /* initializing matrix structure */
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, MB, NB, LDA, N, 0, 0,
+                               M, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcTS, 1,
+        parsec_matrix_block_cyclic, (&dcTS, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcTS.super.super.rank_of = always_local_rank_of;
+    dcTS.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcTT, 1,
+        parsec_matrix_block_cyclic, (&dcTT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcTT.super.super.rank_of = always_local_rank_of;
+    dcTT.super.super.rank_of_key = always_local_rank_of_key;
+
+    /* Do the CPU warmup first */
+    dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, random_seed );
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+    dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+
+    dplasma_systolic_init( &qrtree,
+                           dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA,
+                           iparam[IPARAM_P],
+                           iparam[IPARAM_Q] );
+
+    parsec_taskpool_t *zgeqrf_systolic = dplasma_zgeqrf_param_New(&qrtree,
+                               (parsec_tiled_matrix_t*)&dcA,
+                               (parsec_tiled_matrix_t*)&dcTS,
+                               (parsec_tiled_matrix_t*)&dcTT);
+    zgeqrf_systolic->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgeqrf_systolic);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_systolic_finalize( &qrtree );
+
+    /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */
+    for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
+        for(int i = 0; i < (int)zgeqrf_systolic->nb_task_classes; i++) {
+            for(int j = 0; NULL != zgeqrf_systolic->task_classes_array[i]->incarnations[j].hook; j++) {
+                if( zgeqrf_systolic->task_classes_array[i]->incarnations[j].type == dtype ) {
+                    goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
+                }
+            }
+        }
+        continue; /* No incarnation of this device type on any task class; try another type */
+    do_run:
+        for(int did = 0; did < (int)parsec_nb_devices; did++) {
+            parsec_device_module_t *dev = parsec_mca_device_get(did);
+            if(dev->type != dtype)
+                continue;
+            /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
+            for(int m = 0; m < MT; m++) {
+                for(int n = 0; n < NT; n++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcTS.super.super.data_of(&dcTS.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcTT.super.super.data_of(&dcTT.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+            dplasma_zpltmg( parsec, iparam[IPARAM_MATRIX_INIT], (parsec_tiled_matrix_t *)&dcA, random_seed );
+            dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTS);
+            dplasma_zlaset( parsec, dplasmaUpperLower, 0., 0., (parsec_tiled_matrix_t *)&dcTT);
+
+            dplasma_systolic_init( &qrtree,
+                                dplasmaNoTrans, (parsec_tiled_matrix_t *)&dcA,
+                                iparam[IPARAM_P],
+                                iparam[IPARAM_Q] );
+
+            parsec_taskpool_t *device_zgeqrf_systolic = dplasma_zgeqrf_param_New(&qrtree,
+                                    (parsec_tiled_matrix_t*)&dcA,
+                                    (parsec_tiled_matrix_t*)&dcTS,
+                                    (parsec_tiled_matrix_t*)&dcTT);
+            parsec_context_add_taskpool(parsec, device_zgeqrf_systolic);
+            parsec_context_start(parsec);
+            parsec_context_wait(parsec);
+            dplasma_systolic_finalize( &qrtree );
+            dplasma_zgeqrf_param_Destruct(device_zgeqrf_systolic);
+        }
+    }
+
+    dplasma_zgeqrf_param_Destruct(zgeqrf_systolic);
+}
diff --git a/tests/testing_zgesvd.c b/tests/testing_zgesvd.c
index 66b8e109..02787cd7 100644
--- a/tests/testing_zgesvd.c
+++ b/tests/testing_zgesvd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2020 The University of Tennessee and The University
+ * Copyright (c) 2011-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2015-2016 Inria, CNRS (LaBRI - UMR 5800), University of
@@ -13,6 +13,7 @@
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 
 static int check_solution(int N, const double *E1, const double *E2);
+static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec);
 
 int main(int argc, char ** argv)
 {
@@ -20,7 +21,8 @@ int main(int argc, char ** argv)
     int iparam[IPARAM_SIZEOF];
     int ret = 0;
     double *s0 = NULL;
-    double *s1;
+    double *s1 = NULL;
+    double *e  = NULL;
     int minMN;
     int info_solution;
     double time_ge2gb, time_gb2bd, time_solve = -1.;
@@ -52,6 +54,8 @@ int main(int argc, char ** argv)
 
     LDA = max(M, LDA);
 
+    warmup_zgesvd(rank, random_seed, parsec);
+
     if ( M < N ) {
         fprintf(stderr, "This testing can only perform SVD on matrices with M >= N\n");
         return EXIT_FAILURE;
@@ -68,123 +72,122 @@ int main(int argc, char ** argv)
                                rank, MB+1, NB, MB+1, minMN, 0, 0,
                                MB+1, minMN, 1, 1, 1, 1, IP, JQ));
 
-    /* Initialize the matrix */
-    if(loud > 3) printf("+++ Generate matrices ... ");
-
-    /* Generate the matrix on rank 0 */
-    if ( check ) {
+    s1 = (double*)malloc( minMN * sizeof(double));
+    e  = (double*)malloc( minMN * sizeof(double));
 
-        /* Generate the singular values vector as in latms routines for check purpose */
-        if (rank == 0)
-        {
-            double tmp = 1. / (double)N;
-            double alp = ( 1. - tmp ) / ((double)( N - 1 ));
-            int i;
-            s0 = (double *) malloc(minMN * sizeof(double));
-
-            s0[0] = 1.;
-            for(i=1; i < minMN; i++){
-                s0[i] = (double)(N-i-1) * alp + tmp;
-            }
-        }
+    for(int t = 0; t < nruns; t++) {
+        /* Initialize the matrix */
+        if(loud > 3) printf("+++ Generate matrices ... ");
 
-        dplasma_zlatms( parsec, dplasmaGeneral, (double)N, (parsec_tiled_matrix_t *)&dcA, 3872);
-    }
-    else {
-        dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, 3872);
-    }
+        /* Generate the matrix on rank 0 */
+        if ( check ) {
 
-    /* Create Parsec */
-    PASTE_CODE_ENQUEUE_KERNEL(parsec, zgebrd_ge2gb,
-                              (IB,
-                               (parsec_tiled_matrix_t*)&dcA,
-                               (parsec_tiled_matrix_t*)&dcBand));
+            /* Generate the singular values vector as in latms routines for check purpose */
+            if (rank == 0 && NULL ==s0 )
+            {
+                double tmp = 1. / (double)N;
+                double alp = ( 1. - tmp ) / ((double)( N - 1 ));
+                int i;
+                s0 = (double *) malloc(minMN * sizeof(double));
 
-    /* lets rock! */
-    SYNC_TIME_START();
-    rc = parsec_context_start(parsec);
-    PARSEC_CHECK_ERROR(rc, "parsec_context_start");
-    TIME_START();
-    rc = parsec_context_wait(parsec);
-    PARSEC_CHECK_ERROR(rc, "parsec_context_wait");
-    SYNC_TIME_STOP();
-    time_ge2gb = sync_time_elapsed;
+                s0[0] = 1.;
+                for(i=1; i < minMN; i++){
+                    s0[i] = (double)(N-i-1) * alp + tmp;
+                }
+            }
 
-    if( rank == 0 ) {
-        double *e;
+            dplasma_zlatms( parsec, dplasmaGeneral, (double)N, (parsec_tiled_matrix_t *)&dcA, 3872);
+        }
+        else {
+            dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, 3872);
+        }
 
-        s1 = (double*)malloc( minMN * sizeof(double));
-        e  = (double*)malloc( minMN * sizeof(double));
+        /* Create Parsec */
+        PASTE_CODE_ENQUEUE_KERNEL(parsec, zgebrd_ge2gb,
+                                (IB,
+                                (parsec_tiled_matrix_t*)&dcA,
+                                (parsec_tiled_matrix_t*)&dcBand));
 
-/* #if defined(__ICC) || defined(__INTEL_COMPILER) */
-/*         mkl_set_num_threads( iparam[IPARAM_NCORES] ); */
-/* #endif */
-        /* Reduce the band */
+        /* lets rock! */
+        SYNC_TIME_START();
+        rc = parsec_context_start(parsec);
+        PARSEC_CHECK_ERROR(rc, "parsec_context_start");
         TIME_START();
-        info_solution = LAPACKE_zgbbrd( LAPACK_COL_MAJOR,
-                                        'N',
-                                        M, N,
-                                        0, 0, NB,
-                                        dcBand.mat, MB+1,
-                                        s1, e,
-                                        NULL, 1,
-                                        NULL, 1,
-                                        NULL, 1 );
-        TIME_STOP();
-        time_gb2bd = time_elapsed;
-
-        /* Solve the bidiagonal SVD problem */
-        if (info_solution == 0){
+        rc = parsec_context_wait(parsec);
+        PARSEC_CHECK_ERROR(rc, "parsec_context_wait");
+        SYNC_TIME_STOP();
+        time_ge2gb = sync_time_elapsed;
+
+        if( rank == 0 ) {
+    /* #if defined(__ICC) || defined(__INTEL_COMPILER) */
+    /*         mkl_set_num_threads( iparam[IPARAM_NCORES] ); */
+    /* #endif */
+            /* Reduce the band */
             TIME_START();
-            info_solution = LAPACKE_zbdsqr( LAPACK_COL_MAJOR, 'U',
-                                            minMN, 0, 0, 0,
+            info_solution = LAPACKE_zgbbrd( LAPACK_COL_MAJOR,
+                                            'N',
+                                            M, N,
+                                            0, 0, NB,
+                                            dcBand.mat, MB+1,
                                             s1, e,
-                                            NULL, 1, NULL, 1, NULL, 1 );
+                                            NULL, 1,
+                                            NULL, 1,
+                                            NULL, 1 );
             TIME_STOP();
-            time_solve = time_elapsed;
+            time_gb2bd = time_elapsed;
+
+            /* Solve the bidiagonal SVD problem */
+            if (info_solution == 0){
+                TIME_START();
+                info_solution = LAPACKE_zbdsqr( LAPACK_COL_MAJOR, 'U',
+                                                minMN, 0, 0, 0,
+                                                s1, e,
+                                                NULL, 1, NULL, 1, NULL, 1 );
+                TIME_STOP();
+                time_solve = time_elapsed;
+            }
+
+    /* #if defined(__ICC) || defined(__INTEL_COMPILER) */
+    /*         mkl_set_num_threads( 1 ); */
+    /* #endif */
+            fprintf(stderr, "WARNING: This code is using the non optimized Lapack zbdsqr subroutine to reduce the band to bi-diagonal form. Please replace this call by the multi-threaded PLASMA implementation in order to get performance\n");
+            printf("zgeqrf GESVD computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d R-bidiag= %d M= %d N= %d : %e %e %e / %f gflops\n",
+                iparam[IPARAM_NNODES],
+                iparam[IPARAM_NCORES],
+                iparam[IPARAM_P],
+                iparam[IPARAM_IB],
+                iparam[IPARAM_MB],
+                iparam[IPARAM_NB],
+                iparam[IPARAM_QR_TS_SZE],
+                iparam[IPARAM_QR_HLVL_SZE],
+                iparam[IPARAM_LOWLVL_TREE],
+                iparam[IPARAM_HIGHLVL_TREE],
+                iparam[IPARAM_QR_DOMINO],
+                iparam[IPARAM_QR_TSRR],
+                iparam[IPARAM_M],
+                iparam[IPARAM_N],
+                time_ge2gb, time_gb2bd, time_solve,
+                gflops = (flops/1e9)/(time_ge2gb+time_gb2bd+time_solve));
+
+    #if defined(PARSEC_SIM)
+            printf("zgeqrf GESVD simulation NP= %d NC= %d P= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d MT= %d NT= %d : %d \n",
+                iparam[IPARAM_NNODES],
+                iparam[IPARAM_NCORES],
+                iparam[IPARAM_P],
+                iparam[IPARAM_QR_TS_SZE],
+                iparam[IPARAM_QR_HLVL_SZE],
+                iparam[IPARAM_LOWLVL_TREE],
+                iparam[IPARAM_HIGHLVL_TREE],
+                iparam[IPARAM_QR_DOMINO],
+                iparam[IPARAM_QR_TSRR],
+                MT, NT,
+                parsec_getsimulationdate( parsec ));
+    #endif
         }
-        free(e);
 
-/* #if defined(__ICC) || defined(__INTEL_COMPILER) */
-/*         mkl_set_num_threads( 1 ); */
-/* #endif */
-        fprintf(stderr, "WARNING: This code is using the non optimized Lapack zbdsqr subroutine to reduce the band to bi-diagonal form. Please replace this call by the multi-threaded PLASMA implementation in order to get performance\n");
-        printf("zgeqrf GESVD computation NP= %d NC= %d P= %d IB= %d MB= %d NB= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d R-bidiag= %d M= %d N= %d : %e %e %e / %f gflops\n",
-               iparam[IPARAM_NNODES],
-               iparam[IPARAM_NCORES],
-               iparam[IPARAM_P],
-               iparam[IPARAM_IB],
-               iparam[IPARAM_MB],
-               iparam[IPARAM_NB],
-               iparam[IPARAM_QR_TS_SZE],
-               iparam[IPARAM_QR_HLVL_SZE],
-               iparam[IPARAM_LOWLVL_TREE],
-               iparam[IPARAM_HIGHLVL_TREE],
-               iparam[IPARAM_QR_DOMINO],
-               iparam[IPARAM_QR_TSRR],
-               iparam[IPARAM_M],
-               iparam[IPARAM_N],
-               time_ge2gb, time_gb2bd, time_solve,
-               gflops = (flops/1e9)/(time_ge2gb+time_gb2bd+time_solve));
-
-#if defined(PARSEC_SIM)
-        printf("zgeqrf GESVD simulation NP= %d NC= %d P= %d qr_a= %d qr_p = %d treel= %d treeh= %d domino= %d RR= %d MT= %d NT= %d : %d \n",
-               iparam[IPARAM_NNODES],
-               iparam[IPARAM_NCORES],
-               iparam[IPARAM_P],
-               iparam[IPARAM_QR_TS_SZE],
-               iparam[IPARAM_QR_HLVL_SZE],
-               iparam[IPARAM_LOWLVL_TREE],
-               iparam[IPARAM_HIGHLVL_TREE],
-               iparam[IPARAM_QR_DOMINO],
-               iparam[IPARAM_QR_TSRR],
-               MT, NT,
-               parsec_getsimulationdate( parsec ));
-#endif
+        dplasma_zgebrd_ge2gb_Destruct( PARSEC_zgebrd_ge2gb );
     }
 
-    dplasma_zgebrd_ge2gb_Destruct( PARSEC_zgebrd_ge2gb );
-
     if( check && (rank==0) ) {
         if (info_solution == 0 ) {
             info_solution = check_solution(minMN, s0, s1);
@@ -200,10 +203,12 @@ int main(int argc, char ** argv)
                    " ---- TESTING ZGESVD .. M >= N .. FAILED !\n"
                    "***************************************************\n");
         }
-        free(s1);
-        free(s0);
     }
 
+    free(s1);
+    free(s0);
+    free(e);
+
     parsec_data_free(dcA.mat);
     parsec_data_free(dcBand.mat);
 
@@ -259,3 +264,111 @@ static int check_solution(int N, const double *E1, const double *E2)
     }
     return info_solution;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zgesvd(int rank, int random_seed, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int IB = 40;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    double *s1, *e;
+
+    /* initializing matrix structure */
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, MB, NB, M, N, 0, 0,
+                               M, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcBand, 1,
+        parsec_matrix_block_cyclic, (&dcBand, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_LAPACK,
+                               rank, MB+1, NB, MB+1, M, 0, 0,
+                               MB+1, M, 1, 1, 1, 1, 0, 0));
+    dcBand.super.super.rank_of = always_local_rank_of;
+    dcBand.super.super.rank_of_key = always_local_rank_of_key;
+    s1 = (double*)malloc( M * sizeof(double));
+    e  = (double*)malloc( M * sizeof(double));
+
+    /* Do the CPU warmup first */
+    dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, random_seed);
+    parsec_taskpool_t *zgesvd = dplasma_zgebrd_ge2gb_New(IB,
+                               (parsec_tiled_matrix_t*)&dcA,
+                               (parsec_tiled_matrix_t*)&dcBand);
+    zgesvd->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgesvd);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    (void)LAPACKE_zgbbrd( LAPACK_COL_MAJOR,
+                                        'N',
+                                        M, N,
+                                        0, 0, NB,
+                                        dcBand.mat, MB+1,
+                                        s1, e,
+                                        NULL, 1,
+                                        NULL, 1,
+                                        NULL, 1 );
+    (void)LAPACKE_zbdsqr( LAPACK_COL_MAJOR, 'U',
+                                            M, 0, 0, 0,
+                                            s1, e,
+                                            NULL, 1, NULL, 1, NULL, 1 );
+
+    /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */
+    for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
+        for(int i = 0; i < (int)zgesvd->nb_task_classes; i++) {
+            for(int j = 0; NULL != zgesvd->task_classes_array[i]->incarnations[j].hook; j++) {
+                if( zgesvd->task_classes_array[i]->incarnations[j].type == dtype ) {
+                    goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
+                }
+            }
+        }
+        continue; /* No incarnation of this device type on any task class; try another type */
+    do_run:
+        for(int did = 0; did < (int)parsec_nb_devices; did++) {
+            parsec_device_module_t *dev = parsec_mca_device_get(did);
+            if(dev->type != dtype)
+                continue;
+            /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
+            for(int m = 0; m < MT; m++) {
+                for(int n = 0; n < NT; n++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcA.super.super.data_of(&dcA.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    if(m == 0) {
+                        dta = dcBand.super.super.data_of(&dcBand.super.super, m, n);
+                        parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    }
+                }
+            }
+            dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcA, random_seed);
+            parsec_taskpool_t *zgesvd_device = dplasma_zgebrd_ge2gb_New(IB,
+                                    (parsec_tiled_matrix_t*)&dcA,
+                                    (parsec_tiled_matrix_t*)&dcBand);
+            zgesvd->devices_index_mask = 1<<0; /* Only CPU ! */
+            parsec_context_add_taskpool(parsec, zgesvd_device);
+            parsec_context_start(parsec);
+            parsec_context_wait(parsec);
+            dplasma_zgebrd_ge2gb_Destruct( zgesvd_device );
+            /* No need to redo zgbbrd and zbdsqr as those are LAPACK / CPU-only */
+        }
+    }
+
+    free(e);
+    free(s1);
+    dplasma_zgebrd_ge2gb_Destruct( zgesvd );
+
+}
diff --git a/tests/testing_zgetrf_incpiv.c b/tests/testing_zgetrf_incpiv.c
index fce18b09..2eef974c 100644
--- a/tests/testing_zgetrf_incpiv.c
+++ b/tests/testing_zgetrf_incpiv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2021 The University of Tennessee and The University
+ * Copyright (c) 2009-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -14,11 +14,11 @@ static int check_solution( parsec_context_t *parsec, int loud,
                            parsec_tiled_matrix_t *dcA,
                            parsec_tiled_matrix_t *dcB,
                            parsec_tiled_matrix_t *dcX );
-
 static int check_inverse( parsec_context_t *parsec, int loud,
                           parsec_tiled_matrix_t *dcA,
                           parsec_tiled_matrix_t *dcInvA,
                           parsec_tiled_matrix_t *dcI );
+static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec);
 
 int main(int argc, char ** argv)
 {
@@ -40,6 +40,7 @@ int main(int argc, char ** argv)
     PASTE_CODE_FLOPS(FLOPS_ZGETRF, ((DagDouble_t)M,(DagDouble_t)N));
 
     LDA = max(M, LDA);
+    warmup_zgetrf(rank, random_seed, parsec);
 
     if ( M != N && check ) {
         fprintf(stderr, "Check is impossible if M != N\n");
@@ -253,3 +254,100 @@ static int check_inverse( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int IB = 40;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    int matrix_init = dplasmaMatrixRandom;
+    int info;
+
+    /* initializing matrix structure */
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, MB, NB, M, N, 0, 0,
+                               M, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcL, 1,
+        parsec_matrix_block_cyclic, (&dcL, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcL.super.super.rank_of = always_local_rank_of;
+    dcL.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcIPIV, 1,
+        parsec_matrix_block_cyclic, (&dcIPIV, PARSEC_MATRIX_INTEGER, PARSEC_MATRIX_TILE,
+                               rank, MB, 1, M, NT, 0, 0,
+                               M, NT, 1, 1, 1, 1, 0, 0));
+    dcIPIV.super.super.rank_of = always_local_rank_of;
+    dcIPIV.super.super.rank_of_key = always_local_rank_of_key;
+
+    /* Do the CPU warmup first */
+    dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
+    parsec_taskpool_t *zgetrf_incpiv = dplasma_zgetrf_incpiv_New((parsec_tiled_matrix_t*)&dcA,
+                            (parsec_tiled_matrix_t*)&dcL,
+                            (parsec_tiled_matrix_t*)&dcIPIV,
+                            &info);
+    zgetrf_incpiv->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgetrf_incpiv);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+
+    /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */
+    for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
+        for(int i = 0; i < (int)zgetrf_incpiv->nb_task_classes; i++) {
+            for(int j = 0; NULL != zgetrf_incpiv->task_classes_array[i]->incarnations[j].hook; j++) {
+                if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type == dtype ) {
+                    goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
+                }
+            }
+        }
+        continue; /* No incarnation of this device type on any task class; try another type */
+    do_run:
+        for(int did = 0; did < (int)parsec_nb_devices; did++) {
+            parsec_device_module_t *dev = parsec_mca_device_get(did);
+            if(dev->type != dtype)
+                continue;
+            /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
+            for(int m = 0; m < MT; m++) {
+                for(int n = 0; n < NT; n++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcL.super.super.data_of(&dcL.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcIPIV.super.super.data_of(&dcIPIV.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+            dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
+            parsec_taskpool_t *zgetrf_incpiv_device = dplasma_zgetrf_incpiv_New((parsec_tiled_matrix_t*)&dcA,
+                                    (parsec_tiled_matrix_t*)&dcL,
+                                    (parsec_tiled_matrix_t*)&dcIPIV,
+                                    &info);
+            parsec_context_add_taskpool(parsec, zgetrf_incpiv_device);
+            parsec_context_start(parsec);
+            parsec_context_wait(parsec);
+
+            dplasma_zgetrf_incpiv_Destruct(zgetrf_incpiv_device);
+        }
+    }
+
+    dplasma_zgetrf_incpiv_Destruct(zgetrf_incpiv);
+
+}
diff --git a/tests/testing_zgetrf_incpiv_dtd.c b/tests/testing_zgetrf_incpiv_dtd.c
index d710ef9c..e1044774 100644
--- a/tests/testing_zgetrf_incpiv_dtd.c
+++ b/tests/testing_zgetrf_incpiv_dtd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020 The University of Tennessee and The University
+ * Copyright (c) 2015-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -12,6 +12,17 @@
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 #include "parsec/interfaces/dtd/insert_function.h"
 
+static int check_solution( parsec_context_t *parsec, int loud,
+                           parsec_tiled_matrix_t *dcA,
+                           parsec_tiled_matrix_t *dcB,
+                           parsec_tiled_matrix_t *dcX );
+
+static int check_inverse( parsec_context_t *parsec, int loud,
+                          parsec_tiled_matrix_t *dcA,
+                          parsec_tiled_matrix_t *dcInvA,
+                          parsec_tiled_matrix_t *dcI );
+static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec);
+
 /* Global indices for the different datatypes */
 static int TILE_FULL,
            TILE_RECTANGLE,
@@ -124,16 +135,6 @@ parsec_core_ssssm(parsec_execution_stream_t *es, parsec_task_t * this_task)
     return PARSEC_HOOK_RETURN_DONE;
 }
 
-static int check_solution( parsec_context_t *parsec, int loud,
-                           parsec_tiled_matrix_t *dcA,
-                           parsec_tiled_matrix_t *dcB,
-                           parsec_tiled_matrix_t *dcX );
-
-static int check_inverse( parsec_context_t *parsec, int loud,
-                          parsec_tiled_matrix_t *dcA,
-                          parsec_tiled_matrix_t *dcInvA,
-                          parsec_tiled_matrix_t *dcI );
-
 int main(int argc, char ** argv)
 {
     parsec_context_t* parsec;
@@ -159,6 +160,7 @@ int main(int argc, char ** argv)
         fprintf(stderr, "Check is impossible if M != N\n");
         check = 0;
     }
+    warmup_zgetrf(rank, random_seed, parsec);
 
     /* initializing matrix structure */
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
@@ -555,3 +557,100 @@ static int check_inverse( parsec_context_t *parsec, int loud,
 
     return info_solution;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zgetrf(int rank, int random_seed, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int IB = 40;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    int matrix_init = dplasmaMatrixRandom;
+    int info;
+
+    /* initializing matrix structure */
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, MB, NB, M, N, 0, 0,
+                               M, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcL, 1,
+        parsec_matrix_block_cyclic, (&dcL, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcL.super.super.rank_of = always_local_rank_of;
+    dcL.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcIPIV, 1,
+        parsec_matrix_block_cyclic, (&dcIPIV, PARSEC_MATRIX_INTEGER, PARSEC_MATRIX_TILE,
+                               rank, MB, 1, M, NT, 0, 0,
+                               M, NT, 1, 1, 1, 1, 0, 0));
+    dcIPIV.super.super.rank_of = always_local_rank_of;
+    dcIPIV.super.super.rank_of_key = always_local_rank_of_key;
+
+    /* Do the CPU warmup first */
+    dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
+    parsec_taskpool_t *zgetrf_incpiv = dplasma_zgetrf_incpiv_New((parsec_tiled_matrix_t*)&dcA,
+                            (parsec_tiled_matrix_t*)&dcL,
+                            (parsec_tiled_matrix_t*)&dcIPIV,
+                            &info);
+    zgetrf_incpiv->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zgetrf_incpiv);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+
+    /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */
+    for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
+        for(int i = 0; i < (int)zgetrf_incpiv->nb_task_classes; i++) {
+            for(int j = 0; NULL != zgetrf_incpiv->task_classes_array[i]->incarnations[j].hook; j++) {
+                if( zgetrf_incpiv->task_classes_array[i]->incarnations[j].type == dtype ) {
+                    goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
+                }
+            }
+        }
+        continue; /* No incarnation of this device type on any task class; try another type */
+    do_run:
+        for(int did = 0; did < (int)parsec_nb_devices; did++) {
+            parsec_device_module_t *dev = parsec_mca_device_get(did);
+            if(dev->type != dtype)
+                continue;
+            /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
+            for(int m = 0; m < MT; m++) {
+                for(int n = 0; n < NT; n++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcL.super.super.data_of(&dcL.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcIPIV.super.super.data_of(&dcIPIV.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+            dplasma_zpltmg( parsec, matrix_init, (parsec_tiled_matrix_t *)&dcA, random_seed );
+            parsec_taskpool_t *zgetrf_incpiv_device = dplasma_zgetrf_incpiv_New((parsec_tiled_matrix_t*)&dcA,
+                                    (parsec_tiled_matrix_t*)&dcL,
+                                    (parsec_tiled_matrix_t*)&dcIPIV,
+                                    &info);
+            parsec_context_add_taskpool(parsec, zgetrf_incpiv_device);
+            parsec_context_start(parsec);
+            parsec_context_wait(parsec);
+
+            dplasma_zgetrf_incpiv_Destruct(zgetrf_incpiv_device);
+        }
+    }
+
+    dplasma_zgetrf_incpiv_Destruct(zgetrf_incpiv);
+
+}
diff --git a/tests/testing_zheev.c b/tests/testing_zheev.c
index e44dda64..c2845e50 100644
--- a/tests/testing_zheev.c
+++ b/tests/testing_zheev.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2020 The University of Tennessee and The University
+ * Copyright (c) 2011-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -21,6 +21,7 @@
 #undef PRINTF_HEAVY
 
 static int check_solution(int N, double *E1, double *E2, double eps);
+static void warmup_zherbt(int rank, int random_seed, int uplo, parsec_context_t *parsec);
 
 int main(int argc, char *argv[])
 {
@@ -43,6 +44,8 @@ int main(int argc, char *argv[])
     LDA = dplasma_imax( LDA, N );
     LDB = dplasma_imax( LDB, N );
 
+    warmup_zherbt(rank, random_seed, uplo, parsec);
+
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
         parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
                                rank, MB, NB, LDA, N, 0, 0,
@@ -52,25 +55,27 @@ int main(int argc, char *argv[])
                                rank, IB, NB, MT*IB, N, 0, 0,
                                MT*IB, N, P, nodes/P, KP, KP, IP, JQ));
 
-    /* Fill A with randomness */
-    dplasma_zplghe( parsec, (double)N, uplo,
-                    (parsec_tiled_matrix_t *)&dcA, 3872);
+    for(int t = 0; t < nruns; t++) {
+        /* Fill A with randomness */
+        dplasma_zplghe( parsec, (double)N, uplo,
+                        (parsec_tiled_matrix_t *)&dcA, random_seed);
 #ifdef PRINTF_HEAVY
-    printf("########### A (initial, tile storage)\n");
-    dplasma_zprint( parsec, uplo, (parsec_tiled_matrix_t *)&dcA );
+        printf("########### A (initial, tile storage)\n");
+        dplasma_zprint( parsec, uplo, (parsec_tiled_matrix_t *)&dcA );
 #endif
 
-    /* Step 1 - Reduction A to band matrix */
-    PASTE_CODE_ENQUEUE_KERNEL(parsec, zherbt,
-                              (uplo, IB,
-                               (parsec_tiled_matrix_t*)&dcA,
-                               (parsec_tiled_matrix_t*)&dcT));
-    PASTE_CODE_PROGRESS_KERNEL(parsec, zherbt);
+        /* Step 1 - Reduction A to band matrix */
+        PASTE_CODE_ENQUEUE_KERNEL(parsec, zherbt,
+                                (uplo, IB,
+                                (parsec_tiled_matrix_t*)&dcA,
+                                (parsec_tiled_matrix_t*)&dcT));
+        PASTE_CODE_PROGRESS_KERNEL(parsec, zherbt);
 #ifdef PRINTF_HEAVY
-    printf("########### A (reduced to band form)\n");
-    dplasma_zprint( parsec, uplo, &dcA);
+        printf("########### A (reduced to band form)\n");
+        dplasma_zprint( parsec, uplo, &dcA);
 #endif
-
+        dplasma_zherbt_Destruct( PARSEC_zherbt );
+    }
 goto fin;
 
     /* Step 2 - Conversion of the tiled band to 1D band storage */
@@ -243,17 +248,16 @@ goto fin;
         free(W0); free(D); free(E);
     }
 
-    dplasma_zherbt_Destruct( PARSEC_zherbt );
     parsec_taskpool_free( &PARSEC_diag_band_to_rect->super );
     dplasma_zhbrdt_Destruct( PARSEC_zhbrdt );
 
     parsec_data_free(dcBAND.mat);
+    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcBAND);
+fin:
     parsec_data_free(dcA.mat);
     parsec_data_free(dcT.mat);
-    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcBAND);
     parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA);
     parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcT);
-fin:
     cleanup_parsec(parsec, iparam);
 
     return EXIT_SUCCESS;
@@ -297,3 +301,85 @@ static int check_solution(int N, double *E1, double *E2, double eps)
     return info_solution;
 }
 
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zherbt(int rank, int random_seed, int uplo, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int IB = 40;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int LDA = N;
+
+    /* initializing matrix structure */
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, MB, NB, LDA, N, 0, 0,
+                               N, N, 1, 1, 1, 1, 0, 0));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+    PASTE_CODE_ALLOCATE_MATRIX(dcT, 1,
+        parsec_matrix_block_cyclic, (&dcT, PARSEC_MATRIX_COMPLEX_DOUBLE, PARSEC_MATRIX_TILE,
+                               rank, IB, NB, MT*IB, N, 0, 0,
+                               MT*IB, N, 1, 1, 1, 1, 0, 0));
+    dcT.super.super.rank_of = always_local_rank_of;
+    dcT.super.super.rank_of_key = always_local_rank_of_key;
+
+    /* Do the CPU warmup first */
+    dplasma_zplghe( parsec, (double)N, uplo, (parsec_tiled_matrix_t *)&dcA, random_seed);
+    parsec_taskpool_t *zherbt = dplasma_zherbt_New(uplo, IB,
+                                (parsec_tiled_matrix_t*)&dcA,
+                                (parsec_tiled_matrix_t*)&dcT);
+    zherbt->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zherbt);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+
+        /* Check for which device type (skipping RECURSIVE), we need to warmup this operation */
+    for(int dtype = PARSEC_DEV_RECURSIVE+1; dtype < PARSEC_DEV_MAX_NB_TYPE; dtype++) {
+        for(int i = 0; i < (int)zherbt->nb_task_classes; i++) {
+            for(int j = 0; NULL != zherbt->task_classes_array[i]->incarnations[j].hook; j++) {
+                if( zherbt->task_classes_array[i]->incarnations[j].type == dtype ) {
+                    goto do_run; /* We found one class that was on that device, no need to try more incarnations or task classes */
+                }
+            }
+        }
+        continue; /* No incarnation of this device type on any task class; try another type */
+    do_run:
+        for(int did = 0; did < (int)parsec_nb_devices; did++) {
+            parsec_device_module_t *dev = parsec_mca_device_get(did);
+            if(dev->type != dtype)
+                continue;
+            /* This should work, right? Unfortunately, we can't test until there is a <dev>-enabled implementation for this test */
+            for(int m = 0; m < MT; m++) {
+                for(int n = 0; n < NT; n++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                    dta = dcT.super.super.data_of(&dcT.super.super, m, n);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+            dplasma_zplghe( parsec, (double)N, uplo, (parsec_tiled_matrix_t *)&dcA, random_seed);
+            parsec_taskpool_t *zherbt_device = dplasma_zherbt_New(uplo, IB,
+                                        (parsec_tiled_matrix_t*)&dcA,
+                                        (parsec_tiled_matrix_t*)&dcT);
+            parsec_context_add_taskpool(parsec, zherbt_device);
+            parsec_context_start(parsec);
+            parsec_context_wait(parsec);
+            dplasma_zherbt_Destruct(zherbt_device);
+        }
+    }
+
+    dplasma_zherbt_Destruct(zherbt);
+}
diff --git a/tests/testing_zpoinv.c b/tests/testing_zpoinv.c
index f907a189..01675de5 100644
--- a/tests/testing_zpoinv.c
+++ b/tests/testing_zpoinv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2020 The University of Tennessee and The University
+ * Copyright (c) 2009-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -12,6 +12,8 @@
 #include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 
+static void warmup_zpoinv(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec);
+
 int main(int argc, char ** argv)
 {
     parsec_context_t* parsec;
@@ -36,29 +38,33 @@ int main(int argc, char ** argv)
     KP = 1;
     KQ = 1;
 
+    warmup_zpoinv(rank, uplo, random_seed, parsec);
+
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
         parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
                                    rank, MB, NB, LDA, N, 0, 0,
                                    N, N, P, nodes/P, uplo));
 
-    /* matrix generation */
-    if(loud > 3) printf("+++ Generate matrices ... ");
-    dplasma_zplghe( parsec, (double)(N), uplo,
-                    (parsec_tiled_matrix_t *)&dcA, random_seed);
-    if(loud > 3) printf("Done\n");
-
-    if (async) {
-        PASTE_CODE_ENQUEUE_KERNEL(parsec, zpoinv,
-                                  (uplo, (parsec_tiled_matrix_t*)&dcA, &info));
-        PASTE_CODE_PROGRESS_KERNEL(parsec, zpoinv);
-        dplasma_zpoinv_Destruct( PARSEC_zpoinv );
-    }
-    else {
-        SYNC_TIME_START();
-        info = dplasma_zpoinv_sync( parsec, uplo, (parsec_tiled_matrix_t*)&dcA );
-        SYNC_TIME_PRINT(rank, ("zpoinv\tPxQ= %3d %-3d NB= %4d N= %7d : %14f gflops\n",
-                               P, Q, NB, N,
-                               gflops=(flops/1e9)/sync_time_elapsed));
+    for(int t = 0; t < nruns; t++) {
+        /* matrix generation */
+        if(loud > 3) printf("+++ Generate matrices ... ");
+        dplasma_zplghe( parsec, (double)(N), uplo,
+                        (parsec_tiled_matrix_t *)&dcA, random_seed);
+        if(loud > 3) printf("Done\n");
+
+        if (async) {
+            PASTE_CODE_ENQUEUE_KERNEL(parsec, zpoinv,
+                                    (uplo, (parsec_tiled_matrix_t*)&dcA, &info));
+            PASTE_CODE_PROGRESS_KERNEL(parsec, zpoinv);
+            dplasma_zpoinv_Destruct( PARSEC_zpoinv );
+        }
+        else {
+            SYNC_TIME_START();
+            info = dplasma_zpoinv_sync( parsec, uplo, (parsec_tiled_matrix_t*)&dcA );
+            SYNC_TIME_PRINT(rank, ("zpoinv\tPxQ= %3d %-3d NB= %4d N= %7d : %14f gflops\n",
+                                P, Q, NB, N,
+                                gflops=(flops/1e9)/sync_time_elapsed));
+        }
     }
 
     if( 0 == rank && info != 0 ) {
@@ -96,3 +102,68 @@ int main(int argc, char ** argv)
     cleanup_parsec(parsec, iparam);
     return ret;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zpoinv(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int M = MB*MT;
+    int N = NB*NT;
+    int did;
+    int info;
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
+                                   rank, MB, NB, M, N, 0, 0,
+                                   M, N, 1, 1, uplo));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+
+    /* Do the CPU warmup first */
+    dplasma_zplghe(parsec, (double)(N), uplo, &dcA.super, random_seed);
+    parsec_taskpool_t *zpoinv = dplasma_zpoinv_New(uplo, &dcA.super, &info );
+    zpoinv->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zpoinv);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_zpoinv_Destruct(zpoinv);
+
+    /* Now do the other devices, skipping RECURSIVE */
+    /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */
+    for(did = 2; did < (int)parsec_nb_devices; did++) {
+        if(PARSEC_MATRIX_LOWER == uplo) {
+            for(int i = 0; i < MT; i++) {
+                for(int j = 0; j <= i; j++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+        } else {
+            for(int i = 0; i < MT; i++) {
+                for(int j = i; j < NT; j++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+        }
+        dplasma_zplghe(parsec, (double)(N), uplo, &dcA.super, random_seed);
+        dplasma_zpoinv( parsec, uplo, &dcA.super );
+        parsec_devices_release_memory();
+    }
+
+    parsec_data_free(dcA.mat); dcA.mat = NULL;
+    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA );
+}
diff --git a/tests/testing_zpotrf.c b/tests/testing_zpotrf.c
index 5245adb9..c59fc33d 100644
--- a/tests/testing_zpotrf.c
+++ b/tests/testing_zpotrf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2021 The University of Tennessee and The University
+ * Copyright (c) 2009-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -12,6 +12,8 @@
 #include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 
+static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec);
+
 int main(int argc, char ** argv)
 {
     parsec_context_t* parsec;
@@ -35,6 +37,8 @@ int main(int argc, char ** argv)
     KP = 1;
     KQ = 1;
 
+    warmup_zpotrf(rank, uplo, random_seed, parsec);
+
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
         parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
                                    rank, MB, NB, LDA, N, 0, 0,
@@ -67,7 +71,7 @@ int main(int argc, char ** argv)
         }
         else
         {
-            PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(parsec, zpotrf, 
+            PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(parsec, zpotrf,
                                       ( uplo, (parsec_tiled_matrix_t*)&dcA, &info),
                                       dplasma_zpotrf_Destruct( PARSEC_zpotrf ));
         }
@@ -130,3 +134,69 @@ int main(int argc, char ** argv)
     cleanup_parsec(parsec, iparam);
     return ret;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    int did;
+    int info;
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
+                                   rank, MB, NB, M, N, 0, 0,
+                                   M, N, 1, 1, uplo));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+
+    /* Do the CPU warmup first */
+    dplasma_zplghe(parsec, (double)(N), uplo, &dcA.super, random_seed);
+    parsec_taskpool_t *zpotrf = dplasma_zpotrf_New(uplo, &dcA.super, &info );
+    zpotrf->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zpotrf);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_zpotrf_Destruct(zpotrf);
+
+    /* Now do the other devices, skipping RECURSIVE */
+    /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */
+    for(did = 2; did < (int)parsec_nb_devices; did++) {
+        if(PARSEC_MATRIX_LOWER == uplo) {
+            for(int i = 0; i < MT; i++) {
+                for(int j = 0; j <= i; j++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+        } else {
+            for(int i = 0; i < MT; i++) {
+                for(int j = i; j < NT; j++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+        }
+        dplasma_zplghe( parsec, (double)(N), uplo,
+                        (parsec_tiled_matrix_t *)&dcA, random_seed);
+        dplasma_zpotrf( parsec, uplo, &dcA.super );
+        parsec_devices_release_memory();
+    }
+
+    parsec_data_free(dcA.mat); dcA.mat = NULL;
+    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA );
+}
diff --git a/tests/testing_zpotrf_dtd.c b/tests/testing_zpotrf_dtd.c
index 08beebe8..32e2dd38 100644
--- a/tests/testing_zpotrf_dtd.c
+++ b/tests/testing_zpotrf_dtd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2020 The University of Tennessee and The University
+ * Copyright (c) 2013-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -20,6 +20,8 @@
 #include <cublas.h>
 #endif  /* defined(DPLASMA_HAVE_CUDA) */
 
+static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec);
+
 /* Global index for the full tile datatype */
 static int TILE_FULL;
 
@@ -226,6 +228,8 @@ int main(int argc, char **argv)
     KP = 1;
     KQ = 1;
 
+    warmup_zpotrf(rank, uplo, random_seed, parsec);
+
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
         parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
                                    rank, MB, NB, LDA, N, 0, 0,
@@ -544,3 +548,69 @@ int main(int argc, char **argv)
     cleanup_parsec(parsec, iparam);
     return ret;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    int did;
+    int info;
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
+                                   rank, MB, NB, M, N, 0, 0,
+                                   M, N, 1, 1, uplo));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+
+    /* Do the CPU warmup first */
+    dplasma_zplghe(parsec, (double)(N), uplo, &dcA.super, random_seed);
+    parsec_taskpool_t *zpotrf = dplasma_zpotrf_New(uplo, &dcA.super, &info );
+    zpotrf->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zpotrf);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_zpotrf_Destruct(zpotrf);
+
+    /* Now do the other devices, skipping RECURSIVE */
+    /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */
+    for(did = 2; did < (int)parsec_nb_devices; did++) {
+        if(PARSEC_MATRIX_LOWER == uplo) {
+            for(int i = 0; i < MT; i++) {
+                for(int j = 0; j <= i; j++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+        } else {
+            for(int i = 0; i < MT; i++) {
+                for(int j = i; j < NT; j++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+        }
+        dplasma_zplghe( parsec, (double)(N), uplo,
+                        (parsec_tiled_matrix_t *)&dcA, random_seed);
+        dplasma_zpotrf( parsec, uplo, &dcA.super );
+        parsec_devices_release_memory();
+    }
+
+    parsec_data_free(dcA.mat); dcA.mat = NULL;
+    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA );
+}
diff --git a/tests/testing_zpotrf_dtd_untied.c b/tests/testing_zpotrf_dtd_untied.c
index 63458744..55c39026 100644
--- a/tests/testing_zpotrf_dtd_untied.c
+++ b/tests/testing_zpotrf_dtd_untied.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020 The University of Tennessee and The University
+ * Copyright (c) 2015-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  *
@@ -26,6 +26,8 @@
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 #include "parsec/interfaces/dtd/insert_function.h"
 
+static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec);
+
 /* Global index for the full tile datatype */
 static int TILE_FULL;
 
@@ -351,6 +353,8 @@ int main(int argc, char **argv)
     KP = 1;
     KQ = 1;
 
+    warmup_zpotrf(rank, uplo, random_seed, parsec);
+
     PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
         parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
                                    rank, MB, NB, LDA, N, 0, 0,
@@ -481,3 +485,69 @@ int main(int argc, char **argv)
     cleanup_parsec(parsec, iparam);
     return ret;
 }
+
+static uint32_t always_local_rank_of(parsec_data_collection_t * desc, ...)
+{
+    return desc->myrank;
+}
+
+static uint32_t always_local_rank_of_key(parsec_data_collection_t * desc, parsec_data_key_t key)
+{
+    (void)key;
+    return desc->myrank;
+}
+
+static void warmup_zpotrf(int rank, dplasma_enum_t uplo, int random_seed, parsec_context_t *parsec)
+{
+    int MB = 64;
+    int NB = 64;
+    int MT = 4;
+    int NT = 4;
+    int N = NB*NT;
+    int M = MB*MT;
+    int did;
+    int info;
+
+    PASTE_CODE_ALLOCATE_MATRIX(dcA, 1,
+        parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
+                                   rank, MB, NB, M, N, 0, 0,
+                                   M, N, 1, 1, uplo));
+    dcA.super.super.rank_of = always_local_rank_of;
+    dcA.super.super.rank_of_key = always_local_rank_of_key;
+
+    /* Do the CPU warmup first */
+    dplasma_zplghe(parsec, (double)(N), uplo, &dcA.super, random_seed);
+    parsec_taskpool_t *zpotrf = dplasma_zpotrf_New(uplo, &dcA.super, &info );
+    zpotrf->devices_index_mask = 1<<0; /* Only CPU ! */
+    parsec_context_add_taskpool(parsec, zpotrf);
+    parsec_context_start(parsec);
+    parsec_context_wait(parsec);
+    dplasma_zpotrf_Destruct(zpotrf);
+
+    /* Now do the other devices, skipping RECURSIVE */
+    /* We know that there is a GPU-enabled version of this operation, so warm it up if some device is enabled */
+    for(did = 2; did < (int)parsec_nb_devices; did++) {
+        if(PARSEC_MATRIX_LOWER == uplo) {
+            for(int i = 0; i < MT; i++) {
+                for(int j = 0; j <= i; j++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+        } else {
+            for(int i = 0; i < MT; i++) {
+                for(int j = i; j < NT; j++) {
+                    parsec_data_t *dta = dcA.super.super.data_of(&dcA.super.super, i, j);
+                    parsec_advise_data_on_device( dta, did, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+                }
+            }
+        }
+        dplasma_zplghe( parsec, (double)(N), uplo,
+                        (parsec_tiled_matrix_t *)&dcA, random_seed);
+        dplasma_zpotrf( parsec, uplo, &dcA.super );
+        parsec_devices_release_memory();
+    }
+
+    parsec_data_free(dcA.mat); dcA.mat = NULL;
+    parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA );
+}