diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 929d1f7de88..0f887753bf5 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -148,62 +148,175 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 
 #else
 
-#define SEG_ADD(s) \
-    l += s; \
-    while (l >= lines) { \
-	l -= lines; \
-	c += width; \
-    }
-
-__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
-                                                         size_t nb_size,
-                                                         OPAL_PTRDIFF_TYPE nb_extent,
-                                                         unsigned char * b_source,
-                                                         unsigned char * b_destination )
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                         size_t size,
+                                                         OPAL_PTRDIFF_TYPE extent,
+                                                         unsigned char* source,
+                                                         unsigned char* destination )
 {
-    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-    uint32_t num_threads = gridDim.x * blockDim.x;
-  
-    //size_t lines = (size_t)lines;
-    size_t size = nb_size / 8;
-    size_t extent = nb_extent / 8;
-    uint64_t * source = (uint64_t *) b_source;
-    uint64_t *destination = (uint64_t *) b_destination;
-    uint64_t val[KERNEL_UNROLL];
+    uint32_t i, u, tid, num_threads, warp_id, tid_per_warp, nb_warps, nb_warps_x, nb_warps_y, pos_x, pos_y, size_last_y, size_last_x;
+    uint32_t size_nb, extent_nb;
+    uint64_t *_source_tmp, *_destination_tmp, *source_64, *destination_64, *_source_left_tmp, *_destination_left_tmp;
+    uint64_t val[UNROLL_16];
     
-    int col = 0;
-    for (int width = 32; width > 0 && col < size; width >>= 1) {
-    	while (size-col >= width) {
-    	    const int warp_id = tid / width;
-    	    const int warp_tid = tid & (width-1);
-    	    const int warp_nb = num_threads / width;
-    	    const int c = col + warp_tid;
-            int l = warp_id * KERNEL_UNROLL;
-    	    uint64_t *src = source + c;
-    	    uint64_t *dst = destination + c;
-    	    for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
-    		    #pragma unroll
-    		    for (int u=0; u<KERNEL_UNROLL; u++) {
-    		        val[u] = *(src+(l+u)*extent);
-    		    }
-    		    #pragma unroll
-    		    for (int u=0; u<KERNEL_UNROLL; u++) {
-    		        dst[(l+u)*size] = val[u];
-    		    }
-    		    l += warp_nb * KERNEL_UNROLL;
-    	    }
-    	    /* Finish non-unrollable case */
-    	    for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
-    		    dst[l*size] = *(src+l*extent);
-    		    l++;
-    	    }		
-    	    col += width;
-    	}
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    warp_id = tid / CUDA_WARP_SIZE;
+    tid_per_warp = threadIdx.x & (CUDA_WARP_SIZE-1);
+    nb_warps = num_threads / CUDA_WARP_SIZE;
+    
+    extent_nb = extent / 8;
+    size_nb = size / 8;
+    source_64 = (uint64_t*)source;
+    destination_64 = (uint64_t*)destination;
+    
+    nb_warps_x = size_nb / CUDA_WARP_SIZE;
+    size_last_x = size_nb & (CUDA_WARP_SIZE-1);
+    if ( size_last_x != 0) {
+        nb_warps_x ++;
+    } else {
+        size_last_x = CUDA_WARP_SIZE;
+    }
+    nb_warps_y = copy_loops / UNROLL_16;
+    size_last_y = copy_loops & (UNROLL_16-1);
+    if ( size_last_y != 0) {
+        nb_warps_y ++;
+    } else {
+        size_last_y = UNROLL_16;
+    }
+    // if (threadIdx.x == 0) {
+    //     printf("warp_id %u, nb_warps_x %u, nb_warps_y %u, tid_per_warps %u, nb_warps %u\n", warp_id, nb_warps_x, nb_warps_y, tid_per_warp, nb_warps);
+    // }
+    
+    const uint32_t extent_nb_times_UNROLL_16 =  extent_nb * UNROLL_16;
+    const uint32_t size_nb_times_UNROLL_16 = size_nb * UNROLL_16;
+    source_64 += tid_per_warp;
+    destination_64 += tid_per_warp;
+    
+    for (i = warp_id; i < (nb_warps_x-1) * (nb_warps_y-1); i += nb_warps) {
+        pos_x = i / (nb_warps_y-1);
+        pos_y = i % (nb_warps_y-1);
+        _source_tmp = source_64 + pos_y * extent_nb_times_UNROLL_16 + pos_x * CUDA_WARP_SIZE;
+        _destination_tmp = destination_64 + pos_y * size_nb_times_UNROLL_16 + pos_x * CUDA_WARP_SIZE;
+        #pragma unroll
+        for (u = 0; u < UNROLL_16; u++) {
+            val[u] = *(_source_tmp + u * extent_nb);
+        }
+        #pragma unroll
+        for (uint32_t u = 0; u < UNROLL_16; u++) {
+            *(_destination_tmp + u * size_nb) = val[u];
+        }
+    }
+    if (tid_per_warp < size_last_x) {
+        pos_x = nb_warps_x - 1;
+        _source_left_tmp = source_64 + pos_x * CUDA_WARP_SIZE;
+        _destination_left_tmp = destination_64 + pos_x * CUDA_WARP_SIZE;
+        for (i = warp_id; i < nb_warps_y-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * extent_nb_times_UNROLL_16;
+            _destination_tmp = _destination_left_tmp + i * size_nb_times_UNROLL_16;
+            #pragma unroll
+            for (u = 0; u < UNROLL_16; u++) {
+                val[u] = *(_source_tmp + u * extent_nb);
+            }
+            #pragma unroll
+            for (uint32_t u = 0; u < UNROLL_16; u++) {
+                *(_destination_tmp + u * size_nb) = val[u];
+            }
+        }
     }
-
     
+    pos_y = nb_warps_y - 1;
+    _source_left_tmp = source_64 + pos_y * extent_nb_times_UNROLL_16;
+    _destination_left_tmp = destination_64 + pos_y * size_nb_times_UNROLL_16;
+    if (size_last_y == UNROLL_16) {
+        for (i = warp_id; i < nb_warps_x-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * CUDA_WARP_SIZE;
+            _destination_tmp = _destination_left_tmp + i * CUDA_WARP_SIZE;
+            #pragma unroll
+            for (u = 0; u < UNROLL_16; u++) {
+                val[u] = *(_source_tmp + u * extent_nb);
+            }
+            #pragma unroll
+            for (uint32_t u = 0; u < UNROLL_16; u++) {
+                *(_destination_tmp + u * size_nb) = val[u];
+            }  
+        } 
+    } else {
+        for (i = warp_id; i < nb_warps_x-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * CUDA_WARP_SIZE;
+            _destination_tmp = _destination_left_tmp + i * CUDA_WARP_SIZE;
+            for (u = 0; u < size_last_y; u++) {
+                *(_destination_tmp + u * size_nb) = *(_source_tmp + u * extent_nb);
+            }
+        }
+    }
+    
+    if (warp_id == 0 && tid_per_warp < size_last_x) {
+        _source_tmp = source_64 + (nb_warps_y-1) * extent_nb_times_UNROLL_16 + (nb_warps_x-1) * CUDA_WARP_SIZE;
+        _destination_tmp = destination_64 + (nb_warps_y-1) * size_nb_times_UNROLL_16 + (nb_warps_x-1) * CUDA_WARP_SIZE;
+        for (u = 0; u < size_last_y; u++) {
+            *(_destination_tmp + u * size_nb) = *(_source_tmp + u * extent_nb);
+        }
+    }
 }
 
+
+// #define SEG_ADD(s) \
+//     l += s; \
+//     while (l >= lines) { \
+//     l -= lines; \
+//     c += width; \
+//     }
+//
+// __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
+//                                                          size_t nb_size,
+//                                                          OPAL_PTRDIFF_TYPE nb_extent,
+//                                                          unsigned char * b_source,
+//                                                          unsigned char * b_destination )
+// {
+//     uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+//     uint32_t num_threads = gridDim.x * blockDim.x;
+//
+//     //size_t lines = (size_t)lines;
+//     size_t size = nb_size / 8;
+//     size_t extent = nb_extent / 8;
+//     uint64_t * source = (uint64_t *) b_source;
+//     uint64_t *destination = (uint64_t *) b_destination;
+//     uint64_t val[KERNEL_UNROLL];
+//
+//     int col = 0;
+//     for (int width = 32; width > 0 && col < size; width >>= 1) {
+//         while (size-col >= width) {
+//             const int warp_id = tid / width;
+//             const int warp_tid = tid & (width-1);
+//             const int warp_nb = num_threads / width;
+//             const int c = col + warp_tid;
+//             int l = warp_id * KERNEL_UNROLL;
+//             uint64_t *src = source + c;
+//             uint64_t *dst = destination + c;
+//             for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
+//                 #pragma unroll
+//                 for (int u=0; u<KERNEL_UNROLL; u++) {
+//                     val[u] = *(src+(l+u)*extent);
+//                 }
+//                 #pragma unroll
+//                 for (int u=0; u<KERNEL_UNROLL; u++) {
+//                     dst[(l+u)*size] = val[u];
+//                 }
+//                 l += warp_nb * KERNEL_UNROLL;
+//             }
+//             /* Finish non-unrollable case */
+//             for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
+//                 dst[l*size] = *(src+l*extent);
+//                 l++;
+//             }
+//             col += width;
+//         }
+//     }
+//
+//
+// }
+
 /*
 #define COLOFF_INC(jump, width, ext) \
      col += jump; \
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 882c26a72b4..20e3b381994 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -192,9 +192,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pCon
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
  //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
-        for (i = 0; i < NB_STREAMS; i++) {
-            cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
-        }
+        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -461,9 +459,9 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 //    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<32, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<16, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -473,7 +471,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -584,7 +582,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
     GET_TIME(start);
 #endif    
 
-    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -593,7 +591,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -638,9 +636,9 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
         printf("can not get dev  mem, %s\n", cuda_err);
     }
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -650,7 +648,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 703e52280b5..9be53d2d5a7 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -178,9 +178,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* p
         total_unpacked += iov[iov_count].iov_len;
     }
  complete_conversion:
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
     *out_size = iov_count;
@@ -955,9 +953,9 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
@@ -967,7 +965,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -1002,7 +1000,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -1011,7 +1009,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -1057,9 +1055,9 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
         printf("can not get dev mem, %s\n", cuda_err);
     }
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
@@ -1069,7 +1067,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
   //  cudaHostUnregister(_source);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index c8985db7913..1ae08565b73 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    //return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 5f51b3f828b..815f7b1e4bf 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    //return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index afc33e1075e..de3f43a8759 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1276,7 +1276,7 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 5; i++) {
-                 local_copy_with_convertor(pdt, 1, 200000000, mat_size);
+               //  local_copy_with_convertor(pdt, 1, 200000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -1339,13 +1339,13 @@ int main( int argc, char* argv[] )
     }
     
     
-    for (blk_len = 4000; blk_len <= 4000; blk_len += 2000) {
+    for (blk_len = 1000; blk_len <= 4000; blk_len += 2000) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-    //             vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
+                 vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }