diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a55fa38..9cf047e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -174,14 +174,14 @@ else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ")
 
     if(CMAKE_COMPILER_IS_GNUCC)
-        set(CMAKE_CXX_FLAGS_RELEASE "-O4 -ffast-math")
+        set(CMAKE_CXX_FLAGS_RELEASE "-O4")
         set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g  -Wall -pedantic")
         set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Bdynamic")
         if(NOT WIN32)
             set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ldl -lz")
         endif()
     elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        set(CMAKE_CXX_FLAGS_RELEASE "-O3 -ffast-math")
+        set(CMAKE_CXX_FLAGS_RELEASE "-O3")
         set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g  -Wall -pedantic")
         set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lz")
     endif()
@@ -211,8 +211,8 @@ if(APR_USE_CUDA)
     message(STATUS "APR: Building CUDA for APR")
     set(CMAKE_CUDA_STANDARD 14)
     set(CMAKE_CUDA_RUNTIME_LIBRARY "Static")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA")
-    set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math") # -lineinfo for profiling
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA")
+    set(CMAKE_CUDA_FLAGS_RELEASE "-O3") # -lineinfo for profiling
     set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G")
     if(APR_BENCHMARK)
         set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DAPR_BENCHMARK")
@@ -226,6 +226,7 @@ if(APR_USE_CUDA)
             src/algorithm/LocalIntensityScale.cu
             src/algorithm/OVPC.cu
             src/data_structures/APR/access/GPUAccess.cu
+            src/data_structures/APR/access/LinearAccessCuda.cu
             src/numerics/miscCuda.cu
             src/numerics/APRDownsampleGPU.cu
             src/numerics/PixelNumericsGPU.cu
@@ -241,6 +242,7 @@ if(APR_BUILD_STATIC_LIB)
     # generate static library used as a intermediate step in generating fat lib
     set(STATIC_TARGET_NAME staticLib)
     add_library(${STATIC_TARGET_NAME} STATIC $<TARGET_OBJECTS:aprObjLib> ${APR_CUDA_SOURCE_FILES})
+    set_property(TARGET ${STATIC_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF)
     target_compile_features(${STATIC_TARGET_NAME} PUBLIC cxx_std_14)
     set_target_properties(${STATIC_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME})
     set_target_properties(${STATIC_TARGET_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION OFF)
@@ -262,7 +264,7 @@ if(APR_BUILD_SHARED_LIB)
 # generate fat shared library
     set(SHARED_TARGET_NAME sharedLib)
     add_library(${SHARED_TARGET_NAME} SHARED $<TARGET_OBJECTS:aprObjLib> ${APR_CUDA_SOURCE_FILES})
-
+    set_property(TARGET ${SHARED_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF)
     target_include_directories(${SHARED_TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src> $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>)
     set_target_properties(${SHARED_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME})
     set_target_properties(${SHARED_TARGET_NAME} PROPERTIES LIBRARY_OUTPUT_NAME ${LIBRARY_NAME})
diff --git a/examples/Example_get_apr.h b/examples/Example_get_apr.h
index c1be9d2b..6d787811 100644
--- a/examples/Example_get_apr.h
+++ b/examples/Example_get_apr.h
@@ -30,7 +30,7 @@ struct cmdLineOptions{
     bool auto_parameters = false;
 
     float Ip_th = 0;
-    float lambda = -1;
+    float lambda = 3.0;
     float sigma_th = 0;
     float rel_error = 0.1;
     float grad_th = 1;
diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp
index b34e1b74..91858629 100644
--- a/src/algorithm/APRConverter.hpp
+++ b/src/algorithm/APRConverter.hpp
@@ -117,7 +117,7 @@ class APRConverter {
     PixelData<float> local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors
     PixelData<float> local_scale_temp2;
 
-    void applyParameters(APR& aAPR,APRParameters& aprParameters);
+    void applyParameters(APRParameters& aprParameters);
 
     template<typename T>
     void computeL(APR& aAPR,PixelData<T>& input_image);
@@ -184,7 +184,7 @@ void APRConverter<ImageType>::get_apr_custom_grad_scale(APR& aAPR,PixelData<Imag
     }
 
     aAPR.parameters = par;
-    applyParameters(aAPR,par);
+    applyParameters(par);
     solveForAPR(aAPR);
     generateDatastructures(aAPR);
 
@@ -215,6 +215,10 @@ void APRConverter<ImageType>::computeL(APR& aAPR,PixelData<T>& input_image){
 
     fine_grained_timer.start_timer("offset image");
 
+    // offset image by factor (this is required if there are zero areas in the background with
+    // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
+    // Warning both of these could result in over-flow!
+
     if (std::is_floating_point<ImageType>::value) {
         image_temp.copyFromMesh(input_image);
     } else {
@@ -247,7 +251,7 @@ void APRConverter<ImageType>::computeL(APR& aAPR,PixelData<T>& input_image){
 }
 
 template<typename ImageType>
-void APRConverter<ImageType>::applyParameters(APR& aAPR,APRParameters& aprParameters) {
+void APRConverter<ImageType>::applyParameters(APRParameters& aprParameters) {
     //
     //  Apply the main parameters
     //
@@ -261,39 +265,7 @@ void APRConverter<ImageType>::applyParameters(APR& aAPR,APRParameters& aprParame
     }
     fine_grained_timer.stop_timer();
 
-    fine_grained_timer.start_timer("threshold");
-    iComputeGradient.threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset);
-    fine_grained_timer.stop_timer();
-
-    float max_th = 60000;
-
-#ifdef HAVE_OPENMP
-#pragma omp parallel for default(shared)
-#endif
-    for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
-
-        float rescaled = local_scale_temp.mesh[i];
-        if (rescaled < aprParameters.sigma_th) {
-            rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : par.sigma_th;
-            local_scale_temp.mesh[i] = rescaled;
-        }
-    }
-
-#ifdef HAVE_LIBTIFF
-    if(par.output_steps) {
-        TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_rescaled.tif", local_scale_temp);
-    }
-#endif
-
-#ifdef HAVE_OPENMP
-#pragma omp parallel for default(shared)
-#endif
-    for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
-
-        if(grad_temp.mesh[i] < aprParameters.grad_th){
-            grad_temp.mesh[i] = 0;
-        }
-    }
+    iComputeGradient.applyParameters(grad_temp, local_scale_temp, local_scale_temp2, aprParameters, bspline_offset);
 }
 
 
@@ -401,7 +373,7 @@ inline bool APRConverter<ImageType>::get_lrf(APR &aAPR, PixelData<T>& input_imag
 template<typename ImageType>
 inline bool APRConverter<ImageType>::get_ds(APR &aAPR) {
 
-    applyParameters(aAPR,par);
+    applyParameters(par);
     aAPR.parameters = par;
 
     solveForAPR(aAPR);
@@ -422,103 +394,45 @@ inline bool APRConverter<ImageType>::get_ds(APR &aAPR) {
  */
 template<typename ImageType> template<typename T>
 inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input_image) {
-    if (!initPipelineAPR(aAPR, input_image)) return false;
 
+    if (!initPipelineAPR(aAPR, input_image)) return false;
 
     initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
 
-    method_timer.start_timer("compute_gradient_magnitude_using_bsplines and local instensity scale CUDA");
-    APRTimer t(true);
-    APRTimer d(true);
-    t.start_timer(" =========== ALL");
-    {
-
-        computation_timer.start_timer("init_mem");
-        PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
-
-        /////////////////////////////////
-        /// Pipeline
-        ////////////////////////
-        //offset image by factor (this is required if there are zero areas in the background with uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
-        // Warning both of these could result in over-flow (if your image is non zero, with a 'buffer' and has intensities up to uint16_t maximum value then set image_type = "", i.e. uncomment the following line)
-
-        if (std::is_same<uint16_t, ImageType>::value) {
-            bspline_offset = 100;
-            image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-        } else if (std::is_same<uint8_t, ImageType>::value) {
-            bspline_offset = 5;
-            image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-        } else {
-            image_temp.copyFromMesh(input_image);
-        }
-
-        computation_timer.stop_timer();
-
-        std::vector<GpuProcessingTask<ImageType>> gpts;
-
-        int numOfStreams = 1;
-        int repetitionsPerStream = 1;
-
-        computation_timer.start_timer("compute_L");
-        // Create streams and send initial task to do
-        for (int i = 0; i < numOfStreams; ++i) {
-            gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
-            gpts.back().sendDataToGpu();
-            gpts.back().processOnGpu();
-        }
-        computation_timer.stop_timer();
-
-
-        for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
-            int c = i % numOfStreams;
-
-            computation_timer.start_timer("apply_parameters");
-            // get data from previous task
-            gpts[c].getDataFromGpu();
-
-            computation_timer.stop_timer();
-
-            // in theory we get new data and send them to task
-            if (i  < numOfStreams * (repetitionsPerStream - 1)) {
-                gpts[c].sendDataToGpu();
-                gpts[c].processOnGpu();
-            }
-
-            // Postprocess on CPU
-            std::cout << "--------- start CPU processing ---------- " << i << std::endl;
-
-            computation_timer.start_timer("solve_for_apr");
-            iPullingScheme.initialize_particle_cell_tree(aAPR.aprInfo);
-
-            PixelData<float> lst(local_scale_temp, true);
-
-#ifdef HAVE_LIBTIFF
-            if (par.output_steps){
-                TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_step.tif", lst);
-            }
-#endif
+    computation_timer.start_timer("init_mem");
+    PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
 
-#ifdef HAVE_LIBTIFF
-            if (par.output_steps){
-                TiffUtils::saveMeshAsTiff(par.output_dir + "gradient_step.tif", grad_temp);
-            }
-#endif
+    /////////////////////////////////
+    /// Pipeline
+    ////////////////////////
+    // offset image by factor (this is required if there are zero areas in the background with
+    // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
+    // Warning both of these could result in over-flow!
 
-            iLocalParticleSet.get_local_particle_cell_set(iPullingScheme,lst, local_scale_temp2,par);
+    if (std::is_same<uint16_t, ImageType>::value) {
+        bspline_offset = 100;
+        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
+    } else if (std::is_same<uint8_t, ImageType>::value) {
+        bspline_offset = 5;
+        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
+    } else {
+        image_temp.copyFromMesh(input_image);
+    }
 
-            iPullingScheme.pulling_scheme_main();
+    GpuProcessingTask<ImageType> gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max());
+    gpt.sendDataToGpu();
+    gpt.processOnGpu();
+    auto linearAccessGpu = gpt.getDataFromGpu();
 
-            computation_timer.stop_timer();
+    aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
 
-            computation_timer.start_timer("generate_data_structures");
-            generateDatastructures(aAPR);
-            computation_timer.stop_timer();
-        }
-        std::cout << "Total n ENDED" << std::endl;
+    // generateDatastructures(aAPR) for linearAcceess for CUDA
+    aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
+    aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
+    aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
+    aAPR.apr_initialized = true;
 
-    }
-    t.stop_timer();
-    method_timer.stop_timer();
+    std::cout << "CUDA pipeline finished!\n";
 
     return true;
 }
@@ -560,7 +474,7 @@ inline bool APRConverter<ImageType>::get_apr_cpu(APR &aAPR, PixelData<T> &input_
         method_timer.stop_timer();
     }
 
-    applyParameters(aAPR,par);
+    applyParameters(par);
 
     computation_timer.stop_timer();
 
@@ -592,7 +506,7 @@ template<typename ImageType> template<typename T>
 inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_image) {
 // TODO: CUDA pipeline is temporarily turned off and CPU version is always chosen.
 //       After revising a CUDA pipeline remove "#if true // " part.
-#if true // #ifndef APR_USE_CUDA
+#ifndef APR_USE_CUDA
     return get_apr_cpu(aAPR, input_image);
 #else
     return get_apr_cuda(aAPR, input_image);
diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp
index 53c3d7cd..6b682fdf 100644
--- a/src/algorithm/ComputeGradient.hpp
+++ b/src/algorithm/ComputeGradient.hpp
@@ -38,6 +38,35 @@ class ComputeGradient {
     template<typename T>
     void calc_inv_bspline_z(PixelData<T> &input);
 
+    template<typename T>
+    void applyParameters(PixelData<T> &grad_temp, PixelData<float> &local_scale_temp, PixelData<float> &local_scale_temp2, APRParameters &aprParameters, float bspline_offset) {
+        threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset);
+
+        float max_th = 60000;
+
+#ifdef HAVE_OPENMP
+#pragma omp parallel for default(shared)
+#endif
+        for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
+
+            float rescaled = local_scale_temp.mesh[i];
+            if (rescaled < aprParameters.sigma_th) {
+                rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : aprParameters.sigma_th;
+                local_scale_temp.mesh[i] = rescaled;
+            }
+        }
+
+#ifdef HAVE_OPENMP
+#pragma omp parallel for default(shared)
+#endif
+        for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
+
+            if(grad_temp.mesh[i] < aprParameters.grad_th){
+                grad_temp.mesh[i] = 0;
+            }
+        }
+    }
+
     struct three_temps {
         float temp_1, temp_2, temp_3;
     };
@@ -65,6 +94,20 @@ class ComputeGradient {
 
     inline float impulse_resp_back(float k, float rho, float omg, float gamma, float c0);
 
+    typedef struct {
+        std::vector<float> bc1_vec;
+        std::vector<float> bc2_vec;
+        std::vector<float> bc3_vec;
+        std::vector<float> bc4_vec;
+        size_t k0;
+        float b1;
+        float b2;
+        float norm_factor;
+        size_t minLen;
+    } BsplineParams;
+
+    BsplineParams prepareBSplineParams(size_t dimLen, float lambda, float tol, int maxFilterLen = -1);
+
 };
 
 template<typename ImageType,typename tempType>
@@ -115,7 +158,6 @@ inline void ComputeGradient::get_gradient(PixelData<ImageType> &image_temp, Pixe
             timer.stop_timer();
         }
     }
-
 }
 
 
@@ -208,81 +250,45 @@ void ComputeGradient::get_smooth_bspline_3D(PixelData<T>& input, float lambda) {
 inline float ComputeGradient::impulse_resp(float k,float rho,float omg){
     //  Impulse Response Function
 
-    return (pow(rho,(std::abs(k)))*sin((std::abs(k) + 1)*omg)) / sin(omg);
+    return (powf(rho,(std::abs(k)))*sinf((std::abs(k) + 1)*omg)) / sinf(omg);
 
 }
 
 inline float ComputeGradient::impulse_resp_back(float k,float rho,float omg,float gamma,float c0){
     //  Impulse Response Function (nominator eq. 4.8, denominator from eq. 4.7)
 
-    return c0*pow(rho,std::abs(k))*(cos(omg*std::abs(k)) + gamma*sin(omg*std::abs(k)))*(1.0/(pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2)));
+    return c0*powf(rho,std::abs(k))*(cosf(omg*std::abs(k)) + gamma*sinf(omg*std::abs(k)))*(1.0/(powf((1 - 2.0*rho*cosf(omg) + pow(rho,2)),2)));
 }
 
-
-/**
- * floating point output -> no rounding or under-/overflow check
- */
-template<typename T>
-std::enable_if_t<std::is_floating_point<T>::value, T>
-round(float val, size_t &errCount) {
-    return val;
-}
-
-/**
- * integer output -> check for under-/overflow and round
- */
-template<typename T>
-std::enable_if_t<!std::is_floating_point<T>::value, T>
-round(float val, size_t &errCount) {
-
-    val = std::round(val);
-
-    if(val < std::numeric_limits<T>::min() || val > std::numeric_limits<T>::max()) {
-        errCount++;
-    }
-    return val;
-}
-
-
-
-template<typename T>
-void ComputeGradient::bspline_filt_rec_y(PixelData<T>& image,float lambda,float tol, int k0Len) {
-    //
-    //  Bevan Cheeseman 2016
-    //
-    // Recursive Filter Implimentation for Smoothing BSplines
+ComputeGradient::BsplineParams ComputeGradient::prepareBSplineParams(size_t dimLen, float lambda, float tol, int maxFilterLen) {
+    // Recursive Filter Implementation for Smoothing BSplines
     // B-Spline Signal Processing: Part 11-Efficient Design and Applications, Unser 1993
 
-    float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda); // eq 4.6
-    float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda))); // eq 4.5
-    float omg = atan(sqrt((1/xi)*(144*lambda - 1))); // eq 4.6
+    float xi = 1 - 96*lambda + 24*lambda * sqrtf(3 + 144*lambda);
+    float rho = (24*lambda - 1 - sqrtf(xi)) / (24*lambda) * sqrtf((1/xi) * (48*lambda + 24*lambda * sqrtf(3 + 144*lambda)));
+    float omg = atan(sqrtf((1/xi) * (144*lambda - 1)));
+    float c0 = (1 + powf(rho,2)) / (1-powf(rho,2)) * (1 - 2*rho * cosf(omg) + powf(rho,2)) / (1 + 2*rho*cosf(omg) + powf(rho,2));
+    float gamma = (1 - powf(rho,2)) / (1+powf(rho,2)) * (1 / tan(omg));
 
-    float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2)); // eq 4.8
-    float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg)); // eq 4.8
+    const float b1 = 2*rho*cosf(omg);
+    const float b2 = -powf(rho,2.0);
 
-    const float b1 = 2*rho*cos(omg);
-    const float b2 = -pow(rho,2.0);
+    const size_t idealK0Len = ceil(std::abs(logf(tol) / logf(rho)));
+    const size_t k0 = maxFilterLen > 0 ? maxFilterLen : idealK0Len;
+    const size_t minLen = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, dimLen);
 
-    const size_t z_num = image.z_num;
-    const size_t x_num = image.x_num;
-    const size_t y_num = image.y_num;
-//    const size_t minLen = y_num;
-    const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))),y_num);
-
-    const size_t k0 = k0Len > 0 ? k0Len : (size_t)(ceil(std::abs(log(tol)/log(rho))));
+    const float norm_factor = powf((1 - 2.0*rho*cosf(omg) + powf(rho,2)),2);
 
+    // std::cout << std::fixed << std::setprecision(9) << "CPU xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl;
 
-    const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2);
-//    std::cout << "CPUy xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl;
     // for boundaries
-    std::vector<float> impulse_resp_vec_f(k0+3);  //forward
-    for (size_t k = 0; k < (k0+3); ++k) {
-        impulse_resp_vec_f[k] = impulse_resp(k,rho,omg);
+    std::vector<float> impulse_resp_vec_f(k0+1);  //forward
+    for (size_t k = 0; k < (k0+1); ++k) {
+        impulse_resp_vec_f[k] = impulse_resp(k, rho, omg);
     }
-
-    std::vector<float> impulse_resp_vec_b(k0+3);  //backward
-    for (size_t k = 0; k < (k0+3); ++k) {
-        impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0);
+    std::vector<float> impulse_resp_vec_b(k0+1);  //backward
+    for (size_t k = 0; k < (k0+1); ++k) {
+        impulse_resp_vec_b[k] = impulse_resp_back(k, rho, omg, gamma, c0);
     }
 
     std::vector<float> bc1_vec(k0, 0);  //forward
@@ -291,9 +297,8 @@ void ComputeGradient::bspline_filt_rec_y(PixelData<T>& image,float lambda,float
     for (size_t k = 0; k < k0; ++k) {
         bc1_vec[k] += impulse_resp_vec_f[k+1];
     }
-
     //assumes a constant value at the end of the filter when the required ghost is bigger then the image
-    for(size_t k = (minLen); k < k0;k++){
+    for (size_t k = minLen; k < k0; k++) {
         bc1_vec[minLen-1] += bc1_vec[k];
     }
 
@@ -302,8 +307,7 @@ void ComputeGradient::bspline_filt_rec_y(PixelData<T>& image,float lambda,float
     for (size_t k = 0; k < k0; ++k) {
         bc2_vec[k] = impulse_resp_vec_f[k];
     }
-
-    for(size_t k = (minLen); k < k0;k++){
+    for (size_t k = minLen; k < k0; k++) {
         bc2_vec[minLen-1] += bc2_vec[k];
     }
 
@@ -313,8 +317,7 @@ void ComputeGradient::bspline_filt_rec_y(PixelData<T>& image,float lambda,float
     for (size_t k = 0; k < (k0-1); ++k) {
         bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2];
     }
-
-    for(size_t k = (minLen); k < k0;k++){
+    for (size_t k = minLen; k < k0;k++) {
         bc3_vec[minLen-1] += bc3_vec[k];
     }
 
@@ -324,11 +327,64 @@ void ComputeGradient::bspline_filt_rec_y(PixelData<T>& image,float lambda,float
     for (size_t k = 1; k < k0; ++k) {
         bc4_vec[k] += 2*impulse_resp_vec_b[k];
     }
-
-    for(size_t k = (minLen); k < k0;k++){
+    for (size_t k = minLen; k < k0; k++) {
         bc4_vec[minLen-1] += bc4_vec[k];
     }
 
+    return BsplineParams {
+        std::move(bc1_vec),
+        std::move(bc2_vec),
+        std::move(bc3_vec),
+        std::move(bc4_vec),
+        k0,
+        b1,
+        b2,
+        norm_factor,
+        minLen
+    };
+}
+
+/**
+ * floating point output -> no rounding or under-/overflow check
+ */
+template<typename T>
+std::enable_if_t<std::is_floating_point<T>::value, T>
+round(float val, size_t &errCount) {
+    return val;
+}
+
+/**
+ * integer output -> check for under-/overflow and round
+ */
+template<typename T>
+std::enable_if_t<!std::is_floating_point<T>::value, T>
+round(float val, size_t &errCount) {
+
+    val = std::round(val);
+
+    if(val < std::numeric_limits<T>::min() || val > std::numeric_limits<T>::max()) {
+        errCount++;
+        std::cout << val << " " << (float)std::numeric_limits<T>::min() << " " << (float)std::numeric_limits<T>::max() << std::endl;
+    }
+    return val;
+}
+
+
+
+template<typename T>
+void ComputeGradient::bspline_filt_rec_y(PixelData<T>& image,float lambda,float tol, int k0Len) {
+    //
+    //  Bevan Cheeseman 2016
+    //
+    // Recursive Filter Implementation for Smoothing BSplines
+    // B-Spline Signal Processing: Part 11-Efficient Design and Applications, Unser 1993
+
+    const size_t z_num = image.z_num;
+    const size_t x_num = image.x_num;
+    const size_t y_num = image.y_num;
+
+    auto p = prepareBSplineParams(y_num, lambda, tol, k0Len);
+
     APRTimer btime;
     btime.verbose_flag = false;
 
@@ -350,37 +406,35 @@ void ComputeGradient::bspline_filt_rec_y(PixelData<T>& image,float lambda,float
             const size_t iynum = x * y_num;
 
             //boundary conditions
-            for (size_t k = 0; k < minLen; ++k) {
-                temp1 += bc1_vec[k]*image.mesh[jxnumynum + iynum + k];
-                temp2 += bc2_vec[k]*image.mesh[jxnumynum + iynum + k];
+            for (size_t k = 0; k < p.minLen; ++k) {
+                temp1 += p.bc1_vec[k]*image.mesh[jxnumynum + iynum + k];
+                temp2 += p.bc2_vec[k]*image.mesh[jxnumynum + iynum + k];
             }
 
             //boundary conditions
-            for (size_t k = 0; k < minLen; ++k) {
-                temp3 += bc3_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k];
-                temp4 += bc4_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k];
+            for (size_t k = 0; k < p.minLen; ++k) {
+                temp3 += p.bc3_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k];
+                temp4 += p.bc4_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k];
             }
 
             //initialize the sequence
-            image.mesh[jxnumynum + iynum + 0] = temp2;
-            image.mesh[jxnumynum + iynum + 1] = temp1;
+            image.mesh[jxnumynum + iynum + 0] = round<T>(temp2, error_count);
+            image.mesh[jxnumynum + iynum + 1] = round<T>(temp1, error_count);
 
             for (auto it = (image.mesh.begin()+jxnumynum + iynum + 2); it !=  (image.mesh.begin()+jxnumynum + iynum + y_num); ++it) {
-                float  temp = temp1*b1 + temp2*b2 + *it;
+
+                float  temp = temp1*p.b1 + temp2*p.b2 + *it;
                 *it = round<T>(temp, error_count);
                 temp2 = temp1;
                 temp1 = temp;
             }
 
-            image.mesh[jxnumynum + iynum + y_num - 2] = round<T>(temp3*norm_factor, error_count);
-            image.mesh[jxnumynum + iynum + y_num - 1] = round<T>(temp4*norm_factor, error_count);
-
-
+            image.mesh[jxnumynum + iynum + y_num - 2] = round<T>(temp3*p.norm_factor, error_count);
+            image.mesh[jxnumynum + iynum + y_num - 1] = round<T>(temp4*p.norm_factor, error_count);
         }
     }
     btime.stop_timer();
 
-
     btime.start_timer("backward_loop_y");
     #ifdef HAVE_OPENMP
 	#pragma omp parallel for default(shared) reduction(+: error_count)
@@ -391,13 +445,12 @@ void ComputeGradient::bspline_filt_rec_y(PixelData<T>& image,float lambda,float
         for (int64_t i = x_num - 1; i >= 0; --i) {
             const size_t iynum = i * y_num;
 
-            float temp2 = image.mesh[jxnumynum + iynum + y_num - 1]/norm_factor;
-            float temp1 = image.mesh[jxnumynum + iynum + y_num - 2]/norm_factor;
+            float temp2 = image.mesh[jxnumynum + iynum + y_num - 1]/p.norm_factor;
+            float temp1 = image.mesh[jxnumynum + iynum + y_num - 2]/p.norm_factor;
 
             for (auto it = (image.mesh.begin()+jxnumynum + iynum + y_num-3); it !=  (image.mesh.begin()+jxnumynum + iynum-1); --it) {
-                float temp = temp1*b1 + temp2*b2 + *it;
-
-                *it = round<T>(temp*norm_factor, error_count);
+                float temp = temp1*p.b1 + temp2*p.b2 + *it;
+                *it = round<T>(temp*p.norm_factor, error_count);
 
                 temp2 = temp1;
                 temp1 = temp;
@@ -417,90 +470,13 @@ void ComputeGradient::bspline_filt_rec_z(PixelData<T>& image,float lambda,float
     //
     //  Bevan Cheeseman 2016
     //
-    //  Recursive Filter Implimentation for Smoothing BSplines
-
-    float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda);
-    float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda)));
-    float omg = atan(sqrt((1/xi)*(144*lambda - 1)));
-    float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2));
-    float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg));
-
-    const float b1 = 2*rho*cos(omg);
-    const float b2 = -pow(rho,2.0);
+    //  Recursive Filter Implementation for Smoothing BSplines
 
     const size_t z_num = image.z_num;
     const size_t x_num = image.x_num;
     const size_t y_num = image.y_num;
-    //const size_t minLen = std::min(z_num, std::min(x_num, y_num));
-    //const size_t minLen = z_num;
-
-    const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))), z_num);
-
-    const size_t k0 = k0Len > 0 ? k0Len :(size_t)(ceil(std::abs(log(tol)/log(rho))));
-
-    const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2);
-//    std::cout << "CPUz xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl;
-
-    //////////////////////////////////////////////////////////////
-    //
-    //  Setting up boundary conditions
-    //
-    //////////////////////////////////////////////////////////////
-
-    std::vector<float> impulse_resp_vec_f(k0+3);  //forward
-    for (size_t k = 0; k < (k0+3);k++){
-        impulse_resp_vec_f[k] = impulse_resp(k,rho,omg);
-    }
-
-    std::vector<float> impulse_resp_vec_b(k0+3);  //backward
-    for (size_t k = 0; k < (k0+3);k++){
-        impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0);
-    }
-
-    std::vector<float> bc1_vec(k0, 0);  //forward
-    //y(1) init
-    bc1_vec[1] = impulse_resp_vec_f[0];
-    for(size_t k = 0; k < k0; k++){
-        bc1_vec[k] += impulse_resp_vec_f[k+1];
-    }
-
-    //assumes a constant value at the end of the filter when the required ghost is bigger then the image
-    for(size_t k = (minLen); k < k0;k++){
-        bc1_vec[minLen-1] += bc1_vec[k];
-    }
-
-
-    std::vector<float> bc2_vec(k0, 0);  //backward
-    //y(0) init
-    for(size_t k = 0; k < k0; k++){
-        bc2_vec[k] = impulse_resp_vec_f[k];
-    }
-
-    for(size_t k = (minLen); k < k0;k++){
-        bc2_vec[minLen-1] += bc2_vec[k];
-    }
-
-    std::vector<float> bc3_vec(k0, 0);  //forward
-    //y(N-1) init
-    bc3_vec[0] = impulse_resp_vec_b[1];
-    for(size_t k = 0; k < (k0-1); k++){
-        bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2];
-    }
-
-    for(size_t k = (minLen); k < k0;k++){
-        bc3_vec[minLen-1] += bc3_vec[k];
-    }
-
-    std::vector<float> bc4_vec(k0, 0);  //backward
-    //y(N) init
-    bc4_vec[0] = impulse_resp_vec_b[0];
-    for(size_t k = 1; k < k0; k++){
-        bc4_vec[k] += 2*impulse_resp_vec_b[k];
-    }
 
-    for(size_t k = (minLen); k < k0;k++){
-        bc4_vec[minLen-1] += bc4_vec[k];
-    }
+    auto p = prepareBSplineParams(z_num, lambda, tol, k0Len);
 
     //forwards direction
     std::vector<float> temp_vec1(y_num,0);
@@ -523,18 +499,18 @@ void ComputeGradient::bspline_filt_rec_z(PixelData<T>& image,float lambda,float
 
         size_t iynum = i * y_num;
 
-        for (size_t j = 0; j < minLen; ++j) {
+        for (size_t j = 0; j < p.minLen; ++j) {
             size_t index = j * x_num * y_num + iynum;
             #ifdef HAVE_OPENMP
 	        #pragma omp simd
             #endif
             for (int64_t k = y_num - 1; k >= 0; k--) {
                 //forwards boundary condition
-                temp_vec1[k] += bc1_vec[j] * image.mesh[index + k];
-                temp_vec2[k] += bc2_vec[j] * image.mesh[index + k];
+                temp_vec1[k] += p.bc1_vec[j] * image.mesh[index + k];
+                temp_vec2[k] += p.bc2_vec[j] * image.mesh[index + k];
                 //backwards boundary condition
-                temp_vec3[k] += bc3_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k];
-                temp_vec4[k] += bc4_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k];
+                temp_vec3[k] += p.bc3_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k];
+                temp_vec4[k] += p.bc4_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k];
             }
         }
 
@@ -557,7 +533,7 @@ void ComputeGradient::bspline_filt_rec_z(PixelData<T>& image,float lambda,float
 	        #pragma omp simd
             #endif
             for (size_t k = 0; k < y_num; ++k) {
-                temp_vec2[k] = round<T>(1.0f*image.mesh[index + k] + b1*temp_vec1[k] + b2*temp_vec2[k], error_count);
+                temp_vec2[k] = round<T>(image.mesh[index + k] + p.b1*temp_vec1[k] + p.b2*temp_vec2[k], error_count);
             }
 
             std::swap(temp_vec1, temp_vec2);
@@ -568,12 +544,12 @@ void ComputeGradient::bspline_filt_rec_z(PixelData<T>& image,float lambda,float
         //initialization
         for (int64_t k = y_num - 1; k >= 0; --k) {
             //y(N)
-            image.mesh[(z_num - 1)*x_num*y_num  + iynum + k] = round<T>(temp_vec4[k]*norm_factor, error_count);
+            image.mesh[(z_num - 1)*x_num*y_num  + iynum + k] = round<T>(temp_vec4[k]*p.norm_factor, error_count);
         }
 
         for (int64_t k = y_num - 1; k >= 0; --k) {
             //y(N-1)
-            image.mesh[(z_num - 2)*x_num*y_num  + iynum + k] = round<T>(temp_vec3[k]*norm_factor, error_count);
+            image.mesh[(z_num - 2)*x_num*y_num  + iynum + k] = round<T>(temp_vec3[k]*p.norm_factor, error_count);
         }
 
         //main loop
@@ -584,8 +560,8 @@ void ComputeGradient::bspline_filt_rec_z(PixelData<T>& image,float lambda,float
 	        #pragma omp simd
             #endif
             for (int64_t k = y_num - 1; k >= 0; --k) {
-                float temp = (image.mesh[index + k] +  b1*temp_vec3[k] + b2*temp_vec4[k]);
-                image.mesh[index + k] = round<T>(temp*norm_factor, error_count);
+                float temp = (image.mesh[index + k] + p.b1*temp_vec3[k] + p.b2*temp_vec4[k]);
+                image.mesh[index + k] = round<T>(temp*p.norm_factor, error_count);
                 temp_vec4[k] = temp_vec3[k];
                 temp_vec3[k] = temp;
             }
@@ -605,85 +581,11 @@ void ComputeGradient::bspline_filt_rec_x(PixelData<T>& image,float lambda,float
     //
     //  Recursive Filter Implimentation for Smoothing BSplines
 
-    float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda);
-    float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda)));
-    float omg = atan(sqrt((1/xi)*(144*lambda - 1)));
-    float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2));
-    float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg));
-
-    const float b1 = 2*rho*cos(omg);
-    const float b2 = -pow(rho,2.0);
-
     const size_t z_num = image.z_num;
     const size_t x_num = image.x_num;
     const size_t y_num = image.y_num;
 
-//    const size_t minLen = x_num;
-    const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))), x_num);
-    const size_t k0 = k0Len > 0 ? k0Len : ((size_t)(ceil(std::abs(log(tol)/log(rho)))));
-    const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2);
-
-//    std::cout << "CPUx xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl;
-
-    //////////////////////////////////////////////////////////////
-    //
-    //  Setting up boundary conditions
-    //
-    //////////////////////////////////////////////////////////////
-
-    std::vector<float> impulse_resp_vec_f(k0+3);  //forward
-    for (size_t k = 0; k < (k0+3);k++){
-        impulse_resp_vec_f[k] = impulse_resp(k,rho,omg);
-    }
-
-    std::vector<float> impulse_resp_vec_b(k0+3);  //backward
-    for (size_t k = 0; k < (k0+3);k++){
-        impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0);
-    }
-
-    std::vector<float> bc1_vec(k0, 0);  //forward
-    //y(1) init
-    bc1_vec[1] = impulse_resp_vec_f[0];
-    for(size_t k = 0; k < k0;k++){
-        bc1_vec[k] += impulse_resp_vec_f[k+1];
-    }
-
-    //assumes a constant value at the end of the filter when the required ghost is bigger then the image
-    for(size_t k = (minLen); k < k0;k++){
-        bc1_vec[minLen-1] += bc1_vec[k];
-    }
-
-    std::vector<float> bc2_vec(k0, 0);  //backward
-    //y(0) init
-    for(size_t k = 0; k < k0;k++){
-        bc2_vec[k] = impulse_resp_vec_f[k];
-    }
-
-    for(size_t k = (minLen); k < k0;k++){
-        bc2_vec[minLen-1] += bc2_vec[k];
-    }
-
-    std::vector<float> bc3_vec(k0, 0);  //forward
-    //y(N-1) init
-    bc3_vec[0] = impulse_resp_vec_b[1];
-    for(size_t k = 0; k < (k0-1);k++){
-        bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2];
-    }
-
-    for(size_t k = (minLen); k < k0;k++){
-        bc3_vec[minLen-1] += bc3_vec[k];
-    }
-
-    std::vector<float> bc4_vec(k0, 0);  //backward
-    //y(N) init
-    bc4_vec[0] = impulse_resp_vec_b[0];
-    for(size_t k = 1; k < k0;k++){
-        bc4_vec[k] += 2*impulse_resp_vec_b[k];
-    }
-
-    for(size_t k = (minLen); k < k0;k++){
-        bc4_vec[minLen-1] += bc4_vec[k];
-    }
+    auto p = prepareBSplineParams(x_num, lambda, tol, k0Len);
 
     //forwards direction
 
@@ -705,15 +607,15 @@ void ComputeGradient::bspline_filt_rec_x(PixelData<T>& image,float lambda,float
 
         size_t jxnumynum = j * y_num * x_num;
 
-        for (size_t i = 0; i < minLen; ++i) {
+        for (size_t i = 0; i < p.minLen; ++i) {
 
             for (size_t k = 0; k < y_num; ++k) {
                 //forwards boundary condition
-                temp_vec1[k] += bc1_vec[i]*image.mesh[jxnumynum + i*y_num + k];
-                temp_vec2[k] += bc2_vec[i]*image.mesh[jxnumynum + i*y_num + k];
+                temp_vec1[k] += p.bc1_vec[i]*image.mesh[jxnumynum + i*y_num + k];
+                temp_vec2[k] += p.bc2_vec[i]*image.mesh[jxnumynum + i*y_num + k];
                 //backwards boundary condition
-                temp_vec3[k] += bc3_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k];
-                temp_vec4[k] += bc4_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k];
+                temp_vec3[k] += p.bc3_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k];
+                temp_vec4[k] += p.bc4_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k];
             }
         }
 
@@ -735,7 +637,7 @@ void ComputeGradient::bspline_filt_rec_x(PixelData<T>& image,float lambda,float
             #pragma omp simd
             #endif
             for (int64_t k = y_num - 1; k >= 0; k--) {
-                temp_vec2[k] = round<T>(image.mesh[index + k] + b1*temp_vec1[k] + b2*temp_vec2[k], error_count);
+                temp_vec2[k] = round<T>(image.mesh[index + k] + p.b1*temp_vec1[k] + p.b2*temp_vec2[k], error_count);
             }
 
             std::swap(temp_vec1, temp_vec2);
@@ -748,12 +650,12 @@ void ComputeGradient::bspline_filt_rec_x(PixelData<T>& image,float lambda,float
         //initialization
         for (int64_t k = y_num - 1; k >= 0; --k) {
             //y(N)
-            image.mesh[jxnumynum  + (x_num - 1)*y_num + k] = round<T>(temp_vec4[k]*norm_factor, error_count);
+            image.mesh[jxnumynum  + (x_num - 1)*y_num + k] = round<T>(temp_vec4[k]*p.norm_factor, error_count);
         }
 
         for (int64_t k = y_num - 1; k >= 0; --k) {
             //y(N-1)
-            image.mesh[jxnumynum  + (x_num - 2)*y_num + k] = round<T>(temp_vec3[k]*norm_factor, error_count);
+            image.mesh[jxnumynum  + (x_num - 2)*y_num + k] = round<T>(temp_vec3[k]*p.norm_factor, error_count);
         }
 
         //main loop
@@ -764,8 +666,8 @@ void ComputeGradient::bspline_filt_rec_x(PixelData<T>& image,float lambda,float
             #pragma omp simd
             #endif
             for (int64_t k = y_num - 1; k >= 0; k--){
-                float temp = (image.mesh[index + k] + b1*temp_vec3[ k]+  b2*temp_vec4[ k]);
-                image.mesh[index + k] = round<T>(temp*norm_factor, error_count);
+                float temp = (image.mesh[index + k] + p.b1*temp_vec3[ k]+  p.b2*temp_vec4[ k]);
+                image.mesh[index + k] = round<T>(temp*p.norm_factor, error_count);
                 temp_vec4[k] = temp_vec3[k];
                 temp_vec3[k] = temp;
             }
@@ -813,8 +715,7 @@ void ComputeGradient::calc_inv_bspline_y(PixelData<T>& input){
             }
 
             //LHS boundary condition
-            input.mesh[j*x_num*y_num + i*y_num] = a2*temp_vec[0];
-            input.mesh[j*x_num*y_num + i*y_num] += (a1+a3)*temp_vec[1];
+            input.mesh[j*x_num*y_num + i*y_num] = a1*temp_vec[1] + a2*temp_vec[0] + a3 * temp_vec[1];
 
             for (int64_t k = 1; k < (y_num-1);k++){
                 const int64_t idx = j * x_num * y_num + i * y_num + k;
@@ -822,8 +723,7 @@ void ComputeGradient::calc_inv_bspline_y(PixelData<T>& input){
             }
 
             //RHS boundary condition
-            input.mesh[j*x_num*y_num + i*y_num + y_num - 1] = (a1+a3)*temp_vec[y_num - 2];
-            input.mesh[j*x_num*y_num + i*y_num + y_num - 1] += a2*temp_vec[y_num - 1];
+            input.mesh[j*x_num*y_num + i*y_num + y_num - 1] = a1*temp_vec[y_num - 2] + a2*temp_vec[y_num - 1] + a3*temp_vec[y_num - 2];
         }
     }
 }
@@ -1015,11 +915,15 @@ void ComputeGradient::calc_bspline_fd_ds_mag(const PixelData<S> &input, PixelDat
 
                 //compute the boundary values
                 if (y_num >= 2) {
-                    temp[0] = sqrt(pow((right[0] - left[0]) / (2 * hx), 2.0) + pow((down[0] - up[0]) / (2 * hz), 2.0) +
-                                   pow((center[1] - center[0 /* boundary */]) / (2 * hy), 2.0));
-                    temp[y_num - 1] = sqrt(pow((right[y_num - 1] - left[y_num - 1]) / (2 * hx), 2.0) +
-                                           pow((down[y_num - 1] - up[y_num - 1]) / (2 * hz), 2.0) +
-                                           pow((center[y_num - 1 /* boundary */] - center[y_num - 2]) / (2 * hy), 2.0));
+                    float dx = (right[0] - left[0]) / (2 * hx);
+                    float dz = (down[0] - up[0]) / (2 * hz);
+                    float dy = (center[1] - center[0 /* boundary */]) / (2 * hy);
+                    temp[0] = sqrtf(dx*dx + dz*dz + dy*dy);
+
+                    dx = (right[y_num - 1] - left[y_num - 1]) / (2 * hx);
+                    dz = (down[y_num - 1] - up[y_num - 1]) / (2 * hz);
+                    dy = (center[y_num - 1 /* boundary */] - center[y_num - 2]) / (2 * hy);
+                    temp[y_num - 1] = sqrtf(dx*dx + dz*dz + dy*dy);
                 } else {
                     temp[0] = 0; // same values minus same values in x/y/z
                 }
@@ -1029,8 +933,10 @@ void ComputeGradient::calc_bspline_fd_ds_mag(const PixelData<S> &input, PixelDat
 #pragma omp simd
 #endif
                 for (size_t y = 1; y < y_num - 1; ++y) {
-                    temp[y] = sqrt(pow((right[y] - left[y]) / (2 * hx), 2.0) + pow((down[y] - up[y]) / (2 * hz), 2.0) +
-                                   pow((center[y + 1] - center[y - 1]) / (2 * hy), 2.0));
+                    float dx = (right[y] - left[y]) / (2 * hx);
+                    float dz = (down[y] - up[y]) / (2 * hz);
+                    float dy = (center[y + 1] - center[y - 1]) / (2 * hy);
+                    temp[y] = sqrtf(dx*dx + dz*dz + dy*dy);
                 }
 
                 // Set as a downsampled gradient maximum from 2x2x2 gradient cubes
diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu
index cf636d5f..c4f0e849 100644
--- a/src/algorithm/ComputeGradientCuda.cu
+++ b/src/algorithm/ComputeGradientCuda.cu
@@ -1,28 +1,30 @@
-#include "ComputeGradientCuda.hpp"
-#include "APRParameters.hpp"
 #include <iostream>
-#include <memory>
+#include <chrono>
+#include <cstdint>
+#include <algorithm>
 
 #include <cuda_runtime.h>
-#include <device_launch_parameters.h>
 
+#include "ComputeGradientCuda.hpp"
+#include "APRParameters.hpp"
 #include "data_structures/Mesh/PixelData.hpp"
-#include "dsGradient.cuh"
-
-#include "invBspline.cuh"
-#include <thrust/device_vector.h>
-#include <thrust/device_ptr.h>
-#include "bsplineXdir.cuh"
-#include "bsplineYdir.cuh"
-#include "bsplineZdir.cuh"
 #include "data_structures/Mesh/downsample.cuh"
 #include "algorithm/ComputePullingScheme.cuh"
-#include "algorithm/LocalIntensityScaleCuda.h"
 #include "algorithm/LocalIntensityScale.cuh"
 #include "misc/CudaTools.cuh"
 #include "misc/CudaMemory.cuh"
-#include <chrono>
-#include <cstdint>
+#include "algorithm/ParticleCellTreeCuda.cuh"
+#include "algorithm/PullingSchemeCuda.hpp"
+#include "data_structures/APR/access/LinearAccessCuda.hpp"
+
+#include "dsGradient.cuh"
+#include "invBspline.cuh"
+#include "bsplineParams.h"
+#include "bsplineXdir.cuh"
+#include "bsplineYdir.cuh"
+#include "bsplineZdir.cuh"
+
+
 
 namespace {
     typedef struct {
@@ -36,47 +38,52 @@ namespace {
         float norm_factor;
     } BsplineParams;
 
+    struct BsplineParamsCudaMemoryHandlers {
+        ScopedCudaMemHandler<float*, H2D> bc1;
+        ScopedCudaMemHandler<float*, H2D> bc2;
+        ScopedCudaMemHandler<float*, H2D> bc3;
+        ScopedCudaMemHandler<float*, H2D> bc4;
+    };
+
     float impulse_resp(float k, float rho, float omg) {
         //  Impulse Response Function
-        return (pow(rho, (std::abs(k))) * sin((std::abs(k) + 1) * omg)) / sin(omg);
+        return (powf(rho, (std::abs(k))) * sinf((std::abs(k) + 1) * omg)) / sinf(omg);
     }
 
     float impulse_resp_back(float k, float rho, float omg, float gamma, float c0) {
         //  Impulse Response Function (nominator eq. 4.8, denominator from eq. 4.7)
-        return c0 * pow(rho, std::abs(k)) * (cos(omg * std::abs(k)) + gamma * sin(omg * std::abs(k))) *
-               (1.0 / (pow((1 - 2.0 * rho * cos(omg) + pow(rho, 2)), 2)));
+        return c0 * powf(rho, std::abs(k)) * (cosf(omg * std::abs(k)) + gamma * sinf(omg * std::abs(k))) *
+               (1.0 / (powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2)));
     }
 
-    template<typename T>
-    BsplineParams prepareBsplineStuff(const PixelData<T> &image, float lambda, float tol, int maxFilterLen = -1) {
+    BsplineParams prepareBsplineStuff(size_t dimLen, float lambda, float tol, int maxFilterLen = -1) {
+
         // Recursive Filter Implimentation for Smoothing BSplines
         // B-Spline Signal Processing: Part II - Efficient Design and Applications, Unser 1993
 
-        float xi = 1 - 96 * lambda + 24 * lambda * sqrt(3 + 144 * lambda); // eq 4.6
-        float rho = (24 * lambda - 1 - sqrt(xi)) / (24 * lambda) *
-                    sqrt((1 / xi) * (48 * lambda + 24 * lambda * sqrt(3 + 144 * lambda))); // eq 4.5
-        float omg = atan(sqrt((1 / xi) * (144 * lambda - 1))); // eq 4.6
+        float xi = 1 - 96 * lambda + 24 * lambda * sqrtf(3 + 144 * lambda); // eq 4.6
+        float rho = (24 * lambda - 1 - sqrtf(xi)) / (24 * lambda) *
+                    sqrtf((1 / xi) * (48 * lambda + 24 * lambda * sqrtf(3 + 144 * lambda))); // eq 4.5
 
-        float c0 = (1 + pow(rho, 2)) / (1 - pow(rho, 2)) * (1 - 2 * rho * cos(omg) + pow(rho, 2)) /
-                   (1 + 2 * rho * cos(omg) + pow(rho, 2)); // eq 4.8
-        float gamma = (1 - pow(rho, 2)) / (1 + pow(rho, 2)) * (1 / tan(omg)); // eq 4.8
+        float omg = atan(sqrtf((1 / xi) * (144 * lambda - 1))); // eq 4.6
 
-        const float b1 = 2 * rho * cos(omg);
-        const float b2 = -pow(rho, 2.0);
+        float c0 = (1 + powf(rho, 2)) / (1 - powf(rho, 2)) * (1 - 2 * rho * cosf(omg) + powf(rho, 2)) /
+                   (1 + 2 * rho * cosf(omg) + powf(rho, 2)); // eq 4.8
+        float gamma = (1 - powf(rho, 2)) / (1 + powf(rho, 2)) * (1 / tan(omg)); // eq 4.8
 
-        const size_t idealK0Len = ceil(std::abs(log(tol) / log(rho)));
-        const size_t minDimension = std::min(image.z_num, std::min(image.x_num, image.y_num));
-        const size_t k0 = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, minDimension);
+        const float b1 = 2 * rho * cosf(omg);
+        const float b2 = -powf(rho, 2.0);
 
-        const float norm_factor = pow((1 - 2.0 * rho * cos(omg) + pow(rho, 2)), 2);
-        std::cout << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1
-                  << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl;
+        const size_t idealK0Len = ceil(std::abs(logf(tol) / logf(rho)));
+        const size_t k0 = maxFilterLen > 0 ? maxFilterLen : idealK0Len;
+        const size_t minLen = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, dimLen);
 
-        // ------- Calculating boundary conditions
+        const float norm_factor = powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2);
+  
+//        std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1
+//                  << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << " lambda=" << lambda << " tol=" << tol << std::endl;
 
-        // forward boundaries
-        std::vector<float> impulse_resp_vec_f(k0 + 1);
-        for (size_t k = 0; k < impulse_resp_vec_f.size(); ++k) impulse_resp_vec_f[k] = impulse_resp(k, rho, omg);
+        // ------- Calculating boundary conditions
 
         size_t boundaryLen = sizeof(float) * k0;
         PinnedMemoryUniquePtr<float> bc1{(float*)getPinnedMemory(boundaryLen)};
@@ -84,11 +91,19 @@ namespace {
         PinnedMemoryUniquePtr<float> bc3{(float*)getPinnedMemory(boundaryLen)};
         PinnedMemoryUniquePtr<float> bc4{(float*)getPinnedMemory(boundaryLen)};
 
+        // forward boundaries
+        std::vector<float> impulse_resp_vec_f(k0 + 1);
+        for (size_t k = 0; k < impulse_resp_vec_f.size(); ++k) impulse_resp_vec_f[k] = impulse_resp(k, rho, omg);
+
         //y(0) init
         for (size_t k = 0; k < k0; ++k) bc1[k] = impulse_resp_vec_f[k];
+        for (size_t k = minLen; k < k0; ++k) bc1[minLen - 1] += bc1[k];
+
         //y(1) init
+        for (size_t k = 0; k < k0; ++k) bc2[k] = 0;
         bc2[1] = impulse_resp_vec_f[0];
         for (size_t k = 0; k < k0; ++k) bc2[k] += impulse_resp_vec_f[k + 1];
+        for (size_t k = minLen; k < k0; ++k) bc2[minLen - 1] += bc2[k];
 
         // backward boundaries
         std::vector<float> impulse_resp_vec_b(k0 + 1);
@@ -96,11 +111,16 @@ namespace {
             impulse_resp_vec_b[k] = impulse_resp_back(k, rho, omg, gamma, c0);
 
         //y(N-1) init
+        for (size_t k = 0; k < k0; ++k) bc3[k] = 0;
         bc3[0] = impulse_resp_vec_b[1];
         for (size_t k = 0; k < (k0 - 1); ++k) bc3[k + 1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k + 2];
+        for (size_t k = minLen; k < k0; ++k) bc3[minLen - 1] += bc3[k];
+
         //y(N) init
+        for (size_t k = 0; k < k0; ++k) bc4[k] = 0;
         bc4[0] = impulse_resp_vec_b[0];
         for (size_t k = 1; k < k0; ++k) bc4[k] += 2 * impulse_resp_vec_b[k];
+        for (size_t k = minLen; k < k0; ++k) bc4[minLen - 1] += bc4[k];
 
         return BsplineParams{
                 std::move(bc1),
@@ -113,72 +133,55 @@ namespace {
                 norm_factor
         };
     }
-}
 
-/**
- * Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise.
- * @param input
- * @param output
- * @param length - len of input/output arrays
- * @param thresholdLevel
- */
-template <typename T, typename S>
-__global__ void threshold(const T *input, S *output, size_t length, float thresholdLevel) {
-    size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x;
-    if (idx < length) {
-        if (input[idx] <= thresholdLevel) { output[idx] = 0; }
-    }
-}
-
-template <typename ImgType, typename T>
-void runThreshold(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, size_t z_num, float Ip_th, cudaStream_t aStream) {
-    dim3 threadsPerBlock(64);
-    dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1)/threadsPerBlock.x);
-    threshold<<<numBlocks,threadsPerBlock, 0, aStream>>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th);
-};
-
-/**
- * Thresholds input array to have minimum thresholdLevel.
- * @param input
- * @param length - len of input/output arrays
- * @param thresholdLevel
- */
-template <typename T>
-__global__ void thresholdImg(T *input, size_t length, float thresholdLevel) {
-    size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x;
-    if (idx < length) {
-        if (input[idx] < thresholdLevel) { input[idx] = thresholdLevel; }
+    auto transferSpline(BsplineParams &aParams, cudaStream_t aStream) {
+        ScopedCudaMemHandler<float*, H2D> bc1(aParams.bc1.get(), aParams.k0, aStream);
+        ScopedCudaMemHandler<float*, H2D> bc2(aParams.bc2.get(), aParams.k0, aStream);
+        ScopedCudaMemHandler<float*, H2D> bc3(aParams.bc3.get(), aParams.k0, aStream);
+        ScopedCudaMemHandler<float*, H2D> bc4(aParams.bc4.get(), aParams.k0, aStream);
+
+        return std::pair<BsplineParamsCuda, BsplineParamsCudaMemoryHandlers> {
+                BsplineParamsCuda {
+                        bc1.get(),
+                        bc2.get(),
+                        bc3.get(),
+                        bc4.get(),
+                        aParams.k0,
+                        aParams.b1,
+                        aParams.b2,
+                        aParams.norm_factor
+                },
+
+                BsplineParamsCudaMemoryHandlers {
+                        std::move(bc1),
+                        std::move(bc2),
+                        std::move(bc3),
+                        std::move(bc4)
+                }
+        };
     }
 }
 
-template <typename T>
-void runThresholdImg(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, float Ip_th_offset, cudaStream_t aStream) {
-    dim3 threadsPerBlock(64);
-    dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1) / threadsPerBlock.x);
-    thresholdImg<<< numBlocks, threadsPerBlock, 0, aStream >>> (cudaImage, x_num * y_num * z_num, Ip_th_offset);
-};
-
 template <typename ImgType>
 void getGradientCuda(const PixelData<ImgType> &image, PixelData<float> &local_scale_temp,
                      ImgType *cudaImage, ImgType *cudaGrad, float *cudalocal_scale_temp,
-                     BsplineParams &p, float *bc1, float *bc2, float *bc3, float *bc4, float *boundary,
+                     BsplineParamsCuda &px, BsplineParamsCuda &py, BsplineParamsCuda &pz, float *boundary,
                      float bspline_offset, const APRParameters &par, cudaStream_t aStream) {
 
-    runThresholdImg(cudaImage, image.x_num, image.y_num, image.z_num, par.Ip_th + bspline_offset, aStream);
+    // TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim
 
-    runBsplineYdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, boundary, aStream);
-    runBsplineXdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream);
-    runBsplineZdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream);
+    if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream);
+    if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, aStream);
+    if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream);
 
-    runKernelGradient(cudaImage, cudaGrad, image.x_num, image.y_num, image.z_num, local_scale_temp.x_num, local_scale_temp.y_num, par.dx, par.dy, par.dz, aStream);
 
-    runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream);
+    runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream);
 
-    runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
-    runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
-    runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
+    runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream);
 
-    runThreshold(cudalocal_scale_temp, cudaGrad, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, par.Ip_th, aStream);
+    if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
+    if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
+    if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream);
 }
 
 class CurrentTime {
@@ -199,6 +202,50 @@ public:
     }
 };
 
+
+/**
+ * Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise.
+ * @param input
+ * @param output
+ * @param length - len of input/output arrays
+ * @param thresholdLevel
+ */
+template <typename T, typename S>
+__global__ void threshold(const T *input, S *output, size_t length, float thresholdLevel) {
+    size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x;
+    if (idx < length) {
+        if (input[idx] <= thresholdLevel) { output[idx] = 0; }
+    }
+}
+
+template <typename ImgType, typename T>
+void runThreshold(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, size_t z_num, float Ip_th, cudaStream_t aStream) {
+    dim3 threadsPerBlock(64);
+    dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1)/threadsPerBlock.x);
+    threshold<<<numBlocks,threadsPerBlock, 0, aStream>>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th);
+};
+
+template<typename T>
+__global__ void rescaleAndThreshold(T *data, size_t len, float sigmaThreshold, float sigmaThresholdMax) {
+    const float max_th = 60000.0;
+    size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < len) {
+        float rescaled = data[idx];
+        if (rescaled < sigmaThreshold) {
+            rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold;
+        }
+        data[idx] = rescaled;
+    }
+}
+
+template <typename T>
+void runRescaleAndThreshold(T *data, size_t len, float sigma, float sigmaMax, cudaStream_t aStream) {
+    dim3 threadsPerBlock(64);
+    dim3 numBlocks((len + threadsPerBlock.x - 1) / threadsPerBlock.x);
+    rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, sigma, sigmaMax);
+}
+
+
 template <typename U>
 template <typename ImgType>
 class GpuProcessingTask<U>::GpuProcessingTaskImpl {
@@ -207,6 +254,7 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
     const PixelData<ImgType> &iCpuImage;
     PixelData<float> &iCpuLevels;
     const APRParameters &iParameters;
+    GenInfo iAprInfo;
     float iBsplineOffset;
     int iMaxLevel;
 
@@ -227,6 +275,11 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
     const size_t boundaryLen;
     ScopedCudaMemHandler<float*, JUST_ALLOC> boundary;
 
+    ParticleCellTreeCuda pctc;
+
+    ScopedCudaMemHandler<uint16_t*, JUST_ALLOC> y_vec; // for LinearAccess
+    LinearAccessCudaStructs lacs;
+
     /**
      * @return newly created stream
      */
@@ -238,61 +291,90 @@ class GpuProcessingTask<U>::GpuProcessingTaskImpl {
 
 public:
 
-    GpuProcessingTaskImpl(const PixelData<ImgType> &image, PixelData<float> &levels, const APRParameters &parameters, float bspline_offset, int maxLevel) :
-        iCpuImage(image),
+    // TODO: Remove need for passing 'levels' to GpuProcessingTask
+    //       It was used during development to control internal computation like filters, gradient, levels etc. but
+    //       once all is done there is no need for it anymore
+    GpuProcessingTaskImpl(const PixelData<ImgType> &inputImage, PixelData<float> &levels, const APRParameters &parameters, float bspline_offset, int maxLevel) :
+        iCpuImage(inputImage),
         iCpuLevels(levels),
         iStream(getStream()),
-        image (image, iStream),
+        image (inputImage, iStream),
         gradient (levels, iStream),
         local_scale_temp (levels, iStream),
         local_scale_temp2 (levels, iStream),
         iParameters(parameters),
+        iAprInfo(iCpuImage.getDimension()),
         iBsplineOffset(bspline_offset),
         iMaxLevel(maxLevel),
-        params(prepareBsplineStuff(image, parameters.lambda, tolerance)),
-        bc1(params.bc1.get(), params.k0, iStream),
-        bc2(params.bc2.get(), params.k0, iStream),
-        bc3(params.bc3.get(), params.k0, iStream),
-        bc4(params.bc4.get(), params.k0, iStream),
-        boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)image.x_num * (size_t)image.z_num},
-        boundary{nullptr, boundaryLen, iStream}
+        // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension.
+        //       Should be fixed when other parts of pipeline are ready.
+//        params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)),
+//        bc1(params.bc1.get(), params.k0, iStream),
+//        bc2(params.bc2.get(), params.k0, iStream),
+//        bc3(params.bc3.get(), params.k0, iStream),
+//        bc4(params.bc4.get(), params.k0, iStream),
+        boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num},
+        boundary{nullptr, boundaryLen, iStream},
+        pctc(iAprInfo, iStream),
+        y_vec(nullptr, iAprInfo.getSize(), iStream)
     {
 //        std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n";
-        std::cout << iCpuImage << std::endl;
-        std::cout << iCpuLevels << std::endl;
-        std::cout << "\n\n\n";
-
+//        std::cout << iCpuImage << std::endl;
+//        std::cout << iCpuLevels << std::endl;
     }
 
     void sendDataToGpu() {
-        CurrentTime ct;
-        uint64_t start = ct.microseconds();
+//        CurrentTime ct;
+//        uint64_t start = ct.microseconds();
         image.copyH2D();
-        std::cout << "SEND time: " << ct.microseconds() - start << std::endl;
+//        checkCuda(cudaStreamSynchronize(iStream));
+//        std::cout << "SEND time: " << ct.microseconds() - start << std::endl;
     }
 
-    void getDataFromGpu() {
-        CurrentTime ct;
-        uint64_t start = ct.microseconds();
-        local_scale_temp.copyD2H();
-        cudaStreamSynchronize(iStream);
-        std::cout << "RCV time: " << ct.microseconds() - start << std::endl;
+    LinearAccessCudaStructs getDataFromGpu() {
+//        CurrentTime ct;
+//        uint64_t start = ct.microseconds();
+//        local_scale_temp.copyD2H();
+//        checkCuda(cudaStreamSynchronize(iStream));
+//        std::cout << "RCV time: " << ct.microseconds() - start << std::endl;
+        return std::move(lacs);
     }
 
     void processOnGpu() {
         CurrentTime ct;
         uint64_t start = ct.microseconds();
+
+        // TODO: temporarily bspline params are generated here
+        //       In principle this is OK and correct but would be faster (for processing series of same size images) if
+        //       they would be calculated in constructor of GpuProcessingTaskImpl class (once).
+        BsplineParams px = prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance);
+        auto cudax = transferSpline(px, iStream);
+        auto splineCudaX = cudax.first;
+        BsplineParams py = prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance);
+        auto cuday = transferSpline(py, iStream);
+        auto splineCudaY = cuday.first;
+        BsplineParams pz = prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance);
+        auto cudaz = transferSpline(pz, iStream);
+        auto splineCudaZ = cudaz.first;
+
         getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(),
-                        params, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(),
+                         splineCudaX, splineCudaY, splineCudaZ, boundary.get(),
                         iBsplineOffset, iParameters, iStream);
-        std::cout << "1: " << ct.microseconds() - start << std::endl;
         runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream);
-        std::cout << "2: " << ct.microseconds() - start << std::endl;
+
+        // Apply parameters from APRConverter:
+        runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream);
+        runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream);
+        runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream);
+        // TODO: automatic parameters are not implemented for GPU pipeline (yet)
+
         float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz));
         float level_factor = pow(2, iMaxLevel) * min_dim;
         const float mult_const = level_factor/iParameters.rel_error;
         runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream);
-        std::cout << "3: " << ct.microseconds() - start << std::endl;
+
+        computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream);
+        computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream);
     }
 
     ~GpuProcessingTaskImpl() {
@@ -302,11 +384,11 @@ public:
 };
 
 template <typename ImgType>
-GpuProcessingTask<ImgType>::GpuProcessingTask(PixelData<ImgType> &image, PixelData<float> &levels, const APRParameters &parameters, float bspline_offset, int maxLevel)
-: impl{new GpuProcessingTaskImpl<ImgType>(image, levels, parameters, bspline_offset, maxLevel)} {std::cout << "GpuProcessingTask\n";}
+GpuProcessingTask<ImgType>::GpuProcessingTask(const PixelData<ImgType> &image, PixelData<float> &levels, const APRParameters &parameters, float bspline_offset, int maxLevel)
+: impl{new GpuProcessingTaskImpl<ImgType>(image, levels, parameters, bspline_offset, maxLevel)} { }
 
 template <typename ImgType>
-GpuProcessingTask<ImgType>::~GpuProcessingTask() {std::cout << "~GpuProcessingTask\n";}
+GpuProcessingTask<ImgType>::~GpuProcessingTask() { }
 
 template <typename ImgType>
 GpuProcessingTask<ImgType>::GpuProcessingTask(GpuProcessingTask&&) = default;
@@ -315,18 +397,11 @@ template <typename ImgType>
 void GpuProcessingTask<ImgType>::sendDataToGpu() {impl->sendDataToGpu();}
 
 template <typename ImgType>
-void GpuProcessingTask<ImgType>::getDataFromGpu() {impl->getDataFromGpu();}
+LinearAccessCudaStructs GpuProcessingTask<ImgType>::getDataFromGpu() {return impl->getDataFromGpu();}
 
 template <typename ImgType>
 void GpuProcessingTask<ImgType>::processOnGpu() {impl->processOnGpu();}
 
-template <typename ImgType>
-void GpuProcessingTask<ImgType>::doAll() {
-    sendDataToGpu();
-    processOnGpu();
-    getDataFromGpu();
-}
-
 // explicit instantiation of handled types
 template class GpuProcessingTask<uint16_t>;
 template class GpuProcessingTask<float>;
@@ -336,29 +411,39 @@ template class GpuProcessingTask<float>;
 
 // explicit instantiation of handled types
 template void cudaFilterBsplineFull(PixelData<float> &, float, float, TypeOfRecBsplineFlags, int);
+template void cudaFilterBsplineFull(PixelData<uint16_t> &, float, float, TypeOfRecBsplineFlags, int);
+template void cudaFilterBsplineFull(PixelData<int16_t> &, float, float, TypeOfRecBsplineFlags, int);
+template void cudaFilterBsplineFull(PixelData<uint8_t> &, float, float, TypeOfRecBsplineFlags, int);
+
+
+
 template <typename ImgType>
 void cudaFilterBsplineFull(PixelData<ImgType> &input, float lambda, float tolerance, TypeOfRecBsplineFlags flags, int maxFilterLen) {
     cudaStream_t  aStream = 0;
 
-    BsplineParams p = prepareBsplineStuff(input, lambda, tolerance, maxFilterLen);
-    ScopedCudaMemHandler<float*, H2D> bc1(p.bc1.get(), p.k0);
-    ScopedCudaMemHandler<float*, H2D> bc2(p.bc2.get(), p.k0);
-    ScopedCudaMemHandler<float*, H2D> bc3(p.bc3.get(), p.k0);
-    ScopedCudaMemHandler<float*, H2D> bc4(p.bc4.get(), p.k0);
-    ScopedCudaMemHandler<PixelData<ImgType>, D2H | H2D> cudaInput(input);
+    ScopedCudaMemHandler<PixelData<ImgType>, D2H | H2D> cudaInput(input, aStream);
 
-    APRTimer timer(true);
+    APRTimer timer(false);
     timer.start_timer("GpuDeviceTimeFull");
     if (flags & BSPLINE_Y_DIR) {
+        BsplineParams p = prepareBsplineStuff((size_t)input.y_num, lambda, tolerance, maxFilterLen);
+        auto cuda = transferSpline(p, aStream);
+        auto splineCuda = cuda.first;
         int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * input.x_num * input.z_num;
-        ScopedCudaMemHandler<float*, JUST_ALLOC> boundary(nullptr, boundaryLen); // allocate memory on device
-        runBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, boundary.get(), aStream);
+        ScopedCudaMemHandler<float*, JUST_ALLOC> boundary(nullptr, boundaryLen, aStream); // allocate memory on device
+        runBsplineYdir(cudaInput.get(), input.getDimension(), splineCuda, boundary.get(), aStream);
     }
     if (flags & BSPLINE_X_DIR) {
-        runBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream);
+        BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen);
+        auto cuda = transferSpline(p, aStream);
+        auto splineCuda = cuda.first;
+        runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, aStream);
     }
     if (flags & BSPLINE_Z_DIR) {
-        runBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream);
+        BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen);
+        auto cuda = transferSpline(p, aStream);
+        auto splineCuda = cuda.first;
+        runBsplineZdir(cudaInput.get(), input.getDimension(), splineCuda, aStream);
     }
     timer.stop_timer();
 }
@@ -367,16 +452,18 @@ void cudaFilterBsplineFull(PixelData<ImgType> &input, float lambda, float tolera
 template void cudaInverseBspline(PixelData<float> &, TypeOfInvBsplineFlags);
 template <typename ImgType>
 void cudaInverseBspline(PixelData<ImgType> &input, TypeOfInvBsplineFlags flags) {
-    ScopedCudaMemHandler<PixelData<ImgType>, H2D | D2H> cudaInput(input);
+    cudaStream_t  aStream = 0;
+
+    ScopedCudaMemHandler<PixelData<ImgType>, H2D | D2H> cudaInput(input, aStream);
 
     if (flags & INV_BSPLINE_Y_DIR) {
-        runInvBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0);
+        runInvBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream);
     }
     if (flags & INV_BSPLINE_X_DIR) {
-        runInvBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0);
+        runInvBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream);
     }
     if (flags & INV_BSPLINE_Z_DIR) {
-        runInvBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0);
+        runInvBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream);
     }
 }
 
@@ -384,62 +471,59 @@ void cudaInverseBspline(PixelData<ImgType> &input, TypeOfInvBsplineFlags flags)
 template void computeLevelsCuda(const PixelData<float> &, PixelData<float> &, int, float, float, float, float);
 template <typename ImageType>
 void computeLevelsCuda(const PixelData<ImageType> &grad_temp, PixelData<float> &local_scale_temp, int maxLevel, float relError,  float dx, float dy, float dz) {
-    ScopedCudaMemHandler<const PixelData<ImageType>, H2D> cudaGrad(grad_temp);
-    ScopedCudaMemHandler<PixelData<float>, D2H | H2D> cudaLis(local_scale_temp);
+    cudaStream_t  aStream = 0;
+
+    ScopedCudaMemHandler<const PixelData<ImageType>, H2D> cudaGrad(grad_temp, aStream);
+    ScopedCudaMemHandler<PixelData<float>, D2H | H2D> cudaLis(local_scale_temp, aStream);
 
     float min_dim = std::min(dy, std::min(dx, dz));
     float level_factor = pow(2, maxLevel) * min_dim;
     const float mult_const = level_factor/relError;
-    cudaStream_t aStream = 0;
     runComputeLevels(cudaGrad.get(), cudaLis.get(), grad_temp.mesh.size(), mult_const, aStream);
 }
 
 // explicit instantiation of handled types
 template void getGradient(PixelData<float> &, PixelData<float> &, PixelData<float> &, PixelData<float> &, float, const APRParameters &);
+template void getGradient(PixelData<uint16_t> &, PixelData<uint16_t> &, PixelData<float> &, PixelData<float> &, float, const APRParameters &);
+
 template <typename ImgType>
 void getGradient(PixelData<ImgType> &image, PixelData<ImgType> &grad_temp, PixelData<float> &local_scale_temp, PixelData<float> &local_scale_temp2, float bspline_offset, const APRParameters &par) {
-    ScopedCudaMemHandler<PixelData<ImgType>, D2H | H2D> cudaImage(image);
-    ScopedCudaMemHandler<PixelData<ImgType>, D2H | H2D> cudaGrad(grad_temp);
-    ScopedCudaMemHandler<PixelData<float>, D2H> cudalocal_scale_temp(local_scale_temp);
-    ScopedCudaMemHandler<PixelData<float>, D2H> cudalocal_scale_temp2(local_scale_temp2);
-
-    float tolerance = 0.0001;
-    BsplineParams p = prepareBsplineStuff(image, par.lambda, tolerance);
+    cudaStream_t  aStream = 0;
+    ScopedCudaMemHandler<PixelData<ImgType>, D2H | H2D> cudaImage(image, aStream);
+    ScopedCudaMemHandler<PixelData<ImgType>, D2H | H2D> cudaGrad(grad_temp, aStream);
+    ScopedCudaMemHandler<PixelData<float>, D2H> cudalocal_scale_temp(local_scale_temp, aStream);
+    ScopedCudaMemHandler<PixelData<float>, D2H> cudalocal_scale_temp2(local_scale_temp2, aStream);
 
-    ScopedCudaMemHandler<float*, H2D> bc1 (p.bc1.get(), p.k0);
-    ScopedCudaMemHandler<float*, H2D> bc2 (p.bc2.get(), p.k0);
-    ScopedCudaMemHandler<float*, H2D> bc3 (p.bc3.get(), p.k0);
-    ScopedCudaMemHandler<float*, H2D> bc4 (p.bc4.get(), p.k0);
     int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * image.x_num * image.z_num;
-    ScopedCudaMemHandler<float*, JUST_ALLOC> boundary(nullptr, boundaryLen);
+    ScopedCudaMemHandler<float*, JUST_ALLOC> boundary(nullptr, boundaryLen, aStream);
 
-    getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(),
-                    p, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(),
-                    bspline_offset, par, 0);
-}
+    float tolerance = 0.0001;
 
-// explicit instantiation of handled types
-template void thresholdImg(PixelData<float> &, const float);
-template <typename T>
-void thresholdImg(PixelData<T> &image, const float threshold) {
-    ScopedCudaMemHandler<PixelData<T>, H2D | D2H> cudaImage(image);
+    // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension.
+    //       Should be fixed when other parts of pipeline are ready.
 
-    runThresholdImg(cudaImage.get(), image.x_num, image.y_num, image.z_num, threshold, 0);
-}
+    // FIX BSPLINE PARAMS !!!!!!!! to get full gradient pipeline test working !!!!!!!!!!!!!!!!!!!!!!!!!1
 
-// explicit instantiation of handled types
-template void thresholdGradient(PixelData<float> &, const PixelData<float> &, const float);
-template <typename T>
-void thresholdGradient(PixelData<float> &output, const PixelData<T> &input, const float Ip_th) {
-    ScopedCudaMemHandler<const PixelData<T>, H2D> cudaInput(input);
-    ScopedCudaMemHandler<PixelData<float>, H2D | D2H> cudaOutput(output);
 
-    runThreshold(cudaInput.get(), cudaOutput.get(), input.x_num, input.y_num, input.z_num, Ip_th, 0);
+    BsplineParams px = prepareBsplineStuff(image.x_num, par.lambda, tolerance);
+    auto cudax = transferSpline(px, aStream);
+    auto splineCudaX = cudax.first;
+    BsplineParams py = prepareBsplineStuff(image.y_num, par.lambda, tolerance);
+    auto cuday = transferSpline(py, aStream);
+    auto splineCudaY = cuday.first;
+    BsplineParams pz = prepareBsplineStuff(image.z_num, par.lambda, tolerance);
+    auto cudaz = transferSpline(pz, aStream);
+    auto splineCudaZ = cudaz.first;
+
+    getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(),
+                    splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, aStream);
 }
 
 void cudaDownsampledGradient(PixelData<float> &input, PixelData<float> &grad, const float hx, const float hy, const float hz) {
-    ScopedCudaMemHandler<PixelData<float>, H2D | D2H> cudaInput(input);
-    ScopedCudaMemHandler<PixelData<float>, D2H> cudaGrad(grad);
+    cudaStream_t  aStream = 0;
+
+    ScopedCudaMemHandler<PixelData<float>, H2D | D2H> cudaInput(input, aStream);
+    ScopedCudaMemHandler<PixelData<float>, D2H> cudaGrad(grad, aStream);
 
-    runKernelGradient(cudaInput.get(), cudaGrad.get(), input.x_num, input.y_num, input.z_num, grad.x_num, grad.y_num, hx, hy, hz, 0);
+    runKernelGradient(cudaInput.get(), cudaGrad.get(), input.getDimension(), grad.getDimension(), hx, hy, hz, aStream);
 }
diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp
index 36bb70b1..837d29f5 100644
--- a/src/algorithm/ComputeGradientCuda.hpp
+++ b/src/algorithm/ComputeGradientCuda.hpp
@@ -7,7 +7,7 @@
 
 #include "data_structures/Mesh/PixelData.hpp"
 #include "algorithm/APRParameters.hpp"
-
+#include "data_structures/APR/access/LinearAccessCuda.hpp"
 
 // Test helpers and definitions
 using TypeOfRecBsplineFlags = uint16_t;
@@ -32,10 +32,6 @@ template <typename ImageType>
 void computeLevelsCuda(const PixelData<ImageType> &grad_temp, PixelData<float> &local_scale_temp, int maxLevel, float relError,  float dx = 1, float dy = 1, float dz = 1);
 template <typename ImgType>
 void getGradient(PixelData<ImgType> &image, PixelData<ImgType> &grad_temp, PixelData<float> &local_scale_temp, PixelData<float> &local_scale_temp2, float bspline_offset, const APRParameters &par);
-template <typename T>
-void thresholdImg(PixelData<T> &image, const float threshold);
-template <typename T>
-void thresholdGradient(PixelData<float> &output, const PixelData<T> &input, const float Ip_th);
 void cudaDownsampledGradient(PixelData<float> &input, PixelData<float> &grad, const float hx, const float hy, const float hz);
 
 template <typename ImgType>
@@ -46,14 +42,13 @@ class GpuProcessingTask {
 
 public:
 
-    GpuProcessingTask(PixelData<ImgType> &image, PixelData<float> &levels, const APRParameters &parameters, float bspline_offset, int maxLevel);
+    GpuProcessingTask(const PixelData<ImgType> &image, PixelData<float> &levels, const APRParameters &parameters, float bspline_offset, int maxLevel);
     ~GpuProcessingTask();
     GpuProcessingTask(GpuProcessingTask&&);
 
     void sendDataToGpu();
-    void getDataFromGpu();
+    LinearAccessCudaStructs getDataFromGpu();
     void processOnGpu();
-    void doAll();
 };
 
 #endif //LIBAPR_COMPUTEGRADIENTCUDA_HPP
diff --git a/src/algorithm/ComputePullingScheme.cuh b/src/algorithm/ComputePullingScheme.cuh
index 28450f30..51b88143 100644
--- a/src/algorithm/ComputePullingScheme.cuh
+++ b/src/algorithm/ComputePullingScheme.cuh
@@ -9,8 +9,13 @@ template <typename T>
 __global__ void computeLevels(const T *grad, float *lis, size_t len, float mult_const) {
     size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < len) {
-        //divide gradient magnitude by Local Intensity Scale (first step in calculating the Local Resolution Estimate L(y), minus constants)
-        uint32_t d = (grad[idx] / lis[idx]) * mult_const;
+        // divide gradient magnitude by Local Intensity Scale (first step in calculating the Local Resolution Estimate L(y), minus constants)
+        // TODO: This part is using a "trick" to convert first to int and then to uint32_t
+        //       Without that some numbers on CPU and GPU are converted to different values...
+        //       For example -6507.28 without conversion to int is converted to 0 but in CPU we got huge value.
+        //       Anyway - both CPU & GPU sides should be checked and maybe some better way of it should be
+        //       used - currently we've got undefined result of such operation.
+        uint32_t d = (int)((grad[idx] / lis[idx]) * mult_const);
         //incorporate other factors and compute the level of the Particle Cell, effectively construct LPC L_n
         lis[idx] = (d == 0) ? 0 : 31 - __clz(d); // fast log2
     }
diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu
index 5539baef..1593b5ab 100644
--- a/src/algorithm/LocalIntensityScale.cu
+++ b/src/algorithm/LocalIntensityScale.cu
@@ -11,24 +11,17 @@
 //#include <cuda_runtime.h>
 
 #include "misc/CudaTools.cuh"
-
+#include "data_structures/Mesh/paddPixelData.cuh"
 
 /**
+ * Calculates mean in Y direction
  *
- * How it works along y-dir (let's suppose offset = 2 and number of workers = 8 for simplicity):
- *
- * image idx: 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2
- *
- * loop #1
- * workersIdx 0 1 2 3 4 5 6 7
- * loop #2
- * workersIdx             6 7 0 1 2 3 4 5
- * loop #3
- * workersIdx                         4 5 6 7 0 1 2 3
- * ..............
- *
- * so #offset workers must wait in each loop to have next elements to sum
- *
+ * NOTE: This is not optimal implementation but.. correct and more or less fast as previous one.
+ *       The reason for change was to have results exactly same as in CPU side.
+ *       Currently after reading whole y-dir line of data mean calculation is done only by one from all threads in block
+ *       so here is some room for improvements.
+ *       If needed may be optimized in future. The main limitation is size of shared memory needed which
+ *       limits number of CUDA blocks that can run in parallel.
  * @tparam T
  * @param image
  * @param offset
@@ -37,44 +30,117 @@
  * @param z_num
  */
 template <typename T>
-__global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) {
+__global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect) {
     // NOTE: Block size in x/z direction must be 1
     const size_t workersOffset = (blockIdx.z * x_num + blockIdx.x) * y_num;
     const int numOfWorkers = blockDim.y;
-    const unsigned int active = __activemask();
     const int workerIdx = threadIdx.y;
+
+    extern __shared__ char sharedMemChar[];
+    T *buffer = (T*) sharedMemChar;
+    T *data = (T*) &buffer[y_num];
+
+    // Read whole line of data from y-direction
     int workerOffset = workerIdx;
+    while (workerOffset < y_num) {
+        buffer[workerOffset] = image[workersOffset + workerOffset];
+        workerOffset += numOfWorkers;
+    }
+
+    const int divisor = 2 * offset  + 1;
+    size_t currElementOffset = 0;
+    size_t saveElementOffset = 0;
+    size_t nextElementOffset = 1;
+
+    if (workerIdx == 0) {
+        // clear shared mem
+        for (int i = offset; i < divisor; ++i) data[i] = 0;
 
-    int offsetInTheLoop = 0;
-    T sum = 0;
-    T v = 0;
-    bool waitForNextLoop = false;
-    int countNumOfSumElements = 1;
-    while(workerOffset < y_num) {
-        if (!waitForNextLoop) v = image[workersOffset + workerOffset];
-        bool waitForNextValues = (workerIdx + offsetInTheLoop) % numOfWorkers >= (numOfWorkers - offset);
-        for (int off = 1; off <= offset; ++off) {
-            T prevElement = __shfl_sync(active, v, workerIdx + blockDim.y - off, blockDim.y);
-            T nextElement = __shfl_sync(active, v, workerIdx + off, blockDim.y);
-            // LHS boundary check + don't add previous values if they were added in a previous loop execution
-            if (workerOffset >= off && !waitForNextLoop) {sum += prevElement; ++countNumOfSumElements;}
-            // RHS boundary check + don't read next values since they are not read yet
-            if (!waitForNextValues && workerOffset + off < y_num) {sum += nextElement; ++countNumOfSumElements;}
+        // saturate cache with #offset elements since it will allow to calculate first element value on LHS
+        float sum = 0;
+        int count = 0;
+        while (count <= offset) {
+            T v = buffer[currElementOffset];
+            sum += v;
+            data[count] = v;
+            if (boundaryReflect && count > 0) {
+                data[2 * offset - count + 1] = v;
+                sum += v;
+            }
+            currElementOffset += nextElementOffset;
+            ++count;
         }
-        waitForNextLoop = waitForNextValues;
-        if (!waitForNextLoop) {
+
+        if (boundaryReflect) {
+            count += offset; // elements in above loop in range [1, offset] were summed twice
+        }
+
+        // Pointer in circular buffer
+        int beginPtr = (offset + 1) % divisor;
+
+        // main loop going through all elements in range [0, y_num - 1 - offset], so till last element that
+        // does not need handling RHS for offset '^'
+        // x x x x ... x x x x x x x
+        //                 o o ^ o o
+        //
+        const int lastElement = y_num - 1 - offset;
+        for (int y = 0; y <= lastElement; ++y) {
+            // Calculate and save currently processed element and move to the new one
+            buffer[saveElementOffset] = sum / count;
+            saveElementOffset += nextElementOffset;
+
+            // There is no more elements to process in that loop, all stuff left to be processed is already in 'data' buffer
+            if (y == lastElement) break;
+
+            // Read new element
+            T v = buffer[currElementOffset];
+
+            // Update sum to cover [-offset, offset] of currently processed element
+            sum -= data[beginPtr];
             sum += v;
-            image[workersOffset + workerOffset] = sum / countNumOfSumElements;
 
-            // workere is done with current element - move to next one
-            sum = 0;
-            countNumOfSumElements = 1;
-            workerOffset += numOfWorkers;
+            // Store new element in circularBuffer
+            data[beginPtr] = v;
+
+            // Move to next elements to read and in circular buffer
+            count = min(count + 1, divisor);
+            beginPtr = (beginPtr + 1) % divisor;
+            currElementOffset += nextElementOffset;
+        }
+
+        // Handle last #offset elements on RHS
+        int boundaryPtr = (beginPtr - 1 - 1 + (2 * offset + 1)) % divisor;
+
+        while (saveElementOffset < currElementOffset) {
+            // If filter length is too big in comparison to processed dimension
+            // do not decrease 'count' and do not remove first element from moving filter
+            // since 'sum' of filter elements contains all elements from processed dimension:
+            // dim elements:        xxxxxx
+            // filter elements:  oooooo^ooooo   (o - offset elements, ^ - middle of the filter)
+            // In such a case first 'o' element should not be removed when filter moves right.
+            if (y_num - (currElementOffset - saveElementOffset) / nextElementOffset > offset || boundaryReflect) {
+                if (!boundaryReflect) count = count - 1;
+                sum -= data[beginPtr];
+            }
+
+            if (boundaryReflect) {
+                sum += data[boundaryPtr];
+                boundaryPtr = (boundaryPtr - 1 + (2 * offset + 1)) % divisor;
+            }
+
+            buffer[saveElementOffset] = sum / count;
+            beginPtr = (beginPtr + 1) % divisor;
+            saveElementOffset += nextElementOffset;
         }
-        offsetInTheLoop += offset;
     }
-}
 
+    // Save whole line of data
+    workerOffset = workerIdx;
+    while (workerOffset < y_num) {
+        image[workersOffset + workerOffset] = buffer[workerOffset];
+        workerOffset += numOfWorkers;
+    }
+}
 constexpr int NumberOfWorkers = 32; // Cannot be greater than 32 since there is no inter-warp communication implemented.
 
 /**
@@ -93,7 +159,7 @@ constexpr int NumberOfWorkers = 32; // Cannot be greater than 32 since there is
  * read/write operations for given element.
  */
 template <typename T>
-__global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) {
+__global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect = false) {
     const size_t workerOffset = blockIdx.y * blockDim.y + threadIdx.y + (blockIdx.z * blockDim.z + threadIdx.z) * y_num * x_num;
     const int workerYoffset = blockIdx.y * blockDim.y + threadIdx.y ;
     const int workerIdx = threadIdx.y;
@@ -113,43 +179,72 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_
         // saturate cache with #offset elements since it will allow to calculate first element value on LHS
         float sum = 0;
         int count = 0;
-        while (count < offset) {
+        while (count <= offset) {
             T v = image[workerOffset + currElementOffset];
             sum += v;
             data[count][workerIdx] = v;
+            if (boundaryReflect && count > 0) {data[2 * offset - count + 1][workerIdx] = v; sum += v;}
             currElementOffset += nextElementOffset;
             ++count;
         }
 
+        if (boundaryReflect) {
+            count += offset; // elements in above loop in range [1, offset] were summed twice
+        }
+
         // Pointer in circular buffer
-        int beginPtr = offset;
+        int beginPtr = (offset + 1) % divisor;
+
+        // main loop going through all elements in range [0, x_num - 1 - offset], so till last element that
+        // does not need handling RHS for offset '^'
+        // x x x x ... x x x x x x x
+        //                 o o ^ o o
+        //
+        const int lastElement = x_num - 1 - offset;
+        for (int x = 0; x <= lastElement; ++x) {
+            // Calculate and save currently processed element and move to the new one
+            image[workerOffset + saveElementOffset] = sum / count;
+            saveElementOffset += nextElementOffset;
+
+            // There is no more elements to process in that loop, all stuff left to be processed is already in 'data' buffer
+            if (x == lastElement) break;
 
-        // main loop going through all elements in range [0, x_num-offset)
-        for (int x = 0; x < x_num - offset; ++x) {
             // Read new element
             T v = image[workerOffset + currElementOffset];
 
             // Update sum to cover [-offset, offset] of currently processed element
-            sum += v;
             sum -= data[beginPtr][workerIdx];
+            sum += v;
 
-            // Save and move pointer
+            // Store new element in circularBuffer
             data[beginPtr][workerIdx] = v;
-            beginPtr = (beginPtr + 1) % divisor;
 
-            // Update count and save currently processed element
+            // Move to next elements to read and in circular buffer
             count = min(count + 1, divisor);
-            image[workerOffset + saveElementOffset] = sum / count;
-
-            // Move to next elements
+            beginPtr = (beginPtr + 1) % divisor;
             currElementOffset += nextElementOffset;
-            saveElementOffset += nextElementOffset;
         }
 
         // Handle last #offset elements on RHS
+        int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor;
+
         while (saveElementOffset < currElementOffset) {
-            count = count - 1;
-            sum -= data[beginPtr][workerIdx];
+            // If filter length is too big in comparison to processed dimension
+            // do not decrease 'count' and do not remove first element from moving filter
+            // since 'sum' of filter elements contains all elements from processed dimension:
+            // dim elements:        xxxxxx
+            // filter elements:  oooooo^ooooo   (o - offset elements, ^ - middle of the filter)
+            // In such a case first 'o' element should not be removed when filter moves right.
+            if (x_num - (currElementOffset - saveElementOffset)/nextElementOffset > offset || boundaryReflect) {
+                if (!boundaryReflect) count = count - 1;
+                sum -= data[beginPtr][workerIdx];
+            }
+
+            if (boundaryReflect) {
+                sum += data[boundaryPtr][workerIdx];
+                boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor;
+            }
+
             image[workerOffset + saveElementOffset] = sum / count;
             beginPtr = (beginPtr + 1) % divisor;
             saveElementOffset += nextElementOffset;
@@ -173,7 +268,7 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_
  * read/write operations for given element.
  */
 template <typename T>
-__global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) {
+__global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect = false) {
     const size_t workerOffset = blockIdx.y * blockDim.y + threadIdx.y + (blockIdx.z * blockDim.z + threadIdx.z) * y_num; // *.z is 'x'
     const int workerYoffset = blockIdx.y * blockDim.y + threadIdx.y ;
     const int workerIdx = threadIdx.y;
@@ -193,43 +288,72 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_
         // saturate cache with #offset elements since it will allow to calculate first element value on LHS
         float sum = 0;
         int count = 0;
-        while (count < offset) {
+        while (count <= offset) {
             T v = image[workerOffset + currElementOffset];
             sum += v;
             data[count][workerIdx] = v;
+            if (boundaryReflect && count > 0) {data[2 * offset - count + 1][workerIdx] = v; sum += v;}
             currElementOffset += nextElementOffset;
             ++count;
         }
 
+        if (boundaryReflect) {
+            count += offset; // elements in above loop in range [1, offset] were summed twice
+        }
+
         // Pointer in circular buffer
-        int beginPtr = offset;
+        int beginPtr = (offset + 1) % divisor;
+
+        // main loop going through all elements in range [0, z_num - 1 - offset], so till last element that
+        // does not need handling RHS for offset '^'
+        // x x x x ... x x x x x x x
+        //                 o o ^ o o
+        //
+        const int lastElement = z_num - 1 - offset;
+        for (int z = 0; z <= lastElement; ++z) {
+            // Calculate and save currently processed element and move to the new one
+            image[workerOffset + saveElementOffset] = sum / count;
+            saveElementOffset += nextElementOffset;
+
+            // There is no more elements to process in that loop, all stuff left to be processed is already in 'data' buffer
+            if (z == lastElement) break;
 
-        // main loop going through all elements in range [0, z_num-offset)
-        for (int z = 0; z < z_num - offset; ++z) {
             // Read new element
             T v = image[workerOffset + currElementOffset];
 
             // Update sum to cover [-offset, offset] of currently processed element
-            sum += v;
             sum -= data[beginPtr][workerIdx];
+            sum += v;
 
-            // Save and move pointer
+            // Store new element in circularBuffer
             data[beginPtr][workerIdx] = v;
-            beginPtr = (beginPtr + 1) % divisor;
 
-            // Update count and save currently processed element
+            // Move to next elements to read and in circular buffer
             count = min(count + 1, divisor);
-            image[workerOffset + saveElementOffset] = sum / count;
-
-            // Move to next elements
+            beginPtr = (beginPtr + 1) % divisor;
             currElementOffset += nextElementOffset;
-            saveElementOffset += nextElementOffset;
         }
 
         // Handle last #offset elements on RHS
+        int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor;
+
         while (saveElementOffset < currElementOffset) {
-            count = count - 1;
-            sum -= data[beginPtr][workerIdx];
+            // If filter length is too big in comparison to processed dimension
+            // do not decrease 'count' and do not remove first element from moving filter
+            // since 'sum' of filter elements contains all elements from processed dimension:
+            // dim elements:        xxxxxx
+            // filter elements:  oooooo^ooooo   (o - offset elements, ^ - middle of the filter)
+            // In such a case first 'o' element should not be removed when filter moves right.
+            if (z_num - (currElementOffset - saveElementOffset)/nextElementOffset > offset || boundaryReflect) {
+                if (!boundaryReflect) count = count - 1;
+                sum -= data[beginPtr][workerIdx];
+            }
+
+            if (boundaryReflect) {
+                sum += data[boundaryPtr][workerIdx];
+                boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor;
+            }
+
             image[workerOffset + saveElementOffset] = sum / count;
             beginPtr = (beginPtr + 1) % divisor;
             saveElementOffset += nextElementOffset;
@@ -238,48 +362,49 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_
 }
 
 template <typename T>
-void runMeanYdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) {
+void runMeanYdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) {
     dim3 threadsPerBlock(1, NumberOfWorkers, 1);
     dim3 numBlocks((x_num + threadsPerBlock.x - 1)/threadsPerBlock.x,
                    1,
                    (z_num + threadsPerBlock.z - 1)/threadsPerBlock.z);
-    meanYdir<<<numBlocks,threadsPerBlock, 0, aStream>>>(cudaImage, offset, x_num, y_num, z_num);
+    const int sharedMemorySize = sizeof(T) * y_num + (offset * 2 + 1) * sizeof(float);
+    meanYdir<<<numBlocks,threadsPerBlock, sharedMemorySize, aStream>>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect);
 }
 
 template <typename T>
-void runMeanXdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) {
+void runMeanXdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) {
     dim3 threadsPerBlock(1, NumberOfWorkers, 1);
     dim3 numBlocks(1,
                    (y_num + threadsPerBlock.y - 1) / threadsPerBlock.y,
                    (z_num + threadsPerBlock.z - 1) / threadsPerBlock.z);
     // Shared memory size  - it is able to keep filter len elements for each worker.
     const int sharedMemorySize = (offset * 2 + 1) * sizeof(float) * NumberOfWorkers;
-    meanXdir<<<numBlocks,threadsPerBlock, sharedMemorySize, aStream>>>(cudaImage, offset, x_num, y_num, z_num);
+    meanXdir<<<numBlocks,threadsPerBlock, sharedMemorySize, aStream>>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect);
 }
 
 template <typename T>
-void runMeanZdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) {
+void runMeanZdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) {
     dim3 threadsPerBlock(1, NumberOfWorkers, 1);
     dim3 numBlocks(1,
                    (y_num + threadsPerBlock.y - 1) / threadsPerBlock.y,
                    (x_num + threadsPerBlock.x - 1) / threadsPerBlock.x); // intentionally here for better memory readings
     // Shared memory size  - it is able to keep filter len elements for each worker.
     const int sharedMemorySize = (offset * 2 + 1) * sizeof(float) * NumberOfWorkers;
-    meanZdir<<<numBlocks,threadsPerBlock, sharedMemorySize, aStream>>>(cudaImage, offset, x_num, y_num, z_num);
+    meanZdir<<<numBlocks,threadsPerBlock, sharedMemorySize, aStream>>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect);
 }
 
-template <typename T, typename S>
-void runMean(T *cudaImage, const PixelData<S> &image, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream) {
+template <typename T>
+void runMean(T *cudaImage, const PixelDataDim dim, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream, bool boundaryReflect = false) {
     if (flags & MEAN_Y_DIR) {
-        runMeanYdir(cudaImage, offsetY, image.x_num, image.y_num, image.z_num, aStream);
+        runMeanYdir(cudaImage, offsetY, dim.x, dim.y, dim.z, aStream, boundaryReflect);
     }
 
     if (flags & MEAN_X_DIR) {
-        runMeanXdir(cudaImage, offsetX, image.x_num, image.y_num, image.z_num, aStream);
+        runMeanXdir(cudaImage, offsetX, dim.x, dim.y, dim.z, aStream, boundaryReflect);
     }
 
     if (flags & MEAN_Z_DIR) {
-        runMeanZdir(cudaImage, offsetZ, image.x_num, image.y_num, image.z_num, aStream);
+        runMeanZdir(cudaImage, offsetZ, dim.x, dim.y, dim.z, aStream, boundaryReflect);
     }
 }
 
@@ -314,30 +439,58 @@ void runAbsDiff1D(T *data, const T *reference, size_t len, cudaStream_t aStream)
 }
 
 template<typename T>
-__global__ void rescaleAndThreshold(T *data, size_t len, float varRescale, float sigmaThreshold, float sigmaThresholdMax) {
-    const float max_th = 60000.0;
+__global__ void rescale(T *data, size_t len, float varRescale) {
     size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < len) {
         float rescaled = varRescale * data[idx];
-        if (rescaled < sigmaThreshold) {
-            rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold;
-        }
         data[idx] = rescaled;
     }
 }
 
 template <typename T>
-void runRescaleAndThreshold(T *data, size_t len, float varRescale, float sigma, float sigmaMax, cudaStream_t aStream) {
+void runRescale(T *data, size_t len, float varRescale, cudaStream_t aStream) {
     dim3 threadsPerBlock(64);
     dim3 numBlocks((len + threadsPerBlock.x - 1) / threadsPerBlock.x);
-    rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, varRescale, sigma, sigmaMax);
+    rescale <<< numBlocks, threadsPerBlock, 0, aStream >>>(data, len, varRescale);
+}
+
+template <typename S>
+__global__ void constantScale(S *image, size_t len) {
+    // This is totally naive and slow implementation (only 1 thread is used) just to have CPU
+    // code implemented in CUDA. This code will not be run in any normal usage of APR
+    // and it is just here for sanity check and or super small images cases (like few pixels)
+    // so DO NOT TRY TO OPTIMIZE IT - use your time for something more productive or have
+    // some beers... still better than writing fast version of this code.
+
+    float min_val = 660000;
+    double sum = 0;
+
+    for (size_t i = 0; i < len; ++i) {
+        float tmp = image[i];
+
+        sum += tmp;
+        if (tmp < min_val) min_val = tmp;
+    }
+
+    float scale_val = (float) (sum / (float)len - min_val);
+
+    for (size_t i = 0; i < len; ++i) {
+        image[i] = scale_val;
+    }
+}
+
+template <typename S>
+void runConstantScale(S *image, PixelDataDim &dim, cudaStream_t aStream) {
+    // Check kernel description for further info!
+    constantScale<<<1, 1, 0, aStream>>>(image, dim.size());
 }
 
 template <typename T, typename S>
 void runLocalIntensityScalePipeline(const PixelData<T> &image, const APRParameters &par, S *cudaImage, S *cudaTemp, cudaStream_t aStream) {
     float var_rescale;
     std::vector<int> var_win;
-    LocalIntensityScale().get_window_alt(var_rescale, var_win, par,image);
+    auto lis = LocalIntensityScale();
+    lis.get_window_alt(var_rescale, var_win, par, image);
     size_t win_y = var_win[0];
     size_t win_x = var_win[1];
     size_t win_z = var_win[2];
@@ -345,12 +498,61 @@ void runLocalIntensityScalePipeline(const PixelData<T> &image, const APRParamete
     size_t win_x2 = var_win[4];
     size_t win_z2 = var_win[5];
 
-    // --------- CUDA ----------------
-    runCopy1D(cudaImage, cudaTemp, image.mesh.size(), aStream);
-    runMean(cudaImage, image, win_x, win_y, win_z, MEAN_ALL_DIR, aStream);
-    runAbsDiff1D(cudaImage, cudaTemp, image.mesh.size(), aStream);
-    runMean(cudaImage, image, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream);
-    runRescaleAndThreshold(cudaImage, image.mesh.size(), var_rescale, par.sigma_th, par.sigma_th_max, aStream);
+
+
+    bool constant_scale = false;
+
+    if (par.constant_intensity_scale || (lis.number_active_dimensions == 0)) {
+        // include the case where the local intensity scale doesn't make sense due to the image being to small.
+        // (This is for just edge cases and sanity checking)
+        constant_scale = true;
+    }
+
+    PixelDataDim imageSize = image.getDimension();
+
+    if (!constant_scale) {
+        CudaMemoryUniquePtr<S> paddedImage;
+        CudaMemoryUniquePtr<S> paddedTemp;
+        PixelDataDim paddSize(std::max(win_y, win_y2), std::max(win_x, win_x2), std::max(win_z, win_z2));
+        PixelDataDim paddedImageSize = imageSize + paddSize + paddSize; // padding on both ends of each dimension
+
+        S *ci = cudaImage;
+        S *ct = cudaTemp;
+        PixelDataDim dim = image.getDimension();
+
+        if (par.reflect_bc_lis) {
+            // padd
+            S *mem = nullptr;
+            checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size()));
+            paddedImage.reset(mem);
+            mem = nullptr;
+            checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size()));
+            paddedTemp.reset(mem);
+
+            runPaddPixels(cudaImage, paddedImage.get(), imageSize, paddedImageSize, paddSize, aStream);
+            runPaddPixels(cudaTemp, paddedTemp.get(), imageSize, paddedImageSize, paddSize, aStream);
+
+            ci = paddedImage.get();
+            ct = paddedTemp.get();
+            dim = paddedImageSize;
+        }
+
+        // Run LIS pipeline
+        runCopy1D(ci, ct, dim.size(), aStream);
+        runMean(ci, dim, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, false);
+        runAbsDiff1D(ci, ct, dim.size(), aStream);
+        runMean(ci, dim, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, false);
+        runRescale(ci, dim.size(), var_rescale, aStream);
+
+        if (par.reflect_bc_lis) {
+            // unpadd
+            runUnpaddPixels(ci, cudaImage, paddedImageSize, imageSize, paddSize, aStream);
+            runUnpaddPixels(ct, cudaTemp, paddedImageSize, imageSize, paddSize, aStream);
+        }
+    }
+    else {
+        runConstantScale(cudaImage, imageSize, aStream);
+    }
 }
 
 template void runLocalIntensityScalePipeline<float,float>(const PixelData<float>&, const APRParameters&, float*, float*, cudaStream_t);
@@ -360,24 +562,26 @@ template void runLocalIntensityScalePipeline<float,float>(const PixelData<float>
 // =================================================== TEST helpers
 // TODO: should be moved somewhere
 template <typename T>
-void calcMean(PixelData<T> &image, int offset, TypeOfMeanFlags flags) {
-    ScopedCudaMemHandler<PixelData<T>, H2D | D2H> cudaImage(image);
-    APRTimer timer(true);
-    timer.start_timer("GpuDeviceTimeFull");
-    runMean(cudaImage.get(), image, offset, offset, offset, flags, 0);
-    timer.stop_timer();
+void calcMean(PixelData<T> &image, int offset, TypeOfMeanFlags flags, bool boundaryReflect) {
+    cudaStream_t aStream = 0;
+
+    ScopedCudaMemHandler<PixelData<T>, H2D | D2H> cudaImage(image, aStream);
+
+    runMean(cudaImage.get(), image.getDimension(), offset, offset, offset, flags, 0, boundaryReflect);
 }
 
 // explicit instantiation of handled types
-template void calcMean(PixelData<float>&, int, TypeOfMeanFlags);
-template void calcMean(PixelData<uint16_t>&, int, TypeOfMeanFlags);
+template void calcMean(PixelData<float>&, int, TypeOfMeanFlags, bool);
+template void calcMean(PixelData<uint16_t>&, int, TypeOfMeanFlags, bool);
 
 
 template <typename T>
 void getLocalIntensityScale(PixelData<T> &image, PixelData<T> &temp, const APRParameters &par) {
-    ScopedCudaMemHandler<PixelData<T>, H2D | D2H> cudaImage(image);
-    ScopedCudaMemHandler<PixelData<T>, D2H> cudaTemp(temp);
+    cudaStream_t aStream = 0;
+
+    ScopedCudaMemHandler<PixelData<T>, H2D | D2H> cudaImage(image, aStream);
+    ScopedCudaMemHandler<PixelData<T>, D2H> cudaTemp(temp, aStream);
 
-    runLocalIntensityScalePipeline(image, par, cudaImage.get(), cudaTemp.get(), 0);
+    runLocalIntensityScalePipeline(image, par, cudaImage.get(), cudaTemp.get(), aStream);
 }
 template void getLocalIntensityScale(PixelData<float>&, PixelData<float>&, const APRParameters&);
diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp
index 3d5942c2..e576efd5 100644
--- a/src/algorithm/LocalIntensityScale.hpp
+++ b/src/algorithm/LocalIntensityScale.hpp
@@ -16,6 +16,8 @@ class LocalIntensityScale {
     bool active_x = true;
     bool active_z = true;
 
+public:
+
     int number_active_dimensions = 3;
 
 
@@ -153,13 +155,13 @@ void get_local_intensity_scale(PixelData<float> &local_scale_temp, PixelData<flo
     void calc_abs_diff(const PixelData<T> &input_image, PixelData<T> &var);
 
     template<typename T>
-    void calc_sat_mean_z(PixelData<T> &input, const size_t offset);
+    void calc_sat_mean_z(PixelData<T> &input, const size_t offset, bool boundaryReflect = false);
 
     template<typename T>
-    void calc_sat_mean_x(PixelData<T> &input, const size_t offset);
+    void calc_sat_mean_x(PixelData<T> &input, const size_t offset, bool boundaryReflect = false);
 
     template<typename T>
-    void calc_sat_mean_y(PixelData<T> &input, const size_t offset);
+    void calc_sat_mean_y(PixelData<T> &input, const size_t offset, bool boundaryReflect = false);
 
     void get_window(float &var_rescale, std::vector<int> &var_win, const APRParameters &par);
 
@@ -302,195 +304,337 @@ inline void LocalIntensityScale::get_window_alt(float& var_rescale, std::vector<
     }
 }
 
-/**
- * Calculates a O(1) recursive mean using SAT.
- * @tparam T
- * @param input
- * @param offset
- */
 template<typename T>
-inline void LocalIntensityScale::calc_sat_mean_y(PixelData<T>& input, const size_t offset){
+inline void LocalIntensityScale::calc_sat_mean_y(PixelData<T>& input, const size_t offset, bool boundaryReflect) {
     const size_t z_num = input.z_num;
     const size_t x_num = input.x_num;
     const size_t y_num = input.y_num;
 
-    std::vector<T> temp_vec(y_num);
-    float divisor = 2 * offset + 1;
+    const size_t divisor = offset + 1 + offset;
+
+    auto &mesh = input.mesh;
+    const size_t dimLen = y_num;
 
     #ifdef HAVE_OPENMP
-	#pragma omp parallel for default(shared) firstprivate(temp_vec)
+    #pragma omp parallel for default(shared)
     #endif
-    for(size_t j = 0; j < z_num; ++j) {
-        for(size_t i = 0; i < x_num; ++i){
-            size_t index = j * x_num*y_num + i * y_num;
-
-            //first pass over and calculate cumsum
-            float temp = 0;
-            for (size_t k = 0; k < y_num; ++k) {
-                temp += input.mesh[index + k];
-                temp_vec[k] = temp;
+    for (size_t j = 0; j < z_num; ++j) {
+        for (size_t i = 0; i < x_num; ++i) {
+            size_t index = j * x_num * y_num + i * y_num;
+
+            size_t count = 0;
+            size_t currElementOffset = 0;
+            size_t nextElementOffset = 1;
+            size_t saveElementOffset = 0;
+
+            std::vector<T> circularBuffer(divisor, 0);
+            T sum = 0;
+
+            while (count <= offset) {
+                auto v = mesh[index + currElementOffset];
+                sum += v;
+                circularBuffer[count] = v;
+                if (boundaryReflect && count > 0) { circularBuffer[2 * offset - count + 1] = v; sum += v;}
+
+                currElementOffset += nextElementOffset;
+                count++;
             }
 
-            //handling boundary conditions (LHS)
-            for (size_t k = 0; k <= offset; ++k) {
-                input.mesh[index + k] = 0;
-            }
+            if (boundaryReflect) count += offset;
 
-            //second pass calculate mean
-            for (size_t k = offset + 1; k < y_num; ++k) {
-                input.mesh[index + k] = -temp_vec[k - offset - 1]/divisor;
-            }
+            int beginPtr = (offset + 1) % divisor;
 
-            //second pass calculate mean
-            for (size_t k = 0; k < (y_num-offset); ++k) {
-                input.mesh[index + k] += temp_vec[k + offset]/divisor;
-            }
+            const int lastElement = dimLen - 1 - offset;
+            for (int i = 0; i <= lastElement; ++i) {
+                mesh[index + saveElementOffset] = sum / count;
+                saveElementOffset += nextElementOffset;
+
+                if (i == lastElement) break;
+
+                auto v = mesh[index + currElementOffset];
 
-            float counter = 0;
-            //handling boundary conditions (RHS)
-            for (size_t k = (y_num - offset); k < (y_num); ++k) {
-                counter++;
-                input.mesh[index + k]*= divisor;
-                input.mesh[index + k]+= temp_vec[y_num-1];
-                input.mesh[index + k]*= 1.0/(divisor - counter);
+                sum -= circularBuffer[beginPtr];
+                sum += v;
+
+                circularBuffer[beginPtr] = v;
+
+                count = std::min(count + 1, divisor);
+                beginPtr = (beginPtr + 1) % divisor;
+                currElementOffset += nextElementOffset;
             }
 
-            //handling boundary conditions (LHS), need to rehandle the boundary
-            for (size_t k = 1; k <= offset; ++k) {
-                input.mesh[index + k] *= divisor/(k + offset + 1.0);
+            int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor;
+            while(saveElementOffset < currElementOffset) {
+                // If filter length is too big in comparison to processed dimension
+                // do not decrease 'count' since 'sum' of filter elements contains all elements from
+                // processed dimension:
+                // dim elements:        xxxxxx
+                // filter elements:   oooooo^ooooo   (o - offset elements, ^ - middle of the filter
+                bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset) / nextElementOffset > offset;
+
+                if (removeElementFromFilter) {
+                    if (!boundaryReflect) count = count - 1;
+                }
+                if (removeElementFromFilter || boundaryReflect) {
+                    sum -= circularBuffer[beginPtr];
+                }
+                if (boundaryReflect) {
+                    sum += circularBuffer[boundaryPtr];
+                }
+
+                mesh[index + saveElementOffset] = sum / count;
+
+                boundaryPtr = (boundaryPtr - 1 + divisor) % divisor;
+                beginPtr = (beginPtr + 1) % divisor;
+                saveElementOffset += nextElementOffset;
             }
 
-            //end point boundary condition
-            input.mesh[index] *= divisor/(offset + 1.0);
         }
     }
 }
 
 template<typename T>
-inline void LocalIntensityScale::calc_sat_mean_x(PixelData<T>& input, const size_t offset) {
+inline void LocalIntensityScale::calc_sat_mean_x(PixelData<T>& input, const size_t offset, bool boundaryReflect) {
+
     const size_t z_num = input.z_num;
     const size_t x_num = input.x_num;
     const size_t y_num = input.y_num;
 
-    std::vector<T> temp_vec(y_num*(2*offset + 1),0);
+    const size_t divisor = offset + 1 + offset;
+    std::vector<T> circularBuffer(y_num * divisor, 0);
+    std::vector<T> sum(y_num, 0);
 
-    #ifdef HAVE_OPENMP
-	#pragma omp parallel for default(shared) firstprivate(temp_vec)
-    #endif
-    for(size_t j = 0; j < z_num; j++) {
+    auto &mesh = input.mesh;
+    const size_t dimLen = x_num;
+
+    if (dimLen < offset) {
+        throw std::runtime_error("offset cannot be bigger than processed dimension length!");
+    }
+
+#ifdef HAVE_OPENMP
+#pragma omp parallel for default(shared) firstprivate(circularBuffer, sum)
+#endif
+    for (size_t j = 0; j < z_num; j++) {
         size_t jxnumynum = j * x_num * y_num;
 
-        for(size_t k = 0; k < y_num ; k++){
-            temp_vec[k] = input.mesh[jxnumynum + k];
-        }
+        size_t count = 0; // counts number of active elements in filter
+        size_t currElementOffset = 0; // offset of element in processed dimension
+        size_t nextElementOffset = 1;
+        size_t saveElementOffset = 0; // offset used to finish RHS boundary
+
+        // Clear buffers so they can be reused in next 'z_num' loop
+        std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop
+        std::fill(circularBuffer.begin(), circularBuffer.end(), 0);
 
-        for(size_t i = 1; i < 2 * offset + 1; i++) {
-            for(size_t k = 0; k < y_num; k++) {
-                temp_vec[i*y_num + k] = input.mesh[jxnumynum + i*y_num + k] + temp_vec[(i-1)*y_num + k];
+        // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS
+        while (count <= offset) {
+            for (size_t k = 0; k < y_num; ++k) {
+                auto v = mesh[jxnumynum + currElementOffset * y_num + k];
+                sum[k] += v;
+                circularBuffer[count * y_num + k] = v;
+                if (boundaryReflect && count > 0) { circularBuffer[(2 * offset - count + 1) * y_num + k] = v; sum[k] += v;}
             }
+
+            currElementOffset += nextElementOffset;
+            ++count;
         }
 
-        // LHS boundary
-        for(size_t i = 0; i < offset + 1; i++){
-            for(size_t k = 0; k < y_num; k++) {
-                input.mesh[jxnumynum + i * y_num + k] = (temp_vec[(i + offset) * y_num + k]) / (i + offset + 1);
-            }
+        if (boundaryReflect) {
+            count += offset; // elements in above loop in range [1, offset] were summed twice
         }
 
-        // middle
-        size_t current_index = offset + 1;
-        size_t index_modulo = 0;
-        for(size_t i = offset + 1; i < x_num - offset; i++){
-            // the current cumsum
-            index_modulo = (current_index + offset) % (2*offset + 1); // current_index - offset - 1
-            size_t previous_modulo = (current_index + offset - 1) % (2*offset + 1); // the index of previous cumsum
-
-            for(size_t k = 0; k < y_num; k++) {
-                float temp = input.mesh[jxnumynum + (i + offset)*y_num + k] + temp_vec[previous_modulo*y_num + k];
-                input.mesh[jxnumynum + i*y_num + k] = (temp - temp_vec[index_modulo*y_num + k]) /
-                                                      (2*offset + 1);
-                temp_vec[index_modulo*y_num + k] = temp;
+        // Pointer in circular buffer
+        int beginPtr = (offset + 1) % divisor;
+
+        // main loop going through all elements in range [0, x_num - 1 - offset], so till last element that
+        // does not need handling RHS for offset '^'
+        // x x x x ... x x x x x x x
+        //                 o o ^ o o
+        //
+        const size_t lastElement = x_num - 1 - offset;
+        for (size_t x = 0; x <= lastElement; ++x) {
+            // Calculate and save currently processed element and move to the new one
+            for (size_t k = 0; k < y_num; ++k) {
+                mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count;
+            }
+            saveElementOffset += nextElementOffset;
+
+            // There is no more elements to process in that loop, all stuff left to be processed is already in 'circularBuffer' buffer
+            if (x == lastElement) break;
+
+            for (size_t k = 0; k < y_num; ++k) {
+                // Read new element
+                T v = mesh[jxnumynum + currElementOffset * y_num + k];
+
+                // Update sum to cover [-offset, offset] of currently processed element
+                sum[k] -= circularBuffer[beginPtr * y_num + k];
+                sum[k] += v;
+
+                // Store new element in circularBuffer
+                circularBuffer[beginPtr * y_num + k] = v;
             }
 
-            current_index = (current_index + 1) % (2*offset + 1);
+            // Move to next elements to read and in circular buffer
+            count  = std::min(count + 1, divisor);
+            beginPtr = (beginPtr + 1) % divisor;
+            currElementOffset += nextElementOffset;
         }
 
-        // RHS boundary
-        current_index = (current_index + offset) % (2*offset + 1);
-        for(size_t i = x_num - offset; i < x_num; i++){
-            for(size_t k = 0; k < y_num; k++){
-                input.mesh[jxnumynum + i*y_num + k] = (temp_vec[index_modulo*y_num + k] -
-                                                       temp_vec[current_index*y_num + k]) / (x_num - i + offset);
+        // boundaryPtr is used only in boundaryReflect mode, adding divisor makes it always non-negative value
+        int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor;
+
+        // Handle last #offset elements on RHS
+        while(saveElementOffset < currElementOffset) {
+            // If filter length is too big in comparison to processed dimension
+            // do not decrease 'count' since 'sum' of filter elements contains all elements from
+            // processed dimension:
+            // dim elements:        xxxxxx
+            // filter elements:   oooooo^ooooo   (o - offset elements, ^ - middle of the filter)
+            bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset)/nextElementOffset > offset;
+
+            if (removeElementFromFilter) {
+                if (!boundaryReflect) count = count - 1;
+            }
+
+            for (size_t k = 0; k < y_num; ++k) {
+                if (removeElementFromFilter || boundaryReflect) {
+                    sum[k] -= circularBuffer[beginPtr * y_num + k];
+                }
+
+                if (boundaryReflect) {
+                    sum[k] += circularBuffer[boundaryPtr * y_num + k];
+                }
+
+                mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count;
             }
-            current_index = (current_index + 1) % (2*offset + 1);
+
+            boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor;
+            beginPtr = (beginPtr + 1) % divisor;
+            saveElementOffset += nextElementOffset;
         }
     }
 }
 
 template<typename T>
-inline void LocalIntensityScale::calc_sat_mean_z(PixelData<T>& input,const size_t offset) {
+inline void LocalIntensityScale::calc_sat_mean_z(PixelData<T>& input, const size_t offset, bool boundaryReflect) {
+
     const size_t z_num = input.z_num;
     const size_t x_num = input.x_num;
     const size_t y_num = input.y_num;
 
-    std::vector<T> temp_vec(y_num*(2*offset + 1),0);
-    size_t xnumynum = x_num * y_num;
+    const size_t divisor = offset + 1 + offset;
+    std::vector<T> circularBuffer(y_num * divisor, 0);
+    std::vector<T> sum(y_num, 0);
 
-    #ifdef HAVE_OPENMP
-	#pragma omp parallel for default(shared) firstprivate(temp_vec)
-    #endif
-    for(size_t i = 0; i < x_num; i++) {
+    auto &mesh = input.mesh;
+    size_t dimLen = z_num;
 
-        size_t iynum = i * y_num;
+    if (dimLen < offset) {
+        throw std::runtime_error("offset cannot be bigger than processed dimension length!");
+    }
 
-        //prefetching
-        for(size_t k = 0; k < y_num ; k++){
-            temp_vec[k] = input.mesh[iynum + k];
-        }
+#ifdef HAVE_OPENMP
+#pragma omp parallel for default(shared) firstprivate(circularBuffer, sum)
+#endif
+    for (size_t j = 0; j < x_num; j++) {
+        size_t jxnumynum = j * y_num;
+
+        size_t count = 0; // counts number of active elements in filter
+        size_t currElementOffset = 0; // offset of element in processed dimension
+        size_t nextElementOffset = x_num;
+        size_t saveElementOffset = 0; // offset used to finish RHS boundary
+
+        // Clear buffers so they can be reused in next 'x_num' loop
+        std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop
+        std::fill(circularBuffer.begin(), circularBuffer.end(), 0);
 
-        for(size_t j = 1; j < 2 * offset + 1; j++) {
-            for(size_t k = 0; k < y_num; k++) {
-                temp_vec[j*y_num + k] = input.mesh[j * xnumynum + iynum + k] + temp_vec[(j-1)*y_num + k];
+        // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS
+        while(count <= offset) {
+            for (size_t k = 0; k < y_num; ++k) {
+                auto v = mesh[jxnumynum + currElementOffset * y_num + k];
+                sum[k] += v;
+                circularBuffer[count * y_num + k] = v;
+                if (boundaryReflect && count > 0) { circularBuffer[(2 * offset - count + 1) * y_num + k] = v; sum[k] += v;}
             }
+
+            currElementOffset += nextElementOffset;
+            ++count;
         }
 
-        // LHS boundary
-        for(size_t j = 0; j < offset + 1; j++){
-            for(size_t k = 0; k < y_num; k++) {
-                input.mesh[j * xnumynum + iynum + k] = (temp_vec[(j + offset)*y_num + k]) / (j + offset + 1);
-            }
+        if (boundaryReflect) {
+            count += offset; // elements in above loop in range [1, offset] were summed twice
         }
 
-        // middle
-        size_t current_index = offset + 1;
-        size_t index_modulo = 0;
-        for(size_t j = offset + 1; j < z_num - offset; j++){
+        // Pointer in circular buffer
+        int beginPtr = (offset + 1) % divisor;
+
+        // main loop going through all elements in range [0, z_num - 1 - offset], so till last element that
+        // does not need handling RHS for offset '^'
+        // x x x x ... x x x x x x x
+        //                 o o ^ o o
+        //
+        const size_t lastElement = z_num - 1 - offset;
+        for (size_t z = 0; z <= lastElement; ++z) {
+            // Calculate and save currently processed element and move to the new one
+            for (size_t k = 0; k < y_num; ++k) {
+                mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count;
+            }
+            saveElementOffset += nextElementOffset;
+
+            // There is no more elements to process in that loop, all stuff left to be processed is already in 'circularBuffer' buffer
+            if (z == lastElement) break;
+
+            for (size_t k = 0; k < y_num; ++k) {
+                // Read new element
+                T v = mesh[jxnumynum + currElementOffset * y_num + k];
 
-            index_modulo = (current_index + offset) % (2*offset + 1); // current_index - offset - 1
-            size_t previous_modulo = (current_index + offset - 1) % (2*offset + 1); // the index of previous cumsum
+                // Update sum to cover [-offset, offset] of currently processed element
+                sum[k] -= circularBuffer[beginPtr * y_num + k];
+                sum[k] += v;
 
-            for(size_t k = 0; k < y_num; k++) {
-                // the current cumsum
-                float temp = input.mesh[(j + offset) * xnumynum + iynum + k] + temp_vec[previous_modulo*y_num + k];
-                input.mesh[j * xnumynum + iynum + k] = (temp - temp_vec[index_modulo*y_num + k]) /
-                                                       (2*offset + 1);
-                temp_vec[index_modulo*y_num + k] = temp;
+                // Save new element
+                circularBuffer[beginPtr * y_num + k] = v;
             }
 
-            current_index = (current_index + 1) % (2*offset + 1);
+            // Move to next elements to read and in circular buffer
+            count  = std::min(count + 1, divisor);
+            beginPtr = (beginPtr + 1) % divisor;
+            currElementOffset += nextElementOffset;
         }
 
-        // RHS boundary
-        current_index = (current_index + offset) % (2*offset + 1);
-        for(size_t j = z_num - offset; j < z_num; j++){
-            for(size_t k = 0; k < y_num; k++){
-                input.mesh[j * xnumynum + iynum + k] = (temp_vec[index_modulo*y_num + k] -
-                                                        temp_vec[current_index*y_num + k]) / (z_num - j + offset);
+        // boundaryPtr is used only in boundaryReflect mode, adding divisor makes it always non-negative value
+        int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor;
+
+        // Handle last #offset elements on RHS
+        while(saveElementOffset < currElementOffset) {
+            // If filter length is too big in comparison to processed dimension
+            // do not decrease 'count' since 'sum' of filter elements contains all elements from
+            // processed dimension:
+            // dim elements:        xxxxxx
+            // filter elements:   oooooo^ooooo   (o - offset elements, ^ - middle of the filter)
+            bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset)/nextElementOffset > offset;
+
+            if (removeElementFromFilter) {
+                if (!boundaryReflect) count = count - 1;
+            }
+
+            for (size_t k = 0; k < y_num; ++k) {
+                if (removeElementFromFilter || boundaryReflect) {
+                    sum[k] -= circularBuffer[beginPtr * y_num + k];
+                }
+
+                if (boundaryReflect) {
+                    sum[k] += circularBuffer[boundaryPtr * y_num + k];
+                }
+
+                mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count;
             }
 
-            current_index = (current_index + 1) % (2*offset + 1);
+            boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor;
+            beginPtr = (beginPtr + 1) % divisor;
+            saveElementOffset += nextElementOffset;
         }
     }
 }
 
-#endif //PARTPLAY_LOCAL_INTENSITY_SCALE_HPP
+#endif
diff --git a/src/algorithm/LocalIntensityScaleCuda.h b/src/algorithm/LocalIntensityScaleCuda.h
index a635a156..f572d5e5 100644
--- a/src/algorithm/LocalIntensityScaleCuda.h
+++ b/src/algorithm/LocalIntensityScaleCuda.h
@@ -16,7 +16,7 @@ constexpr TypeOfMeanFlags MEAN_Z_DIR = 0x04;
 constexpr TypeOfMeanFlags MEAN_ALL_DIR = MEAN_Y_DIR | MEAN_X_DIR | MEAN_Z_DIR;
 
 template <typename T>
-void calcMean(PixelData<T> &image, int offset, TypeOfMeanFlags flags = MEAN_ALL_DIR);
+void calcMean(PixelData<T> &image, int offset, TypeOfMeanFlags flags = MEAN_ALL_DIR, bool boundaryReflect = false);
 
 template <typename T>
 void getLocalIntensityScale(PixelData<T> &image, PixelData<T> &temp, const APRParameters &par);
diff --git a/src/algorithm/LocalParticleCellSet.hpp b/src/algorithm/LocalParticleCellSet.hpp
index 7935076b..f834805a 100644
--- a/src/algorithm/LocalParticleCellSet.hpp
+++ b/src/algorithm/LocalParticleCellSet.hpp
@@ -49,6 +49,10 @@ inline int __builtin_clz(unsigned int x)
 
 #endif
 
+#include "algorithm/PullingScheme.hpp"
+#include "algorithm/PullingSchemeSparse.hpp"
+#include "io/TiffUtils.hpp"
+
 class LocalParticleCellSet {
 
 public:
diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu
index f568212b..80765bca 100644
--- a/src/algorithm/OVPC.cu
+++ b/src/algorithm/OVPC.cu
@@ -1,44 +1,34 @@
 #include "PullingSchemeCuda.hpp"
 
 #include <cuda_runtime.h>
-#include <device_launch_parameters.h>
-//#include <device_functions.h>
-#include <cuda_runtime_api.h>
 
 #include "misc/CudaTools.cuh"
 #include "data_structures/Mesh/downsample.cuh"
+#include "algorithm/OVPC.h"
+#include "algorithm/ParticleCellTreeCuda.cuh"
 
-namespace {
-    using ElementType = uint8_t;
-    static constexpr int BIT_SHIFT = 6;
-    static constexpr ElementType OVPC_SEED = 1;
-    static constexpr ElementType OVPC_BOUNDARY = 2;
-    static constexpr ElementType OVPC_FILLER = 3;
-
-    static constexpr ElementType  SEED_MASK = OVPC_SEED << BIT_SHIFT;
-    static constexpr ElementType  BOUNDARY_MASK = OVPC_BOUNDARY << BIT_SHIFT;
-    static constexpr ElementType  FILLER_MASK = OVPC_FILLER << BIT_SHIFT;
-    static constexpr ElementType  MASK = 0x03 << BIT_SHIFT;
-}
 
 template <typename T, typename S>
-__global__ void copy1D(const T *input, S *output, size_t length) {
+__global__ void copyAndClampLevels(const T *input, S *output, size_t length, int levelMin, int levelMax) {
     size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x;
     if (idx < length) {
-        output[idx] = input[idx];
+        T v = input[idx];
+        if (v > levelMax) v = levelMax;
+        if (v < levelMin) v = levelMin;
+        output[idx] = v;
     }
 }
 
 template <typename T, typename S>
-void runCopy1D(T *inputData, S *outputData, size_t lenght, cudaStream_t aStream) {
+void runCopyAndClampLevels(T *inputData, S *outputData, size_t lenght, int levelMin, int levelMax, cudaStream_t aStream) {
     dim3 threadsPerBlock(128);
     dim3 numBlocks((lenght + threadsPerBlock.x - 1)/threadsPerBlock.x);
-    copy1D<<<numBlocks,threadsPerBlock, 0, aStream>>>(inputData, outputData, lenght);
+    copyAndClampLevels<<<numBlocks,threadsPerBlock, 0, aStream>>>(inputData, outputData, lenght, levelMin, levelMax);
 };
 
 
 template <typename T>
-__global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int level) {
+__global__ void firstStep(T *data, size_t xLen, size_t yLen, size_t zLen, int level) {
     const int xi = (blockIdx.x * blockDim.x) + threadIdx.x;
     const int yi = (blockIdx.y * blockDim.y) + threadIdx.y;
     const int zi = (blockIdx.z * blockDim.z) + threadIdx.z;
@@ -51,39 +41,38 @@ __global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int lev
     int zmin = zi > 0 ? zi - 1 : 0;
     int zmax = zi < zLen - 1 ? zi + 1 : zLen - 1;
 
-    bool ok = true;
-    bool neig = false;
+    bool hasNeighHigherLevel = false;
+    bool hasNeighSameLevel = false;
     for (int z = zmin; z <= zmax; ++z) {
         for (int x = xmin; x <= xmax; ++x) {
             for (int y = ymin; y <= ymax; ++y) {
                 const size_t idx = z * xLen * yLen + x * yLen + y;
-                T currentLevel = ~MASK & data[idx];
-                if (currentLevel > level) { ok = false; break; }
-                else if (currentLevel == level) neig = true;
+                T currentLevel = ~OVPC::MASK & data[idx];
+                if (currentLevel > level) { hasNeighHigherLevel = true; break; }
+                else if (currentLevel == level) hasNeighSameLevel = true;
             }
         }
     }
-    if (ok) {
+    if (!hasNeighHigherLevel) {
         const size_t idx = zi * xLen * yLen + xi * yLen + yi;
         T status = data[idx];
-        if (status == level) data[idx] |= SEED_MASK;
-        else if (neig) data[idx] |= BOUNDARY_MASK;
-        else data[idx] |= FILLER_MASK;
+        if (status == level) data[idx] |= OVPC::SEED;
+        else if (hasNeighSameLevel) data[idx] |= OVPC::BOUNDARY;
+        else data[idx] |= OVPC::FILLER;
     }
 }
 
 template <typename T>
-void runOneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int level, cudaStream_t aStream) {
+void runFirstStep(T *data, size_t xLen, size_t yLen, size_t zLen, int level, cudaStream_t aStream) {
     dim3 threadsPerBlock(1, 128, 1);
     dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x,
                    (yLen + threadsPerBlock.y - 1) / threadsPerBlock.y,
                    (zLen + threadsPerBlock.z - 1) / threadsPerBlock.z);
-//    dim3 numBlocks((xLen * yLen * zLen + threadsPerBlock.x - 1)/threadsPerBlock.x);
-    oneLevel<<<numBlocks,threadsPerBlock, 0, aStream>>>(data, xLen, yLen, zLen, level);
+    firstStep<<<numBlocks,threadsPerBlock, 0, aStream>>>(data, xLen, yLen, zLen, level);
 };
 
 template <typename T>
-__global__ void secondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax) {
+__global__ void secondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMin) {
     const int xi = (blockIdx.x * blockDim.x) + threadIdx.x;
     const int yi = (blockIdx.y * blockDim.y) + threadIdx.y;
     const int zi = (blockIdx.z * blockDim.z) + threadIdx.z;
@@ -103,73 +92,94 @@ __global__ void secondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t
         for (int x = xmin; x <= xmax; ++x) {
             for (int y = ymin; y <= ymax; ++y) {
                 size_t children_index = z * xLenc * yLenc + x * yLenc + y;
-                child[children_index] = status >= (OVPC_SEED << BIT_SHIFT) ? 0 : child[children_index] >> BIT_SHIFT;
+                child[children_index] = status >= (OVPC::OVPC_SEED << OVPC::BIT_SHIFT) ? 0 : child[children_index] >> OVPC::BIT_SHIFT;
             }
         }
     }
-    if (isLevelMax) data[zi * xLen * yLen + xi * yLen + yi] = status >> BIT_SHIFT;
+    if (isLevelMin) data[zi * xLen * yLen + xi * yLen + yi] = status >> OVPC::BIT_SHIFT;
 }
 
 template <typename T>
-void runSecondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax, cudaStream_t aStream) {
+void runSecondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax, cudaStream_t aStream) {
     dim3 threadsPerBlock(1, 128, 1);
     dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x,
                    (yLen + threadsPerBlock.y - 1) / threadsPerBlock.y,
                    (zLen + threadsPerBlock.z - 1) / threadsPerBlock.z);
-    secondPhase<<<numBlocks,threadsPerBlock, 0, aStream>>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax);
+    secondStep<<<numBlocks,threadsPerBlock, 0, aStream>>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax);
 };
 
+
 // explicit instantiation of handled types
-template void computeOVPC(const PixelData<float>&, PixelData<TreeElementType>&, int, int);
+template std::vector<PixelData<uint8_t>> computeOvpcCuda(const PixelData<float>&, const GenInfo&);
+template std::vector<PixelData<uint8_t>> computeOvpcCuda(const PixelData<int>&, const GenInfo&);
+
+/**
+ * CUDA implementation of Pullin Scheme (OVPC - Optimal Valid Particle Cell set).
+ * @tparam T - type of input levels
+ * @param input - input levels computed in earlier stages
+ * @param gi - GenInfo for given APR
+ *
+ * @return - PCT for CPU (copied from GPU)
+ */
+template <typename T>
+std::vector<PixelData<uint8_t>> computeOvpcCuda(const PixelData<T> &input, const GenInfo &gi) {
+    // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing
+    // all steps
 
-template <typename T, typename S>
-void computeOVPC(const PixelData<T> &input, PixelData<S> &output, int levelMin, int levelMax) {
-    ScopedCudaMemHandler<const PixelData<T>, H2D> in(input);
-    ScopedCudaMemHandler<PixelData<S>, D2H> mem(output);
-
-    // TODO: This is not needed later - just for having clear debug
-    //cudaMemset(mem.get(), 0, mem.getNumOfBytes());
-
-    // =============== Create pyramid
-    std::vector<S*> levels(levelMax + 1, nullptr);
-    std::vector<size_t> xSize(levelMax + 1);
-    std::vector<size_t> ySize(levelMax + 1);
-    std::vector<size_t> zSize(levelMax + 1);
-
-    int xDS = input.x_num;
-    int yDS = input.y_num;
-    int zDS = input.z_num;
-
-    size_t offset = 0;
-    for (int l = levelMax; l >= levelMin; --l) {
-        levels[l] = reinterpret_cast<TreeElementType *>(mem.get()) + offset;
-        xSize[l] = xDS;
-        ySize[l] = yDS;
-        zSize[l] = zDS;
-
-        offset += xDS * yDS * zDS * sizeof(TreeElementType);
-        // round up to 16-bytes
-        const size_t alignemet = 16;
-        offset = ((offset + alignemet - 1) / alignemet ) * alignemet;
-
-        xDS = ceil(xDS/2.0);
-        yDS = ceil(yDS/2.0);
-        zDS = ceil(zDS/2.0);
-    }
+    cudaStream_t stream = nullptr;
+
+    ScopedCudaMemHandler<const PixelData<T>, H2D> in(input, stream);
+
+    ParticleCellTreeCuda pct(gi, stream);
+    int levelMin = gi.l_min;
+    int levelMax = gi.l_max - 1;
 
-    runCopy1D(in.get(), levels[levelMax], in.getSize(), 0);
 
+    // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range
+    runCopyAndClampLevels(in.get(), pct[levelMax], in.getSize(), levelMin, levelMax, stream);
+
+    // Downsample with max reduction to levelMin to fill rest of the tree
+    for (int l = levelMax - 1; l >= levelMin; --l) {
+        runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], stream);
+    }
+
+    // ================== Phase 1 - top to down
+    for (int l = levelMin; l <= levelMax; ++l) {
+        runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, stream);
+    }
+    // ================== Phase 1 - down to top
     for (int l = levelMax - 1; l >= levelMin; --l) {
-        runDownsampleMax(levels[l + 1], levels[l], xSize[l + 1], ySize[l + 1], zSize[l + 1], 0);
+        runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, stream);
     }
 
+    return pct.getPCTcpu();
+}
+
+// explicit instantiation of handled types
+template void computeOvpcCuda(float *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream);
+template void computeOvpcCuda(int *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream);
+
+
+template <typename ImgType>
+void computeOvpcCuda(ImgType *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream) {
+    int levelMin = gi.l_min;
+    int levelMax = gi.l_max - 1;
+
+
+    // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range
+    runCopyAndClampLevels(in, pct[levelMax], gi.y_num[levelMax]*gi.x_num[levelMax]*gi.z_num[levelMax], levelMin, levelMax, stream);
+
+    // Downsample with max reduction to levelMin to fill rest of the tree
+    for (int l = levelMax - 1; l >= levelMin; --l) {
+        runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], stream);
+    }
 
     // ================== Phase 1 - top to down
     for (int l = levelMin; l <= levelMax; ++l) {
-        runOneLevel(levels[l], xSize[l], ySize[l], zSize[l], l, 0);
+        runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, stream);
     }
     // ================== Phase 1 - down to top
     for (int l = levelMax - 1; l >= levelMin; --l) {
-        runSecondPhase(levels[l], levels[l+1], xSize[l], ySize[l], zSize[l], xSize[l+1], ySize[l+1], zSize[l+1], l == levelMin, 0);
+        runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, stream);
     }
-};
+}
\ No newline at end of file
diff --git a/src/algorithm/OVPC.h b/src/algorithm/OVPC.h
index f8e975ac..f55bfee3 100644
--- a/src/algorithm/OVPC.h
+++ b/src/algorithm/OVPC.h
@@ -9,11 +9,13 @@
 
 #include <vector>
 #include "data_structures/Mesh/PixelData.hpp"
-#include "data_structures/APR/APRAccess.hpp"
+#include "data_structures/APR/GenInfo.hpp"
 #include "algorithm/PullingScheme.hpp"
 
 
 class OVPC {
+
+public:
     // Element big enouth to keep all the levels + 2 highest bits for type
     // for uint8_t we have [ 2 bit - type(empty, seed, boundary, filler) |  6 bit - level(0-63) ]
     using ElementType = uint8_t;
@@ -31,9 +33,8 @@ class OVPC {
     int iLevelMin;
     std::vector<PixelData<ElementType>> iParticleCellTree;
 
-public:
     template <typename T>
-    OVPC(const APRAccess &aAprAccess, const PixelData<T> &aInputLevels) {
+    OVPC(const GenInfo &aAprAccess, const PixelData<T> &aInputLevels) {
         // Level Max is one less since we are working on downsampled version
         iLevelMax = aAprAccess.l_max - 1;
         iLevelMin = aAprAccess.l_min;
@@ -43,8 +44,8 @@ class OVPC {
         iParticleCellTree[iLevelMax].init(aInputLevels.y_num, aInputLevels.x_num, aInputLevels.z_num);
         fillLevel(iLevelMax, aInputLevels);
 
-        // Downsample with max reduction to levelMin to fill the rest of the tree
-        for(int level = iLevelMax - 1; level >= iLevelMin; --level) {
+        // Downsample with max reduction to levelMin to fill rest of the tree
+        for (int level = iLevelMax - 1; level >= iLevelMin; --level) {
             downsample(iParticleCellTree[level + 1], iParticleCellTree[level],
                        [](const float &x, const float &y) -> float { return std::max(x, y); },
                        [](const float &x) -> float { return x; }, true);
diff --git a/src/algorithm/ParticleCellTreeCuda.cuh b/src/algorithm/ParticleCellTreeCuda.cuh
new file mode 100644
index 00000000..d3bc6160
--- /dev/null
+++ b/src/algorithm/ParticleCellTreeCuda.cuh
@@ -0,0 +1,77 @@
+#ifndef PARTICLE_CELL_TREE_CUDA_CUH
+#define PARTICLE_CELL_TREE_CUDA_CUH
+
+
+#include "data_structures/APR/GenInfo.hpp"
+#include "algorithm/PullingScheme.hpp"
+#include "misc/CudaTools.cuh"
+
+/*
+ * CUDA representation of PCT (Particle Cell Tree)
+ * Allocates memory and initialize it to EMPTY
+ *
+ * Allows acces to each level via subscription operator:
+ * ParticleCellTreeCuda pct(aprInfo);
+ * pct[level]
+ *
+ * getPCTcpu and uploadPCT2GPU handle interaction with CPU code (mainly for test/debug purposes).
+ */
+class ParticleCellTreeCuda {
+    ScopedCudaMemHandler<uint8_t*, JUST_ALLOC> mem;
+    std::vector<size_t> startOffsets;
+    GenInfo gi;
+    size_t numOfElements = 0;
+    cudaStream_t stream = nullptr;
+
+public:
+
+    ParticleCellTreeCuda(const GenInfo &aprInfo, const cudaStream_t aStream) : gi(aprInfo), stream(aStream) {
+        // Calculate size of needed memory for PCT and offsets for particular levels
+        int l_max = aprInfo.l_max - 1;
+        int l_min = aprInfo.l_min;
+
+        startOffsets.resize(l_max + 1, 0);
+
+        for (int l = l_min; l <= l_max; ++l) {
+            auto yLen = ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1));
+            auto xLen = ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1));
+            auto zLen = ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1));
+            size_t levelSize = yLen * xLen * zLen;
+            startOffsets[l] = numOfElements;
+            numOfElements += levelSize;
+        }
+
+        // Initialize memory, it is not binded to any CPU memory so we provide nullptr
+        mem.initialize(nullptr, numOfElements, stream);
+        cudaMemsetAsync(mem.get(), EMPTY, numOfElements, stream);
+    }
+
+    inline uint8_t* operator[](size_t level) { return mem.get() + startOffsets[level]; }
+
+    auto getPCTcpu() {
+        std::vector<PixelData<uint8_t>> pct = PullingScheme::generateParticleCellTree(gi);
+        for (int i = gi.l_min; i < gi.l_max; ++i) {
+            checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream));
+        }
+        checkCuda(cudaStreamSynchronize(stream));
+
+        return pct;
+    }
+
+    void downloadPCTfromGPU(std::vector<PixelData<uint8_t>> &pct) {
+        for (int i = gi.l_min; i < gi.l_max; ++i) {
+            checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream));
+        }
+        checkCuda(cudaStreamSynchronize(stream));
+    }
+
+    void uploadPCT2GPU(const std::vector<PixelData<uint8_t>> &pct) {
+        for (int i = gi.l_min; i < gi.l_max; ++i) {
+            checkCuda(cudaMemcpyAsync((*this)[i], pct[i].mesh.get(), pct[i].mesh.size(), cudaMemcpyHostToDevice, stream));
+        }
+        checkCuda(cudaStreamSynchronize(stream));
+    }
+};
+
+
+#endif
diff --git a/src/algorithm/PullingScheme.hpp b/src/algorithm/PullingScheme.hpp
index 58ae9ee2..05b0b723 100644
--- a/src/algorithm/PullingScheme.hpp
+++ b/src/algorithm/PullingScheme.hpp
@@ -13,14 +13,21 @@
 #include "data_structures/Mesh/ImagePatch.hpp"
 #include <vector>
 
+// Main types
 #define EMPTY 0
 #define SEED_TYPE 1
 #define BOUNDARY_TYPE 2
 #define FILLER_TYPE 3
+
+// Type used in linear/random access
+#define UPSAMPLING_SEED_TYPE 4
+
+// Types specific for this implementation of Pulling Scheme (OVPC is not using them)
 #define ASCENDANT 8
 #define PROPOGATE 15
 #define ASCENDANTNEIGHBOUR 16
 
+
 #define NEIGHBOURLOOP(jn,in,kn, boundaries) \
 for(jn = boundaries[0][0]; jn < boundaries[0][1]; jn++) \
     for(in = boundaries[1][0]; in < boundaries[1][1]; in++) \
@@ -51,13 +58,13 @@ for(jn = j * 2; jn < j * 2 + children_boundaries[0]; jn++) \
 
 class PullingScheme {
 
-    double powr(uint64_t num,uint64_t pow2){
+public:
+
+    static double powr(uint64_t num,uint64_t pow2){
         //return (uint64_t) std::round(std::pow(num,pow2));
         return std::round(pow(num,pow2));
     }
 
-
-public:
     template<typename T>
     void fill(float k, const PixelData<T> &input);
 
@@ -65,6 +72,7 @@ class PullingScheme {
     void fill_patch(float level, const PixelData<T> &input, ImagePatch& patch);
 
     void pulling_scheme_main();
+    static std::vector<PixelData<uint8_t>> generateParticleCellTree(const GenInfo &aprInfo);
     void initialize_particle_cell_tree(const GenInfo &aprInfo);
     std::vector<PixelData<uint8_t>>& getParticleCellTree() { return particle_cell_tree; }
 
@@ -86,6 +94,25 @@ class PullingScheme {
     int l_max;
 };
 
+
+inline std::vector<PixelData<uint8_t>> PullingScheme::generateParticleCellTree(const GenInfo &aprInfo) {
+    int l_max = aprInfo.l_max - 1;
+    int l_min = aprInfo.l_min;
+
+    std::vector<PixelData<uint8_t>> pct;
+    pct.resize(l_max + 1);
+
+    for (int l = l_min; l <= l_max; ++l) {
+        pct[l].initWithValue(ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1)),
+                             ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1)),
+                             ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1)),
+                             EMPTY);
+
+    }
+
+    return pct;
+}
+
 /**
  * Initializes particle_cell_tree up to level (max - 1)
  */
@@ -93,14 +120,7 @@ inline void PullingScheme::initialize_particle_cell_tree(const GenInfo &aprInfo)
     l_max = aprInfo.l_max - 1;
     l_min = aprInfo.l_min;
 
-    particle_cell_tree.resize(l_max + 1);
-
-    for (int l = l_min; l <= l_max; ++l) {
-        particle_cell_tree[l].initWithValue(ceil(aprInfo.org_dims[0] / powr(2.0, l_max - l + 1)),
-                                            ceil(aprInfo.org_dims[1] / powr(2.0, l_max - l + 1)),
-                                            ceil(aprInfo.org_dims[2] / powr(2.0, l_max - l + 1)),
-                                            EMPTY);
-    }
+    particle_cell_tree = generateParticleCellTree(aprInfo);
 }
 
 /**
diff --git a/src/algorithm/PullingSchemeCuda.hpp b/src/algorithm/PullingSchemeCuda.hpp
index 79a23560..12aa81d3 100644
--- a/src/algorithm/PullingSchemeCuda.hpp
+++ b/src/algorithm/PullingSchemeCuda.hpp
@@ -7,11 +7,15 @@
 
 
 #include "data_structures/Mesh/PixelData.hpp"
+#include "data_structures/APR/GenInfo.hpp"
+#include "algorithm/ParticleCellTreeCuda.cuh"
 
 using TreeElementType = uint8_t;
 
-template <typename T, typename S>
-void computeOVPC(const PixelData<T> &input, PixelData<S> &output, int levelMin, int levelMax);
+template <typename T>
+std::vector<PixelData<uint8_t>> computeOvpcCuda(const PixelData<T> &input, const GenInfo &gi);
 
+template <typename ImgType>
+void computeOvpcCuda(ImgType *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream);
 
 #endif //LIBAPR_PULLINGSCHEMECUDA_HPP
diff --git a/src/algorithm/bsplineParams.h b/src/algorithm/bsplineParams.h
new file mode 100644
index 00000000..44dbd1c1
--- /dev/null
+++ b/src/algorithm/bsplineParams.h
@@ -0,0 +1,19 @@
+#ifndef APR_BSPLINEPARAMS_H
+#define APR_BSPLINEPARAMS_H
+
+
+#include <cstddef>
+
+
+struct BsplineParamsCuda {
+    float *bc1;
+    float *bc2;
+    float *bc3;
+    float *bc4;
+    size_t k0;
+    float b1;
+    float b2;
+    float norm_factor;
+};
+
+#endif //APR_BSPLINEPARAMS_H
diff --git a/src/algorithm/bsplineXdir.cuh b/src/algorithm/bsplineXdir.cuh
index be0a5f78..1df52a80 100644
--- a/src/algorithm/bsplineXdir.cuh
+++ b/src/algorithm/bsplineXdir.cuh
@@ -5,9 +5,11 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <cinttypes>
+#include "cudaMisc.cuh"
+#include "bsplineParams.h"
 
 /**
- * Runs bspline recursive filter in X direction. Each processed 2D patch consist of number of workes
+ * Runs bspline recursive filter in X direction. Each processed 2D patch consist of number of workers
  * (distributed in Y direction) and each of them is handling the whole row in X-dir.
  * Next patches are build on a top of first (like patch1 in example below) and they cover
  * whole y-dimension. Such a setup should be run for every plane in z-direction.
@@ -59,42 +61,44 @@
  * @param norm_factor - filter norm factor
  */
 template<typename T>
-__global__ void bsplineXdir(T *image, size_t x_num, size_t y_num,
-                            const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0,
-                            float b1, float b2, float norm_factor) {
+__global__ void bsplineXdir(T *image, PixelDataDim dim, BsplineParamsCuda p, bool *error) {
 
     const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y;
-    const size_t zDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * x_num * y_num;
-    const size_t nextElementXdirOffset = y_num;
-    const size_t dirLen = x_num;
+    const size_t zDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.x * dim.y;
+    const size_t nextElementXdirOffset = dim.y;
+    const size_t dirLen = dim.x;
+    const size_t minLen = min(dirLen, p.k0);
 
-    if (yDirOffset < y_num) {
+    if (yDirOffset < dim.y) {
         float temp1 = 0;
         float temp2 = 0;
         float temp3 = 0;
         float temp4 = 0;
+
         // calculate boundary values
-        for (int k = 0; k < k0; ++k) {
+        for (int k = 0; k < minLen; ++k) {
             T val = image[zDirOffset + k * nextElementXdirOffset + yDirOffset];
-            temp1 += bc1[k] * val;
-            temp2 += bc2[k] * val;
+            temp1 += p.bc1[k] * val;
+            temp2 += p.bc2[k] * val;
             val = image[zDirOffset + (dirLen - 1 - k) * nextElementXdirOffset + yDirOffset];
-            temp3 += bc3[k] * val;
-            temp4 += bc4[k] * val;
+            temp3 += p.bc3[k] * val;
+            temp4 += p.bc4[k] * val;
         }
 
+        size_t errorCnt = 0;
+
         // set boundary values in two first and two last points processed direction
-        image[zDirOffset + 0 * nextElementXdirOffset + yDirOffset] = temp1;
-        image[zDirOffset + 1 * nextElementXdirOffset + yDirOffset] = temp2;
-        image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = temp3 * norm_factor;
-        image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = temp4 * norm_factor;
+        image[zDirOffset + 0 * nextElementXdirOffset + yDirOffset] = round<T>(temp1, errorCnt);
+        image[zDirOffset + 1 * nextElementXdirOffset + yDirOffset] = round<T>(temp2, errorCnt);
+        image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = round<T>(temp3 * p.norm_factor, errorCnt);
+        image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = round<T>(temp4 * p.norm_factor, errorCnt);
 
         // Causal Filter loop
         int64_t offset = zDirOffset + 2 * nextElementXdirOffset + yDirOffset;
         int64_t offsetLimit = zDirOffset + (dirLen - 2) * nextElementXdirOffset;
         while (offset < offsetLimit) {
             __syncthreads(); // only needed for speed imporovement (memory coalescing)
-            const float temp = temp1 * b2 + temp2 * b1 + image[offset];
+            const float temp = round<T>(image[offset] + p.b1 * temp2 + p.b2 * temp1, errorCnt);
             image[offset] = temp;
             temp1 = temp2;
             temp2 = temp;
@@ -107,13 +111,15 @@ __global__ void bsplineXdir(T *image, size_t x_num, size_t y_num,
         offsetLimit = zDirOffset;
         while (offset >= offsetLimit) {
             __syncthreads(); // only needed for speed imporovement (memory coalescing)
-            const float temp = temp3 * b1 + temp4 * b2 + image[offset];
-            image[offset] = temp * norm_factor;
+            const float temp = image[offset] + p.b1 * temp3 + p.b2 * temp4;
+            image[offset] = round<T>(temp * p.norm_factor, errorCnt);
             temp4 = temp3;
             temp3 = temp;
 
             offset -= nextElementXdirOffset;
         }
+
+        if (errorCnt > 0) *error = true;
     }
 }
 
@@ -121,15 +127,24 @@ __global__ void bsplineXdir(T *image, size_t x_num, size_t y_num,
  * Function for launching a kernel
  */
 template<typename T>
-void runBsplineXdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num,
-                    const float *bc1, const float *bc2, const float *bc3, const float *bc4,
-                    size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) {
+void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) {
     constexpr int numOfWorkersYdir = 128;
     dim3 threadsPerBlockX(1, numOfWorkersYdir, 1);
     dim3 numBlocksX(1,
-                    (y_num + threadsPerBlockX.y - 1) / threadsPerBlockX.y,
-                    (z_num + threadsPerBlockX.z - 1) / threadsPerBlockX.z);
-    bsplineXdir<T> <<<numBlocksX, threadsPerBlockX, 0, aStream>>> (cudaImage, x_num, y_num, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor);
+                    (dim.y + threadsPerBlockX.y - 1) / threadsPerBlockX.y,
+                    (dim.z + threadsPerBlockX.z - 1) / threadsPerBlockX.z);
+    // In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel
+    // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected.
+    bool isErrorDetected = false;
+    {
+        ScopedCudaMemHandler<bool*, H2D | D2H> error(&isErrorDetected, 1, aStream);
+        bsplineXdir<T> <<<numBlocksX, threadsPerBlockX, 0, aStream>>>(cudaImage, dim, p, error.get());
+    }
+
+    if (isErrorDetected) {
+        throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineXdir - "
+                                    "try squashing the input image to a narrower range or use APRConverter<float>");
+    }
 }
 
 #endif
diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh
index b9dc2f25..e9905b64 100644
--- a/src/algorithm/bsplineYdir.cuh
+++ b/src/algorithm/bsplineYdir.cuh
@@ -5,12 +5,15 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <cinttypes>
+#include "cudaMisc.cuh"
+#include "bsplineParams.h"
+
 
 /**
  * Runs bspline recursive filter in Y direction - divided into two phases:
  * 1. calculate boundary conditions
  * 2. run recursive filter as a set of 2D patches:
- * Each processed 2D patch consist of number of workes
+ * Each processed 2D patch consist of number of workers
  * (distributed in Y direction) and each of them is handling the whole row in Y-dir.
  * Next patches are build on next to it in the x-dir to cover whole x * z domain.
  *
@@ -57,44 +60,45 @@
 
 
 template<typename T>
-__global__ void bsplineYdirBoundary(T *image, size_t x_num, size_t y_num, size_t z_num,
-                                    const float *bc1_vec, const float *bc2_vec, const float *bc3_vec, const float *bc4_vec,
-                                    size_t k0, float *boundary) {
+__global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, BsplineParamsCuda p, float *boundary, bool *error) {
     const int xzIndexOfWorker = (blockIdx.x * blockDim.x) + threadIdx.x;
     const int xzIndexOfBlock = (blockIdx.x * blockDim.x);
 
     const int numOfWorkers = blockDim.x;
     const int currentWorkerId = threadIdx.x;
-    const size_t workersOffset = xzIndexOfBlock * y_num; // per each (x,z) coordinate we have y-row
+    const size_t workersOffset = xzIndexOfBlock * dim.y; // per each (x,z) coordinate we have y-row
+
+    const int64_t maxXZoffset = dim.x * dim.z;
 
-    const int64_t maxXZoffset = x_num * z_num;
+    const size_t dirLen = dim.y;
+    const size_t minLen = min(dirLen, p.k0);
 
     extern __shared__ float sharedMem[];
     float *bc1_vec2 = &sharedMem[0];
-    float *bc2_vec2 = &bc1_vec2[k0];
-    T *cache = (T*)&bc2_vec2[k0];
+    float *bc2_vec2 = &bc1_vec2[p.k0];
+    float *cache = (float*)&bc2_vec2[p.k0];
 
     // Read from global mem to cache
-    for (int i = currentWorkerId; i < k0 * numOfWorkers; i += numOfWorkers) {
-        if (i < k0) {
-            bc1_vec2[i] = bc1_vec[i];
-            bc2_vec2[i] = bc2_vec[i];
+    for (int i = currentWorkerId; i < p.k0 * numOfWorkers; i += numOfWorkers) {
+        if (i < p.k0) {
+            bc1_vec2[i] = p.bc1[i];
+            bc2_vec2[i] = p.bc2[i];
         }
-        int offs = i % k0;
-        int work = i / k0;
-        if (work + xzIndexOfBlock < maxXZoffset) {
-            cache[work * k0 + offs] = image[workersOffset + y_num * work + offs];
+        int offs = i % p.k0;
+        int work = i / p.k0;
+        if (work + xzIndexOfBlock < maxXZoffset && offs < dirLen) {
+            cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + offs];
         }
     }
     __syncthreads();
 
     //forwards direction
-    if (xzIndexOfWorker < x_num * z_num) {
+    if (xzIndexOfWorker < dim.x * dim.z) {
         float temp1 = 0;
         float temp2 = 0;
-        for (size_t k = 0; k < k0; ++k) {
-            temp1 += bc1_vec2[k] * cache[currentWorkerId * k0 + k];
-            temp2 += bc2_vec2[k] * cache[currentWorkerId * k0 + k];
+        for (size_t k = 0; k < minLen; ++k) {
+            temp1 += bc1_vec2[k] * (T)cache[currentWorkerId * p.k0 + k];
+            temp2 += bc2_vec2[k] * (T)cache[currentWorkerId * p.k0 + k];
         }
         boundary[xzIndexOfWorker*4 + 0] = temp1;
         boundary[xzIndexOfWorker*4 + 1] = temp2;
@@ -103,57 +107,61 @@ __global__ void bsplineYdirBoundary(T *image, size_t x_num, size_t y_num, size_t
     // ----------------- second end
     __syncthreads();
 
-    for (int i = currentWorkerId; i < k0 * numOfWorkers; i += numOfWorkers) {
-        if (i < k0) {
-            bc1_vec2[i] = bc3_vec[i];
-            bc2_vec2[i] = bc4_vec[i];
+    for (int i = currentWorkerId; i < p.k0 * numOfWorkers; i += numOfWorkers) {
+        if (i < p.k0) {
+            bc1_vec2[i] = p.bc3[i];
+            bc2_vec2[i] = p.bc4[i];
         }
-        int offs = i % k0;
-        int work = i / k0;
-        if (work + xzIndexOfBlock < maxXZoffset) {
-            cache[work * k0 + offs] = image[workersOffset + y_num * work + y_num - 1 - offs];
+        int offs = i % p.k0;
+        int work = i / p.k0;
+        if (work + xzIndexOfBlock < maxXZoffset  && offs < dirLen) {
+            cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + dim.y - 1 - offs];
         }
     }
     __syncthreads();
 
+    size_t errorCnt = 0;
+
     //forwards direction
-    if (xzIndexOfWorker < x_num * z_num) {
+    if (xzIndexOfWorker < dim.x * dim.z) {
         float temp3 = 0;
         float temp4 = 0;
-        for (size_t k = 0; k < k0; ++k) {
-            temp3 += bc1_vec2[k] * cache[currentWorkerId * k0 + k];
-            temp4 += bc2_vec2[k] * cache[currentWorkerId * k0 + k];
+        for (size_t k = 0; k < minLen; ++k) {
+            temp3 += bc1_vec2[k] * (T)cache[currentWorkerId * p.k0 + k];
+            temp4 += bc2_vec2[k] * (T)cache[currentWorkerId * p.k0 + k];
         }
-        boundary[xzIndexOfWorker*4 + 2] = temp3;
-        boundary[xzIndexOfWorker*4 + 3] = temp4;
+        boundary[xzIndexOfWorker*4 + 2] = round<T>(temp3 * p.norm_factor, errorCnt);
+        boundary[xzIndexOfWorker*4 + 3] = round<T>(temp4 * p.norm_factor, errorCnt);
     }
+
+    if (errorCnt > 0) *error = true;
 }
 
 constexpr int blockWidth = 32;
 constexpr int numOfThreads = 32;
 extern __shared__ char sharedMemProcess[];
 template<typename T>
-__global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_num, const size_t z_num, size_t k0,
-                                   const float b1, const float b2, const float norm_factor, float *boundary) {
+__global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, BsplineParamsCuda p, float *boundary, bool *error) {
     const int numOfWorkers = blockDim.x;
     const int currentWorkerId = threadIdx.x;
     const int xzOffset = blockIdx.x * blockDim.x;
-    const int64_t maxXZoffset = x_num * z_num;
-    const int64_t workersOffset = xzOffset * y_num;
+    const int64_t maxXZoffset = dim.x * dim.z;
+    const int64_t workersOffset = xzOffset * dim.y;
 
-    T (*cache)[blockWidth + 0] = (T (*)[blockWidth + 0]) &sharedMemProcess[0];
+    float (*cache)[blockWidth + 0] = (float (*)[blockWidth + 0]) &sharedMemProcess[0];
 
     float temp1, temp2;
+    size_t errorCnt = 0;
 
     // ---------------- forward direction -------------------------------------------
-    for (int yBlockBegin = 0; yBlockBegin < y_num - 2; yBlockBegin += blockWidth) {
+    for (int yBlockBegin = 0; yBlockBegin < dim.y - 2; yBlockBegin += blockWidth) {
 
         // Read from global mem to cache
         for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) {
             int offs = i % blockWidth;
             int work = i / blockWidth;
-            if (offs + yBlockBegin < (y_num - 2) && work + xzOffset < maxXZoffset) {
-                cache[work][(offs + work)%blockWidth] = image[workersOffset + y_num * work + offs + yBlockBegin];
+            if (offs + yBlockBegin < (dim.y - 2) && work + xzOffset < maxXZoffset) {
+                cache[work][(offs + work)%blockWidth] = image[workersOffset + dim.y * work + offs + yBlockBegin];
             }
         }
         __syncthreads();
@@ -166,8 +174,8 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_
                 cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = temp1;
                 cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = temp2;
             }
-            for (size_t k = yBlockBegin == 0 ? 2 : 0; k < blockWidth && k + yBlockBegin < y_num - 2; ++k) {
-                float  temp = temp1*b2 + temp2*b1 + cache[currentWorkerId][(k + currentWorkerId)%blockWidth];
+            for (size_t k = yBlockBegin == 0 ? 2 : 0; k < blockWidth && k + yBlockBegin < dim.y - 2; ++k) {
+                float  temp = temp2*p.b1 + temp1*p.b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth];
                 cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp;
                 temp1 = temp2;
                 temp2 = temp;
@@ -179,37 +187,37 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_
         for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) {
             int offs = i % blockWidth;
             int work = i / blockWidth;
-            if (offs + yBlockBegin < (y_num - 2) && work + xzOffset < maxXZoffset) {
-                image[workersOffset + y_num * work + offs + yBlockBegin] = cache[work][(offs + work)%blockWidth];
+            if (offs + yBlockBegin < (dim.y - 2) && work + xzOffset < maxXZoffset) {
+                image[workersOffset + dim.y * work + offs + yBlockBegin] = round<T>(cache[work][(offs + work)%blockWidth], errorCnt);
             }
         }
         __syncthreads();
     }
 
     // ---------------- backward direction -------------------------------------------
-    for (int yBlockBegin = y_num - 1; yBlockBegin >= 0; yBlockBegin -= blockWidth) {
+    for (int yBlockBegin = dim.y - 1; yBlockBegin >= 0; yBlockBegin -= blockWidth) {
 
         // Read from global mem to cache
         for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) {
             int offs = i % blockWidth;
             int work = i / blockWidth;
             if (yBlockBegin - offs >= 0 && work + xzOffset < maxXZoffset) {
-                cache[work][(offs + work)%blockWidth] = image[workersOffset + y_num * work - offs + yBlockBegin];
+                cache[work][(offs + work)%blockWidth] = image[workersOffset + dim.y * work - offs + yBlockBegin];
             }
         }
         __syncthreads();
 
         // Do operations
         if (xzOffset + currentWorkerId < maxXZoffset) {
-            if (yBlockBegin == y_num - 1) {
-                temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3];
-                temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2];
-                cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = norm_factor * temp1;
-                cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = norm_factor * temp2;
+            if (yBlockBegin == dim.y - 1) {
+                temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3] / p.norm_factor;
+                temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2] / p.norm_factor;
+                cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = p.norm_factor * temp1;
+                cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = p.norm_factor * temp2;
             }
-            for (int64_t k = yBlockBegin == y_num - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) {
-                float  temp = temp2*b1 + temp1*b2 + cache[currentWorkerId][(k + currentWorkerId)%blockWidth];
-                cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * norm_factor;
+            for (int64_t k = yBlockBegin == dim.y - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) {
+                float  temp = temp2*p.b1 + temp1*p.b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth];
+                cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * p.norm_factor;
                 temp1 = temp2;
                 temp2 = temp;
             }
@@ -221,25 +229,35 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_
             int offs = i % blockWidth;
             int work = i / blockWidth;
             if (yBlockBegin - offs >= 0 && work + xzOffset < maxXZoffset) {
-                image[workersOffset + y_num * work - offs + yBlockBegin] = cache[work][(offs + work)%blockWidth];
+                image[workersOffset + dim.y * work - offs + yBlockBegin] = round<T>(cache[work][(offs + work)%blockWidth], errorCnt);
             }
         }
         __syncthreads();
     }
+
+    if (errorCnt > 0) *error = true;
 }
 
 /**
  * Function for launching a kernel
  */
 template <typename T>
-void runBsplineYdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num,
-                    const float *bc1, const float *bc2, const float *bc3, const float *bc4,
-                    size_t k0, float b1, float b2, float norm_factor, float *boundary, cudaStream_t aStream) {
+void runBsplineYdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, float *boundary, cudaStream_t aStream) {
+
     dim3 threadsPerBlock(numOfThreads);
-    dim3 numBlocks((x_num * z_num + threadsPerBlock.x - 1) / threadsPerBlock.x);
-    size_t sharedMemSize = (2 /*bc vectors*/) * (k0) * sizeof(float) + numOfThreads * (k0) * sizeof(T);
-    bsplineYdirBoundary<T> <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>> (cudaImage, x_num, y_num, z_num, bc1, bc2, bc3, bc4, k0, boundary);
-    sharedMemSize = numOfThreads * blockWidth * sizeof(T);
-    bsplineYdirProcess<T> <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>> (cudaImage, x_num, y_num, z_num, k0, b1, b2, norm_factor, boundary);
+    dim3 numBlocks((dim.x * dim.z + threadsPerBlock.x - 1) / threadsPerBlock.x);
+    size_t sharedMemSize = (2 /*bc vectors*/) * (p.k0) * sizeof(float) + numOfThreads * (p.k0) * sizeof(float);
+    bool isErrorDetected = false;
+    {
+        ScopedCudaMemHandler<bool *, H2D | D2H> error(&isErrorDetected, 1, aStream);
+        bsplineYdirBoundary<T> <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get());
+        sharedMemSize = numOfThreads * blockWidth * sizeof(float);
+        bsplineYdirProcess<T> <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get());
+    }
+
+    if (isErrorDetected) {
+        throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineYdir - "
+                                    "try squashing the input image to a narrower range or use APRConverter<float>");
+    }
 }
 #endif
diff --git a/src/algorithm/bsplineZdir.cuh b/src/algorithm/bsplineZdir.cuh
index 33a5b420..43550ff8 100644
--- a/src/algorithm/bsplineZdir.cuh
+++ b/src/algorithm/bsplineZdir.cuh
@@ -5,6 +5,9 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <cinttypes>
+#include "cudaMisc.cuh"
+#include "bsplineParams.h"
+
 
 /**
  * Runs bspline recursive filter in Z direction. Each processed 2D patch consist of number of workes
@@ -60,42 +63,44 @@
  * @param norm_factor - filter norm factor
  */
 template<typename T>
-__global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num,
-                            const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0,
-                            float b1, float b2, float norm_factor) {
+__global__ void bsplineZdir(T *image, PixelDataDim dim, BsplineParamsCuda p, bool *error) {
 
     const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y;
-    const size_t xDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * y_num; // x is in 'z' to have good memory coalescing
-    const size_t nextElementZdirOffset = x_num * y_num;
-    const size_t dirLen = z_num;
+    const size_t xDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.y; // x is in 'z' to have good memory coalescing
+    const size_t nextElementZdirOffset = dim.x * dim.y;
+    const size_t dirLen = dim.z;
+    const size_t minLen = min(dirLen, p.k0);
 
-    if (yDirOffset < y_num) {
+    if (yDirOffset < dim.y) {
         float temp1 = 0;
         float temp2 = 0;
         float temp3 = 0;
         float temp4 = 0;
+
         // calculate boundary values
-        for (int k = 0; k < k0; ++k) {
+        for (int k = 0; k < minLen; ++k) {
             T val = image[xDirOffset + k * nextElementZdirOffset + yDirOffset];
-            temp1 += bc1[k] * val;
-            temp2 += bc2[k] * val;
+            temp1 += p.bc1[k] * val;
+            temp2 += p.bc2[k] * val;
             val = image[xDirOffset + (dirLen - 1 - k) * nextElementZdirOffset + yDirOffset];
-            temp3 += bc3[k] * val;
-            temp4 += bc4[k] * val;
+            temp3 += p.bc3[k] * val;
+            temp4 += p.bc4[k] * val;
         }
 
+        size_t errorCnt = 0;
+
         // set boundary values in two first and two last points processed direction
-        image[xDirOffset + 0 * nextElementZdirOffset + yDirOffset] = temp1;
-        image[xDirOffset + 1 * nextElementZdirOffset + yDirOffset] = temp2;
-        image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = temp3 * norm_factor;
-        image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = temp4 * norm_factor;
+        image[xDirOffset + 0 * nextElementZdirOffset + yDirOffset] = round<T>(temp1, errorCnt);
+        image[xDirOffset + 1 * nextElementZdirOffset + yDirOffset] = round<T>(temp2, errorCnt);
+        image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = round<T>(temp3 * p.norm_factor, errorCnt);
+        image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = round<T>(temp4 * p.norm_factor, errorCnt);
 
         // Causal Filter loop
         int64_t offset = xDirOffset + 2 * nextElementZdirOffset + yDirOffset;
         int64_t offsetLimit = xDirOffset + (dirLen - 2) * nextElementZdirOffset;
         while (offset < offsetLimit) {
             __syncthreads(); // only needed for speed imporovement (memory coalescing)
-            const float temp = temp1 * b2 + temp2 * b1 + image[offset];
+            const float temp = round<T>(image[offset] + p.b1 * temp2 + p.b2 * temp1, errorCnt);
             image[offset] = temp;
             temp1 = temp2;
             temp2 = temp;
@@ -108,13 +113,15 @@ __global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num,
         offsetLimit = xDirOffset;
         while (offset >= offsetLimit) {
             __syncthreads(); // only needed for speed imporovement (memory coalescing)
-            const float temp = temp3 * b1 + temp4 * b2 + image[offset];
-            image[offset] = temp * norm_factor;
+            const float temp = image[offset] + p.b1 * temp3 + p.b2 * temp4;
+            image[offset] = round<T>(temp * p.norm_factor, errorCnt);
             temp4 = temp3;
             temp3 = temp;
 
             offset -= nextElementZdirOffset;
         }
+
+        if (errorCnt > 0) *error = true;
     }
 }
 
@@ -122,15 +129,24 @@ __global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num,
  * Function for launching a kernel
  */
 template<typename T>
-void runBsplineZdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num,
-                    const float *bc1, const float *bc2, const float *bc3, const float *bc4,
-                    size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) {
+void runBsplineZdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) {
     constexpr int numOfWorkersYdir = 128;
     dim3 threadsPerBlockZ(1, numOfWorkersYdir, 1);
     dim3 numBlocksZ(1,
-                    (y_num + threadsPerBlockZ.y - 1) / threadsPerBlockZ.y,
-                    (x_num + threadsPerBlockZ.x - 1) / threadsPerBlockZ.x);
-    bsplineZdir<T> <<<numBlocksZ, threadsPerBlockZ, 0, aStream>>> (cudaImage, x_num, y_num, z_num, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor);
+                    (dim.y + threadsPerBlockZ.y - 1) / threadsPerBlockZ.y,
+                    (dim.x + threadsPerBlockZ.x - 1) / threadsPerBlockZ.x);
+    // In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel
+    // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected.
+    bool isErrorDetected = false;
+    {
+        ScopedCudaMemHandler<bool*, H2D | D2H> error(&isErrorDetected, 1, aStream);
+        bsplineZdir<T> <<<numBlocksZ, threadsPerBlockZ, 0, aStream>>> (cudaImage, dim, p, error.get());
+    }
+
+    if (isErrorDetected) {
+        throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineZdir - "
+                                    "try squashing the input image to a narrower range or use APRConverter<float>");
+    }
 }
 
 #endif
diff --git a/src/algorithm/cudaMisc.cuh b/src/algorithm/cudaMisc.cuh
new file mode 100644
index 00000000..7442c60b
--- /dev/null
+++ b/src/algorithm/cudaMisc.cuh
@@ -0,0 +1,66 @@
+#ifndef CUDAMISC_CUH
+#define CUDAMISC_CUH
+
+
+#include <type_traits>
+
+
+/**
+ * floating point output -> no rounding or under-/overflow check
+ */
+template<typename T>
+__device__ std::enable_if_t<std::is_floating_point<T>::value, T> round(float val, size_t &errCount) {
+    return val;
+}
+
+/**
+ * integer output -> check for under-/overflow and round
+ *
+ * CUDA is not supporting std::numeric_limits<T> so this results in belows manual checking of different
+ * data types range. In theory we could use --expt-relaxed-constexpr flag but since it is experimental
+ * and without guarantee of long existence for now it is better to stick to belows definitions.
+ */
+template<typename T>
+__device__  std::enable_if_t<std::is_same<T, uint8_t>::value, uint8_t> round(float val, size_t &errCount) {
+    val = std::round(val);
+    if (val < 0 || val > 255) { errCount++; }
+    return val;
+}
+
+template<typename T>
+__device__  std::enable_if_t<std::is_same<T, int8_t>::value, int8_t> round(float val, size_t &errCount) {
+    val = std::round(val);
+    if (val < -128 || val > 127) { errCount++; }
+    return val;
+}
+
+template<typename T>
+__device__  std::enable_if_t<std::is_same<T, uint16_t>::value, uint16_t> round(float val, size_t &errCount) {
+    val = std::round(val);
+    if (val < 0 || val > 65535) { errCount++; }
+    return val;
+}
+
+template<typename T>
+__device__  std::enable_if_t<std::is_same<T, int16_t>::value, int16_t> round(float val, size_t &errCount) {
+    val = std::round(val);
+    if (val < -32768 || val > 32767) { errCount++; }
+    return val;
+}
+
+template<typename T>
+__device__  std::enable_if_t<std::is_same<T, uint32_t>::value, uint32_t> round(float val, size_t &errCount) {
+    val = std::round(val);
+    if (val < 0 || val > 4294967295) { errCount++; }
+    return val;
+}
+
+template<typename T>
+__device__  std::enable_if_t<std::is_same<T, int32_t>::value, int32_t> round(float val, size_t &errCount) {
+    val = std::round(val);
+    if (val < -2147483648 || val > 2147483647) { errCount++; }
+    return val;
+}
+
+
+#endif
diff --git a/src/algorithm/dsGradient.cuh b/src/algorithm/dsGradient.cuh
index de4a2c77..8e2efc84 100644
--- a/src/algorithm/dsGradient.cuh
+++ b/src/algorithm/dsGradient.cuh
@@ -5,11 +5,14 @@
 
 template<typename T>
 __global__ void
-gradient(const T *input, size_t x_num, size_t y_num, size_t z_num, T *grad, size_t x_num_ds, size_t y_num_ds,
-         float hx, float hy, float hz) {
+gradient(const T *input, PixelDataDim inputDim, T *grad, PixelDataDim gradDim, float hx, float hy, float hz) {
     const int xi = ((blockIdx.x * blockDim.x) + threadIdx.x) * 2;
     const int yi = ((blockIdx.y * blockDim.y) + threadIdx.y) * 2;
     const int zi = ((blockIdx.z * blockDim.z) + threadIdx.z) * 2;
+    const auto x_num = inputDim.x;
+    const auto y_num = inputDim.y;
+    const auto z_num = inputDim.z;
+
     if (xi >= x_num || yi >= y_num || zi >= z_num) return;
 
     const size_t xnumynum = x_num * y_num;
@@ -33,28 +36,28 @@ gradient(const T *input, size_t x_num, size_t y_num, size_t z_num, T *grad, size
             for (int y = 1; y <= 2; ++y) {
                 float xd = (temp[z][x - 1][y] - temp[z][x + 1][y]) / (2 * hx);
                 xd = xd * xd;
-                float yd = (temp[z - 1][x][y] - temp[z + 1][x][y]) / (2 * hy);
-                yd = yd * yd;
-                float zd = (temp[z][x][y - 1] - temp[z][x][y + 1]) / (2 * hz);
+                float zd = (temp[z - 1][x][y] - temp[z + 1][x][y]) / (2 * hz);
                 zd = zd * zd;
-                float gm = __fsqrt_rn(xd + yd + zd);
+                float yd = (temp[z][x][y - 1] - temp[z][x][y + 1]) / (2 * hy);
+                yd = yd * yd;
+                float gm = sqrtf(xd + zd + yd);
                 if (gm > maxGrad) maxGrad = gm;
             }
 
-    const size_t idx = zi / 2 * x_num_ds * y_num_ds + xi / 2 * y_num_ds + yi / 2;
+    const size_t idx = zi / 2 * gradDim.x * gradDim.y + xi / 2 * gradDim.y + yi / 2;
     grad[idx] = maxGrad;
 }
 
 template<typename T>
 void runKernelGradient(const T *cudaInput, T *cudaGrad,
-                       size_t xLenInput, size_t yLenInput, size_t zLenInput,
-                       size_t xLenGradient, size_t yLenGradient,
+                       PixelDataDim inputDim,
+                       PixelDataDim gradDim,
                        float hx, float hy, float hz, cudaStream_t aStream) {
     dim3 threadsPerBlock(1, 64, 1);
-    dim3 numBlocks((xLenInput + threadsPerBlock.x - 1) / threadsPerBlock.x,
-                   (yLenInput + threadsPerBlock.y - 1) / threadsPerBlock.y,
-                   (zLenInput + threadsPerBlock.z - 1) / threadsPerBlock.z);
-    gradient <<<numBlocks, threadsPerBlock, 0, aStream>>> (cudaInput, xLenInput, yLenInput, zLenInput, cudaGrad, xLenGradient, yLenGradient, hx, hy, hz);
+    dim3 numBlocks((inputDim.x + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                   (inputDim.y + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (inputDim.z + threadsPerBlock.z - 1) / threadsPerBlock.z);
+    gradient <<<numBlocks, threadsPerBlock, 0, aStream>>> (cudaInput, inputDim, cudaGrad, gradDim, hx, hy, hz);
 }
 
 
diff --git a/src/algorithm/invBspline.cuh b/src/algorithm/invBspline.cuh
index d422abf1..7c27d853 100644
--- a/src/algorithm/invBspline.cuh
+++ b/src/algorithm/invBspline.cuh
@@ -9,14 +9,18 @@ __global__ void invBsplineYdir(T *image, size_t x_num, size_t y_num, size_t z_nu
     int workerOffset = workerIdx;
     int loopNum = 0;
 
-    T p = 0;
-    T v = 0;
+    const float a1 = 1.0/6.0;
+    const float a2 = 4.0/6.0;
+    const float a3 = 1.0/6.0;
+
+    float p = 0;
+    float v = 0;
     bool notLastInRow = true;
     while (workerOffset < y_num) {
         if (notLastInRow) v = image[workersOffset + workerOffset];
-        T temp = __shfl_sync(active, v, workerIdx + blockDim.y - 1, blockDim.y);
+        float temp = __shfl_sync(active, v, workerIdx + blockDim.y - 1, blockDim.y);
         p = notLastInRow ? temp : p;
-        T n = __shfl_sync(active, v, workerIdx + 1, blockDim.y);
+        float n = __shfl_sync(active, v, workerIdx + 1, blockDim.y);
 
         // handle boundary (reflective mode)
         if (workerOffset == 0) p = n;
@@ -24,7 +28,7 @@ __global__ void invBsplineYdir(T *image, size_t x_num, size_t y_num, size_t z_nu
 
         notLastInRow = (workerIdx + 1 + loopNum) % blockDim.y != 0;
         if (notLastInRow) {
-            v = (p + v * 4 + n) / 6.0;
+            v = a1 * p + a2 * v + a3 * n;
             image[workersOffset + workerOffset] = v;
             workerOffset += blockDim.y;
         }
@@ -49,21 +53,25 @@ __global__ void invBsplineXdir(T *image, size_t x_num, size_t y_num, size_t z_nu
     const int workerIdx = blockIdx.y * blockDim.y + threadIdx.y ;
     const int nextElementOffset = y_num;
 
+    const float a1 = 1.0/6.0;
+    const float a2 = 4.0/6.0;
+    const float a3 = 1.0/6.0;
+
     if (workerIdx < y_num) {
         int currElementOffset = 0;
 
         T v1 = image[workerOffset + currElementOffset];
         T v2 = image[workerOffset + currElementOffset + nextElementOffset];
-        image[workerOffset + currElementOffset] = (2 * v2 + 4 * v1) / 6.0;
+        image[workerOffset + currElementOffset] = a1 * v2 + a2 * v1 + a3 * v2;
 
         for (int x = 2; x < x_num; ++x) {
             T v3 = image[workerOffset + currElementOffset + 2 * nextElementOffset];
-            image[workerOffset + currElementOffset + nextElementOffset] = (v1 + 4 * v2 + v3) / 6.0;
+            image[workerOffset + currElementOffset + nextElementOffset] = (a1 * v1 + a2 * v2 + a3 * v3);
             v1 = v2;
             v2 = v3;
             currElementOffset += nextElementOffset;
         }
-        image[workerOffset + currElementOffset + nextElementOffset] = (2 * v1 + 4 * v2) / 6.0;
+        image[workerOffset + currElementOffset + nextElementOffset] = (a1 + a3) * v1 + a2 * v2;
     }
 }
 
@@ -83,21 +91,25 @@ __global__ void invBsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_nu
     const int workerIdx = blockIdx.y * blockDim.y + threadIdx.y ;
     const int nextElementOffset = x_num * y_num;
 
+    const float a1 = 1.0/6.0;
+    const float a2 = 4.0/6.0;
+    const float a3 = 1.0/6.0;
+
     if (workerIdx < y_num) {
         int currElementOffset = 0;
 
         T v1 = image[workerOffset + currElementOffset];
         T v2 = image[workerOffset + currElementOffset + nextElementOffset];
-        image[workerOffset + currElementOffset] = (2 * v2 + 4 * v1) / 6.0;
+        image[workerOffset + currElementOffset] = a1 * v2 + a2 * v1 + a1 * v2;
 
         for (int x = 2; x < z_num; ++x) {
             T v3 = image[workerOffset + currElementOffset + 2 * nextElementOffset];
-            image[workerOffset + currElementOffset + nextElementOffset] = (v1 + 4 * v2 + v3) / 6.0;
+            image[workerOffset + currElementOffset + nextElementOffset] = a1 * v1 + a2 * v2 + a3 * v3;
             v1 = v2;
             v2 = v3;
             currElementOffset += nextElementOffset;
         }
-        image[workerOffset + currElementOffset + nextElementOffset] = (2 * v1 + 4 * v2) / 6.0;
+        image[workerOffset + currElementOffset + nextElementOffset] = (a1 + a3) * v1 + a2 * v2;
     }
 }
 
diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp
index f8fd090e..8d5da2bd 100644
--- a/src/data_structures/APR/GenInfo.hpp
+++ b/src/data_structures/APR/GenInfo.hpp
@@ -5,6 +5,11 @@
 #ifndef LIBAPR_GENINFO_HPP
 #define LIBAPR_GENINFO_HPP
 
+
+#include <sstream>
+#include <vector>
+#include <cmath>
+
 //Note this function sets up the domain for the APR for a given input size.
 class GenInfo {
 
@@ -29,6 +34,16 @@ class GenInfo {
 
     std::vector<int> level_size; // precomputation of the size of each level, used by the iterators.
 
+    GenInfo() {}
+    GenInfo(const PixelDataDim &dim) { init(dim); }
+
+    size_t getSize() const { return (size_t)y_num[l_max] * x_num[l_max] * z_num[l_max]; }
+
+    //initialize the information given the original dimensions
+    void init(const PixelDataDim &dim) {
+        init(dim.y, dim.x, dim.z);
+    }
+
     //initialize the information given the original dimensions
     void init(uint64_t y_org,uint64_t x_org,uint64_t z_org){
 
@@ -64,6 +79,11 @@ class GenInfo {
         }
     }
 
+    //initialize the information given the original dimensions
+    void init_tree(const PixelDataDim &dim){
+        init_tree(dim.y, dim.x, dim.z);
+    }
+
     //initialize the information given the original dimensions
     void init_tree(uint64_t y_org,uint64_t x_org,uint64_t z_org){
 
@@ -97,6 +117,26 @@ class GenInfo {
             z_num[l] = ceil(z_org / cellSize);
         }
     }
+
+    friend std::ostream & operator<<(std::ostream &os, const GenInfo &gi) {
+        os << "GenInfo {\n";
+        os << "    Original dimensions(y/x/z): [" << gi.org_dims[0] << ", " << gi.org_dims[1] << ", " << gi.org_dims[2] << "]\n";
+        os << "    Original size: " << gi.getSize() << "\n";
+        os << "    Number of dimensions: " << static_cast<int>(gi.number_dimensions) << "\n";
+        os << "    l_min, l_max: {" << gi.l_min << " - " << gi.l_max << "}\n";
+        os << "    total number of particles: " << gi.total_number_particles << "\n";
+        os << "    y_num, x_num, z_num:\n";
+        for (int l = gi.l_min; l <= gi.l_max; ++l) {
+            os << "        level [" << l << "] =  " << gi.y_num[l] << ", " << gi.x_num[l] << ", " << gi.z_num[l] << "\n";
+        }
+        os << "    level_size:\n";
+        for (int l = gi.l_min; l <= gi.l_max; ++l) {
+            os << "        level " << l << ": " << gi.level_size[l] << "\n";
+        }
+        os << "}";
+
+        return os;
+    }
 };
 
 
diff --git a/src/data_structures/APR/access/LinearAccess.hpp b/src/data_structures/APR/access/LinearAccess.hpp
index 5f92c0ef..b92476c2 100644
--- a/src/data_structures/APR/access/LinearAccess.hpp
+++ b/src/data_structures/APR/access/LinearAccess.hpp
@@ -11,6 +11,7 @@
 #include "data_structures/Mesh/PixelData.hpp"
 
 #include "algorithm/APRParameters.hpp"
+#include "algorithm/PullingScheme.hpp"
 
 #include "APRAccessStructures.hpp"
 
@@ -225,44 +226,43 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet
 
     initialize_xz_linear();
 
+    // *********************************************************************************************************************
+    //                       FULL RESOLUTION
+    // *********************************************************************************************************************
     //edge case
     if(level_max()<=2){
         // For performance reasons and clarity of the code, it doesn't make sense here to handle these cases. Below assumes there is atleast levels <=2;
 
         //just initialize full resolution
         const auto level_start = level_xz_vec[level_max()];
-        uint64_t counter = 0;
+        uint64_t particleCounter = 0;
         for (int z = 0; z < z_num(level_max()); ++z) {
             for (int x = 0; x < x_num(level_max()); ++x) {
                 const size_t offset_pc_data = z * x_num(level_max()) + x;
-                for (int y = 0; y < y_num(level_max()); ++y) {
-
-                    counter++;
-                }
-                xz_end_vec[level_start + offset_pc_data] = counter;
+                particleCounter += y_num(level_max());
+                xz_end_vec[level_start + offset_pc_data] = particleCounter;
             }
         }
-        y_vec.resize(counter);
-        counter = 0;
+        genInfo->total_number_particles = xz_end_vec.back();
+        y_vec.resize(genInfo->total_number_particles);
 
+        size_t idx = 0;
         for (int z = 0; z < z_num(level_max()); ++z) {
             for (int x = 0; x < x_num(level_max()); ++x) {
-
                 for (int y = 0; y < y_num(level_max()); ++y) {
-                    y_vec[counter] = y;
-                    counter++;
+                    y_vec[idx++] = y;
                 }
             }
         }
 
-
         return;
     }
 
-    // ========================================================================
+    // *********************************************************************************************************************
+    //                       FIRST STEP
+    // *********************************************************************************************************************
     apr_timer.start_timer("first_step");
 
-    const uint8_t UPSAMPLING_SEED_TYPE = 4;
     const uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization
     for (int level = level_min()+1; level < level_max(); ++level) {
         const size_t xLen = genInfo->x_num[level];
@@ -293,7 +293,9 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet
     }
     apr_timer.stop_timer();
 
-    // ========================================================================
+    // *********************************************************************************************************************
+    //                       SECOND STEP
+    // *********************************************************************************************************************
     apr_timer.start_timer("second_step");
 
 
@@ -328,14 +330,15 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet
         }
     }
 
+
+// *********************************************************************************************************************
+//                       SECOND STEP LAST LEVEL
+//
+//    l_max - 1 is special as it also has the l_max information that then needs to be upsampled.
+// *********************************************************************************************************************
     std::vector<uint64_t> temp_max_xz;
     temp_max_xz.resize(genInfo->z_num[genInfo->l_max - 1]*genInfo->x_num[genInfo->l_max - 1],0);
 
-    /*
-     * l_max - 1 is special as it also has the l_max information that then needs to be upsampled.
-     *
-     */
-
     size_t l_minus_1 = genInfo->l_max - 1;
     const size_t xLen = genInfo->x_num[l_minus_1];
     const size_t zLen = genInfo->z_num[l_minus_1];
@@ -409,6 +412,11 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet
 
     apr_timer.stop_timer();
 
+
+    // *********************************************************************************************************************
+    //                       THIRD STEP - Get Y values
+    // *********************************************************************************************************************
+
     apr_timer.start_timer("init y");
 
     genInfo->total_number_particles = xz_end_vec.back();
@@ -452,10 +460,11 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet
         }
     }
 
-    /*
-     * l_max - 1 is special as it also has the l_max information that then needs to be upsampled.
-     *
-     */
+    // *********************************************************************************************************************
+    //                       4th STEP LAST LEVEL
+    //
+    //    l_max - 1 is special as it also has the l_max information that then needs to be upsampled.
+    // *********************************************************************************************************************
 
 
 #ifdef HAVE_OPENMP
@@ -545,7 +554,6 @@ inline void LinearAccess::initialize_linear_structure_sparse(APRParameters& apr_
     // ========================================================================
     apr_timer.start_timer("first_step");
 
-    const uint8_t UPSAMPLING_SEED_TYPE = 4;
     const uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization
     for (int level = level_min()+1; level < level_max(); ++level) {
         const size_t xLen = genInfo->x_num[level];
diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu
new file mode 100644
index 00000000..1a876d0e
--- /dev/null
+++ b/src/data_structures/APR/access/LinearAccessCuda.cu
@@ -0,0 +1,638 @@
+#include "LinearAccessCuda.hpp"
+
+#include "misc/CudaTools.cuh"
+#include "algorithm/ParticleCellTreeCuda.cuh"
+
+// CUDA version of GenInfo structure
+typedef struct GenInfoCuda_t {
+    int l_min;
+    int l_max;
+
+    int *org_dims; // fixed size: [3]
+
+    uint8_t number_dimensions;
+
+    int *x_num;
+    int *y_num;
+    int *z_num;
+
+    // this differs from original GenInfo structure
+    // since we need to be able to send data back from GPU to CPU
+    uint64_t *total_number_particles;
+
+    int *level_size;
+
+    uint64_t get_total_number_particles() const { return *total_number_particles; }
+
+    __device__ int level_max() const { return l_max; }
+    __device__ int level_min() const { return l_min; }
+
+} GenInfoCuda;
+
+// -----------------------------
+
+/*
+ * Class for easy transfering to/from GPU of GenInfo structure.
+ */
+class GenInfoGpuAccess {
+    GenInfo &gi;
+
+    cudaStream_t iStream;
+
+    ScopedCudaMemHandler<int*, H2D | D2H> org_dims;
+    ScopedCudaMemHandler<int*, H2D | D2H> x_num;
+    ScopedCudaMemHandler<int*, H2D | D2H> y_num;
+    ScopedCudaMemHandler<int*, H2D | D2H> z_num;
+    ScopedCudaMemHandler<uint64_t*, H2D | D2H> total_number_particles;
+    ScopedCudaMemHandler<int*, H2D | D2H> level_size;
+
+
+public:
+    GenInfoGpuAccess(GenInfo &genInfo, cudaStream_t cudaStream) :
+        gi(genInfo),
+        iStream(cudaStream),
+        org_dims(gi.org_dims, 3, iStream),
+        x_num(gi.x_num.data(), gi.x_num.size(), iStream),
+        y_num(gi.y_num.data(), gi.y_num.size(), iStream),
+        z_num(gi.z_num.data(), gi.z_num.size(), iStream),
+        total_number_particles(&gi.total_number_particles, 1, iStream),
+        level_size(gi.level_size.data(), gi.level_size.size(), iStream)
+    {
+    }
+
+    GenInfoCuda getGenInfoCuda() {
+        GenInfoCuda gic;
+
+        gic.l_min = gi.l_min;
+        gic.l_max = gi.l_max;
+        gic.org_dims = org_dims.get();
+        gic.number_dimensions = gi.number_dimensions;
+        gic.x_num = x_num.get();
+        gic.y_num = y_num.get();
+        gic.z_num = z_num.get();
+        gic.total_number_particles = total_number_particles.get();
+        gic.level_size = level_size.get();
+
+        return gic;
+    }
+
+    ~GenInfoGpuAccess() {
+        copyDtoH();
+    }
+
+    void copyHtoD() {
+        // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension)
+        total_number_particles.copyH2D();
+    }
+
+    void copyDtoH() {
+        // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension)
+        total_number_particles.copyD2H();
+    }
+};
+
+// *********************************************************************************************************************
+//                       FULL RESOLUTION
+// *********************************************************************************************************************
+/**
+ * Handle edge case for #levels <= 2
+ * For performance reasons and clarity of the code,
+ * it doesn't make sense here to handle these cases.
+ * Below assumes there is at least levels <=2;
+ * @param level_xz
+ * @param xz_end
+ * @param y
+ * @param gic - cuda version of GenInfo
+ */
+__global__ void fullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, GenInfoCuda gic) {
+
+    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
+    const unsigned levelMax = gic.level_max();
+    const uint64_t xMax = gic.x_num[levelMax];
+    const uint64_t yMax = gic.y_num[levelMax];
+    const uint64_t zMax = gic.z_num[levelMax];
+
+
+    if (x < xMax && z < zMax) {
+        const uint64_t levelStart = level_xz[levelMax];
+        uint64_t offset_pc_data = z * xMax + x;
+        uint64_t particleCounter = (1 + x + z * xMax) * yMax;
+
+        xz_end[levelStart + offset_pc_data] = particleCounter;
+
+        for (int i = 0; i < yMax; ++i) {
+            uint64_t idx = (xMax * z + x) * yMax + i;
+            y[idx] = i;
+        }
+    }
+
+    if (x == 0 && z == 0) {
+        *gic.total_number_particles = xMax * yMax * zMax;
+    }
+}
+
+void runFullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, const GenInfo &gi, GenInfoGpuAccess &giga, cudaStream_t aStream) {
+    dim3 threadsPerBlock(32, 1, 1);
+
+    dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x,
+                    1,
+                    (gi.z_num[gi.l_max] + threadsPerBlock.z - 1)/threadsPerBlock.z);
+    fullResolution<<<numBlocks, threadsPerBlock, 0, aStream>>>(level_xz, xz_end, y, giga.getGenInfoCuda());
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("----------------------------------Error: %s\n", cudaGetErrorString(err));
+        throw std::runtime_error("runFullResolution failed");
+    }
+}
+
+
+// *********************************************************************************************************************
+//                       FIRST STEP
+// *********************************************************************************************************************
+
+static constexpr uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization
+
+
+__global__ void firstStep(const uint8_t *prevLevel, uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic) {
+    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
+    const uint64_t xLen = gic.x_num[level];
+    const uint64_t yLen = gic.y_num[level];
+    const uint64_t zLen = gic.z_num[level];
+    const uint64_t xLenDS = gic.x_num[level - 1];
+    const uint64_t yLenDS = gic.y_num[level - 1];
+
+    if (x < xLen && z < zLen) {
+        const size_t offset_part_map_ds = (x / 2) * yLenDS + (z / 2) * yLenDS * xLenDS;
+        const size_t offset_part_map = x * yLen + z * yLen * xLen;
+
+        for (size_t y = 0; y < yLenDS; ++y) {
+            uint8_t  status = prevLevel[offset_part_map_ds + y];
+            if (status > 0 && status <= min_type) {
+                currLevel[offset_part_map + 2 * y] = seed_us;                    // 2 * y
+                currLevel[offset_part_map + min(2 * y + 1, yLen - 1)] = seed_us; // 2 * y + 1
+            }
+        }
+    }
+}
+
+void runFirstStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, cudaStream_t aStream) {
+    dim3 threadsPerBlock(32, 1, 1);
+
+    for (int level = gi.l_min + 1; level < gi.l_max; ++level) {
+        dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x,
+                        1,
+                        (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z);
+        auto *p_mapPrev = p_map[level - 1];
+        auto *p_mapCurr = p_map[level];
+        firstStep<<<numBlocks, threadsPerBlock, 0, aStream>>>(p_mapPrev, p_mapCurr, level, min_type, giga.getGenInfoCuda());
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("----------------------------------Error: %s\n", cudaGetErrorString(err));
+        throw std::runtime_error("runFirstStep failed");
+    }
+}
+
+
+// *********************************************************************************************************************
+//                       SECOND STEP
+// *********************************************************************************************************************
+
+
+__global__ void secondStep(const uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end) {
+    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
+    const uint64_t xLen = gic.x_num[level];
+    const uint64_t yLen = gic.y_num[level];
+    const uint64_t zLen = gic.z_num[level];
+
+    const uint64_t level_start = level_xz[level];
+
+    if (x < xLen && z < zLen) {
+        const size_t offset_pc_data = z * xLen + x;
+        const size_t offset_part_map = yLen * offset_pc_data;
+
+        uint64_t counter = 0;
+
+        for (size_t y = 0; y < yLen; ++y) {
+            uint8_t  status = currLevel[offset_part_map + y];
+            if (status > min_type && status <= UPSAMPLING_SEED_TYPE) {
+                counter++;
+            }
+        }
+
+        xz_end[level_start + offset_pc_data] = counter;
+    }
+}
+
+void runSecondStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, cudaStream_t aStream) {
+    dim3 threadsPerBlock(32, 1, 1);
+
+    for (int level = gi.l_min; level < gi.l_max - 1; ++level) {
+        dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x,
+                        1,
+                        (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z);
+        auto *p_mapCurr = p_map[level];
+        secondStep<<<numBlocks, threadsPerBlock, 0, aStream>>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end);
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("----------------------------------Error: %s\n", cudaGetErrorString(err));
+        throw std::runtime_error("runSecondStep failed");
+    }
+}
+
+
+// *********************************************************************************************************************
+//                       SECOND STEP LAST LEVEL
+//
+//    l_max - 1 is special as it also has the l_max information that then needs to be upsampled.
+// *********************************************************************************************************************
+
+
+__global__ void secondStepLastLevel(const uint8_t *currLevel, int level_minus_1, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end) {
+    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
+    const uint64_t xLen = gic.x_num[level_minus_1];
+    const uint64_t yLen = gic.y_num[level_minus_1];
+    const uint64_t zLen = gic.z_num[level_minus_1];
+
+    const uint64_t  xLen_m = gic.x_num[level_minus_1 + 1]; // level max
+    const uint64_t  yLen_m = gic.y_num[level_minus_1 + 1]; // level max
+    const uint64_t  zLen_m = gic.z_num[level_minus_1 + 1]; // level max
+
+    const uint64_t level_start = level_xz[level_minus_1];
+    const uint64_t level_start_m = level_xz[level_minus_1 + 1]; // level max
+
+
+    if (x < xLen && z < zLen) {
+        const size_t offset_pc_data = z * xLen + x;
+        const size_t offset_part_map = yLen * offset_pc_data;
+
+        uint64_t counter = 0;
+        uint64_t counter_l = 0;
+
+        for (size_t y = 0; y < yLen; ++y) {
+            uint8_t  status = currLevel[offset_part_map + y];
+            if (status > min_type && status <= UPSAMPLING_SEED_TYPE) {
+                counter++;
+            }
+            else if (status > 0 && status <= min_type) {
+                counter_l++;
+
+                if ((2 * y) < (yLen_m - 1)) {
+                    counter_l++;
+                }
+            }
+        }
+
+        xz_end[level_start + offset_pc_data] = counter;
+
+        // In original CPU code value of counter_l is remembered in temporary buffer and later
+        // write down to xz_end vector. Here is the solution without need of temp. buffer.
+        for (size_t dz = 0; dz <= 1; dz++) {
+            for (size_t dx = 0; dx <= 1; dx++) {
+                size_t uz = 2 * z + dz; // upsampled z
+                size_t ux = 2 * x + dx; // upsampled x
+                if (uz < zLen_m && ux < xLen_m) {
+                    const size_t offset_pc_data_m = uz * xLen_m + ux;
+                    xz_end[level_start_m + offset_pc_data_m] = counter_l;
+                }
+            }
+        }
+
+    }
+}
+
+__global__ void secondStepCountParticles(GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total) {
+    // std::partial_sum on one CUDA core naive implementation
+    size_t sum = xz_end[0];
+    for (size_t i = 1; i < counter_total; i++) {
+        sum += xz_end[i];
+        xz_end[i] = sum;
+    }
+
+    *gic.total_number_particles = xz_end[counter_total -1];
+}
+
+void runSecondStepLastLevel(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total, cudaStream_t aStream) {
+    dim3 threadsPerBlock(32, 1, 1);
+    dim3 numBlocks( (gi.x_num[gi.l_max - 1] + threadsPerBlock.x - 1)/threadsPerBlock.x,
+                    1,
+                    (gi.z_num[gi.l_max - 1] + threadsPerBlock.z - 1)/threadsPerBlock.z);
+
+    int level = gi.l_max - 1;
+    auto *p_mapCurr = p_map[level];
+    secondStepLastLevel<<<numBlocks, threadsPerBlock, 0, aStream>>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end);
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("----------------------------------Error: %s\n", cudaGetErrorString(err));
+        throw std::runtime_error("runSecondStepLastLevel #1 failed");
+    }
+
+    secondStepCountParticles<<<1, 1, 0, aStream>>>(giga.getGenInfoCuda(), level_xz, xz_end, counter_total);
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("----------------------------------Error: %s\n", cudaGetErrorString(err));
+        throw std::runtime_error("runSecondStepLastLevel #2 failed");
+    }
+}
+
+
+// *********************************************************************************************************************
+//                       THIRD STEP - Get Y values
+// *********************************************************************************************************************
+
+
+__global__ void getYvalues(const uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) {
+    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
+    const uint64_t xLen = gic.x_num[level];
+    const uint64_t yLen = gic.y_num[level];
+    const uint64_t zLen = gic.z_num[level];
+
+    const uint64_t level_start = level_xz[level];
+
+    if (x < xLen && z < zLen) {
+        const size_t offset_pc_data = z * xLen + x;
+        const size_t offset_part_map = yLen * offset_pc_data;
+
+        uint64_t counter = 0;
+
+        uint64_t offset_y  = xz_end[level_start + offset_pc_data - 1];
+
+        for (size_t y = 0; y < yLen; ++y) {
+            uint8_t  status = currLevel[offset_part_map + y];
+            if (status > min_type && status <= UPSAMPLING_SEED_TYPE) {
+                y_vec[counter + offset_y] = y;
+                counter++;
+            }
+        }
+    }
+}
+
+void runGetYvalues(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, cudaStream_t aStream) {
+    dim3 threadsPerBlock(32, 1, 1);
+
+    for (int level = gi.l_min; level < gi.l_max - 1; ++level) {
+        dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x,
+                        1,
+                        (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z);
+        auto *p_mapCurr = p_map[level];
+        getYvalues<<<numBlocks, threadsPerBlock, 0, aStream>>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec);
+    }
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("----------------------------------Error: %s\n", cudaGetErrorString(err));
+        throw std::runtime_error("runGetYvalues failed");
+    }
+}
+
+
+// *********************************************************************************************************************
+//                       4th STEP LAST LEVEL
+//
+//    l_max - 1 is special as it also has the l_max information that then needs to be upsampled.
+// *********************************************************************************************************************
+
+
+__global__ void fourthStep(const uint8_t *currLevel, int level_minus_1, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) {
+    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
+    const uint64_t xLen = gic.x_num[level_minus_1];
+    const uint64_t yLen = gic.y_num[level_minus_1];
+    const uint64_t zLen = gic.z_num[level_minus_1];
+
+    const uint64_t  xLen_m = gic.x_num[level_minus_1 + 1]; // level max
+    const uint64_t  yLen_m = gic.y_num[level_minus_1 + 1]; // level max
+
+    const uint64_t level_start_minus_1 = level_xz[level_minus_1];
+    const uint64_t level_start_m = level_xz[level_minus_1 + 1]; // level max
+
+
+    if (x < xLen && z < zLen) {
+        const size_t offset_pc_data = z * xLen + x;
+
+        const size_t offset_pc_data_m = (z*2) * xLen_m + x * 2;
+        const size_t offset_part_map = yLen * offset_pc_data; // current level
+
+        uint64_t counter = 0;
+        uint64_t counter_l = 0;
+
+        uint64_t offset_y = xz_end[level_start_minus_1 + offset_pc_data - 1];
+        uint64_t offset_y_m = xz_end[level_start_m + offset_pc_data_m -1];
+
+        for (size_t y = 0; y < yLen; ++y) {
+            uint8_t  status = currLevel[offset_part_map + y];
+            if (status > min_type && status <= UPSAMPLING_SEED_TYPE) {
+                y_vec[counter + offset_y] = y;
+                counter++;
+            }
+            else if (status > 0 && status <= min_type) {
+                y_vec[counter_l + offset_y_m] = 2*y;
+                counter_l++;
+
+                if ((2 * y) < (yLen_m - 1)) {
+                    y_vec[counter_l + offset_y_m] = 2*y + 1;
+                    counter_l++;
+                }
+            }
+        }
+    }
+}
+
+__global__ void fourthStepLastLevel(GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) {
+    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
+
+    int maxLevel = gic.level_max();
+    const uint64_t  xLen_m = gic.x_num[maxLevel]; // level max
+    const uint64_t  zLen_m = gic.z_num[maxLevel]; // level max
+
+    const uint64_t level_start_m = level_xz[maxLevel];
+
+
+    if (x < xLen_m && z < zLen_m) {
+
+        // first check if it's not already there
+        if ( ((z % 2) != 0) || ((x % 2) != 0) ) {
+            const size_t offset_pc_data_m = z * xLen_m + x;
+            const size_t offset_pc_data_m_f = (z/2) * 2 * xLen_m + (x/2) * 2;
+
+            uint64_t offset_y_b_f = xz_end[level_start_m + offset_pc_data_m_f - 1];
+            uint64_t offset_y_e_f = xz_end[level_start_m + offset_pc_data_m_f];
+            uint64_t offset_y_b   = xz_end[level_start_m + offset_pc_data_m - 1];
+
+            for (uint64_t idx = offset_y_b_f; idx < offset_y_e_f; ++idx) {
+                y_vec[offset_y_b++] = y_vec[idx];
+            }
+        }
+
+    }
+}
+
+void runFourthStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, uint64_t counter_total, cudaStream_t aStream) {
+    dim3 threadsPerBlock(32, 1, 1);
+    dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x,
+                    1,
+                    (gi.z_num[gi.l_max] + threadsPerBlock.z - 1)/threadsPerBlock.z);
+
+    int level = gi.l_max - 1;
+    auto *p_mapCurr = p_map[level];
+    fourthStep<<<numBlocks, threadsPerBlock, 0, aStream>>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec);
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("----------------------------------Error: %s\n", cudaGetErrorString(err));
+        throw std::runtime_error("runFourthStep #1 failed");
+    }
+
+    fourthStepLastLevel<<<numBlocks, threadsPerBlock, 0, aStream>>>(giga.getGenInfoCuda(), level_xz, xz_end, y_vec);
+
+    cudaError_t err2 = cudaGetLastError();
+    if (err2 != cudaSuccess) {
+        printf("----------------------------------Error: %s\n", cudaGetErrorString(err));
+        throw std::runtime_error("runFourthStep #2 failed");
+    }
+}
+
+
+// *********************************************************************************************************************
+//   MAIN FUNC TO CALL - implements logic of  LinearAccess::initialize_linear_structure CPU func.
+// *********************************************************************************************************************
+
+
+/*
+ * This function does everything:
+ * - creates CPU structures
+ * - copies everything to GPU
+ * - run computation of all linear-structures
+ * - copy it back to CPU
+ * - returns all the structure
+ *
+ *  In current shape it is a good function for testing implementation rather than using it in production code.
+ *  Production code should use parts of it and work on pre-allocated memory - probably in GpuProcessingTask.
+ */
+LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct) {
+
+    cudaStream_t aStream = nullptr;
+
+    // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing
+    // all steps
+    ParticleCellTreeCuda p_map (gi, aStream);
+    p_map.uploadPCT2GPU(pct);
+
+    uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2;
+
+    VectorData<uint16_t> y_vec(true);
+    VectorData<uint64_t> xz_end_vec(true);
+    VectorData<uint64_t> level_xz_vec(true);
+
+    // initialize_xz_linear() - CPU impl.
+    uint64_t counter_total = 1; //the buffer val to allow -1 calls without checking.
+    level_xz_vec.resize(gi.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1;
+    level_xz_vec[0] = 1; //allowing for the offset.
+    for (int i = 0; i <= gi.l_max; ++i) {
+        counter_total += gi.x_num[i] * gi.z_num[i];
+        level_xz_vec[i + 1] = counter_total;
+    }
+    xz_end_vec.resize(counter_total, 0);
+
+//    auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; };
+//    prt(y_vec);
+//    prt(xz_end_vec);
+//    prt(level_xz_vec);
+
+    // TODO: This is temporary solution.
+    //       Since in CPU code size of y_vec is calculated 'on the fly' and in CUDA code it would be much better
+    //       to have pre-allocated memory for that - currently y_vec is pre-allocated to have maximum size. This is not
+    //       optimal but always working solution. If any better idea pop up - it will be changed.
+    size_t maxYvecSize = gi.x_num[gi.l_max] * gi.y_num[gi.l_max] * gi.z_num[gi.l_max];
+    y_vec.resize(maxYvecSize);
+
+
+    {
+        ScopedCudaMemHandler<uint16_t *, D2H> y_vec_cuda(y_vec.data(), y_vec.size());
+        ScopedCudaMemHandler<uint64_t *, D2H> xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size());
+        ScopedCudaMemHandler<uint64_t *, H2D | D2H> level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size());
+        GenInfoGpuAccess giga(gi, aStream);
+        if (gi.l_max <= 2) {
+            runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), gi, giga, aStream);
+        }
+        else {
+            runFirstStep(gi, giga, p_map, min_type, aStream);
+            runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), aStream);
+            runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), counter_total, aStream);
+            runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), aStream);
+            runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), counter_total, aStream);
+        }
+    }
+
+    // TODO: Resized back to correct size, should it be initialized to this size in the first place or pre-allocation for
+    //       full size is more than enough? (for example in case of computing particles for multiple frames with same resolution
+    //       we can get different size of particles for each frame - with preallocated buffer we can do all of them on it).
+    y_vec.resize(gi.total_number_particles);
+
+    // Transfer changes to PCT from GPU to CPU (this is needed only for tests)
+    p_map.downloadPCTfromGPU(pct);
+
+
+    LinearAccessCudaStructs lac;
+    lac.y_vec.swap(y_vec);
+    lac.xz_end_vec.swap(xz_end_vec);
+    lac.level_xz_vec.swap(level_xz_vec);
+
+    return lac;
+}
+
+void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream) {
+
+    uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2;
+
+    VectorData<uint64_t> xz_end_vec(true);
+    VectorData<uint64_t> level_xz_vec(true);
+
+    // initialize_xz_linear() - CPU impl.
+    uint64_t counter_total = 1; //the buffer val to allow -1 calls without checking.
+    level_xz_vec.resize(gi.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1;
+    level_xz_vec[0] = 1; //allowing for the offset.
+    for (int i = 0; i <= gi.l_max; ++i) {
+        counter_total += gi.x_num[i] * gi.z_num[i];
+        level_xz_vec[i + 1] = counter_total;
+    }
+    xz_end_vec.resize(counter_total, 0);
+
+
+    {
+        ScopedCudaMemHandler<uint64_t *, D2H> xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size());
+        ScopedCudaMemHandler<uint64_t *, H2D | D2H> level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size());
+        GenInfoGpuAccess giga(gi, aStream);
+        if (gi.l_max <= 2) {
+            runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, gi, giga, aStream);
+        }
+        else {
+            runFirstStep(gi, giga, p_map, min_type, aStream);
+            runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), aStream);
+            runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), counter_total, aStream);
+            runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, aStream);
+            runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, counter_total, aStream);
+        }
+    }
+
+    VectorData<uint16_t> y_vec(true);
+    y_vec.resize(gi.total_number_particles);
+    checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda, gi.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, aStream));
+    checkCuda(cudaStreamSynchronize(aStream));
+
+    lacs.y_vec.swap(y_vec);
+    lacs.xz_end_vec.swap(xz_end_vec);
+    lacs.level_xz_vec.swap(level_xz_vec);
+}
diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp
new file mode 100644
index 00000000..27d56ab6
--- /dev/null
+++ b/src/data_structures/APR/access/LinearAccessCuda.hpp
@@ -0,0 +1,20 @@
+#ifndef APR_LINEARACCESSCUDA_HPP
+#define APR_LINEARACCESSCUDA_HPP
+
+#include "algorithm/APRParameters.hpp"
+#include "data_structures/Mesh/PixelData.hpp"
+#include "data_structures/APR/GenInfo.hpp"
+#include "algorithm/ParticleCellTreeCuda.cuh"
+
+typedef struct {
+    VectorData<uint16_t> y_vec;
+    VectorData<uint64_t> xz_end_vec;
+    VectorData<uint64_t> level_xz_vec;
+} LinearAccessCudaStructs;
+
+LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector<PixelData<uint8_t>> &pct);
+
+void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream);
+
+
+#endif //APR_LINEARACCESSCUDA_HPP
diff --git a/src/data_structures/APR/access/RandomAccess.hpp b/src/data_structures/APR/access/RandomAccess.hpp
index 0daf7a54..18366d99 100644
--- a/src/data_structures/APR/access/RandomAccess.hpp
+++ b/src/data_structures/APR/access/RandomAccess.hpp
@@ -1210,7 +1210,7 @@ inline void RandomAccess::initialize_tree_access(RandomAccess& APROwn_access, st
 }
 
 
-void RandomAccess::init_data_structure_tree(RandomAccess& APROwn_access, SparseGaps<std::pair<uint16_t,YGap_map>>& y_begin){
+inline void RandomAccess::init_data_structure_tree(RandomAccess& APROwn_access, SparseGaps<std::pair<uint16_t,YGap_map>>& y_begin){
     uint64_t cumsum = 0;
 
     APRTimer apr_timer(false);
@@ -1423,7 +1423,7 @@ inline void RandomAccess::initialize_tree_access_sparse(RandomAccess& APROwn_acc
 }
 
 
-void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParameters& apr_parameters, SparseGaps<SparseParticleCellMap> &p_map) {
+inline void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParameters& apr_parameters, SparseGaps<SparseParticleCellMap> &p_map) {
     //
     //  Initialize the new structure;
     //
@@ -1513,7 +1513,7 @@ void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParame
                 gap.global_index_begin_offset = 0;
                 uint64_t counter = 0;
 
-                uint16_t prev_y = -2; //init
+                uint16_t prev_y = 65534; // Originally = -2 which is 65534 when assigned to uint16 - removing compiler error //init
 
                 auto& mesh = p_map.data[i][offset_pc_data][0].mesh;
 
@@ -1577,7 +1577,7 @@ void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParame
 
             auto& mesh = p_map.data[i][offset_pc_data1][0].mesh;
 
-            uint16_t prev_y = -2; //init
+            uint16_t prev_y = 65534; // Originally = -2 which is 65534 when assigned to uint16 - removing compiler error //init
 
             //SPARSE iteration
             for (auto it=mesh.begin(); it!=mesh.end(); ++it) {
diff --git a/src/data_structures/Mesh/ImagePatch.hpp b/src/data_structures/Mesh/ImagePatch.hpp
index a249efdd..01d27fd3 100644
--- a/src/data_structures/Mesh/ImagePatch.hpp
+++ b/src/data_structures/Mesh/ImagePatch.hpp
@@ -38,7 +38,7 @@ struct ImagePatch {
 };
 
 
-void initPatchGlobal(ImagePatch& patch, int z_begin_global, int z_end_global, int x_begin_global, int x_end_global, int y_begin_global, int y_end_global) {
+inline void initPatchGlobal(ImagePatch& patch, int z_begin_global, int z_end_global, int x_begin_global, int x_end_global, int y_begin_global, int y_end_global) {
     patch.z_begin_global = z_begin_global;
     patch.x_begin_global = x_begin_global;
     patch.y_begin_global = y_begin_global;
diff --git a/src/data_structures/Mesh/PixelData.cu b/src/data_structures/Mesh/PixelData.cu
index fd27f4d5..35924482 100644
--- a/src/data_structures/Mesh/PixelData.cu
+++ b/src/data_structures/Mesh/PixelData.cu
@@ -10,11 +10,14 @@
 #include "misc/CudaTools.cuh"
 
 #include "downsample.cuh"
-#include <vector>
+#include "paddPixelData.cuh"
+
 
 // explicit instantiation of handled types
 template void downsampleMeanCuda(const PixelData<float>&, PixelData<float>&);
 template void downsampleMaxCuda(const  PixelData<float>&, PixelData<float>&);
+template void paddPixelsCuda(const PixelData<float> &input, PixelData<float> &output, const PixelDataDim &padSize);
+template void unpaddPixelsCuda(const PixelData<float> &input, PixelData<float> &output, const PixelDataDim &padSize);
 
 template <typename T, typename S>
 void downsampleMeanCuda(const PixelData<T> &input, PixelData<S> &output) {
@@ -31,3 +34,19 @@ void downsampleMaxCuda(const PixelData<T> &input, PixelData<S> &output) {
 
     runDownsampleMax(in.get(), out.get(), input.x_num, input.y_num, input.z_num, 0);
 };
+
+template <typename T>
+void paddPixelsCuda(const PixelData<T> &input, PixelData<T> &output, const PixelDataDim &padSize) {
+    ScopedCudaMemHandler<const PixelData<T>, H2D> inputData(input);
+    ScopedCudaMemHandler<PixelData<T>, D2H> outputData(output);
+
+    runPaddPixels(inputData.get(), outputData.get(), input.getDimension(), output.getDimension(), padSize, 0);
+};
+
+template <typename T>
+void unpaddPixelsCuda(const PixelData<T> &input, PixelData<T> &output, const PixelDataDim &padSize) {
+    ScopedCudaMemHandler<const PixelData<T>, H2D> inputData(input);
+    ScopedCudaMemHandler<PixelData<T>, D2H> outputData(output);
+
+    runUnpaddPixels(inputData.get(), outputData.get(), input.getDimension(), output.getDimension(), padSize, 0);
+};
diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp
index 931b95a3..f0127920 100644
--- a/src/data_structures/Mesh/PixelData.hpp
+++ b/src/data_structures/Mesh/PixelData.hpp
@@ -34,9 +34,11 @@ struct PixelDataDim {
     size_t x;
     size_t z;
 
-    PixelDataDim(size_t y, size_t x, size_t z) : y(y), x(x), z(z) {}
+    constexpr PixelDataDim(size_t y, size_t x, size_t z) : y(y), x(x), z(z) {}
 
     size_t size() const { return y * x * z; }
+    size_t maxDimSize() const { return std::max(x, std::max(y, z)); }
+    int numOfDimensions() const { return (int)(x > 1) + (int)(y > 1) + (int)(z > 1); }
 
     PixelDataDim operator+(const PixelDataDim &rhs) const { return {y + rhs.y, x + rhs.x, z + rhs.z}; }
     PixelDataDim operator-(const PixelDataDim &rhs) const { return {y - rhs.y, x - rhs.x, z - rhs.z}; }
@@ -147,10 +149,6 @@ public :
         usePinnedMemory = usePinned;
     }
 
-    void setUsePinnedMemory(bool usePinned){
-        usePinnedMemory = usePinned;
-    }
-
     inline uint64_t size() const{
         return vec.size();
     }
@@ -281,8 +279,19 @@ public :
         std::swap(usePinnedMemory, aObj.usePinnedMemory);
         std::swap(vecMemory, aObj.vecMemory);
         vec.swap(aObj.vec);
+#ifdef APR_USE_CUDA
+        std::swap(vecMemoryPinned, aObj.vecMemoryPinned);
+#endif
     }
 
+    VectorData(VectorData &&aObj) {
+        usePinnedMemory = aObj.usePinnedMemory;
+        vecMemory.swap(aObj.vecMemory);
+        vec = std::move(aObj.vec);
+#ifdef APR_USE_CUDA
+        vecMemoryPinned =std::move(aObj.vecMemoryPinned);
+#endif
+    }
 
     /**
      * Apply unary operator to each element in parallel, writing the result to VectorData 'output'.
@@ -436,6 +445,19 @@ public :
      */
     PixelData(int aSizeOfY, int aSizeOfX, int aSizeOfZ, T aInitVal) { initWithValue(aSizeOfY, aSizeOfX, aSizeOfZ, aInitVal); }
 
+    /**
+     * Constructor - initialize initial size of mesh to provided values
+     * @param aDims - PixelDataDim with length of each dimension
+     */
+    PixelData(PixelDataDim aDims) { init(aDims.y, aDims.x, aDims.z); }
+
+    /**
+     * Constructor - creates mesh with provided dimentions initialized to aInitVal
+     * @param aDims - PixelDataDim with length of each dimension
+     * @param aInitVal - initial value of all elements
+     */
+    PixelData(PixelDataDim aDims, T aInitVal) { initWithValue(aDims.y, aDims.x, aDims.z, aInitVal); }
+
     /**
      * Move constructor
      * @param aObj mesh to be moved
@@ -498,6 +520,16 @@ public :
      * @return element @(y, x, z)
      */
     T& operator()(int y, int x, int z) {
+        // TODO: In number of places during running tests below check shows problems.
+        //       Investigate and try to fix. Such check in future probably should be permanent
+        //       to discover all problems rather than hiding them.
+#ifndef NDEBUG  // with Cmake we need to use double neg. condition since there is not ifdef DEBUG defined :(
+        if ((y < 0 || y >= y_num) || (x < 0 || x >= x_num) || (z < 0 || z >= z_num)) {
+//            std::cerr << "Provided coordinates=(" << y << ", " << x << ", " << z;
+//            std::cerr << ") while PixelData size=(" << y_num << ", " << x_num << ", " << z_num << ")" << std::endl;
+//            throw std::runtime_error("Provided (y,x,z) coordinates are out of range!");
+        }
+#endif
         y = std::min(y, y_num-1);
         x = std::min(x, x_num-1);
         z = std::min(z, z_num-1);
@@ -710,6 +742,10 @@ public :
         init(y_num_ds, x_num_ds, z_num_ds, aUsePinnedMemory);
     }
 
+    void initDownsampled(const PixelDataDim &dim, bool aUsePinnedMemory) {
+        initDownsampled(dim.y, dim.x, dim.z, aUsePinnedMemory);
+    }
+
     /**
      * Initializes mesh with size of half of provided dimensions (rounding up if not divisible by 2) and initialize values
      * @param aSizeOfY
@@ -725,6 +761,10 @@ public :
         initWithValue(y_num_ds, x_num_ds, z_num_ds, aInitVal, aUsePinnedMemory);
     }
 
+    void initDownsampled(const PixelDataDim &dim, T aInitVal, bool aUsePinnedMemory) {
+        initDownsampled(dim.y, dim.x, dim.z, aInitVal, aUsePinnedMemory);
+    }
+
     /**
      * Initializes mesh with size of half of provided mesh dimensions (rounding up if not divisible by 2)
      * @param aMesh - mesh used to get dimensions
@@ -950,15 +990,16 @@ void downsample(const PixelData<T> &aInput, PixelData<S> &aOutput, R reduce, C c
                 const size_t shy = std::min(2*y + 1, y_num - 1);
                 const size_t idx = z * x_num_ds * y_num_ds + x * y_num_ds + y;
                 outMesh[idx] =  constant_operator(
-                        reduce(reduce(reduce(reduce(reduce(reduce(reduce(        // inMesh coordinates
+                        reduce(reduce(reduce(reduce(                             // inMesh coordinates
                                inMesh[2*z * x_num * y_num + 2*x * y_num + 2*y],  // z,   x,   y
-                               inMesh[2*z * x_num * y_num + 2*x * y_num + shy]), // z,   x,   y+1
                                inMesh[2*z * x_num * y_num + shx * y_num + 2*y]), // z,   x+1, y
-                               inMesh[2*z * x_num * y_num + shx * y_num + shy]), // z,   x+1, y+1
                                inMesh[shz * x_num * y_num + 2*x * y_num + 2*y]), // z+1, x,   y
-                               inMesh[shz * x_num * y_num + 2*x * y_num + shy]), // z+1, x,   y+1
                                inMesh[shz * x_num * y_num + shx * y_num + 2*y]), // z+1, x+1, y
-                               inMesh[shz * x_num * y_num + shx * y_num + shy])  // z+1, x+1, y+1
+                               reduce(reduce(reduce(
+                               inMesh[2*z * x_num * y_num + 2*x * y_num + shy],  // z,   x,   y+1
+                               inMesh[2*z * x_num * y_num + shx * y_num + shy]), // z,   x+1, y+1
+                               inMesh[shz * x_num * y_num + 2*x * y_num + shy]), // z+1, x,   y+1
+                               inMesh[shz * x_num * y_num + shx * y_num + shy])) // z+1, x+1, y+1
                 );
             }
         }
diff --git a/src/data_structures/Mesh/PixelDataCuda.h b/src/data_structures/Mesh/PixelDataCuda.h
index 34f7a56c..97f2144e 100644
--- a/src/data_structures/Mesh/PixelDataCuda.h
+++ b/src/data_structures/Mesh/PixelDataCuda.h
@@ -1,17 +1,35 @@
-//
-// Created by Krzysztof Gonciarz on 4/9/18.
-//
-
 #ifndef LIBAPR_PIXELDATACUDA_H
 #define LIBAPR_PIXELDATACUDA_H
 
 
 #include "PixelData.hpp"
 
+
 template<typename T, typename S>
 void downsampleMeanCuda(const PixelData<T> &aInput, PixelData<S> &aOutput);
 
 template <typename T, typename S>
 void downsampleMaxCuda(const PixelData<T> &input, PixelData<S> &output);
 
-#endif //LIBAPR_PIXELDATACUDA_H
+/**
+ * Copies data from input to output (which is bigger by pad size) reflecting around the edge pixels.
+ * @tparam T
+ * @param input
+ * @param output
+ * @param padSize
+ */
+template <typename T>
+void paddPixelsCuda(const PixelData<T> &input, PixelData<T> &output, const PixelDataDim &padSize);
+
+/**
+ * Copies data from input to output (which is smaller by pad size).
+ * @tparam T
+ * @param input
+ * @param output
+ * @param padSize
+ */
+template <typename T>
+void unpaddPixelsCuda(const PixelData<T> &input, PixelData<T> &output, const PixelDataDim &padSize);
+
+#endif
+
diff --git a/src/data_structures/Mesh/downsample.cuh b/src/data_structures/Mesh/downsample.cuh
index 947db945..a6548a52 100644
--- a/src/data_structures/Mesh/downsample.cuh
+++ b/src/data_structures/Mesh/downsample.cuh
@@ -24,14 +24,14 @@ __global__ void downsampleMean(const T *input, S *output, size_t x_num, size_t y
     size_t idx = (zi * x_num + xi) * y_num + yi;
 
     // Go through all elements in 2x2
-    T v = input[idx];
+    S v = input[idx];
     v +=  input[idx + xs * y_num];
     v +=  input[idx +              zs * x_num * y_num];
     v +=  input[idx + xs * y_num + zs * x_num * y_num];
 
     // Get data from odd thread to even one
     const int workerIdx = threadIdx.y;
-    T a = __shfl_sync(__activemask(), v, workerIdx + 1);
+    S a = __shfl_sync(__activemask(), v, workerIdx + 1);
 
     // downsampled dimensions twice smaller (rounded up)
 
diff --git a/src/data_structures/Mesh/paddPixelData.cuh b/src/data_structures/Mesh/paddPixelData.cuh
new file mode 100644
index 00000000..dae96d79
--- /dev/null
+++ b/src/data_structures/Mesh/paddPixelData.cuh
@@ -0,0 +1,81 @@
+#ifndef LIBAPR_PADDPIXELDATA_CUH
+#define LIBAPR_PADDPIXELDATA_CUH
+
+
+#include "data_structures/Mesh/PixelData.hpp"
+
+
+template <typename T>
+__global__ void paddPixels(const T* input, T *output, const PixelDataDim inputSize, const PixelDataDim outputSize, const PixelDataDim padSize) {
+    size_t yIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    size_t xIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t zIdx = blockIdx.z * blockDim.z + threadIdx.z;
+
+    // copy data to output (padded) cube
+    if (yIdx < outputSize.y && xIdx < outputSize.x && zIdx < outputSize.z) {
+
+        // output cube index
+        size_t outputIdx = (zIdx * outputSize.x + xIdx) * outputSize.y + yIdx;
+
+        // input cube index
+        int yIn = yIdx - padSize.y;
+        if (yIn < 0) yIn = -yIn;                                      // reflected boundary on LHS
+        if (yIn >= inputSize.y) yIn -= 2 * (yIn - (inputSize.y - 1)); // reflected boundary on RHS
+
+        int xIn = xIdx - padSize.x;
+        if (xIn < 0) xIn = -xIn;                                      // reflected boundary on LHS
+        if (xIn >= inputSize.x) xIn -= 2 * (xIn - (inputSize.x - 1)); // reflected boundary on RHS
+
+        int zIn = zIdx - padSize.z;
+        if (zIn < 0) zIn = -zIn;                                      // reflected boundary on LHS
+        if (zIn >= inputSize.z) zIn -= 2 * (zIn - (inputSize.z - 1)); // reflected boundary on RHS
+
+        size_t inputIdx = (zIn * inputSize.x + xIn) * inputSize.y + yIn;
+
+        output[outputIdx] = input[inputIdx];
+    }
+}
+
+template <typename T>
+void runPaddPixels(const T* input, T *output, const PixelDataDim &inputSize, const PixelDataDim &outputSize, const PixelDataDim &padSize, cudaStream_t aStream) {
+    dim3 threadsPerBlock(1, 64, 1);
+    dim3 numBlocks((outputSize.x + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                   (outputSize.y + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (outputSize.z + threadsPerBlock.z - 1) / threadsPerBlock.z);
+
+    paddPixels<<<numBlocks, threadsPerBlock, 0, aStream>>>(input, output, inputSize, outputSize, padSize);
+}
+
+template <typename T>
+__global__ void unpaddPixels(const T* input, T *output, const PixelDataDim inputSize, const PixelDataDim outputSize, const PixelDataDim padSize) {
+    size_t yIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    size_t xIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t zIdx = blockIdx.z * blockDim.z + threadIdx.z;
+
+    // copy data to output (unpadded) cube
+    if (yIdx < outputSize.y && xIdx < outputSize.x && zIdx < outputSize.z) {
+
+        // output cube index
+        size_t outputIdx = (zIdx * outputSize.x + xIdx) * outputSize.y + yIdx;
+
+        // input cube index (map coordinates of output cube to internal cube of padded cube)
+        int yIn = yIdx + padSize.y;
+        int xIn = xIdx + padSize.x;
+        int zIn = zIdx + padSize.z;
+        size_t inputIdx = (zIn * inputSize.x + xIn) * inputSize.y + yIn;
+
+        output[outputIdx] = input[inputIdx];
+    }
+}
+
+template <typename T>
+void runUnpaddPixels(const T* input, T *output, const PixelDataDim &inputSize, const PixelDataDim &outputSize, const PixelDataDim &padSize, cudaStream_t aStream) {
+    dim3 threadsPerBlock(1, 64, 1);
+    dim3 numBlocks((outputSize.x + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                   (outputSize.y + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (outputSize.z + threadsPerBlock.z - 1) / threadsPerBlock.z);
+
+    unpaddPixels<<<numBlocks, threadsPerBlock, 0, aStream>>>(input, output, inputSize, outputSize, padSize);
+}
+
+#endif
diff --git a/src/misc/CudaMemory.cuh b/src/misc/CudaMemory.cuh
index e237779f..fbe125e9 100644
--- a/src/misc/CudaMemory.cuh
+++ b/src/misc/CudaMemory.cuh
@@ -11,14 +11,20 @@
 
 #include <cassert>
 
-inline cudaError_t checkCuda(cudaError_t result) {
-#if defined(DEBUG) || defined(_DEBUG)
-    if (result != cudaSuccess) {
-        fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
-        assert(result == cudaSuccess);
+
+// TODO: this method is duplicated in CudaTools.cuh
+//       Somehow including it here break compilation - fix it please.
+#define checkCuda(ans) { cudaAssert2((ans), __FILE__, __LINE__); }
+inline void cudaAssert2(cudaError_t code, const char *file, int line, bool abort=true)
+{
+#if defined(DEBUG) || defined(_DEBUG) || !defined(NDEBUG)
+    if (code != cudaSuccess)
+    {
+        fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line);
+        assert(code == cudaSuccess); // If debugging it helps to see call tree somehow
+        if (abort) exit(code);
     }
 #endif
-    return result;
 }
 
 inline void* getPinnedMemory(size_t aNumOfBytes) {
diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh
index 3f9b5fca..10e4cb73 100644
--- a/src/misc/CudaTools.cuh
+++ b/src/misc/CudaTools.cuh
@@ -8,16 +8,26 @@
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
-//#include <math_functions.h>
 #include <cuda_runtime_api.h>
-//#include <cuda_runtime.h>
-
-
 #include <iostream>
 #include <chrono>
+
 #include "data_structures/Mesh/PixelData.hpp"
 
 
+#define checkCuda(ans) { cudaAssert((ans), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+#if defined(DEBUG) || defined(_DEBUG) || !defined(NDEBUG)
+    if (code != cudaSuccess)
+    {
+        fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line);
+        assert(code == cudaSuccess); // If debugging it helps to see call tree somehow
+        if (abort) exit(code);
+    }
+#endif
+}
+
 inline void waitForCuda() {
     cudaDeviceSynchronize();
     cudaError_t err = cudaGetLastError();
@@ -29,12 +39,6 @@ inline void printCudaDims(const dim3 &threadsPerBlock, const dim3 &numBlocks) {
     std::cout << "Number of threads (x/y/z): " << threadsPerBlock.x << "/" << threadsPerBlock.y << "/" << threadsPerBlock.z << std::endl;
 }
 
-template<typename ImgType>
-inline void getDataFromKernel(PixelData<ImgType> &input, size_t inputSize, ImgType *cudaInput) {
-    cudaMemcpy(input.mesh.get(), cudaInput, inputSize, cudaMemcpyDeviceToHost);
-    cudaFree(cudaInput);
-}
-
 class CudaTimer {
     std::vector<std::chrono::system_clock::time_point> iStartTimes;
     std::vector<std::string> names;
@@ -85,12 +89,18 @@ public:
 
 
 // Useful type for keeping CUDA allocated memory (which is released with cudaFree)
-template <typename T, typename D=decltype(&cudaFree)>
+static cudaError_t CUDARTAPI deleter(void *devPtr) {
+    //std::cout << "cudaFree() called...\n";
+    return cudaFree(devPtr);
+}
+
+template <typename T, typename D=decltype(&deleter)>
 struct CudaMemoryUniquePtr : public std::unique_ptr<T[], D> {
     using std::unique_ptr<T[],D>::unique_ptr; // inheriting other constructors
-    explicit CudaMemoryUniquePtr(T *aMemory = nullptr) : std::unique_ptr<T[], D>(aMemory, &cudaFree) {}
+    explicit CudaMemoryUniquePtr(T *aMemory = nullptr) : std::unique_ptr<T[], D>(aMemory, &deleter) {}
 };
 
+
 /**
  * Directions for sending data between Host and Device
  */
@@ -211,6 +221,17 @@ public:
         initialize();
     }
 
+    ScopedCudaMemHandler (ScopedCudaMemHandler &&obj) {
+        iData = obj.iData;
+        obj.iData = nullptr;
+        iSize = obj.iSize;
+        obj.iSize = 0;
+        iBytes = obj.iBytes;
+        obj.iBytes = 0;
+        iStream = obj.iStream;
+        obj.iStream = nullptr;
+        iCudaMemory = std::move(obj.iCudaMemory);
+    }
 
     ~ScopedCudaMemHandler() {
         if (DIRECTION & D2H) {
@@ -223,15 +244,21 @@ public:
     size_t getNumOfBytes() const {return iBytes; }
 
     void copyH2D() {
-        cudaMemcpyAsync(iCudaMemory.get(), iData, iBytes, cudaMemcpyHostToDevice, iStream);
+        if (iData != nullptr) {
+            checkCuda(cudaMemcpyAsync(iCudaMemory.get(), iData, iBytes, cudaMemcpyHostToDevice, iStream));
+        }
     }
 
     void copyH2D(const size_t numElements) {
-        cudaMemcpyAsync(iCudaMemory.get(), iData, numElements*DataSize, cudaMemcpyHostToDevice, iStream);
+        if (iData != nullptr) {
+            checkCuda(cudaMemcpyAsync(iCudaMemory.get(), iData, numElements*DataSize, cudaMemcpyHostToDevice, iStream));
+        }
     }
 
     void copyD2H() {
-        cudaMemcpyAsync((void*)iData, iCudaMemory.get(), iBytes, cudaMemcpyDeviceToHost, iStream);
+        if (iData != nullptr) {
+            checkCuda(cudaMemcpyAsync((void *) iData, iCudaMemory.get(), iBytes, cudaMemcpyDeviceToHost, iStream));
+        }
     }
 
 private:
@@ -240,7 +267,7 @@ private:
 
     void initialize() {
         ElementType *mem = nullptr;
-        cudaMalloc(&mem, iBytes);
+        checkCuda(cudaMalloc(&mem, iBytes));
         iCudaMemory.reset(mem);
         if (DIRECTION & H2D) {
             copyH2D();
diff --git a/test/APRTest.cpp b/test/APRTest.cpp
index 33ea37d6..83071a7f 100644
--- a/test/APRTest.cpp
+++ b/test/APRTest.cpp
@@ -134,7 +134,7 @@ bool compare_two_iterators(Iterator1& it1, Iterator2& it2, int maxNumOfErrPrinte
     uint64_t counter_1 = 0;
     uint64_t counter_2 = 0;
 
-    uint64_t errors = 0;
+    int64_t errors = 0;
 
     for (int level = it1.level_min(); level <= it1.level_max(); ++level) {
         for (int z = 0; z < it1.z_num(level); z++) {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index dc3e5a11..d3377fb0 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -11,13 +11,18 @@ buildTarget(testComputeGradient ComputeGradientTest.cpp)
 buildTarget(testLocalIntensityScale LocalIntensityScaleTest.cpp)
 buildTarget(testPullingScheme PullingSchemeTest.cpp)
 buildTarget(testAPRParameters APRParametersTest.cpp)
+buildTarget(testLinearAccess LinearAccessTest.cpp)
 
 #APR GPU Tests
 if(APR_USE_CUDA)
    buildTarget(testAPRCuda APRTestCuda.cpp)
+   buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp)
+   buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp)
+   buildTarget(testFullPipelineCuda FullPipelineCudaTest.cpp)
+   buildTarget(testPullingSchemeCuda PullingSchemeCudaTest.cpp)
+   buildTarget(testLinearAccessCuda LinearAccessCudaTest.cpp)
 endif()
 
-
 if(APR_BUILD_EXAMPLES)
     buildTarget(testExamples ExamplesTest.cpp)
 endif()
diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp
new file mode 100644
index 00000000..588c5ea3
--- /dev/null
+++ b/test/ComputeGradientCudaTest.cpp
@@ -0,0 +1,369 @@
+
+#include <gtest/gtest.h>
+
+#include "data_structures/Mesh/PixelData.hpp"
+#include "algorithm/ComputeGradient.hpp"
+#include "algorithm/ComputeGradientCuda.hpp"
+#include "TestTools.hpp"
+
+namespace {
+
+#ifdef APR_USE_CUDA
+
+
+    // ========================================================================
+    // BSPLINE tests
+    // ========================================================================
+
+    template <typename T>
+    class BsplineTest : public testing::Test {};
+    TYPED_TEST_SUITE_P(BsplineTest);
+
+    TYPED_TEST_P(BsplineTest, testBsplineInXdirCUDA) {
+        APRTimer timer(false);
+
+        std::vector<std::pair<int, int>> yzSizes = {{1,   1},
+                                                    {32,  32},
+                                                    {33,  33},
+                                                    {44,  35},
+                                                    {35,  44},
+                                                    {255, 129}};
+
+        for (auto &p: yzSizes) {
+            int yLen = p.first;
+            int zLen = p.second;
+            // Run test with dimension in range much shorter than filter length to longer than filter length
+            // (for lambda=3 and tolerance=0.00001 expected filter length k0=18)
+            for (int xLen = 2; xLen < 22; ++xLen) {
+                // Generate random mesh
+                using ImgType = TypeParam;
+                PixelData<ImgType> m = getRandInitializedMesh<ImgType>(yLen, xLen, zLen, 30, 10);
+
+                // Filter parameters
+                const float lambda = 3;
+                const float tolerance = 0.0001;
+
+                // Calculate bspline on CPU
+                PixelData<ImgType> mCpu(m, true);
+                timer.start_timer("CPU bspline");
+                ComputeGradient().bspline_filt_rec_x(mCpu, lambda, tolerance);
+                timer.stop_timer();
+
+                // Calculate bspline on GPU
+                PixelData<ImgType> mGpu(m, true);
+                timer.start_timer("GPU bspline");
+                cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_X_DIR);
+                timer.stop_timer();
+
+                // Compare GPU vs CPU
+                EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+            }
+        }
+    }
+
+    TYPED_TEST_P(BsplineTest, testBsplineInZdirCUDA) {
+        APRTimer timer(false);
+
+        std::vector<std::pair<int, int>> xySizes = {{1,   1},
+                                                    {32,  32},
+                                                    {33,  33},
+                                                    {44,  35},
+                                                    {35,  44},
+                                                    {255, 129}};
+
+        for (auto &p : xySizes) {
+            int xLen = p.first;
+            int yLen = p.second;
+            // Run test with dimension in range much shorter than filter length to longer than filter length
+            // (for lambda=3 and tolerance=0.00001 expected filter length k0=18)
+            for (int zLen = 2; zLen < 22; ++zLen) {
+                // Generate random mesh
+                using ImgType = TypeParam;
+                PixelData<ImgType> m = getRandInitializedMesh<ImgType>(yLen, xLen, zLen, 30, 10);
+
+                // Filter parameters
+                const float lambda = 3;
+                const float tolerance = 0.0001;
+
+                // Calculate bspline on CPU
+                PixelData<ImgType> mCpu(m, true);
+                timer.start_timer("CPU bspline");
+                ComputeGradient().bspline_filt_rec_z(mCpu, lambda, tolerance);
+                timer.stop_timer();
+
+                // Calculate bspline on GPU
+                PixelData<ImgType> mGpu(m, true);
+                timer.start_timer("GPU bspline");
+                cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Z_DIR);
+                timer.stop_timer();
+
+                // Compare GPU vs CPU
+                EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+            }
+        }
+    }
+
+    TYPED_TEST_P(BsplineTest, testBsplineInYdirCUDA) {
+        APRTimer timer(false);
+
+        std::vector<std::pair<int, int>> xzSizes = {{1,   1},
+                                                    {32,  32},
+                                                    {33,  33},
+                                                    {44,  35},
+                                                    {35,  44},
+                                                    {255, 129}};
+
+        for (auto &p : xzSizes) {
+            int xLen = p.first;
+            int zLen = p.second;
+            // Run test with dimension in range much shorter than filter length to longer than filter length
+            // (for lambda=3 and tolerance=0.00001 expected filter length k0=18)
+            for (int yLen = 2; yLen < 22; ++yLen) {
+                // Generate random mesh
+                using ImgType = TypeParam;
+                PixelData<ImgType> m = getRandInitializedMesh<ImgType>(yLen, xLen, zLen, 30, 10);
+
+                // Filter parameters
+                const float lambda = 3;
+                const float tolerance = 0.0001;
+
+                // Calculate bspline on CPU
+                PixelData<ImgType> mCpu(m, true);
+                timer.start_timer("CPU bspline");
+                ComputeGradient().bspline_filt_rec_y(mCpu, lambda, tolerance);
+                timer.stop_timer();
+
+                // Calculate bspline on GPU
+                PixelData<ImgType> mGpu(m, true);
+                timer.start_timer("GPU bspline");
+                cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Y_DIR);
+                timer.stop_timer();
+
+                //Compare GPU vs CPU
+                EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+            }
+        }
+    }
+
+    REGISTER_TYPED_TEST_SUITE_P(BsplineTest, testBsplineInXdirCUDA, testBsplineInZdirCUDA, testBsplineInYdirCUDA);
+    using ImgTypes = ::testing::Types< float, uint16_t, int16_t, uint8_t>;
+    INSTANTIATE_TYPED_TEST_SUITE_P(Testing, BsplineTest, ImgTypes);
+
+    TEST(ComputeBspineTest, BSPLINE_FULL_XYZ_DIR_CUDA) {
+        APRTimer timer(false);
+
+        // Generate random mesh
+        using ImgType = float;
+        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(127, 128, 129, 100, 10);
+
+        // Filter parameters
+        const float lambda = 3;
+        const float tolerance = 0.0001; // as defined in get_smooth_bspline_3D
+
+        // Calculate bspline on CPU
+        PixelData<ImgType> mCpu(m, true);
+        timer.start_timer("CPU bspline");
+        ComputeGradient().get_smooth_bspline_3D(mCpu, lambda);
+        timer.stop_timer();
+
+        // Calculate bspline on GPU
+        PixelData<ImgType> mGpu(m, true);
+        timer.start_timer("GPU bspline");
+        cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_ALL_DIR);
+        timer.stop_timer();
+
+        // Compare GPU vs CPU
+        EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+    }
+
+
+    // ========================================================================
+    // INV. BSPLINE tests
+    // ========================================================================
+
+    TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_RND_CUDA) {
+        APRTimer timer(false);
+
+        // Generate random mesh
+        using ImgType = float;
+        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(127, 61, 66, 100, 10);
+
+        // Calculate bspline on CPU
+        PixelData<ImgType> mCpu(m, true);
+        timer.start_timer("CPU inv bspline");
+        ComputeGradient().calc_inv_bspline_x(mCpu);
+        timer.stop_timer();
+
+        // Calculate bspline on GPU
+        PixelData<ImgType> mGpu(m, true);
+        timer.start_timer("GPU inv bspline");
+        cudaInverseBspline(mGpu,  INV_BSPLINE_X_DIR);
+        timer.stop_timer();
+
+        // Compare GPU vs CPU
+        EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+    }
+
+    TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) {
+        APRTimer timer(false);
+
+        // Generate random mesh
+        using ImgType = float;
+        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(128, 61, 66, 100, 10);
+
+        // Calculate bspline on CPU
+        PixelData<ImgType> mCpu(m, true);
+        timer.start_timer("CPU inv bspline");
+        ComputeGradient().calc_inv_bspline_z(mCpu);
+        timer.stop_timer();
+
+        // Calculate bspline on GPU
+        PixelData<ImgType> mGpu(m, true);
+        timer.start_timer("GPU inv bspline");
+        cudaInverseBspline(mGpu,  INV_BSPLINE_Z_DIR);
+        timer.stop_timer();
+
+        // Compare GPU vs CPU
+        EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+    }
+
+    TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_RND_CUDA) {
+        APRTimer timer(false);
+
+        // Generate random mesh
+        using ImgType = float;
+        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(127, 61, 71, 100, 10);
+
+        // Calculate bspline on CPU
+        PixelData<ImgType> mCpu(m, true);
+        timer.start_timer("CPU inv bspline");
+        ComputeGradient().calc_inv_bspline_y(mCpu);
+        timer.stop_timer();
+
+        // Calculate bspline on GPU
+        PixelData<ImgType> mGpu(m, true);
+        timer.start_timer("GPU inv bspline");
+        cudaInverseBspline(mGpu,  INV_BSPLINE_Y_DIR);
+        timer.stop_timer();
+
+        // Compare GPU vs CPU
+        EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+    }
+
+    TEST(ComputeInverseBspline, CALC_INV_BSPLINE_FULL_XYZ_DIR_RND_CUDA) {
+        APRTimer timer(false);
+
+        // Generate random mesh
+        using ImgType = float;
+        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(32,32,32,100, 10);
+
+        // Calculate bspline on CPU
+        PixelData<ImgType> mCpu(m, true);
+        timer.start_timer("CPU inv bspline");
+        ComputeGradient().calc_inv_bspline_y(mCpu);
+        ComputeGradient().calc_inv_bspline_x(mCpu);
+        ComputeGradient().calc_inv_bspline_z(mCpu);
+        timer.stop_timer();
+
+        // Calculate bspline on GPU
+        PixelData<ImgType> mGpu(m, true);
+        timer.start_timer("GPU inv bspline");
+        cudaInverseBspline(mGpu,  INV_BSPLINE_ALL_DIR);
+        timer.stop_timer();
+
+        // Compare GPU vs CPU
+        EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+    }
+
+    // ========================================================================
+    // Downsampled gradient
+    // ========================================================================
+
+    TEST(ComputeGradientTest, GPU_VS_CPU_DOWNSAMPLE_GRADIENT_ON_RANDOM_VALUES) {
+        APRTimer timer(false);
+
+        // Generate random mesh
+        using ImgType = float;
+        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(31, 32, 33, 100);
+
+        // Calculate gradient on CPU
+        PixelData<ImgType> grad;
+        grad.initDownsampled(m, 0);
+        timer.start_timer("CPU gradient");
+        ComputeGradient().calc_bspline_fd_ds_mag(m, grad, 1, 1, 1);
+        timer.stop_timer();
+
+        // Calculate gradient on GPU
+        PixelData<ImgType> gradCuda;
+        gradCuda.initDownsampled(m, 0);
+        timer.start_timer("GPU gradient");
+        cudaDownsampledGradient(m, gradCuda, 1, 1, 1);
+        timer.stop_timer();
+
+        // Compare GPU vs CPU
+        EXPECT_EQ(compareMeshes(grad, gradCuda, 0), 0);
+    }
+
+
+    // ========================================================================
+    // Full pipeline/gradient tests
+    // ========================================================================
+
+    TEST(ComputeThreshold, FULL_GRADIENT_TEST) {
+        APRTimer timer(false);
+
+        // Generate random mesh
+        using ImageType = uint16_t;
+        PixelData<ImageType> input_image = getRandInitializedMesh<ImageType>(33, 35, 37, 15, 20);
+        PixelData<ImageType> &image_temp = input_image;
+
+        PixelData<ImageType> grad_temp; // should be a down-sampled image
+        grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false);
+        PixelData<float> local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors
+        local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false);
+        PixelData<float> local_scale_temp2;
+        local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false);
+
+        PixelData<ImageType> grad_temp_GPU; // should be a down-sampled image
+        grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false);
+        PixelData<float> local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors
+        local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, true);
+        PixelData<float> local_scale_temp2_GPU;
+        local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false);
+
+        APRParameters par;
+        par.lambda = 3;
+        par.Ip_th = 10;
+        par.dx = 1;
+        par.dy = 1;
+        par.dz = 1;
+
+        // Calculate bspline on CPU
+        PixelData<ImageType> mCpuImage(image_temp, true);
+
+        ComputeGradient computeGradient;
+
+        timer.start_timer(">>>>>>>>>>>>>>>>> CPU gradient");
+        computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par);
+        timer.stop_timer();
+
+        // Calculate bspline on GPU
+        PixelData<ImageType> mGpuImage(image_temp, true);
+        timer.start_timer(">>>>>>>>>>>>>>>>> GPU gradient");
+        getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par);
+        timer.stop_timer();
+
+        // Compare GPU vs CPU
+        EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage, 0), 0);
+        EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0);
+        EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0);
+    }
+
+#endif // APR_USE_CUDA
+
+}
+
+int main(int argc, char **argv) {
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp
index 0b2fc17e..0d822357 100644
--- a/test/ComputeGradientTest.cpp
+++ b/test/ComputeGradientTest.cpp
@@ -2,99 +2,13 @@
  * Created by Krzysztof Gonciarz 2018
  */
 #include <array>
-#include <cmath>
 #include <gtest/gtest.h>
 #include "data_structures/Mesh/PixelData.hpp"
 #include "algorithm/ComputeGradient.hpp"
-#include "algorithm/ComputeGradientCuda.hpp"
 #include <random>
-#include "algorithm/APRConverter.hpp"
+#include "TestTools.hpp"
 
 namespace {
-    /**
-     * Compares mesh with provided data
-     * @param mesh
-     * @param data - data with [Z][Y][X] structure
-     * @return true if same
-     */
-    template<typename T>
-    bool compare(PixelData<T> &mesh, const float *data, const float epsilon) {
-        size_t dataIdx = 0;
-        for (int z = 0; z < mesh.z_num; ++z) {
-            for (int y = 0; y < mesh.y_num; ++y) {
-                for (int x = 0; x < mesh.x_num; ++x) {
-                    bool v = std::abs(mesh(y, x, z) - data[dataIdx]) < epsilon;
-                    if (v == false) {
-                        std::cerr << "Mesh and expected data differ. First place at (Y, X, Z) = " << y << ", " << x
-                                  << ", " << z << ") " << mesh(y, x, z) << " vs " << data[dataIdx] << std::endl;
-                        return false;
-                    }
-                    ++dataIdx;
-                }
-            }
-        }
-        return true;
-    }
-
-    /**
-     * Compares two meshes
-     * @param expected
-     * @param tested
-     * @param maxNumOfErrPrinted - how many error values should be printed (-1 for all)
-     * @return number of errors detected
-     */
-    template <typename T>
-    int compareMeshes(const PixelData<T> &expected, const PixelData<T> &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) {
-        int cnt = 0;
-        for (size_t i = 0; i < expected.mesh.size(); ++i) {
-            if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) ||
-                std::isnan(tested.mesh[i])) {
-                if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) {
-                    std::cout << "ERROR expected vs tested mesh: " << expected.mesh[i] << " vs " << tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl;
-                }
-                cnt++;
-            }
-        }
-        std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl;
-        return cnt;
-    }
-
-    /**
-     * Generates mesh with provided dims with random values in range [0, 1] * multiplier
-     * @param y
-     * @param x
-     * @param z
-     * @param multiplier
-     * @return
-     */
-    template <typename T>
-    PixelData<T> getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) {
-        PixelData<T> m(y, x, z);
-        std::cout << "Mesh info: " << m << std::endl;
-        std::random_device rd;
-        std::mt19937 mt(rd());
-        std::uniform_real_distribution<double> dist(0.0, 1.0);
-        for (size_t i = 0; i < m.mesh.size(); ++i) {
-            m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier;
-        }
-        return m;
-    }
-
-    template<typename T>
-    bool initFromZYXarray(PixelData<T> &mesh, const float *data) {
-        size_t dataIdx = 0;
-        for (int z = 0; z < mesh.z_num; ++z) {
-            for (int y = 0; y < mesh.y_num; ++y) {
-                for (int x = 0; x < mesh.x_num; ++x) {
-                    mesh(y, x, z) = data[dataIdx];
-                    ++dataIdx;
-                }
-            }
-        }
-        return true;
-    }
-
-
 
     TEST(ComputeGradientTest, 2D_XY) {
         {   // Corner points
@@ -455,7 +369,7 @@ namespace {
                     0.0000000000, 0.2193282992, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000,
                     0.0000000000, 0.2930246294, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000 };
             // put values in corners
-            m(1, 1, 4) = 1;
+            m(0, 1, 2) = 1;
 
             // Calculate bspline on CPU
             PixelData<float> mCpu(m, true);
@@ -724,496 +638,6 @@ namespace {
 
         ASSERT_TRUE(compare(m, expect, 0.01));
     }
-
-    // ======================= CUDA =======================================
-    // ======================= CUDA =======================================
-    // ======================= CUDA =======================================
-
-#ifdef APR_USE_CUDA
-
-    TEST(ComputeGradientTest, 2D_XY_CUDA) {
-        // Corner points
-        PixelData<float> m(6, 6, 1, 0);
-        // expect gradient is 3x3 X/Y plane
-        float expect[] = {1.41, 0, 4.24,
-                          0, 0, 0,
-                          2.82, 0, 5.65};
-        // put values in corners
-        m(0, 0, 0) = 2;
-        m(5, 0, 0) = 4;
-        m(0, 5, 0) = 6;
-        m(5, 5, 0) = 8;
-        PixelData<float> grad;
-        grad.initDownsampled(m, 0);
-        cudaDownsampledGradient(m, grad, 1, 1, 1);
-        ASSERT_TRUE(compare(grad, expect, 0.01));
-    }
-
-    TEST(ComputeGradientTest, Corners3D_CUDA) {
-        PixelData<float> m(6, 6, 4, 0);
-        // expect gradient is 3x3x2 X/Y/Z plane
-        float expect[] = {1.73, 0, 5.19,
-                          0, 0, 0,
-                          3.46, 0, 6.92,
-
-                          8.66, 0, 12.12,
-                          0, 0, 0,
-                          10.39, 0, 13.85};
-        // put values in corners
-        m(0, 0, 0) = 2;
-        m(5, 0, 0) = 4;
-        m(0, 5, 0) = 6;
-        m(5, 5, 0) = 8;
-        m(0, 0, 3) = 10;
-        m(5, 0, 3) = 12;
-        m(0, 5, 3) = 14;
-        m(5, 5, 3) = 16;
-
-        PixelData<float> grad;
-        grad.initDownsampled(m, 0);
-        cudaDownsampledGradient(m, grad, 1, 1, 1);
-        ASSERT_TRUE(compare(grad, expect, 0.01));
-    }
-
-    TEST(ComputeGradientTest, GPU_VS_CPU_ON_RANDOM_VALUES) {
-        // Generate random mesh
-        // Generate random mesh
-        using ImgType = float;
-        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(33, 31, 3);
-
-        APRTimer timer(true);
-
-        // Calculate gradient on CPU
-        PixelData<ImgType> grad;
-        grad.initDownsampled(m, 0);
-        timer.start_timer("CPU gradient");
-        ComputeGradient().calc_bspline_fd_ds_mag(m, grad, 1, 1, 1);
-        timer.stop_timer();
-
-        // Calculate gradient on GPU
-        PixelData<ImgType> gradCuda;
-        gradCuda.initDownsampled(m, 0);
-        timer.start_timer("GPU gradient");
-        cudaDownsampledGradient(m, gradCuda, 1, 1, 1);
-        timer.stop_timer();
-
-        // Compare GPU vs CPU
-        EXPECT_EQ(compareMeshes(grad, gradCuda), 0);
-    }
-
-    TEST(ComputeBspineTest, BSPLINE_Y_DIR_CUDA) {
-        APRTimer timer(true);
-
-        // Generate random mesh
-        using ImgType = float;
-        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(129,127,128);
-
-        // Filter parameters
-        const float lambda = 3;
-        const float tolerance = 0.0001;
-
-        // Calculate bspline on CPU
-        PixelData<ImgType> mCpu(m, true);
-        timer.start_timer("CPU bspline");
-        ComputeGradient().bspline_filt_rec_y(mCpu, lambda, tolerance);
-        timer.stop_timer();
-
-        // Calculate bspline on GPU
-        PixelData<ImgType> mGpu(m, true);
-        timer.start_timer("GPU bspline");
-        cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Y_DIR);
-        timer.stop_timer();
-
-        // Compare GPU vs CPU
-        EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
-    }
-
-    TEST(ComputeBspineTest, BSPLINE_X_DIR_CUDA) {
-        APRTimer timer(true);
-
-        // Generate random mesh
-        using ImgType = float;
-        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(129,127,128);
-
-        // Filter parameters
-        const float lambda = 3;
-        const float tolerance = 0.0001;
-
-        // Calculate bspline on CPU
-        PixelData<ImgType> mCpu(m, true);
-        timer.start_timer("CPU bspline");
-        ComputeGradient().bspline_filt_rec_x(mCpu, lambda, tolerance);
-        timer.stop_timer();
-
-        // Calculate bspline on GPU
-        PixelData<ImgType> mGpu(m, true);
-        timer.start_timer("GPU bspline");
-        cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_X_DIR);
-        timer.stop_timer();
-
-        // Compare GPU vs CPU
-        EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
-    }
-
-    TEST(ComputeBspineTest, BSPLINE_Z_DIR_CUDA) {
-        APRTimer timer(true);
-
-        // Generate random mesh
-        using ImgType = float;
-        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(129,127,128);
-
-        // Filter parameters
-        const float lambda = 3;
-        const float tolerance = 0.0001;
-
-        // Calculate bspline on CPU
-        PixelData<ImgType> mCpu(m, true);
-        timer.start_timer("CPU bspline");
-        ComputeGradient().bspline_filt_rec_z(mCpu, lambda, tolerance);
-        timer.stop_timer();
-
-        // Calculate bspline on GPU
-        PixelData<ImgType> mGpu(m, true);
-        timer.start_timer("GPU bspline");
-        cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Z_DIR);
-        timer.stop_timer();
-
-        // Compare GPU vs CPU
-        EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
-    }
-
-    TEST(ComputeBspineTest, BSPLINE_FULL_XYZ_DIR_CUDA) {
-        APRTimer timer(true);
-
-        // Generate random mesh
-        using ImgType = float;
-        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(127, 128, 129);
-
-        // Filter parameters
-        const float lambda = 3;
-        const float tolerance = 0.0001; // as defined in get_smooth_bspline_3D
-
-        // Calculate bspline on CPU
-        PixelData<ImgType> mCpu(m, true);
-        timer.start_timer("CPU bspline");
-        ComputeGradient().get_smooth_bspline_3D(mCpu, lambda);
-        timer.stop_timer();
-
-        // Calculate bspline on GPU
-        PixelData<ImgType> mGpu(m, true);
-        timer.start_timer("GPU bspline");
-        cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_ALL_DIR);
-        timer.stop_timer();
-
-        // Compare GPU vs CPU
-        EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
-    }
-
-    TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_CUDA) {
-        using ImgType = float;
-
-        ImgType init[] =   {1.00, 0.00, 0.00,
-                            1.00, 0.00, 6.00,
-                            0.00, 6.00, 0.00,
-                            6.00, 0.00, 0.00};
-
-        ImgType expect[] = {1.00, 0.00, 2.00,
-                            0.83, 1.00, 4.00,
-                            1.17, 4.00, 1.00,
-                            4.00, 2.00, 0.00};
-
-        PixelData<ImgType> m(4, 3, 1);
-        initFromZYXarray(m, init);
-
-        // Calculate and compare
-        m.printMesh(4,2);
-        cudaInverseBspline(m, INV_BSPLINE_Y_DIR);
-        m.printMesh(4,2);
-        ASSERT_TRUE(compare(m, expect, 0.01));
-    }
-
-    TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_RND_CUDA) {
-        APRTimer timer(true);
-
-        // Generate random mesh
-        using ImgType = float;
-        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(127, 33, 31);
-
-        // Calculate bspline on CPU
-        PixelData<ImgType> mCpu(m, true);
-        timer.start_timer("CPU inv bspline");
-        ComputeGradient().calc_inv_bspline_y(mCpu);
-        timer.stop_timer();
-
-        // Calculate bspline on GPU
-        PixelData<ImgType> mGpu(m, true);
-        timer.start_timer("GPU inv bspline");
-        cudaInverseBspline(mGpu,  INV_BSPLINE_Y_DIR);
-        timer.stop_timer();
-
-        // Compare GPU vs CPU
-        EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
-    }
-
-    TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_CUDA) {
-        using ImgType = float;
-
-        ImgType init[] =   {0.00, 6.00, 0.00,
-                            1.00, 0.00, 0.00,
-                            0.00, 0.00, 1.00};
-
-        ImgType expect[] = {2.00, 4.00, 2.00,
-                            0.67, 0.16, 0.00,
-                            0.00, 0.16, 0.67};
-
-        PixelData<ImgType> m(3, 3, 1);
-        initFromZYXarray(m, init);
-
-        // Calculate and compare
-        m.printMesh(4,2);
-        cudaInverseBspline(m, INV_BSPLINE_X_DIR);
-        m.printMesh(4,2);
-        ASSERT_TRUE(compare(m, expect, 0.01));
-    }
-
-    TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_RND_CUDA) {
-        APRTimer timer(true);
-
-        // Generate random mesh
-        using ImgType = float;
-        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(127, 61, 66);
-
-        // Calculate bspline on CPU
-        PixelData<ImgType> mCpu(m, true);
-        timer.start_timer("CPU inv bspline");
-        ComputeGradient().calc_inv_bspline_x(mCpu);
-        timer.stop_timer();
-
-        // Calculate bspline on GPU
-        PixelData<ImgType> mGpu(m, true);
-        timer.start_timer("GPU inv bspline");
-        cudaInverseBspline(mGpu,  INV_BSPLINE_X_DIR);
-        timer.stop_timer();
-
-        // Compare GPU vs CPU
-        EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
-    }
-
-    TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) {
-        APRTimer timer(true);
-
-        // Generate random mesh
-        using ImgType = float;
-        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(127, 61, 66);
-
-        // Calculate bspline on CPU
-        PixelData<ImgType> mCpu(m, true);
-        timer.start_timer("CPU inv bspline");
-        ComputeGradient().calc_inv_bspline_z(mCpu);
-        timer.stop_timer();
-
-        // Calculate bspline on GPU
-        PixelData<ImgType> mGpu(m, true);
-        timer.start_timer("GPU inv bspline");
-        cudaInverseBspline(mGpu,  INV_BSPLINE_Z_DIR);
-        timer.stop_timer();
-
-        // Compare GPU vs CPU
-        EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
-    }
-
-    TEST(ComputeInverseBspline, CALC_INV_BSPLINE_FULL_XYZ_DIR_RND_CUDA) {
-        APRTimer timer(true);
-
-        // Generate random mesh
-        using ImgType = float;
-        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(3,3,3,100);
-
-        // Calculate bspline on CPU
-        PixelData<ImgType> mCpu(m, true);
-        timer.start_timer("CPU inv bspline");
-        ComputeGradient().calc_inv_bspline_y(mCpu);
-        ComputeGradient().calc_inv_bspline_x(mCpu);
-        ComputeGradient().calc_inv_bspline_z(mCpu);
-        timer.stop_timer();
-
-        // Calculate bspline on GPU
-        PixelData<ImgType> mGpu(m, true);
-        timer.start_timer("GPU inv bspline");
-        cudaInverseBspline(mGpu,  INV_BSPLINE_ALL_DIR);
-        timer.stop_timer();
-
-        // Compare GPU vs CPU
-        EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
-    }
-
-    TEST(ComputeThreshold, CALC_THRESHOLD_RND_CUDA) {
-        APRTimer timer(true);
-
-        // Generate random mesh
-        using ImgType = float;
-        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(31, 33, 13);
-        PixelData<ImgType> g = getRandInitializedMesh<ImgType>(31, 33, 13);
-        float thresholdLevel = 1;
-
-        // Calculate bspline on CPU
-        PixelData<ImgType> mCpu(g, true);
-        timer.start_timer("CPU threshold");
-        ComputeGradient().threshold_gradient(mCpu, m, thresholdLevel);
-
-        timer.stop_timer();
-
-        // Calculate bspline on GPU
-        PixelData<ImgType> mGpu(g, true);
-        timer.start_timer("GPU threshold");
-        thresholdGradient(mGpu, m, thresholdLevel);
-        timer.stop_timer();
-
-        // Compare GPU vs CPU
-        EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
-    }
-
-    TEST(ComputeThreshold, CALC_THRESHOLD_IMG_RND_CUDA) {
-        APRTimer timer(true);
-
-        // Generate random mesh
-        using ImgType = float;
-        PixelData<ImgType> g = getRandInitializedMesh<ImgType>(31, 33, 13, 1, true);
-
-        float thresholdLevel = 10;
-
-        // Calculate bspline on CPU
-        PixelData<ImgType> mCpu(g, true);
-        timer.start_timer("CPU threshold");
-        for (size_t i = 0; i < mCpu.mesh.size(); ++i) {
-            if (mCpu.mesh[i] <= (thresholdLevel)) { mCpu.mesh[i] = thresholdLevel; }
-        }
-        timer.stop_timer();
-
-        // Calculate bspline on GPU
-        PixelData<ImgType> mGpu(g, true);
-        timer.start_timer("GPU threshold");
-        thresholdImg(mGpu, thresholdLevel);
-        timer.stop_timer();
-
-        // Compare GPU vs CPU
-        EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
-    }
-
-    // TODO: These two test will be fixed as soon as CUDA pipeline is updated.
-    //       Currently turning them off to have testable rest of CUDA impl.
-//    TEST(ComputeThreshold, FULL_GRADIENT_TEST) {
-//        APRTimer timer(true);
-//
-//        // Generate random mesh
-//        using ImageType = float;
-//        PixelData<ImageType> input_image = getRandInitializedMesh<ImageType>(310, 330, 13, 25);
-//        PixelData<ImageType> &image_temp = input_image;
-//
-//        PixelData<ImageType> grad_temp; // should be a down-sampled image
-//        grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false);
-//        PixelData<float> local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors
-//        local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false);
-//        PixelData<float> local_scale_temp2;
-//        local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false);
-//
-//        PixelData<ImageType> grad_temp_GPU; // should be a down-sampled image
-//        grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false);
-//        PixelData<float> local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors
-//        local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, true);
-//        PixelData<float> local_scale_temp2_GPU;
-//        local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false);
-//
-//        APRParameters par;
-//        par.lambda = 3;
-//        par.Ip_th = 10;
-//        par.dx = 1;
-//        par.dy = 1;
-//        par.dz = 1;
-//
-//        // Calculate bspline on CPU
-//        PixelData<ImageType> mCpuImage(image_temp, true);
-//
-//        ComputeGradient computeGradient;
-//
-//        timer.start_timer(">>>>>>>>>>>>>>>>> CPU gradient");
-//        computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par);
-//        timer.stop_timer();
-//
-//        // Calculate bspline on GPU
-//        PixelData<ImageType> mGpuImage(image_temp, true);
-//        timer.start_timer(">>>>>>>>>>>>>>>>> GPU gradient");
-//        getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par);
-//        timer.stop_timer();
-//
-//        // Compare GPU vs CPU
-//        EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage), 0);
-//        EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0.1), 0);
-//        EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU), 0);
-//    }
-//
-//    TEST(ComputeThreshold, FULL_PIPELINE_TEST) {
-//        APRTimer timer(true);
-//
-//        // Generate random mesh
-//        using ImageType = float;
-//        PixelData<ImageType> input_image = getRandInitializedMesh<ImageType>(310, 330, 32, 25);
-//        int maxLevel = ceil(std::log2(330));
-//
-//        PixelData<ImageType> &image_temp = input_image;
-//
-//        PixelData<ImageType> grad_temp; // should be a down-sampled image
-//        grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false);
-//        PixelData<float> local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors
-//        local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false);
-//        PixelData<float> local_scale_temp2;
-//        local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false);
-//
-//        PixelData<ImageType> grad_temp_GPU; // should be a down-sampled image
-//        grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false);
-//        PixelData<float> local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors
-//        local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false);
-//        PixelData<float> local_scale_temp2_GPU;
-//        local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false);
-//
-//
-//        APRParameters par;
-//        par.lambda = 3;
-//        par.Ip_th = 10;
-//        par.sigma_th = 0;
-//        par.sigma_th_max = 0;
-//        par.dx = 1;
-//        par.dy = 1;
-//        par.dz = 1;
-//
-//        ComputeGradient computeGradient;
-//        LocalIntensityScale localIntensityScale;
-//        LocalParticleCellSet localParticleSet;
-//
-//        // Calculate bspline on CPU
-//        PixelData<ImageType> mCpuImage(image_temp, true);
-//        timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE");
-//        computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par);
-//        localIntensityScale.get_local_intensity_scale(local_scale_temp, local_scale_temp2, par);
-//        localParticleSet.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
-//        timer.stop_timer();
-//
-//        // Calculate bspline on GPU
-//        PixelData<ImageType> mGpuImage(image_temp, true);
-//        timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
-//        GpuProcessingTask<ImageType> gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel);
-//        gpt.doAll();
-//        timer.stop_timer();
-//
-//        // Compare GPU vs CPU
-//        // allow some differences since float point diffs
-//        // TODO: It would be much better to count number of diffs with delta==1 and allow some of these
-//        EXPECT_TRUE(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0.01) < 29);
-//    }
-
-
-#endif // APR_USE_CUDA
-
 }
 
 int main(int argc, char **argv) {
diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp
new file mode 100644
index 00000000..913b7e09
--- /dev/null
+++ b/test/FullPipelineCudaTest.cpp
@@ -0,0 +1,367 @@
+
+#include <gtest/gtest.h>
+
+#include "algorithm/LocalIntensityScaleCuda.h"
+#include "algorithm/LocalIntensityScale.hpp"
+#include "algorithm/ComputeGradient.hpp"
+#include "algorithm/ComputeGradientCuda.hpp"
+#include "algorithm/PullingSchemeCuda.hpp"
+#include "data_structures/APR/access/LinearAccessCuda.hpp"
+#include "TestTools.hpp"
+#include "data_structures/Mesh/PixelDataCuda.h"
+#include "algorithm/APRConverter.hpp"
+#include "misc/CudaTools.cuh"
+
+
+namespace {
+#ifdef APR_USE_CUDA
+
+    TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS) {
+        APRTimer timer(true);
+
+        // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors
+        using ImageType = float;
+        constexpr PixelDataDim dim1{4, 4, 3};
+        constexpr PixelDataDim dim2{163, 123, 555};
+        for (int d = 0; d <= 3; d++) {
+            auto &dim = (d % 2 == 0) ? dim1 : dim2;
+            PixelData<ImageType> input_image = (d/2 == 0) ? getRandInitializedMesh<ImageType>(dim, 13) :
+                                                            getMeshWithBlobInMiddle<ImageType>(dim);
+
+            // Initialize CPU data structures
+            PixelData<ImageType> mCpuImage(input_image, true);
+            PixelData<ImageType> grad_temp;
+            grad_temp.initDownsampled(dim, 0, false);
+            PixelData<float> local_scale_temp;
+            local_scale_temp.initDownsampled(dim, false);
+            PixelData<float> local_scale_temp2;
+            local_scale_temp2.initDownsampled(dim, false);
+
+            // Initialize GPU data structures to same values as CPU
+            PixelData<ImageType> mGpuImage(input_image, true, true);
+            PixelData<ImageType> grad_temp_GPU(grad_temp, true, true);
+            PixelData<float> local_scale_temp_GPU(local_scale_temp, true, true);
+            PixelData<float> local_scale_temp2_GPU(local_scale_temp2, true, true);
+
+            // Prepare parameters
+            APRParameters par;
+            par.lambda = 3;
+            par.Ip_th = 10;
+            par.sigma_th = 0;
+            par.sigma_th_max = 0;
+            par.dx = 1;
+            par.dy = 1;
+            par.dz = 1;
+
+            // Calculate pipeline on CPU
+            timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE");
+            ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par);
+            LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par);
+            timer.stop_timer();
+
+            // Calculate pipeline on GPU
+            timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
+            getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par);
+            getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par);
+            timer.stop_timer();
+
+            // Compare GPU vs CPU - expect exactly same result
+            EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0);
+            EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0);
+        }
+    }
+
+    TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS) {
+        APRTimer timer(true);
+
+        // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors
+        using ImageType = float;
+        constexpr PixelDataDim dim1{4, 4, 3};
+        constexpr PixelDataDim dim2{163, 123, 555};
+        for (int d = 0; d <= 3; d++) {
+            auto &dim = (d%2 == 0) ? dim1 : dim2;
+            PixelData<ImageType> input_image = (d/2 == 0) ? getRandInitializedMesh<ImageType>(dim, 13) :
+                                               getMeshWithBlobInMiddle<ImageType>(dim);
+            int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize()));
+
+            // Initialize CPU data structures
+            PixelData<ImageType> mCpuImage(input_image, true);
+            PixelData<ImageType> grad_temp;
+            grad_temp.initDownsampled(dim, 0, false);
+            PixelData<float> local_scale_temp;
+            local_scale_temp.initDownsampled(dim, false);
+            PixelData<float> local_scale_temp2;
+            local_scale_temp2.initDownsampled(dim, false);
+
+            // Initialize GPU data structures to same values as CPU
+            PixelData<ImageType> mGpuImage(input_image, true, false);
+            PixelData<ImageType> grad_temp_GPU(grad_temp, true, false);
+            PixelData<float> local_scale_temp_GPU(local_scale_temp, true, false);
+            PixelData<float> local_scale_temp2_GPU(local_scale_temp2, true, false);
+
+            // Prepare parameters
+            APRParameters par;
+            par.lambda = 3;
+            par.Ip_th = 10;
+            par.sigma_th = 0;
+            par.sigma_th_max = 0;
+            par.dx = 1;
+            par.dy = 1;
+            par.dz = 1;
+
+            // Calculate pipeline on CPU
+            timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE");
+            ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par);
+            LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par);
+            LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
+            timer.stop_timer();
+
+            // Calculate pipeline on GPU
+            timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
+            getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par);
+            getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par);
+            computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
+            timer.stop_timer();
+
+            // Compare GPU vs CPU - expect exactly same result
+            EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0);
+            EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0);
+        }
+    }
+
+    TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS) {
+        APRTimer timer(true);
+
+        // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors
+        using ImageType = float;
+        constexpr PixelDataDim dim1{4, 4, 3};
+        constexpr PixelDataDim dim2{163, 123, 555};
+        for (int d = 0; d <= 3; d++) {
+            auto &dim = (d % 2 == 0) ? dim1 : dim2;
+            PixelData<ImageType> input_image = (d / 2 == 0) ? getRandInitializedMesh<ImageType>(dim, 13) :
+                                                              getMeshWithBlobInMiddle<ImageType>(dim);
+            int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize()));
+
+            // Initialize CPU data structures
+            PixelData<ImageType> mCpuImage(input_image, true);
+            PixelData<ImageType> grad_temp;
+            grad_temp.initDownsampled(dim, 0, false);
+            PixelData<float> local_scale_temp;
+            local_scale_temp.initDownsampled(dim, false);
+            PixelData<float> local_scale_temp2;
+            local_scale_temp2.initDownsampled(dim, false);
+
+            // Initialize GPU data structures to same values as CPU
+            PixelData<ImageType> mGpuImage(input_image, true);
+            PixelData<ImageType> grad_temp_GPU(grad_temp, true);
+            PixelData<float> local_scale_temp_GPU(local_scale_temp, true);
+            PixelData<float> local_scale_temp2_GPU(local_scale_temp2, true);
+
+            // Prepare parameters and APR info structures
+            APRParameters par;
+            par.lambda = 3;
+            par.Ip_th = 10;
+            par.sigma_th = 0;
+            par.sigma_th_max = 0;
+            par.dx = 1;
+            par.dy = 1;
+            par.dz = 1;
+
+            GenInfo aprInfo;
+            aprInfo.init(input_image.getDimension());
+
+            // Calculate pipeline on CPU
+            timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE");
+            ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par);
+            LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par);
+            LocalParticleCellSet lpcs = LocalParticleCellSet();
+            lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
+            PullingScheme ps;
+            ps.initialize_particle_cell_tree(aprInfo);
+            lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par);
+            ps.pulling_scheme_main();
+            timer.stop_timer();
+
+            // Calculate pipeline on GPU
+            timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
+            getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par);
+            getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par);
+            computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
+            auto pct = computeOvpcCuda(local_scale_temp_GPU, aprInfo);
+            timer.stop_timer();
+
+            // Compare GPU vs CPU - expect exactly same result
+            ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0);
+        }
+    }
+
+
+
+
+    TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS_LINEARACCESS) {
+        APRTimer timer(true);
+
+        // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors
+        using ImageType = float;
+        constexpr PixelDataDim dim1{4, 4, 3};
+        constexpr PixelDataDim dim2{163, 123, 555};
+        for (int d = 0; d <= 3; d++) {
+            auto &dim = (d % 2 == 0) ? dim1 : dim2;
+            PixelData<ImageType> input_image = (d / 2 == 0) ? getRandInitializedMesh<ImageType>(dim, 13) :
+                                                              getMeshWithBlobInMiddle<ImageType>(dim);
+
+            int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize()));
+
+            // Initialize CPU data structures
+            PixelData<ImageType> mCpuImage(input_image, true);
+            PixelData<ImageType> grad_temp;
+            grad_temp.initDownsampled(dim, 0, false);
+            PixelData<float> local_scale_temp;
+            local_scale_temp.initDownsampled(dim, false);
+            PixelData<float> local_scale_temp2;
+            local_scale_temp2.initDownsampled(dim, false);
+
+            // Initialize GPU data structures to same values as CPU
+            PixelData<ImageType> mGpuImage(input_image, true);
+            PixelData<ImageType> grad_temp_GPU(grad_temp, true);
+            PixelData<float> local_scale_temp_GPU(local_scale_temp, true);
+            PixelData<float> local_scale_temp2_GPU(local_scale_temp2, true);
+
+            // Prepare parameters and APR info structures
+            APRParameters par;
+            par.lambda = 3;
+            par.Ip_th = 10;
+            par.sigma_th = 0;
+            par.sigma_th_max = 0;
+            par.dx = 1;
+            par.dy = 1;
+            par.dz = 1;
+            par.neighborhood_optimization = true;
+
+            GenInfo aprInfo(input_image.getDimension());
+            GenInfo giGpu(input_image.getDimension());
+
+            // Calculate pipeline on CPU
+            timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE");
+            ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par);
+            LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par);
+            LocalParticleCellSet lpcs = LocalParticleCellSet();
+            lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
+            PullingScheme ps;
+            ps.initialize_particle_cell_tree(aprInfo);
+            lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par);
+            ps.pulling_scheme_main();
+            LinearAccess linearAccess;
+            linearAccess.genInfo = &aprInfo;
+
+            linearAccess.initialize_linear_structure(par, ps.getParticleCellTree());
+            timer.stop_timer();
+
+            // Calculate pipeline on GPU
+            timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
+            getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par);
+            getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par);
+            computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
+            auto pct = computeOvpcCuda(local_scale_temp_GPU, giGpu);
+            auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct);
+            timer.stop_timer();
+
+            // Compare GPU vs CPU - expect exactly same result
+            // Test if returned structures have same data
+            EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0);
+            EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0);
+            EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0);
+
+            EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles);
+            EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size());
+        }
+    }
+
+    TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GpuProcessingTask) {
+        APRTimer timer(true);
+
+        // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors
+        using ImageType = float;
+        constexpr PixelDataDim dim1{4, 4, 3};
+        constexpr PixelDataDim dim2{1024,512,512};
+        for (int d = 0; d <= 3; d++) {
+            auto &dim = (d % 2 == 0) ? dim1 : dim2;
+            PixelData<ImageType> input_image = (d / 2 == 0) ? getRandInitializedMesh<ImageType>(dim, 13) :
+                                               getMeshWithBlobInMiddle<ImageType>(dim);
+
+            int maxLevel = ceil(std::log2(dim.maxDimSize()));
+
+            // Initialize CPU data structures
+            PixelData<ImageType> mCpuImage(input_image, true);
+            PixelData<ImageType> grad_temp;
+            grad_temp.initDownsampled(dim, 0, false);
+            PixelData<float> local_scale_temp;
+            local_scale_temp.initDownsampled(dim, false);
+            PixelData<float> local_scale_temp2;
+            local_scale_temp2.initDownsampled(dim, false);
+
+            // Initialize GPU data structures to same values as CPU
+            PixelData<ImageType> mGpuImage(input_image, true);
+            PixelData<float> local_scale_temp_GPU(local_scale_temp, false);
+
+            // Prepare parameters
+            APRParameters par;
+            par.lambda = 3;
+            par.Ip_th = 10;
+            par.sigma_th = 0;
+            par.sigma_th_max = 0;
+            par.dx = 1;
+            par.dy = 1;
+            par.dz = 1;
+            par.neighborhood_optimization = true;
+
+            float bspline_offset = 0;
+
+            GenInfo aprInfo(input_image.getDimension());
+            GenInfo giGpu(input_image.getDimension());
+
+            // Calculate pipeline on CPU
+            timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE");
+            ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par);
+            LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par);
+            LocalParticleCellSet lpcs = LocalParticleCellSet();
+            ComputeGradient().applyParameters(grad_temp, local_scale_temp, local_scale_temp2, par, bspline_offset);
+            lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz);
+            PullingScheme ps;
+            ps.initialize_particle_cell_tree(aprInfo);
+            lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par);
+            ps.pulling_scheme_main();
+            LinearAccess linearAccess;
+            linearAccess.genInfo = &aprInfo;
+            linearAccess.initialize_linear_structure(par, ps.getParticleCellTree());
+            timer.stop_timer();
+
+
+            // Calculate pipeline on GPU
+            timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE");
+            GpuProcessingTask<ImageType> gpt(mGpuImage, local_scale_temp_GPU, par, bspline_offset, maxLevel);
+            gpt.sendDataToGpu();
+            gpt.processOnGpu();
+            auto linearAccessGpu = gpt.getDataFromGpu();
+            giGpu.total_number_particles = linearAccessGpu.y_vec.size();
+            cudaDeviceSynchronize();
+            timer.stop_timer();
+
+            // Compare GPU vs CPU - expect exactly same result
+            EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0);
+            EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0);
+            EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0);
+
+            EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles);
+            EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size());
+
+        }
+    }
+
+#endif // APR_USE_CUDA
+}
+
+int main(int argc, char **argv) {
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp
new file mode 100644
index 00000000..eb91e7bd
--- /dev/null
+++ b/test/LinearAccessCudaTest.cpp
@@ -0,0 +1,388 @@
+#include <gtest/gtest.h>
+
+#include "algorithm/LocalParticleCellSet.hpp"
+#include "algorithm/PullingScheme.hpp"
+#include "algorithm/APRConverter.hpp"
+#include "data_structures/APR/access/LinearAccessCuda.hpp"
+
+#include "TestTools.hpp"
+
+namespace {
+    template<typename DataType>
+    void fillPS(PullingScheme &aPS, PixelData<DataType> &levels) {
+        PixelData<DataType> levelsDS(ceil(levels.y_num / 2.0), ceil(levels.x_num / 2.0), ceil(levels.z_num / 2.0));
+        LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters());
+    }
+
+/**
+ * Prints PCT
+ * @param particleCellTree
+ */
+    template<typename T>
+    void printParticleCellTree(const std::vector<PixelData<T>> &particleCellTree) {
+        for (uint64_t l = 0; l < particleCellTree.size(); ++l) {
+            auto &tree = particleCellTree[l];
+            tree.printMeshT(3, 0);
+        }
+    }
+
+    /**
+     * Create PCT with provided data
+     * @param aprInfo
+     * @param levels complete list of values from level min to level max in form { {level, min, values}, ..., {level, max, values} }
+     *               if levels are not provided PCT with EMPTY values is returned
+     * @return Particle Cell Tree with values (or with EMPTY if levels are not provided)
+     */
+    auto makePCT(const GenInfo &aprInfo, std::initializer_list<std::initializer_list<int>> levels) {
+        auto pct = PullingScheme::generateParticleCellTree(aprInfo);
+
+        // Fill particle cell tree only if levels provided - otherwise return tree with EMPTY values
+        if (levels.size() != 0) {
+
+            int l = aprInfo.l_min;
+            // PS levels range is [l_max - 1, l_min]
+            if (((aprInfo.l_max - 1) - aprInfo.l_min + 1) != (int) levels.size()) {
+                throw std::runtime_error("Wrong number of level data provided!");
+            }
+            for (auto &level: levels) {
+                if (pct[l].getDimension().size() != level.size()) {
+                    std::cerr << "Provided data for level=" << l << " differs from level size "
+                              << pct[l].getDimension().size() << " vs. " << level.size() << std::endl;
+                    std::cerr << aprInfo << std::endl;
+                    throw std::runtime_error("Not this time...");
+                }
+                std::copy(level.begin(), level.end(), pct[l].mesh.begin());
+                l++;
+            }
+        }
+        return pct;
+    }
+
+    // Copy PCT - copies only existing levels of it.
+    auto copyPCT(const std::vector<PixelData<uint8_t>> &pct) {
+        std::vector<PixelData<uint8_t>> copy;
+        copy.resize(pct.size());
+
+        for (size_t l = 0; l < pct.size(); ++l) {
+            copy[l].initWithResize(pct[l].y_num, pct[l].x_num, pct[l].z_num);
+            // Copy only existing levels
+            if (pct[l].z_num > 0) copy[l].copyFromMesh(pct[l]);
+        }
+
+        return copy;
+    }
+
+    // Create random Particle Cell Tree with dimensions specified in 'gi' with given number of particles.
+    auto makeRandomPCT(const GenInfo &gi, int numOfParticles = 3) {
+        PullingScheme ps;
+        ps.initialize_particle_cell_tree(gi);
+
+        // Generate random levels for PS and OVPC
+        PixelData<float> levels(std::ceil(gi.org_dims[0]/2.0),
+                                std::ceil(gi.org_dims[1]/2.0),
+                                std::ceil(gi.org_dims[2]/2.0),
+                                0);
+        int seed = std::time(nullptr);
+        std::srand(seed);
+        for (int i = 0; i < numOfParticles; ++i) {
+            int modulo = (gi.l_max - gi.l_min);
+            if (modulo == 0) modulo = 1;
+            levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = std::rand() % modulo + gi.l_min;
+        }
+        fillPS(ps, levels);
+        ps.pulling_scheme_main();
+
+        return copyPCT(ps.getParticleCellTree());
+    }
+
+}
+
+// TODO: There are still problems with computing of small (like 1D images in pipeline)
+//       belows test can be used to trigger those errors - should be fixed
+
+//TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_fullAprPipeline) {
+//    // TODO: delete me after development
+//    // Full 'get apr' pipeline to test imp. on different stages
+//    // Useful during debugging and can be removed once finished
+//
+//    // Prepare input data (image)
+//    int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
+//    // PS input values = 5  0  0  0  0  0  0  0
+//
+////         int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, };
+////         PullingScheme input values (local_scale_temp) for above 'image' = {6  0  0  0  0  0  0  0  6  0  0  0  0  0  0  0};
+//
+//    int len = sizeof(values)/sizeof(int);
+//    PixelData<int> data(len, 1, 1);
+//    initFromZYXarray(data, values);
+//    std::cout << "----- Input image:\n";
+//    data.printMeshT(3, 1);
+//
+//    // Produce APR
+//    APR apr;
+//    APRConverter<uint16_t> aprConverter;
+//    aprConverter.par.rel_error = 0.1;
+//    aprConverter.par.lambda = 0.1;
+//    aprConverter.par.sigma_th = 0.0001;
+//    aprConverter.par.neighborhood_optimization = true;
+//    aprConverter.get_apr(apr, data);
+//
+//    // Print information about APR and all particles
+//    std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl;
+//    for (int l = apr.level_min(); l <= apr.level_max(); ++l) {
+//        std::cout << "    level[" << l << "] size: " << apr.level_size(l) << std::endl;
+//    }
+//    std::cout << "APR particles z x y level:\n";
+//    auto it = apr.iterator();
+//    for (int level = it.level_min(); level <= it.level_max(); ++level) {
+//        for (int z = 0; z < it.z_num(level); z++) {
+//            for (int x = 0; x < it.x_num(level); ++x) {
+//                for (it.begin(level, z, x); it < it.end(); it++) {
+//                    std::cout << "              " << z << " " << x << " " << it.y() << " " << level << std::endl;
+//                }
+//            }
+//        }
+//    }
+//    std::cout << std::endl;
+//
+//    // Sample input
+//    ParticleData<uint16_t> particleIntensities;
+//    particleIntensities.sample_image(apr, data);
+//
+//    // Reconstruct image from particles
+//    PixelData<uint16_t> reconstructImg;
+//    APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities);
+//    std::cout << "----- Reconstructed image:"<<std::endl;
+//    reconstructImg.printMeshT(3, 1);
+//
+//    // Show level assigned to each pixel in reconstructed image
+//    PixelData<uint16_t> levelImg;
+//    APRReconstruction::reconstruct_level(apr, levelImg);
+//    std::cout << "----- Image levels:" << std::endl;
+//    levelImg.printMeshT(3, 1);
+//
+//    // Show intensities and levels of each particle
+//    std::cout << "----- Particle intensities:\n";
+//    for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " ";
+//    std::cout << std::endl;
+//
+//    particleIntensities.fill_with_levels(apr);
+//
+//    std::cout << "----- Particle levels:\n";
+//    for (uint64_t  i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " ";
+//    std::cout << std::endl;
+//
+//    // Show some general information about generated APR
+//    double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles());
+//    std::cout << std::endl;
+//    std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl;
+//    std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl;
+//}
+
+
+//TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) {
+//    // TODO: delete me after development
+//    // Runs PS to test imp. on different stages
+//    // Useful during debugging and can be removed once finished
+////    int values[] = {0,0,0,5, 0,0,0,0};
+////    int len = sizeof(values)/sizeof(int);
+//
+//    PixelData<int> levels(8, 1, 1, 0);
+//    levels(5,0,0) = 1;
+//
+////    initFromZYXarray(levels, values);
+//    std::cout << "---------------\n";
+//    levels.printMeshT(3, 1);
+//    std::cout << "---------------\n";
+//
+//    GenInfo gi;
+//    const PixelDataDim dim = levels.getDimension();
+//    std::cout << "Levels dim: " << dim << std::endl;
+//    gi.init(dim.y * 2, dim.x * 1, dim.z * 1); // time two in y-direction since PS container is downsized.
+//    std::cout << gi << std::endl;
+//
+//    APRTimer t(false);
+//
+//    t.start_timer("PS1");
+//    PullingScheme ps;
+//    ps.initialize_particle_cell_tree(gi);
+//    int l_max = gi.l_max - 1;
+//    int l_min = gi.l_min;
+//    std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << "  " << l_min << " " << ps.pct_level_min() << std::endl;
+//
+//    fillPS(ps, levels);
+//
+//    std::cout << "---------- Filled PS tree\n";
+//    printParticleCellTree(ps.getParticleCellTree());
+//    std::cout << "---------------\n";
+//
+//    ps.pulling_scheme_main();
+//    t.stop_timer();
+//
+//    // Useful during debugging and can be removed once finished
+//    std::cout << "----------PS:\n";
+//    printParticleCellTree(ps.getParticleCellTree());
+//    std::cout << "-------------\n";
+//
+//    LinearAccess linearAccess;
+//    linearAccess.genInfo = &gi;
+//    APRParameters par;
+//    par.neighborhood_optimization = true;
+//    linearAccess.initialize_linear_structure(par, ps.getParticleCellTree());
+//
+//    std::cout << gi << std::endl;
+//    auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; };
+//    prt(linearAccess.y_vec);
+//    prt(linearAccess.xz_end_vec);
+//    prt(linearAccess.level_xz_vec);
+//
+//    LinearIterator it(linearAccess, gi);
+//    for (int l = 0; l <= 3; l++) {
+//        std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl;
+//    }
+//    std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl;
+//
+//    std::cout << "===========================\n";
+//    for (int level = it.level_min(); level <= it.level_max(); ++level) {
+//        for (int z = 0; z < it.z_num(level); z++) {
+//            for (int x = 0; x < it.x_num(level); ++x) {
+//                for (it.begin(level, z, x); it < it.end(); it++) {
+//                    std::cout << "              " << z << " " << x << " " << it.y() << " " << level << std::endl;
+//                }
+//            }
+//        }
+//    }
+//    std::cout << std::endl;
+//}
+
+// *********************************************************************************************************************
+// Tests of CUDA implementation of LinearAccess
+// *********************************************************************************************************************
+
+
+TEST(LinearAccessCudaTest, optimizationForSmallLevels) {
+    // Tests optimized part of LinearAccess returning full-resolution for levels <= 2
+
+    // --- Create input data structures and objects
+    GenInfo gi;
+    gi.init(4, 3, 2);
+    auto pct = makePCT(gi, {}); // In that case values of PCT are not important  (all dense particle data will be generated anyway)
+
+    APRParameters par;
+    par.neighborhood_optimization = true;
+
+    // --- Method under test
+    auto linearAccess = initializeLinearStructureCuda(gi, par, pct);
+
+    // ---- Verify output
+    std::vector<uint16_t> expected_y_vec = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // all 'y' particles for each xz
+    std::vector<uint64_t> expected_xz_end_vec = {0, 0, 0, 4, 8, 12, 16, 20, 24};
+    std::vector<uint64_t> expected_level_xz_vec = {1, 1, 3, 9};
+
+    EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0);
+    EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0);
+    // Useful during debugging and can be removed once finished
+    EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0);
+
+    EXPECT_EQ(gi.total_number_particles, expected_y_vec.size());
+    EXPECT_EQ(gi.total_number_particles, 4 * 3 * 2);
+}
+
+TEST(LinearAccessCudaTest, optimizationForSmallLevelsVScpu) {
+    // Tests optimized part of LinearAccess returning full-resolution for levels <= 2 for all possible combination of xyz
+    // For bigger xyz 'optimized' part of code is not used
+
+    for (int x = 1; x <= 4; ++x) {
+        for (int y = 1; y <= 4; ++y) {
+            for (int z = 1; z <= 4; ++z) {
+//                std::cout << "< ============================================= " << x << " " << y << " "<< z << std::endl;
+                // --- Create input data structures and objects
+                GenInfo gi;
+                gi.init(y, x, z);
+
+                auto pct = makePCT(gi, {}); // In that case values of PCT are not important  (all dense particle data will be generated anyway)
+                GenInfo giGpu;
+                giGpu.init(y, x, z);
+                auto pctGpu = makePCT(giGpu, {}); // In that case values of PCT are not important  (all dense particle data will be generated anyway)
+
+                LinearAccess linearAccess;
+                linearAccess.genInfo = &gi;
+                APRParameters par;
+                par.neighborhood_optimization = true;
+
+                // --- Method under test
+                linearAccess.initialize_linear_structure(par, pct);
+                auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu);
+
+                EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0);
+                EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0);
+                EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0);
+
+                EXPECT_EQ(giGpu.total_number_particles, gi.total_number_particles);
+                EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size());
+            }
+        }
+    }
+
+}
+
+TEST(LinearAccessCudaTest, testGPUvsCPUforDifferentSizes) {
+
+    for (int x : {1, 2, 4, 100, 255}) {
+        for (int y : {1, 2, 4, 100, 256}) {
+            for (int z : {1, 2, 4, 100, 257}) {
+//                std::cout << "< ============================================= " << y << " " << x << " "<< z << std::endl;
+
+                // ----------- Create input data structures and objects
+                GenInfo gi;
+                gi.init(y, x, z);
+
+                auto pct = makeRandomPCT(gi, 133);
+
+                auto pctCpu = copyPCT(pct);
+                auto pctGpu = copyPCT(pct);
+
+                GenInfo giGpu;
+                giGpu.init(y, x, z);
+
+                LinearAccess linearAccess;
+                linearAccess.genInfo = &gi;
+                APRParameters par;
+                par.neighborhood_optimization = true;
+
+
+                // --------- methods under test
+                APRTimer t(false);
+                t.start_timer("__________________________ CPU");
+                // --- Method under test
+                linearAccess.initialize_linear_structure(par, pctCpu);
+                t.stop_timer();
+
+                t.start_timer("_________________________ GPU");
+                auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu);
+                t.stop_timer();
+
+
+                // ----------- verify results
+
+                // LinearAccess changes PCT - compare if changes in CPU and GPU side are same
+                EXPECT_EQ(compareParticleCellTrees(pctCpu, pctGpu), 0);
+
+                // Test if returned structures have same data
+                EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0);
+                EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0);
+                EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0);
+
+                EXPECT_EQ(giGpu.total_number_particles, gi.total_number_particles);
+                EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size());
+            }
+        }
+    }
+
+}
+
+
+int main(int argc, char **argv) {
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/LinearAccessTest.cpp b/test/LinearAccessTest.cpp
new file mode 100644
index 00000000..b6c67db8
--- /dev/null
+++ b/test/LinearAccessTest.cpp
@@ -0,0 +1,248 @@
+#include <gtest/gtest.h>
+
+#include "algorithm/PullingScheme.hpp"
+#include "algorithm/LocalParticleCellSet.hpp"
+
+#include "TestTools.hpp"
+
+
+/**
+ * Create PCT with provided data
+ * @param aprInfo
+ * @param levels complete list of values from level min to level max in form { {level, min, values}, ..., {level, max, values} }
+ * @return Particle Cell Tree with values
+ */
+auto makePCT(const GenInfo &aprInfo, std::initializer_list<std::initializer_list<int>> levels) {
+    auto pct = PullingScheme::generateParticleCellTree(aprInfo);
+
+
+    int l = aprInfo.l_min;
+
+    // PS levels range is [l_max - 1, l_min]
+    if (((aprInfo.l_max - 1) - aprInfo.l_min + 1) != (int) levels.size()) {
+        throw std::runtime_error("Wrong number of level data provided!");
+    }
+    for (auto &level : levels) {
+        if (pct[l].getDimension().size() != level.size()) {
+            std::cerr << "Provided data for level=" << l << " differs from level size " << pct[l].getDimension().size() << " vs. " << level.size() << std::endl;
+            std::cerr << aprInfo << std::endl;
+            throw std::runtime_error("Not this time...");
+        }
+        std::copy(level.begin(), level.end(), pct[l].mesh.begin());
+        l++;
+    }
+
+    return pct;
+}
+
+TEST(LinearAccessTest, optimizationForSmallLevels) {
+
+    // --- Create input data structures and objects
+    GenInfo gi;
+    gi.init(4, 3, 2);
+    auto pct = makePCT(gi, {{1, 2, 3, 4}}); // In that case values of PCT are not important  (all dense particle data will be generated anyway)
+
+    LinearAccess linearAccess;
+    linearAccess.genInfo = &gi;
+    APRParameters par;
+    par.neighborhood_optimization = true;
+
+    // --- Method under test
+    linearAccess.initialize_linear_structure(par, pct);
+
+    // ---- Verify output
+    std::vector<uint16_t> expected_y_vec = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // all 'y' particles for each xz
+    std::vector<uint64_t> expected_xz_end_vec = {0, 0, 0, 4, 8, 12, 16, 20, 24};
+    std::vector<uint64_t> expected_level_xz_vec = {1, 1, 3, 9};
+
+    EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0);
+    EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0);
+    EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0);
+
+    EXPECT_EQ(gi.total_number_particles, expected_y_vec.size());
+}
+
+TEST(LinearAccessTest, yDirNeighbourhoodOptTrue) {
+
+    // --- Create input data structures and objects
+    GenInfo gi;
+    gi.init(16, 1, 1);
+
+    auto pct = makePCT(gi, {{0, 0},
+                            {0, 0, 3, 3},
+                            {1, 2, 3, 3, 0, 0, 0, 0}});
+
+    LinearAccess linearAccess;
+    linearAccess.genInfo = &gi;
+    APRParameters par;
+    par.neighborhood_optimization = true;
+
+    // --- Method under test
+    linearAccess.initialize_linear_structure(par, pct);
+
+    // ---- Verify output
+    std::vector<uint16_t> expected_y_vec = {2, 3, 1, 2, 3, 0, 1};
+    std::vector<uint64_t> expected_xz_end_vec = {0, 0, 2, 5, 7};
+    std::vector<uint64_t> expected_level_xz_vec = {1, 1, 2, 3, 4, 5};
+
+    EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0);
+    EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0);
+    EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0);
+
+    EXPECT_EQ(gi.total_number_particles, expected_y_vec.size());
+}
+
+TEST(LinearAccessTest, yDirNeighbourhoodOptFalse) {
+
+    // --- Create input data structures and objects
+    GenInfo gi;
+    gi.init(16, 1, 1);
+
+    auto pct = makePCT(gi, {{0, 0},
+                            {0, 0, 3, 3},
+                            {1, 2, 3, 3, 0, 0, 0, 0}});
+
+    LinearAccess linearAccess;
+    linearAccess.genInfo = &gi;
+    APRParameters par;
+    par.neighborhood_optimization = false;
+
+    // --- Method under test
+    linearAccess.initialize_linear_structure(par, pct);
+
+    // ---- Verify output
+    std::vector<uint16_t> expected_y_vec = {2, 3, 2, 3, 0, 1, 2, 3};
+    std::vector<uint64_t> expected_xz_end_vec = {0, 0, 2, 4, 8};
+    std::vector<uint64_t> expected_level_xz_vec = {1, 1, 2, 3, 4, 5};
+
+    EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0);
+    EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0);
+    EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0);
+
+    EXPECT_EQ(gi.total_number_particles, expected_y_vec.size());
+}
+
+TEST(LinearAccessTest, xDirNeighbourhoodOptTrue) {
+
+    // --- Create input data structures and objects
+    GenInfo gi;
+    gi.init(1, 16, 1);
+
+    auto pct = makePCT(gi, {{0, 0},
+                            {0, 0, 3, 3},
+                            {1, 2, 3, 3, 0, 0, 0, 0}});
+
+    LinearAccess linearAccess;
+    linearAccess.genInfo = &gi;
+    APRParameters par;
+    par.neighborhood_optimization = true;
+
+    // --- Method under test
+    linearAccess.initialize_linear_structure(par, pct);
+
+    // ---- Verify output
+    std::vector<uint16_t> expected_y_vec = {0, 0, 0, 0, 0, 0, 0};
+    std::vector<uint64_t> expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
+    std::vector<uint64_t> expected_level_xz_vec = {1, 1, 3, 7, 15, 31};
+
+    EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0);
+    EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0);
+    EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0);
+
+    EXPECT_EQ(gi.total_number_particles, expected_y_vec.size());
+}
+
+TEST(LinearAccessTest, xDirNeighbourhoodOptFalse) {
+
+    // --- Create input data structures and objects
+    GenInfo gi;
+    gi.init(1, 16, 1);
+
+    auto pct = makePCT(gi, {{0, 0},
+                            {0, 0, 3, 3},
+                            {1, 2, 3, 3, 0, 0, 0, 0}});
+
+    LinearAccess linearAccess;
+    linearAccess.genInfo = &gi;
+    APRParameters par;
+    par.neighborhood_optimization = false;
+
+    // --- Method under test
+    linearAccess.initialize_linear_structure(par, pct);
+
+    // ---- Verify output
+    std::vector<uint16_t> expected_y_vec = {0, 0, 0, 0, 0, 0, 0, 0};
+    std::vector<uint64_t> expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+    std::vector<uint64_t> expected_level_xz_vec = {1, 1, 3, 7, 15, 31};
+
+    EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0);
+    EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0);
+    EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0);
+
+    EXPECT_EQ(gi.total_number_particles, expected_y_vec.size());
+}
+
+TEST(LinearAccessTest, zDirNeighbourhoodOptTrue) {
+
+    // --- Create input data structures and objects
+    GenInfo gi;
+    gi.init(1, 1, 16);
+
+    auto pct = makePCT(gi, {{0, 0},
+                            {0, 0, 3, 3},
+                            {1, 2, 3, 3, 0, 0, 0, 0}});
+
+    LinearAccess linearAccess;
+    linearAccess.genInfo = &gi;
+    APRParameters par;
+    par.neighborhood_optimization = true;
+
+    // --- Method under test
+    linearAccess.initialize_linear_structure(par, pct);
+
+    // ---- Verify output
+    std::vector<uint16_t> expected_y_vec = {0, 0, 0, 0, 0, 0, 0};
+    std::vector<uint64_t> expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
+    std::vector<uint64_t> expected_level_xz_vec = {1, 1, 3, 7, 15, 31};
+
+    EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0);
+    EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0);
+    EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0);
+
+    EXPECT_EQ(gi.total_number_particles, expected_y_vec.size());
+}
+
+TEST(LinearAccessTest, zDirNeighbourhoodOptFalse) {
+
+    // --- Create input data structures and objects
+    GenInfo gi;
+    gi.init(1, 1, 16);
+
+    auto pct = makePCT(gi, {{0, 0},
+                            {0, 0, 3, 3},
+                            {1, 2, 3, 3, 0, 0, 0, 0}});
+
+    LinearAccess linearAccess;
+    linearAccess.genInfo = &gi;
+    APRParameters par;
+    par.neighborhood_optimization = false;
+
+    // --- Method under test
+    linearAccess.initialize_linear_structure(par, pct);
+
+    // ---- Verify output
+    std::vector<uint16_t> expected_y_vec = {0, 0, 0, 0, 0, 0, 0, 0};
+    std::vector<uint64_t> expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+    std::vector<uint64_t> expected_level_xz_vec = {1, 1, 3, 7, 15, 31};
+
+    EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0);
+    EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0);
+    EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0);
+
+    EXPECT_EQ(gi.total_number_particles, expected_y_vec.size());
+}
+
+int main(int argc, char **argv) {
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp
new file mode 100644
index 00000000..39f8ff22
--- /dev/null
+++ b/test/LocalIntensityScaleCudaTest.cpp
@@ -0,0 +1,643 @@
+
+#include <gtest/gtest.h>
+
+#include "algorithm/LocalIntensityScaleCuda.h"
+#include "algorithm/LocalIntensityScale.hpp"
+#include "TestTools.hpp"
+#include "data_structures/Mesh/PixelDataCuda.h"
+
+namespace {
+
+#ifdef APR_USE_CUDA
+
+    TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_X_DIR_VS_MANUALLY_CALCULATED_VALUES) {
+        // Belows data is precomputed for x-len = 5 (and maximum offset = 4) so do not change these numbers!
+        constexpr PixelDataDim const dim{1, 5, 1};
+        float expectedData[2][5][dim.x] = {
+                        {   // with no boundary values
+                            {1.00, 2.00, 3.00, 4.00, 5.00},  // offset = 0
+                            {1.50, 2.00, 3.00, 4.00, 4.50},  // offset = 1
+                            {2.00, 2.50, 3.00, 3.50, 4.00},  // offset = 2
+                            {2.50, 3.00, 3.00, 3.00, 3.50},  // offset = 3
+                            {3.00, 3.00, 3.00, 3.00, 3.00}   // offset = 4
+                        },
+                        {   // with boundary values
+                            {1.00, 2.00, 3.00, 4.00, 5.00},
+                            {1.66, 2.00, 3.00, 4.00, 4.33},
+                            {2.20, 2.40, 3.00, 3.60, 3.80},
+                            {2.71, 2.85, 3.00, 3.14, 3.28},
+                            {3.22, 3.11, 3.00, 2.88, 2.77}
+                        }
+                    };
+
+        APRTimer timer(false); // set to true to see timings
+
+        PixelData<float> m(dim);
+        float dataIn[] = {1, 2, 3, 4, 5};
+        initFromZYXarray(m, dataIn);
+
+        LocalIntensityScale lis;
+
+        for (int boundary = 0; boundary <= 1; ++ boundary) {
+            // boundary = 0 there is no reflected boundary
+            // boudnary = 1 there is boundary reflect
+            for (int offset = 0; offset <= 4; ++offset) {
+//                std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl;
+
+                // Run on CPU
+                PixelData<float> mCpu(m, true);
+                timer.start_timer("CPU mean X-DIR");
+                lis.calc_sat_mean_x(mCpu, offset, (boundary > 0));
+                timer.stop_timer();
+
+                // Run on GPU
+                PixelData<float> mGpu(m, true);
+                timer.start_timer("GPU mean X-DIR");
+                calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0));
+                timer.stop_timer();
+
+                // Compare results
+                PixelData<float> expected(dim);
+                initFromZYXarray(expected, expectedData[boundary][offset]);
+                EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0);
+                EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0);
+
+                // Also GPU and CPU should give exactly same output
+                EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0);
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_X_DIR_RANDOM_VALUES) {
+        APRTimer timer(false);
+
+        constexpr PixelDataDim const dim{49, 53, 51};
+        PixelData<float> m = getRandInitializedMesh<float>(dim, 50, 10);
+
+        LocalIntensityScale lis;
+
+        for (int boundary = 0; boundary <= 1; ++ boundary) {
+            // boundary = 0 there is no reflected boundary
+            // boudnary = 1 there is boundary reflect
+            for (int offset = 0; offset <= 6; ++offset) {
+//                std::cout << "------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl;
+
+                PixelData<float> mCpu;
+                mCpu.init(m);
+                mCpu.copyFromMesh(m);
+                timer.start_timer("CPU mean X-DIR");
+                lis.calc_sat_mean_x(mCpu, offset, (boundary > 0));
+                timer.stop_timer();
+
+                // Run on GPU
+                PixelData<float> mGpu(m, true);
+                timer.start_timer("GPU mean X-DIR");
+                calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0));
+                timer.stop_timer();
+
+                // Compare results
+                EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); // Expect exactly same results
+            }
+        }
+    }
+
+    /**
+     * Generate input and expected output using easy brute force approach.
+     * When comparing vs CPU or GPU outputs there is small error expected since little difference in order of float
+     * operations.
+     * @tparam T - type of generated data
+     * @param len - length
+     * @param offset - offset for which expected output should be calculated
+     * @param boundary - use boundary?
+     * @param useRandomNumbers - use random numbers or if false then index numbers in buffers [1..len]
+     * @return tuple of [input, expectedOutput]
+     */
+    template <typename T>
+    auto generateInputAndExpected(int len, int offset, bool boundary, bool useRandomNumbers) {
+        std::vector<T> input(len);
+        std::vector<T> expected(len);
+
+        std::random_device rd;
+        std::mt19937 mt(rd());
+        std::uniform_real_distribution<double> dist(0.0, 10.0);
+
+        // Feel input and calculate expected data
+        for (int i = 0; i < len; ++i) input[i] = useRandomNumbers ? dist(mt) : i + 1;
+
+        for (int i = 0; i < len; ++i) {
+            int count = 0;
+            T sum = 0;
+            for (int x = i - offset; x <= i + offset; ++x) {
+                int currIdx = x;
+                if (boundary) {
+                    currIdx = abs(x);
+                    if (currIdx > len - 1) currIdx = (len - 1) - (currIdx - (len - 1));
+                }
+
+                if (currIdx < 0 || currIdx >= len) continue;
+
+                sum += input[currIdx];
+                count++;
+            }
+            expected[i] = sum / count;
+        }
+        return std::make_tuple(input, expected);
+    }
+
+    TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_X_DIR) {
+        // Input params
+        using T = float;
+
+        for (int b = 0; b <= 1; b++) {
+            for (int len = 5; len <= 45; len += 20) {
+                for (int offset = 0; offset <= 6 && offset < len; offset++) {
+                    for (int r = 0; r <= 1; r++) {
+                        bool hasBoundary = b > 0;
+                        bool useRandomNumbers = r > 0;
+//                        std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl;
+
+                        auto t = generateInputAndExpected<T>(len, offset, hasBoundary, useRandomNumbers);
+                        auto input = std::get<0>(t);
+                        auto expected = std::get<1>(t);
+                        PixelData<T> m(1, len, 1, 0);
+                        initFromZYXarray(m, input.data());
+                        PixelData<T> expectedMesh(1, len, 1, 0);
+                        initFromZYXarray(expectedMesh, expected.data());
+
+                        APRTimer timer(false);
+                        LocalIntensityScale lis;
+
+                        // Run on CPU old-impl
+                        timer.start_timer("CPU X-DIR");
+                        PixelData<T> mCpu(m, true);
+                        lis.calc_sat_mean_x(mCpu, offset, hasBoundary);
+                        timer.stop_timer();
+
+                        // Run on GPU
+                        PixelData<T> mGpu(m, true);
+                        timer.start_timer("GPU X-DIR");
+                        calcMean(mGpu, offset, MEAN_X_DIR, hasBoundary);
+                        timer.stop_timer();
+
+                        // expectedMesh because of different order of calculation will have small floating-point differences
+                        // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values!
+                        EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match";
+                        EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match";
+                        EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0) << "---!!!!!!--- CPU vs GPU values does not match";
+                    }
+                }
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_Z_DIR_VS_MANUALLY_CALCULATED_VALUES) {
+        // Belows data is precomputed for x-len = 5 (and maximum offset = 4) so do not change these numbers!
+        constexpr PixelDataDim const dim{1, 1, 5};
+        float expectedData[2][5][dim.z] = {
+                {   // with no boundary values
+                        {1.00, 2.00, 3.00, 4.00, 5.00},  // offset = 0
+                        {1.50, 2.00, 3.00, 4.00, 4.50},  // offset = 1
+                        {2.00, 2.50, 3.00, 3.50, 4.00},  // offset = 2
+                        {2.50, 3.00, 3.00, 3.00, 3.50},  // offset = 3
+                        {3.00, 3.00, 3.00, 3.00, 3.00}   // offset = 4
+                },
+                {   // with boundary values
+                        {1.00, 2.00, 3.00, 4.00, 5.00},
+                        {1.66, 2.00, 3.00, 4.00, 4.33},
+                        {2.20, 2.40, 3.00, 3.60, 3.80},
+                        {2.71, 2.85, 3.00, 3.14, 3.28},
+                        {3.22, 3.11, 3.00, 2.88, 2.77}
+                }
+        };
+
+        APRTimer timer(false); // set to true to see timings
+
+        PixelData<float> m(dim);
+        float dataIn[] = {1, 2, 3, 4, 5};
+        initFromZYXarray(m, dataIn);
+
+        LocalIntensityScale lis;
+
+        for (int boundary = 0; boundary <= 1; ++ boundary) {
+            // boundary = 0 there is no reflected boundary
+            // boudnary = 1 there is boundary reflect
+            for (int offset = 0; offset <= 4; ++offset) {
+//                std::cout << "------------------ OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl;
+
+                // Run on CPU
+                PixelData<float> mCpu(m, true);
+                timer.start_timer("CPU mean X-DIR");
+                lis.calc_sat_mean_z(mCpu, offset, (boundary > 0));
+                timer.stop_timer();
+
+                // Run on GPU
+                PixelData<float> mGpu(m, true);
+                timer.start_timer("GPU mean X-DIR");
+                calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0));
+                timer.stop_timer();
+
+                // Compare results
+                PixelData<float> expected(dim);
+                initFromZYXarray(expected, expectedData[boundary][offset]);
+                EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0);
+                EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0);
+
+                // Also GPU and CPU should give exactly same output
+                EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0);
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Z_DIR_RANDOM_VALUES) {
+        APRTimer timer(false);
+
+        constexpr PixelDataDim const dim{49, 51, 53};
+        PixelData<float> m = getRandInitializedMesh<float>(dim, 50, 10);
+
+        LocalIntensityScale lis;
+
+        for (int boundary = 0; boundary <= 1; ++ boundary) {
+            // boundary = 0 there is no reflected boundary
+            // boudnary = 1 there is boundary reflect
+            for (int offset = 0; offset <= 6; ++offset) {
+//                std::cout << "---------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl;
+
+                PixelData<float> mCpu;
+                mCpu.init(m);
+                mCpu.copyFromMesh(m);
+                timer.start_timer("CPU mean Z-DIR");
+                lis.calc_sat_mean_z(mCpu, offset, (boundary > 0));
+                timer.stop_timer();
+
+                // Run on GPU
+                PixelData<float> mGpu(m, true);
+                timer.start_timer("GPU mean Z-DIR");
+                calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0));
+                timer.stop_timer();
+
+                // Compare results
+                EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_Z_DIR) {
+        // Input params
+        using T = float;
+
+        for (int b = 0; b <= 1; b++) {
+            for (int len = 5; len <= 45; len += 20) {
+                for (int offset = 0; offset <= 6 && offset < len; offset++) {
+                    for (int r = 0; r <= 1; r++) {
+                        bool hasBoundary = b > 0;
+                        bool useRandomNumbers = r > 0;
+//                        std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl;
+
+                        auto t = generateInputAndExpected<T>(len, offset, hasBoundary, useRandomNumbers);
+                        auto input = std::get<0>(t);
+                        auto expected = std::get<1>(t);
+                        PixelData<T> m(1, 1, len, 0);
+                        initFromZYXarray(m, input.data());
+                        PixelData<T> expectedMesh(1, 1, len, 0);
+                        initFromZYXarray(expectedMesh, expected.data());
+
+                        APRTimer timer(false);
+                        LocalIntensityScale lis;
+
+                        // Run on CPU old-impl
+                        timer.start_timer("CPU Z-DIR");
+                        PixelData<T> mCpu(m, true);
+                        lis.calc_sat_mean_z(mCpu, offset, hasBoundary);
+                        timer.stop_timer();
+
+                        // Run on GPU
+                        PixelData<T> mGpu(m, true);
+                        timer.start_timer("GPU Z-DIR");
+                        calcMean(mGpu, offset, MEAN_Z_DIR, hasBoundary);
+                        timer.stop_timer();
+
+                        // expectedMesh because of different order of calculation will have small floating-point differences
+                        // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values!
+                        EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match";
+                        EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match";
+                        EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match";
+                    }
+                }
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_Y_DIR_VS_MANUALLY_CALCULATED_VALUES) {
+        // Belows data is precomputed for y_len = 5 (and maximum offset = 4) so do not change these numbers!
+        constexpr PixelDataDim const dim{5, 1, 1};
+        float expectedData[2][5][dim.y] = {
+                {   // with no boundary values
+                        {1.00, 2.00, 3.00, 4.00, 5.00},  // offset = 0
+                        {1.50, 2.00, 3.00, 4.00, 4.50},  // offset = 1
+                        {2.00, 2.50, 3.00, 3.50, 4.00},  // offset = 2
+                        {2.50, 3.00, 3.00, 3.00, 3.50},  // offset = 3
+                        {3.00, 3.00, 3.00, 3.00, 3.00}   // offset = 4
+                },
+                {   // with boundary values
+                        {1.00, 2.00, 3.00, 4.00, 5.00},
+                        {1.66, 2.00, 3.00, 4.00, 4.33},
+                        {2.20, 2.40, 3.00, 3.60, 3.80},
+                        {2.71, 2.85, 3.00, 3.14, 3.28},
+                        {3.22, 3.11, 3.00, 2.88, 2.77}
+                }
+        };
+
+        APRTimer timer(false); // set to true to see timings
+
+        PixelData<float> m(dim);
+        float dataIn[] = {1, 2, 3, 4, 5};
+        initFromZYXarray(m, dataIn);
+
+        LocalIntensityScale lis;
+
+        for (int boundary = 0; boundary <= 1; ++ boundary) {
+            // boundary = 0 there is no reflected boundary
+            // boudnary = 1 there is boundary reflect
+            for (int offset = 0; offset <= 4; ++offset) {
+                // std::cout << "------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl;
+
+                // Run on CPU
+                PixelData<float> mCpu(m, true);
+                timer.start_timer("CPU mean Y-DIR");
+                lis.calc_sat_mean_y(mCpu, offset, (boundary > 0));
+                timer.stop_timer();
+
+                // Run on GPU
+                PixelData<float> mGpu(m, true);
+                timer.start_timer("GPU mean Y-DIR");
+                calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0));
+                timer.stop_timer();
+
+                // Compare results
+                PixelData<float> expected(dim);
+                initFromZYXarray(expected, expectedData[boundary][offset]);
+                EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0);
+                EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0);
+
+                // Also GPU and CPU should give exactly same output
+                EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0);
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Y_DIR_RANDOM_VALUES) {
+        APRTimer timer(false);
+
+        constexpr PixelDataDim const dim{49, 51, 53};
+        PixelData<float> m = getRandInitializedMesh<float>(dim, 2, 0,false);
+
+        LocalIntensityScale lis;
+
+        for (int boundary = 0; boundary <= 1; ++ boundary) {
+            // boundary = 0 there is no reflected boundary
+            // boudnary = 1 there is boundary reflect
+            for (int offset = 0; offset <= 6; ++offset) {
+//                std::cout << "---------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl;
+
+                PixelData<float> mCpu(m, true);
+                timer.start_timer("CPU mean Y-DIR");
+                lis.calc_sat_mean_y(mCpu, offset, (boundary > 0));
+                timer.stop_timer();
+
+                // Run on GPU
+                PixelData<float> mGpu(m, true);
+                timer.start_timer("GPU mean Y-DIR");
+                calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0));
+                timer.stop_timer();
+
+                // Compare results
+                EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_Y_DIR) {
+        // Input params
+        using T = float;
+
+        for (int b = 0; b <= 1; b++) {
+            for (int len = 5; len <= 45; len += 20) {
+                for (int offset = 0; offset <= 6 && offset < len; offset++) {
+                    for (int r = 0; r <= 1; r++) {
+                        bool hasBoundary = b > 0;
+                        bool useRandomNumbers = r > 0;
+//                        std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl;
+
+                        auto t = generateInputAndExpected<T>(len, offset, hasBoundary, useRandomNumbers);
+                        auto input = std::get<0>(t);
+                        auto expected = std::get<1>(t);
+                        PixelData<T> m(len, 1, 1, 0);
+                        initFromZYXarray(m, input.data());
+                        PixelData<T> expectedMesh(len, 1, 1, 0);
+                        initFromZYXarray(expectedMesh, expected.data());
+
+                        APRTimer timer(false);
+                        LocalIntensityScale lis;
+
+                        // Run on CPU old-impl
+                        timer.start_timer("CPU Y-DIR");
+                        PixelData<T> mCpu(m, true);
+                        lis.calc_sat_mean_y(mCpu, offset, hasBoundary);
+                        timer.stop_timer();
+
+                        // Run on GPU
+                        PixelData<T> mGpu(m, true);
+                        timer.start_timer("GPU Y-DIR");
+                        calcMean(mGpu, offset, MEAN_Y_DIR, hasBoundary);
+                        timer.stop_timer();
+
+                        // expectedMesh because of different order of calculation will have small floating-point differences
+                        // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values!
+                        EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match";
+                        EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match";
+                        EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0) << "---!!!!!!--- CPU vs GPU values does not match";
+                    }
+                }
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) {
+        APRTimer timer(false);
+        PixelData<float> m = getRandInitializedMesh<float>(33, 32, 31);
+
+        LocalIntensityScale lis;
+        for (int boundary = 0; boundary <= 1; boundary++) {
+            for (int offset = 0; offset <= 6; ++offset) {
+                bool hasBoundary = (boundary > 0);
+//                std::cout << "========================> " << " offset=" << offset << " hasBoundary=" << hasBoundary << std::endl;
+
+                // Run on CPU
+                PixelData<float> mCpu(m, true);
+                timer.start_timer("CPU mean ALL-DIR");
+                lis.calc_sat_mean_y(mCpu, offset, hasBoundary);
+                lis.calc_sat_mean_x(mCpu, offset, hasBoundary);
+                lis.calc_sat_mean_z(mCpu, offset, hasBoundary);
+                timer.stop_timer();
+
+                // Run on GPU
+                PixelData<float> mGpu(m, true);
+                timer.start_timer("GPU mean ALL-DIR");
+                calcMean(mGpu, offset, MEAN_ALL_DIR, hasBoundary);
+                timer.stop_timer();
+
+                // Compare results
+                EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) {
+        APRTimer timer(false);
+        PixelData<uint16_t> m = getRandInitializedMesh<uint16_t>(33, 31, 13);
+
+        LocalIntensityScale lis;
+        for (int boundary = 0; boundary <= 1; boundary++) {
+            for (int offset = 0; offset <= 6; ++offset) {
+                bool hasBoundary = (boundary > 0);
+//                std::cout << "========================> " << " offset=" << offset << " hasBoundary=" << hasBoundary << std::endl;
+
+                // Run on CPU
+                PixelData<uint16_t> mCpu(m, true);
+                timer.start_timer("CPU mean ALL-DIR");
+                lis.calc_sat_mean_y(mCpu, offset, hasBoundary);
+                lis.calc_sat_mean_x(mCpu, offset, hasBoundary);
+                lis.calc_sat_mean_z(mCpu, offset, hasBoundary);
+                timer.stop_timer();
+
+                // Run on GPU
+                PixelData<uint16_t> mGpu(m, true);
+                timer.start_timer("GPU mean ALL-DIR");
+                calcMean(mGpu, offset, MEAN_ALL_DIR, hasBoundary);
+                timer.stop_timer();
+
+                // Compare results
+                EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) {
+        APRTimer timer(false);
+
+        for (int boundary = 0; boundary <= 1; ++boundary) {
+            for (int r = 0; r <= 1; r++) {
+                bool hasBoundary = (boundary > 0);
+                bool useRandomNumbers = (r > 0);
+
+                PixelData<float> m = getRandInitializedMesh<float>(31, 33, 32, 25, 10, !useRandomNumbers);
+
+                APRParameters params;
+                params.sigma_th = 1;
+                params.sigma_th_max = 2;
+                params.reflect_bc_lis = hasBoundary;
+
+                // Run on CPU
+                PixelData<float> mCpu(m, true);
+                PixelData<float> mCpuTemp(m, false);
+                timer.start_timer("CPU LIS FULL");
+                LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params);
+                timer.stop_timer();
+
+                // Run on GPU
+                PixelData<float> mGpu(m, true);
+                PixelData<float> mGpuTemp(m, false);
+                timer.start_timer("GPU LIS FULL");
+                getLocalIntensityScale(mGpu, mGpuTemp, params);
+                timer.stop_timer();
+
+                // Compare results
+                EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0), 0);
+                EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE_SUPER_SMALL) {
+        // In case of very small input image like 2x2x2 constant scale is being used
+        APRTimer timer(false);
+
+        for (int boundary = 0; boundary <= 1; ++boundary) {
+            for (int r = 0; r <= 1; r++) {
+                bool hasBoundary = (boundary > 0);
+                bool useRandomNumbers = (r > 0);
+
+                PixelData<float> m = getRandInitializedMesh<float>(2,2,2, 25, 10, !useRandomNumbers);
+
+                APRParameters params;
+                params.sigma_th = 1;
+                params.sigma_th_max = 2;
+                params.reflect_bc_lis = hasBoundary;
+
+                // Run on CPU
+                PixelData<float> mCpu(m, true);
+                PixelData<float> mCpuTemp(m, false);
+                timer.start_timer("CPU LIS FULL");
+                LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params);
+                mCpu.printMesh(3,2);
+                timer.stop_timer();
+
+                // Run on GPU
+                PixelData<float> mGpu(m, true);
+                PixelData<float> mGpuTemp(m, false);
+                timer.start_timer("GPU LIS FULL");
+                getLocalIntensityScale(mGpu, mGpuTemp, params);
+                timer.stop_timer();
+
+                // Compare results - only mGPU mattters since mGpuTemp in case of constant scale is not modified
+                EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+            }
+        }
+    }
+
+    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE_CONSTANT_SCALE) {
+        APRTimer timer(false);
+
+        for (int boundary = 0; boundary <= 1; ++boundary) {
+            for (int r = 0; r <= 1; r++) {
+                bool hasBoundary = (boundary > 0);
+                bool useRandomNumbers = (r > 0);
+
+                PixelData<float> m = getRandInitializedMesh<float>(31, 33, 32, 25, 10, !useRandomNumbers);
+
+                APRParameters params;
+                params.sigma_th = 1;
+                params.sigma_th_max = 2;
+                params.reflect_bc_lis = hasBoundary;
+                params.constant_intensity_scale = true;
+
+                // Run on CPU
+                PixelData<float> mCpu(m, true);
+                PixelData<float> mCpuTemp(m, false);
+                timer.start_timer("CPU LIS FULL");
+                LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params);
+                timer.stop_timer();
+
+                // Run on GPU
+                PixelData<float> mGpu(m, true);
+                PixelData<float> mGpuTemp(m, false);
+                timer.start_timer("GPU LIS FULL");
+                getLocalIntensityScale(mGpu, mGpuTemp, params);
+                timer.stop_timer();
+
+                // Compare results
+                // NOTE: mCpuTemp and mGpuTemp are not checked since in case of
+                //       constant_intensity_scale they are not set to any value
+                EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0);
+            }
+        }
+    }
+
+#endif // APR_USE_CUDA
+}
+
+
+int main(int argc, char **argv) {
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/LocalIntensityScaleTest.cpp b/test/LocalIntensityScaleTest.cpp
index a9f1558b..09a6466b 100644
--- a/test/LocalIntensityScaleTest.cpp
+++ b/test/LocalIntensityScaleTest.cpp
@@ -5,9 +5,6 @@
 #include <gtest/gtest.h>
 #include "data_structures/Mesh/PixelData.hpp"
 #include "algorithm/LocalIntensityScale.hpp"
-#include "algorithm/LocalIntensityScaleCuda.h"
-#include "data_structures/APR/APR.hpp"
-#include "algorithm/APRConverter.hpp"
 #include "TestTools.hpp"
 
 
@@ -24,7 +21,7 @@ namespace {
             LocalIntensityScale lis;
             lis.calc_sat_mean_y(m, 0);
 
-            ASSERT_TRUE(compare(m, expect, 0.05));
+            ASSERT_TRUE(compare(m, expect, 0.000001));
         }
         {   // OFFSET=1
 
@@ -37,7 +34,7 @@ namespace {
             LocalIntensityScale lis;
             lis.calc_sat_mean_y(m, 1);
 
-            ASSERT_TRUE(compare(m, expect, 0.05));
+            ASSERT_TRUE(compare(m, expect, 0.000001));
         }
         {   // OFFSET=2 (+symmetricity check)
 
@@ -50,7 +47,7 @@ namespace {
             LocalIntensityScale lis;
             lis.calc_sat_mean_y(m, 2);
 
-            ASSERT_TRUE(compare(m, expect, 0.05));
+            ASSERT_TRUE(compare(m, expect, 0.000001));
 
             // check if data in opposite order gives same result
             float dataIn2[] = {24,21,18,15,12,9,6,3};
@@ -60,7 +57,7 @@ namespace {
 
             lis.calc_sat_mean_y(m, 2);
 
-            ASSERT_TRUE(compare(m, expect2, 0.05));
+            ASSERT_TRUE(compare(m, expect2, 0.000001));
         }
     }
 
@@ -76,7 +73,7 @@ namespace {
             LocalIntensityScale lis;
             lis.calc_sat_mean_x(m, 0);
 
-            ASSERT_TRUE(compare(m, expect, 0.05));
+            ASSERT_TRUE(compare(m, expect, 0.000001));
         }
         {   // OFFSET=1
 
@@ -89,7 +86,7 @@ namespace {
             LocalIntensityScale lis;
             lis.calc_sat_mean_x(m, 1);
 
-            ASSERT_TRUE(compare(m, expect, 0.05));
+            ASSERT_TRUE(compare(m, expect, 0.000001));
         }
         {   // OFFSET=2 (+symmetricity check)
 
@@ -102,7 +99,7 @@ namespace {
             LocalIntensityScale lis;
             lis.calc_sat_mean_x(m, 2);
 
-            ASSERT_TRUE(compare(m, expect, 0.05));
+            ASSERT_TRUE(compare(m, expect, 0.000001));
 
             // check if data in opposite order gives same result
             float dataIn2[] = {24,21,18,15,12,9,6,3};
@@ -112,7 +109,7 @@ namespace {
 
             lis.calc_sat_mean_x(m, 2);
 
-            ASSERT_TRUE(compare(m, expect2, 0.05));
+            ASSERT_TRUE(compare(m, expect2, 0.000001));
         }
     }
 
@@ -128,7 +125,7 @@ namespace {
             LocalIntensityScale lis;
             lis.calc_sat_mean_z(m, 0);
 
-            ASSERT_TRUE(compare(m, expect, 0.05));
+            ASSERT_TRUE(compare(m, expect, 0.000001));
         }
         {   // OFFSET=1
 
@@ -141,7 +138,7 @@ namespace {
             LocalIntensityScale lis;
             lis.calc_sat_mean_z(m, 1);
 
-            ASSERT_TRUE(compare(m, expect, 0.05));
+            ASSERT_TRUE(compare(m, expect, 0.000001));
         }
         {   // OFFSET=2 (+symmetricity check)
 
@@ -154,7 +151,7 @@ namespace {
             LocalIntensityScale lis;
             lis.calc_sat_mean_z(m, 2);
 
-            ASSERT_TRUE(compare(m, expect, 0.05));
+            ASSERT_TRUE(compare(m, expect, 0.000001));
 
             // check if data in opposite order gives same result
             float dataIn2[] = {24,21,18,15,12,9,6,3};
@@ -164,221 +161,10 @@ namespace {
 
             lis.calc_sat_mean_z(m, 2);
 
-            ASSERT_TRUE(compare(m, expect2, 0.05));
+            ASSERT_TRUE(compare(m, expect2, 0.000001));
         }
     }
 
-
-// ============================================================================
-// ====================       CUDA IMPL TESTS     =============================
-// ============================================================================
-
-#ifdef APR_USE_CUDA
-
-    TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) {
-        {   // OFFSET=0
-
-            PixelData<float> m(8, 1, 1, 0);
-            float dataIn[] = {3,6,9,12,15,18,21,24};
-            float expect[] = {3,6,9,12,15,18,21,24};
-
-            initFromZYXarray(m, dataIn);
-
-            calcMean(m, 0, MEAN_Y_DIR);
-
-            ASSERT_TRUE(compare(m, expect, 0.05));
-        }
-        {   // OFFSET=1
-
-            PixelData<float> m(8, 1, 1, 0);
-            float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8};
-            float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5};
-
-            initFromZYXarray(m, dataIn);
-
-            calcMean(m, 1, MEAN_Y_DIR);
-
-            ASSERT_TRUE(compare(m, expect, 0.05));
-        }
-        {   // OFFSET=2 (+symmetricity check)
-
-            PixelData<float> m(8, 1, 1, 0);
-            float dataIn[] = {3,6,9,12,15,18,21,24};
-            float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21};
-
-            initFromZYXarray(m, dataIn);
-
-            calcMean(m, 2, MEAN_Y_DIR);
-
-            ASSERT_TRUE(compare(m, expect, 0.05));
-
-            // check if data in opposite order gives same result
-            float dataIn2[] = {24,21,18,15,12,9,6,3};
-            float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6};
-
-            initFromZYXarray(m, dataIn2);
-
-            calcMean(m, 2, MEAN_Y_DIR);
-
-            ASSERT_TRUE(compare(m, expect2, 0.05));
-        }
-    }
-
-    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) {
-        APRTimer timer(true);
-        PixelData<float> m = getRandInitializedMesh<float>(33, 31, 13);
-
-        LocalIntensityScale lis;
-        for (int offset = 0; offset < 6; ++offset) {
-            // Run on CPU
-            PixelData<float> mCpu(m, true);
-            timer.start_timer("CPU mean Y-DIR");
-            lis.calc_sat_mean_y(mCpu, offset);
-            timer.stop_timer();
-
-            // Run on GPU
-            PixelData<float> mGpu(m, true);
-            timer.start_timer("GPU mean Y-DIR");
-            calcMean(mGpu, offset, MEAN_Y_DIR);
-            timer.stop_timer();
-
-            // Compare results
-            EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0);
-        }
-    }
-
-    TEST(LocalIntensityScaleCudaTest, 1GPU_VS_CPU_X_DIR) {
-        APRTimer timer(true);
-        PixelData<float> m = getRandInitializedMesh<float>(33, 31, 13);
-
-        LocalIntensityScale lis;
-        for (int offset = 0; offset < 6; ++offset) {
-            // Run on CPU
-            PixelData<float> mCpu(m, true);
-            timer.start_timer("CPU mean X-DIR");
-            lis.calc_sat_mean_x(mCpu, offset);
-            timer.stop_timer();
-
-            // Run on GPU
-            PixelData<float> mGpu(m, true);
-            timer.start_timer("GPU mean X-DIR");
-            calcMean(mGpu, offset, MEAN_X_DIR);
-            timer.stop_timer();
-
-            // Compare results
-            EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0);
-        }
-    }
-
-    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) {
-        APRTimer timer(true);
-        using ImgType = float;
-        PixelData<ImgType> m = getRandInitializedMesh<ImgType>(310, 330, 13, 255);
-
-        LocalIntensityScale lis;
-        for (int offset = 0; offset < 6; ++offset) {
-            // Run on CPU
-            PixelData<ImgType> mCpu(m, true);
-            timer.start_timer("CPU mean Z-DIR");
-            lis.calc_sat_mean_z(mCpu, offset);
-            timer.stop_timer();
-
-            // Run on GPU
-            PixelData<ImgType> mGpu(m, true);
-            timer.start_timer("GPU mean Z-DIR");
-            calcMean(mGpu, offset, MEAN_Z_DIR);
-            timer.stop_timer();
-
-            // Compare results
-            EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0);
-        }
-    }
-
-    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) {
-        APRTimer timer(true);
-        PixelData<float> m = getRandInitializedMesh<float>(33, 31, 13);
-
-        LocalIntensityScale lis;
-        for (int offset = 0; offset < 6; ++offset) {
-            // Run on CPU
-            PixelData<float> mCpu(m, true);
-            timer.start_timer("CPU mean ALL-DIR");
-            lis.calc_sat_mean_y(mCpu, offset);
-            lis.calc_sat_mean_x(mCpu, offset);
-            lis.calc_sat_mean_z(mCpu, offset);
-            timer.stop_timer();
-
-            // Run on GPU
-            PixelData<float> mGpu(m, true);
-            timer.start_timer("GPU mean ALL-DIR");
-            calcMean(mGpu, offset);
-            timer.stop_timer();
-
-            // Compare results
-            EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0);
-        }
-    }
-
-    //@KG: The CPU code doesn't work for uint16 --> overflow will likely result.
-
-//    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) {
-//        APRTimer timer(true);
-//        PixelData<uint16_t> m = getRandInitializedMesh<uint16_t>(33, 31, 13);
-//
-//        LocalIntensityScale lis;
-//        for (int offset = 0; offset < 6; ++offset) {
-//            // Run on CPU
-//            PixelData<uint16_t> mCpu(m, true);
-//            timer.start_timer("CPU mean ALL-DIR");
-//            lis.calc_sat_mean_y(mCpu, offset);
-//            lis.calc_sat_mean_x(mCpu, offset);
-//            lis.calc_sat_mean_z(mCpu, offset);
-//            timer.stop_timer();
-//
-//            // Run on GPU
-//            PixelData<uint16_t> mGpu(m, true);
-//            timer.start_timer("GPU mean ALL-DIR");
-//            calcMean(mGpu, offset);
-//            timer.stop_timer();
-//
-//            // Compare results
-//            EXPECT_EQ(compareMeshes(mCpu, mGpu, 1), 0);
-//        }
-//    }
-
-    TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) {
-        APRTimer timer(true);
-        PixelData<float> m = getRandInitializedMesh<float>(310, 330, 13, 25);
-
-        APRParameters params;
-        params.sigma_th = 1;
-        params.sigma_th_max = 2;
-        params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented.
-
-        // Run on CPU
-        PixelData<float> mCpu(m, true);
-        PixelData<float> mCpuTemp(m, false);
-        timer.start_timer("CPU LIS FULL");
-
-        LocalIntensityScale localIntensityScale;
-
-        localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params);
-        timer.stop_timer();
-
-        // Run on GPU
-        PixelData<float> mGpu(m, true);
-        PixelData<float> mGpuTemp(m, false);
-        timer.start_timer("GPU LIS ALL-DIR");
-        getLocalIntensityScale(mGpu, mGpuTemp, params);
-        timer.stop_timer();
-
-        // Compare results
-        //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required.
-        EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0);
-    }
-
-#endif // APR_USE_CUDA
-
 }
 
 int main(int argc, char **argv) {
diff --git a/test/MeshDataTest.cpp b/test/MeshDataTest.cpp
index 869229e3..20b1bbe3 100644
--- a/test/MeshDataTest.cpp
+++ b/test/MeshDataTest.cpp
@@ -5,6 +5,7 @@
 #include "data_structures/Mesh/PixelData.hpp"
 #include "data_structures/Mesh/PixelDataCuda.h"
 #include <random>
+#include "TestTools.hpp"
 
 namespace {
 
@@ -34,6 +35,7 @@ namespace {
             ASSERT_EQ(d.x, 20);
             ASSERT_EQ(d.z, 30);
             ASSERT_EQ(d.size(), 10*20*30);
+            ASSERT_EQ(d.maxDimSize(), 30);
         }
         { // adding int to all dims
 
@@ -80,6 +82,16 @@ namespace {
             ASSERT_FALSE(x == z);
             ASSERT_TRUE(x != z);
         }
+        {  // number of dimensions
+            const PixelDataDim x = {2, 3, 5};
+            const PixelDataDim y = {2, 1, 5};
+            const PixelDataDim z = {1, 4, 1};
+            const PixelDataDim w = {1, 1, 1};
+            ASSERT_EQ(x.numOfDimensions(), 3);
+            ASSERT_EQ(y.numOfDimensions(), 2);
+            ASSERT_EQ(z.numOfDimensions(), 1);
+            ASSERT_EQ(w.numOfDimensions(), 0);
+        }
     }
 
     TEST_F(VectorDataTest, InitTest) {
@@ -337,6 +349,16 @@ namespace {
             ASSERT_EQ(md.mesh.size(), 100*200*300);
         }
 
+        // size provided
+        {
+            PixelDataDim dim(100, 200, 300);
+            PixelData<int> md(dim);
+            ASSERT_EQ(md.x_num, 200);
+            ASSERT_EQ(md.y_num, 100);
+            ASSERT_EQ(md.z_num, 300);
+            ASSERT_EQ(md.mesh.size(), 100*200*300);
+        }
+
         // mesh provided
         {
             // generate some data
@@ -675,51 +697,7 @@ namespace {
 }
 
 #ifdef APR_USE_CUDA
-namespace {
-    /**
-     * Compares two meshes
-     * @param expected
-     * @param tested
-     * @param maxNumOfErrPrinted - how many error values should be printed (-1 for all)
-     * @return number of errors detected
-     */
-    template <typename T>
-    int compareMeshes(const PixelData<T> &expected, const PixelData<T> &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) {
-        int cnt = 0;
-        for (size_t i = 0; i < expected.mesh.size(); ++i) {
-            if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) ||
-                std::isnan(tested.mesh[i])) {
-                if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) {
-                    std::cout << "ERROR expected vs tested mesh: " << expected.mesh[i] << " vs " << tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl;
-                }
-                cnt++;
-            }
-        }
-        std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl;
-        return cnt;
-    }
 
-    /**
- * Generates mesh with provided dims with random values in range [0, 1] * multiplier
- * @param y
- * @param x
- * @param z
- * @param multiplier
- * @return
- */
-    template <typename T>
-    PixelData<T> getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) {
-        PixelData<T> m(y, x, z);
-        std::cout << "Mesh info: " << m << std::endl;
-        std::random_device rd;
-        std::mt19937 mt(rd());
-        std::uniform_real_distribution<double> dist(0.0, 1.0);
-        for (size_t i = 0; i < m.mesh.size(); ++i) {
-            m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier;
-        }
-        return m;
-    }
-}
 TEST(MeshDataSimpleTest, DownSampleCuda) {
     {   // reduce/constant_operator calculate maximum value when downsampling
         PixelData<float> m(5, 6, 4);
@@ -773,10 +751,10 @@ TEST(MeshDataSimpleTest, DownSampleCuda) {
         EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
     }
     {
-        APRTimer timer(true);
+        APRTimer timer(false);
 
         // reduce/constant_operator calculate average value of pixels when downsampling
-        PixelData<float> m =  getRandInitializedMesh<float>(33, 22, 21);
+        PixelData<float> m =  getRandInitializedMesh<float>(33, 22, 21, 100, 5);
         for (size_t i = 0; i < m.mesh.size(); ++i) m.mesh[i] = 27 - i;
 
         PixelData<float> mCpu; mCpu.initDownsampled(m);
@@ -792,7 +770,7 @@ TEST(MeshDataSimpleTest, DownSampleCuda) {
         downsampleMeanCuda(m, mGpu);
         timer.stop_timer();
 
-        EXPECT_EQ(compareMeshes(mCpu, mGpu), 0);
+        EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0);
     }
 }
 #endif
diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp
new file mode 100644
index 00000000..bd24156e
--- /dev/null
+++ b/test/PullingSchemeCudaTest.cpp
@@ -0,0 +1,254 @@
+#include <gtest/gtest.h>
+
+#include "algorithm/PullingScheme.hpp"
+#include "algorithm/OVPC.h"
+
+#include "algorithm/PullingSchemeCuda.hpp"
+#include "algorithm/ComputeGradientCuda.hpp"
+#include "algorithm/LocalParticleCellSet.hpp"
+
+#include "TestTools.hpp"
+
+// Class for storing expected values for one element of Particle Cell Tree (output of Pulling Scheme)
+class LevelData  {
+public:
+    int level;
+    int y;
+    int x;
+    int z;
+    uint8_t expectedType; // seed, boundary, filler...
+};
+
+/**
+ * Verify computed Particle Cell Tree (PCT) vs expected values
+ * Expected values should list all data for types=1,2,3 (seed, boundary filler) which are used to generate particles:
+ * {levels, y,x,z(position), type}
+ * All other values are ignored (and used by Pulling Scheme (PS) only for intermediate calculations)
+ * @param aPCT - PCT produces by PS (note: values in PCT will be changed during verification!)
+ * @param expectedValues expected values
+ * @return true if correct, false otherwise
+ */
+template<typename ElementType>
+bool verifyParticleCellTree(std::vector<PixelData<ElementType>> &aPCT, const std::vector<LevelData> &expectedValues) {
+
+    const uint8_t AlreadyCheckedMark = 255;
+    const uint8_t MaxValueOfImportantType = FILLER_TYPE; // All types above are used by PS during computation phase only
+
+    for (const auto &r : expectedValues) {
+        // std::cout << r.level << " " << r.y << "," << r.x << "," << r.z << " " << (int)r.expectedType << std::endl;
+
+        auto &v = aPCT[r.level](r.y, r.x, r.z);
+        // Add dim. checks for accessing pct
+        if (v == r.expectedType) {
+            v = AlreadyCheckedMark;
+        }
+        else {
+            std::cout << "Error! Data on level=" << r.level << " at (" << r.y << "," << r.x << "," << r.z << ") expected=" << (int)r.expectedType << " got=" << (int)v << std::endl;
+            return false;
+        }
+    }
+
+    for (size_t level = 0; level < aPCT.size(); level++) {
+        auto &d = aPCT[level];
+        auto y_num = d.y_num;
+        auto x_num = d.x_num;
+        auto z_num = d.z_num;
+
+        for (int j = 0; j < z_num; j++) {
+            for (int i = 0; i < x_num; i++) {
+                for (int k = 0; k < y_num; k++) {
+                    const auto &v = d(k, i, j);
+                    if (v != AlreadyCheckedMark && v <= MaxValueOfImportantType && v > 0) {
+                        std::cout << "Error! Data on level = " << level << " at (" << k << "," << i << "," << j << ") with value = " << (int)v << " not verified or bad!" << std::endl;
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+/**
+ * Prints PCT
+ * @param particleCellTree
+ */
+template <typename T>
+void printParticleCellTree(const std::vector<PixelData<T>> &particleCellTree) {
+    for (uint64_t  l = 0; l < particleCellTree.size(); ++l) {
+        auto &tree = particleCellTree[l];
+//            std::cout << "-- level = " << l << ",  " << tree << std::endl;
+        tree.printMeshT(3,0);
+    }
+}
+
+template<typename DataType>
+void fillPS(PullingScheme &aPS, PixelData<DataType> &levels) {
+    PixelData<DataType> levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0));
+    LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters());
+}
+
+
+// -------------------------------------------------------------------------------------------------------------------------------------------
+
+TEST(PullingSchemeTest, PSvsOVPCCUDA) {
+    // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC
+    GenInfo gi;
+    gi.init(255, 157, 257);
+
+    // Generate random levels for PS and OVPC
+    PixelData<float> levels(std::ceil(gi.org_dims[0]/2.0),
+                            std::ceil(gi.org_dims[1]/2.0),
+                            std::ceil(gi.org_dims[2]/2.0),
+                            0);
+    // Add a few particles only - it will end up with Pulling Scheme generate particles on (almost) all
+    // levels - good case to compare with OVPC
+    const int numOfParticles = 3;
+    std::srand(std::time(nullptr));
+    for (int i = 0; i < numOfParticles; ++i) {
+        levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = gi.l_max;
+    }
+    PixelData<float> levelsOVPC(levels, true); // just copy 'levels'
+    PixelData<float> levelsPS(levels, true);
+
+    // Initialize all needed objects
+    APRTimer t(true);
+
+    t.start_timer("PS - init");
+    PullingScheme ps;
+    ps.initialize_particle_cell_tree(gi);
+    fillPS(ps, levelsPS);
+    t.stop_timer();
+    t.start_timer("PS - compute");
+    ps.pulling_scheme_main();
+    t.stop_timer();
+
+    // Run test methods and compare results
+    t.start_timer("OVPCCUDA - compute");
+    auto pct = computeOvpcCuda(levelsOVPC, gi);
+    t.stop_timer();
+
+    // -------------- Verify result
+    ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0);
+}
+
+
+TEST(PullingSchemeTest, OVPCCUDA_Ydir) {
+    // Prepare input data for PS
+    float values[] = {9,0,0,0, 0,0,0,0};
+    int len = sizeof(values)/sizeof(int);
+    PixelData<float> levels(len, 1, 1);  // <-- Y-dir
+    initFromZYXarray(levels, values); // <-- Y-dir
+
+    // Prepare GenInfo structure -
+    // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2
+    GenInfo gi;
+    const PixelDataDim dim = levels.getDimension();
+    gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir
+
+    int levelMax = gi.l_max - 1;
+    int levelMin = gi.l_min;
+
+    // Initialize all needed objects
+    APRTimer t(false);
+
+    t.start_timer("OVPCCUDA - compute");
+    auto pct = computeOvpcCuda(levels, gi);
+    t.stop_timer();
+
+    // List of expected types
+    std::vector<LevelData> ev = {
+            {3, 0,0,0, 1},
+            {3, 1,0,0, 2},
+            {3, 2,0,0, 3},
+            {3, 3,0,0, 3},
+
+            {2, 2,0,0, 3},
+            {2, 3,0,0, 3}
+    };
+
+    // -------------- Verify result
+    EXPECT_TRUE(verifyParticleCellTree(pct, ev));
+}
+
+TEST(PullingSchemeTest, OVPCCUDA_Xdir) {
+    // Prepare input data for PS
+    int values[] = {9,0,0,0, 0,0,0,0};
+    int len = sizeof(values)/sizeof(int);
+    PixelData<int> levels(1, len, 1);  // <-- X-dir
+    initFromZYXarray(levels, values);
+
+    // Prepare GenInfo structure -
+    // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2
+    GenInfo gi;
+    const PixelDataDim dim = levels.getDimension();
+    gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir
+
+    int levelMax = gi.l_max - 1;
+    int levelMin = gi.l_min;
+
+    // Initialize all needed objects
+    APRTimer t(false);
+
+    t.start_timer("OVPCCUDA - compute");
+    auto pct = computeOvpcCuda(levels, gi);
+    t.stop_timer();
+
+    // List of expected types
+    std::vector<LevelData> ev = {
+            {3, 0,0,0, 1},
+            {3, 0,1,0, 2},
+            {3, 0,2,0, 3},
+            {3, 0,3,0, 3}  ,
+
+            {2, 0,2,0, 3},
+            {2, 0,3,0, 3}
+    };
+
+    // -------------- Verify result
+    EXPECT_TRUE(verifyParticleCellTree(pct, ev));
+}
+
+TEST(PullingSchemeTest, OVPCCUDA_Zdir) {
+    // Prepare input data for PS
+    int values[] = {9,0,0,0, 0,0,0,0};
+    int len = sizeof(values)/sizeof(int);
+    PixelData<int> levels(1, 1, len);  // <-- Z-dir
+    initFromZYXarray(levels, values);
+
+    // Prepare GenInfo structure -
+    // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2
+    GenInfo gi;
+    const PixelDataDim dim = levels.getDimension();
+    gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir
+
+    int levelMax = gi.l_max - 1;
+    int levelMin = gi.l_min;
+
+    // Initialize all needed objects
+    APRTimer t(false);
+
+    t.start_timer("OVPCCUDA - compute");
+    auto pct = computeOvpcCuda(levels, gi);
+    t.stop_timer();
+
+    // List of expected types
+    std::vector<LevelData> ev = {
+            {3, 0,0,0, 1},
+            {3, 0,0,1, 2},
+            {3, 0,0,2, 3},
+            {3, 0,0,3, 3}  ,
+
+            {2, 0,0,2, 3},
+            {2, 0,0,3, 3}
+    };
+
+    // -------------- Verify result
+    EXPECT_TRUE(verifyParticleCellTree(pct, ev));
+}
+
+int main(int argc, char **argv) {
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp
index f72897cd..eeee9718 100644
--- a/test/PullingSchemeTest.cpp
+++ b/test/PullingSchemeTest.cpp
@@ -4,109 +4,509 @@
 
 #include <gtest/gtest.h>
 #include "data_structures/Mesh/PixelData.hpp"
-//TODO: only APRAccess.hpp should be included here but currently because of dependencies it does not work :(
-#include "data_structures/APR/APR.hpp"
-#include "algorithm/APRConverter.hpp"
-//#include "data_structures/APR/APRAccess.hpp"
+#include "data_structures/APR/access/APRAccessStructures.hpp"
 #include "algorithm/PullingScheme.hpp"
+#include "algorithm/OVPC.h"
 #include "TestTools.hpp"
-#ifdef APR_USE_CUDA
-#include "algorithm/ComputeGradientCuda.hpp"
-#endif
+#include "algorithm/LocalParticleCellSet.hpp"
+
 
 namespace {
+
+    // =================================================================================================================
+    // ======== Some test helpers
+    // =================================================================================================================
+
+    /**
+     * Prints PCT
+     * @param particleCellTree
+     */
     template <typename T>
-    PixelData<float> generateLevels(const PixelData<T> &dimsMesh, int maxLevel) {
-        PixelData<float> levels(dimsMesh, false);
-        for (size_t i = 0; i < levels.mesh.size(); ++i) {
-            levels.mesh[i] = ( i/2 ) % (maxLevel + 2);
+    void printParticleCellTree(const std::vector<PixelData<T>> &particleCellTree) {
+        for (uint64_t  l = 0; l < particleCellTree.size(); ++l) {
+            auto &tree = particleCellTree[l];
+//            std::cout << "-- level = " << l << ",  " << tree << std::endl;
+            tree.printMeshT(3,0);
         }
-//        std::cout << "LEVELS: " << std::endl;
-        levels.printMesh(3, 0);
-        return levels;
     }
 
-//    void printParticleCellTree(const std::vector<PixelData<uint8_t>> &particleCellTree) {
-//        for (int l = 0; l < particleCellTree.size(); ++l) {
-//            auto &tree = particleCellTree[l];
-//            std::cout << "------ 1level=" << l << " " << tree << std::endl;
-//            tree.printMesh(3,0);
-//        }
-//    }
+    // Class for storing expected values for one element of Particle Cell Tree (output of Pulling Scheme)
+    class LevelData  {
+    public:
+        int level;
+        int y;
+        int x;
+        int z;
+        uint8_t expectedType; // seed, boundary, filler...
+    };
 
-    TEST(PullingSchemeTest, Init) {
+    /**
+     * Verify computed Particle Cell Tree (PCT) vs expected values
+     * Expected values should list all data for types=1,2,3 (seed, boundary filler) which are used to generate particles:
+     * {levels, y,x,z(position), type}
+     * All other values are ignored (and used by Pulling Scheme (PS) only for intermediate calculations)
+     * @param aPCT - PCT produces by PS (note: values in PCT will be changed during verification!)
+     * @param expectedValues expected values
+     * @return true if correct, false otherwise
+     */
+     template<typename ElementType>
+    bool verifyParticleCellTree(std::vector<PixelData<ElementType>> &aPCT, const std::vector<LevelData> &expectedValues) {
 
-        GenInfo aprInfo;
+        const uint8_t AlreadyCheckedMark = 255;
+        const uint8_t MaxValueOfImportantType = FILLER_TYPE; // All types above are used by PS during computation phase only
 
-        aprInfo.l_max = 4;
-        aprInfo.l_min = 2;
-        aprInfo.org_dims[0] = 8;
-        aprInfo.org_dims[1] = 16;
-        aprInfo.org_dims[2] = 1;
+        for (const auto &r : expectedValues) {
+            // std::cout << r.level << " " << r.y << "," << r.x << "," << r.z << " " << (int)r.expectedType << std::endl;
 
-        PullingScheme ps;
-        ps.initialize_particle_cell_tree(aprInfo);
-        std::vector<PixelData<uint8_t>> &pctree = ps.getParticleCellTree();
-
-        // TEST: check if zeroed and correct number of levels
-        ASSERT_EQ(aprInfo.l_max, pctree.size()); // all levels [0, access.level_max - 1]
-        for (size_t l = 0; l < pctree.size(); ++l) {
-            auto &tree = pctree[l];
-            for (auto &e : tree.mesh) {
-                ASSERT_EQ(0, e);
+            auto &v = aPCT[r.level](r.y, r.x, r.z);
+            // Add dim. checks for accessing pct
+            if (v == r.expectedType) {
+                v = AlreadyCheckedMark;
+            }
+            else {
+                std::cout << "Error! Data on level=" << r.level << " at (" << r.y << "," << r.x << "," << r.z << ") expected=" << (int)r.expectedType << " got=" << (int)v << std::endl;
+                return false;
             }
         }
 
-        // Generate mesh with test levels
-        PixelData<float> levels = generateLevels(pctree[aprInfo.l_max - 1], aprInfo.l_max);
-
-        // Fill particle cell tree with levels
-        int l_max = aprInfo.l_max - 1;
-        int l_min = aprInfo.l_min;
-        ps.fill(l_max, levels);
-
-        PixelData<float> levelsDS;
-        for(int l_ = l_max - 1; l_ >= l_min; l_--){
-            //down sample the resolution level k, using a max reduction
-            downsample(levels, levelsDS,
-                       [](const float &x, const float &y) -> float { return std::max(x, y); },
-                       [](const float &x) -> float { return x; }, true);
-            ps.fill(l_,levelsDS);
-            levels.swap(levelsDS);
+        for (size_t level = 0; level < aPCT.size(); level++) {
+            auto &d = aPCT[level];
+            auto y_num = d.y_num;
+            auto x_num = d.x_num;
+            auto z_num = d.z_num;
+
+            for (int j = 0; j < z_num; j++) {
+                for (int i = 0; i < x_num; i++) {
+                    for (int k = 0; k < y_num; k++) {
+                        const auto &v = d(k, i, j);
+                        if (v != AlreadyCheckedMark && v <= MaxValueOfImportantType && v > 0) {
+                            std::cout << "Error! Data on level = " << level << " at (" << k << "," << i << "," << j << ") with value = " << (int)v << " not verified or bad!" << std::endl;
+                            return false;
+                        }
+                    }
+                }
+            }
         }
-//
-//        printParticleCellTree(pctree);
-//        ps.fill_neighbours(l_max);
-//        pctree[l_max].printMesh(3, 0);
-//        ps.pulling_scheme_main();
-//        printParticleCellTree(pctree);
+
+        return true;
+    }
+
+    template<typename DataType>
+    void fillPS(PullingScheme &aPS, PixelData<DataType> &levels) {
+        PixelData<DataType> levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0));
+        LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters());
+    }
+
+    // =================================================================================================================
+    // ======== Pulling Scheme algorithm tests
+    // =================================================================================================================
+    TEST(PullingSchemeTest, PullingScheme1D_Ydir) {
+        // Prepare input data for PS
+        int values[] = {9,0,0,0, 0,0,0,0};
+        int len = sizeof(values)/sizeof(int);
+        PixelData<int> levels(len, 1, 1);  // <-- Y-dir
+        initFromZYXarray(levels, values); // <-- Y-dir
+
+        // Prepare GenInfo structure -
+        // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2
+        GenInfo gi;
+        const PixelDataDim dim = levels.getDimension();
+        gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir
+
+        // Initialize all needed objects
+        APRTimer t(false);
+
+        t.start_timer("PS - initialize with data");
+        PullingScheme ps;
+        ps.initialize_particle_cell_tree(gi);
+        fillPS(ps, levels);
+        t.stop_timer();
+
+        t.start_timer("PS - compute");
+        ps.pulling_scheme_main();
+        t.stop_timer();
+
+        // List of expected types
+        std::vector<LevelData> ev = {
+                {3, 0,0,0, 1},
+                {3, 1,0,0, 2},
+                {3, 2,0,0, 3},
+                {3, 3,0,0, 3},
+
+                {2, 2,0,0, 3},
+                {2, 3,0,0, 3}
+        };
+
+        // -------------- Verify result
+        EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev));
+    }
+
+    TEST(PullingSchemeTest, PullingScheme1D_Xdir) {
+        // Prepare input data for PS
+        int values[] = {9,0,0,0, 0,0,0,0};
+        int len = sizeof(values)/sizeof(int);
+        PixelData<int> levels(1, len, 1);  // <-- X-dir
+        initFromZYXarray(levels, values);
+
+        // Prepare GenInfo structure -
+        // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2
+        GenInfo gi;
+        const PixelDataDim dim = levels.getDimension();
+        gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir
+
+        // Initialize all needed objects
+        APRTimer t(false);
+
+        t.start_timer("PS - initialize with data");
+        PullingScheme ps;
+        ps.initialize_particle_cell_tree(gi);
+        fillPS(ps, levels);
+        t.stop_timer();
+
+        t.start_timer("PS - compute");
+        ps.pulling_scheme_main();
+        t.stop_timer();
+
+        // List of expected types
+        std::vector<LevelData> ev = {
+                {3, 0,0,0, 1},
+                {3, 0,1,0, 2},
+                {3, 0,2,0, 3},
+                {3, 0,3,0, 3}  ,
+
+                {2, 0,2,0, 3},
+                {2, 0,3,0, 3}
+        };
+
+        // -------------- Verify result
+        EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev));
+    }
+
+    TEST(PullingSchemeTest, PullingScheme1D_Zdir) {
+        // Prepare input data for PS
+        int values[] = {9,0,0,0, 0,0,0,0};
+        int len = sizeof(values)/sizeof(int);
+        PixelData<int> levels(1, 1, len);  // <-- Z-dir
+        initFromZYXarray(levels, values);
+
+        // Prepare GenInfo structure -
+        // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2
+        GenInfo gi;
+        const PixelDataDim dim = levels.getDimension();
+        gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir
+
+        // Initialize all needed objects
+        APRTimer t(false);
+
+        t.start_timer("PS - initialize with data");
+        PullingScheme ps;
+        ps.initialize_particle_cell_tree(gi);
+        fillPS(ps, levels);
+        t.stop_timer();
+
+        t.start_timer("PS - compute");
+        ps.pulling_scheme_main();
+        t.stop_timer();
+
+        // List of expected types
+        std::vector<LevelData> ev = {
+                {3, 0,0,0, 1},
+                {3, 0,0,1, 2},
+                {3, 0,0,2, 3},
+                {3, 0,0,3, 3}  ,
+
+                {2, 0,0,2, 3},
+                {2, 0,0,3, 3}
+        };
+
+        // -------------- Verify result
+        EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev));
+    }
+
+    TEST(PullingSchemeTest, PullingScheme3D_smallCube) {
+        // Prepare input data for PS
+        PixelData<int> levels(3, 3, 3, 0);
+        levels(2, 2, 2) = 3;
+
+        // Prepare GenInfo structure -
+        // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2
+        GenInfo gi;
+        const PixelDataDim dim = levels.getDimension();
+        gi.init(2 * dim.y, 2 * dim.x, 2 * dim.z);
+
+        // Initialize all needed objects
+        APRTimer t(false);
+
+        t.start_timer("PS - initialize with data");
+        PullingScheme ps;
+        ps.initialize_particle_cell_tree(gi);
+        fillPS(ps, levels);
+        t.stop_timer();
+
+        t.start_timer("PS - compute");
+        ps.pulling_scheme_main();
+        t.stop_timer();
+
+        // List of expected types
+        std::vector<LevelData> ev = {
+                {2, 0,0,0, 3},
+                {2, 0,1,0, 3},
+                {2, 0,2,0, 3},
+                {2, 1,0,0, 3},
+                {2, 1,1,0, 3},
+                {2, 1,2,0, 3},
+                {2, 2,0,0, 3},
+                {2, 2,1,0, 3},
+                {2, 2,2,0, 3},
+
+                {2, 0,0,1, 3},
+                {2, 0,1,1, 3},
+                {2, 0,2,1, 3},
+                {2, 1,0,1, 3},
+                {2, 1,1,1, 2},
+                {2, 1,2,1, 2},
+                {2, 2,0,1, 3},
+                {2, 2,1,1, 2},
+                {2, 2,2,1, 2},
+
+                {2, 0,0,2, 3},
+                {2, 0,1,2, 3},
+                {2, 0,2,2, 3},
+                {2, 1,0,2, 3},
+                {2, 1,1,2, 2},
+                {2, 1,2,2, 2},
+                {2, 2,0,2, 3},
+                {2, 2,1,2, 2},
+                {2, 2,2,2, 1},
+
+        };
+
+        // -------------- Verify result
+        EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev));
+    }
+
+    // =================================================================================================================
+    // ======== OVPC - Optimal Valid Particle Cell - alternative version of original Pulling Scheme algorithm
+    // =================================================================================================================
+    TEST(PullingSchemeTest, OVPC_Ydir) {
+        // Prepare input data for PS
+        int values[] = {9,0,0,0, 0,0,0,0};
+        int len = sizeof(values)/sizeof(int);
+        PixelData<int> levels(len, 1, 1);  // <-- Y-dir
+        initFromZYXarray(levels, values); // <-- Y-dir
+
+        // Prepare GenInfo structure -
+        // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2
+        GenInfo gi;
+        const PixelDataDim dim = levels.getDimension();
+        gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir
+
+        // Initialize all needed objects
+        APRTimer t(false);
+
+        t.start_timer("OVPC - initialize");
+        OVPC ps(gi, levels);
+        t.stop_timer();
+        t.start_timer("OVPC - compute");
+        ps.generateTree();
+        t.stop_timer();
+
+        // List of expected types
+        std::vector<LevelData> ev = {
+                {3, 0,0,0, 1},
+                {3, 1,0,0, 2},
+                {3, 2,0,0, 3},
+                {3, 3,0,0, 3},
+
+                {2, 2,0,0, 3},
+                {2, 3,0,0, 3}
+        };
+
+        // -------------- Verify result
+        EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev));
+    }
+
+    TEST(PullingSchemeTest, OVPC_Xdir) {
+        // Prepare input data for PS
+        int values[] = {9,0,0,0, 0,0,0,0};
+        int len = sizeof(values)/sizeof(int);
+        PixelData<int> levels(1, len, 1);  // <-- X-dir
+        initFromZYXarray(levels, values);
+
+        // Prepare GenInfo structure -
+        // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2
+        GenInfo gi;
+        const PixelDataDim dim = levels.getDimension();
+        gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir
+
+        // Initialize all needed objects
+        APRTimer t(false);
+
+        t.start_timer("OVPC - initialize");
+        OVPC ps(gi, levels);
+        t.stop_timer();
+        t.start_timer("OVPC - compute");
+        ps.generateTree();
+        t.stop_timer();
+
+        // List of expected types
+        std::vector<LevelData> ev = {
+                {3, 0,0,0, 1},
+                {3, 0,1,0, 2},
+                {3, 0,2,0, 3},
+                {3, 0,3,0, 3}  ,
+
+                {2, 0,2,0, 3},
+                {2, 0,3,0, 3}
+        };
+
+        // -------------- Verify result
+        EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev));
     }
-#ifdef APR_USE_CUDA
-    TEST(PullingSchemeTest, computeLevels) {
-        using ImgType = float;
-        const int maxLevel = 3;
-        const float relError = 0.1;
 
-        PixelData<ImgType> grad = getRandInitializedMesh<ImgType>(10, 20, 33);
-        PixelData<float> localIntensityScaleCpu = getRandInitializedMesh<float>(10, 20, 33);
+    TEST(PullingSchemeTest, OVPC_Zdir) {
+        // Prepare input data for PS
+        int values[] = {9,0,0,0, 0,0,0,0};
+        int len = sizeof(values)/sizeof(int);
+        PixelData<int> levels(1, 1, len);  // <-- Z-dir
+        initFromZYXarray(levels, values);
 
-        PixelData<float> localIntensityScaleGpu(localIntensityScaleCpu, true);
-        PixelData<float> elo(localIntensityScaleCpu, true);
-        APRTimer timer(true);
+        // Prepare GenInfo structure -
+        // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2
+        GenInfo gi;
+        const PixelDataDim dim = levels.getDimension();
+        gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir
 
-        LocalParticleCellSet localParticleCellSet;
+        // Initialize all needed objects
+        APRTimer t(false);
 
-        timer.start_timer("CPU PS FULL");
-        localParticleCellSet.computeLevels(grad, localIntensityScaleCpu, maxLevel, relError,1,1,1);
-        timer.stop_timer();
+        t.start_timer("OVPC - initialize");
+        OVPC ps(gi, levels);
+        t.stop_timer();
+        t.start_timer("OVPC - compute");
+        ps.generateTree();
+        t.stop_timer();
 
-        timer.start_timer("GPU PS FULL");
-        computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError);
-        timer.stop_timer();
+        // List of expected types
+        std::vector<LevelData> ev = {
+                {3, 0,0,0, 1},
+                {3, 0,0,1, 2},
+                {3, 0,0,2, 3},
+                {3, 0,0,3, 3}  ,
 
-        EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0);
+                {2, 0,0,2, 3},
+                {2, 0,0,3, 3}
+        };
+
+        // -------------- Verify result
+        EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev));
+    }
+
+    TEST(PullingSchemeTest, OVPC_smallCube) {
+        // Prepare input data for PS
+        PixelData<int> levels(3, 3, 3, 0);
+        levels(2, 2, 2) = 3;
+
+        // Prepare GenInfo structure -
+        // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2
+        GenInfo gi;
+        const PixelDataDim dim = levels.getDimension();
+        gi.init(2 * dim.y, 2 * dim.x, 2 * dim.z);
+
+        // Initialize all needed objects
+        APRTimer t(false);
+
+        t.start_timer("OVPC - initialize");
+        OVPC ps(gi, levels);
+        t.stop_timer();
+        t.start_timer("OVPC - compute");
+        ps.generateTree();
+        t.stop_timer();
+
+        // List of expected types
+        std::vector<LevelData> ev = {
+                {2, 0,0,0, 3},
+                {2, 0,1,0, 3},
+                {2, 0,2,0, 3},
+                {2, 1,0,0, 3},
+                {2, 1,1,0, 3},
+                {2, 1,2,0, 3},
+                {2, 2,0,0, 3},
+                {2, 2,1,0, 3},
+                {2, 2,2,0, 3},
+
+                {2, 0,0,1, 3},
+                {2, 0,1,1, 3},
+                {2, 0,2,1, 3},
+                {2, 1,0,1, 3},
+                {2, 1,1,1, 2},
+                {2, 1,2,1, 2},
+                {2, 2,0,1, 3},
+                {2, 2,1,1, 2},
+                {2, 2,2,1, 2},
+
+                {2, 0,0,2, 3},
+                {2, 0,1,2, 3},
+                {2, 0,2,2, 3},
+                {2, 1,0,2, 3},
+                {2, 1,1,2, 2},
+                {2, 1,2,2, 2},
+                {2, 2,0,2, 3},
+                {2, 2,1,2, 2},
+                {2, 2,2,2, 1},
+
+        };
+
+        // -------------- Verify result
+        EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev));
     }
-#endif
+
+
+    // =================================================================================================================
+    // ======== PS vs OVPC
+    // =================================================================================================================
+
+    TEST(PullingSchemeTest, PSvsOVPC) {
+        // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC
+        GenInfo gi;
+        gi.init(255, 257, 199);
+
+        // Generate random levels for PS and OVPC
+        PixelData<int> levels(std::ceil(gi.org_dims[0]/2.0),
+                              std::ceil(gi.org_dims[1]/2.0),
+                              std::ceil(gi.org_dims[2]/2.0),
+                              0);
+        // Add a few particles only - it will end up with Pulling Scheme generate particles on (almost) all
+        // levels - good case to compare with OVPC
+        const int numOfParticles = 3;
+        std::srand(std::time(nullptr));
+        for (int i = 0; i < numOfParticles; ++i) {
+            levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = gi.l_max;
+        }
+        PixelData<int> levelsOVPC(levels, true); // just copy 'levels'
+        APRTimer t(false);
+
+        // Run test methods and compare results
+        t.start_timer("OVPC - init");
+        OVPC nps(gi, levelsOVPC);
+        t.stop_timer();
+        t.start_timer("OVPC compute");
+        nps.generateTree();
+        t.stop_timer();
+
+
+        t.start_timer("PS - init");
+        PullingScheme ps;
+        ps.initialize_particle_cell_tree(gi);
+        fillPS(ps, levels);
+        t.stop_timer();
+        t.start_timer("PS - compute");
+        ps.pulling_scheme_main();
+        t.stop_timer();
+
+        ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), nps.getParticleCellTree()), 0);
+    }
+
 }
 
 int main(int argc, char **argv) {
diff --git a/test/TestTools.hpp b/test/TestTools.hpp
index f323d2bb..158bf2ea 100644
--- a/test/TestTools.hpp
+++ b/test/TestTools.hpp
@@ -8,6 +8,8 @@
 
 #include "data_structures/Mesh/PixelData.hpp"
 #include <random>
+#include "data_structures/APR/particles/ParticleData.hpp"
+
 
 std::string get_source_directory_apr(){
   // returns path to the directory where utils.cpp is stored
@@ -44,7 +46,7 @@ inline bool compare(PixelData<T> &mesh, const float *data, const float epsilon)
 }
 
 template<typename T>
-inline bool initFromZYXarray(PixelData<T> &mesh, const float *data) {
+inline bool initFromZYXarray(PixelData<T> &mesh, const T *data) {
     size_t dataIdx = 0;
     for (int z = 0; z < mesh.z_num; ++z) {
         for (int y = 0; y < mesh.y_num; ++y) {
@@ -65,17 +67,42 @@ inline bool initFromZYXarray(PixelData<T> &mesh, const float *data) {
  * @return number of errors detected
  */
 template <typename T>
-inline int compareMeshes(const PixelData<T> &expected, const PixelData<T> &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) {
+inline int compareMeshes(const PixelData<T> &expected, const PixelData<T> &tested, double maxError = 0, int maxNumOfErrPrinted = 3) {
+    if (expected.getDimension() != tested.getDimension()) {
+        std::stringstream errMsg;
+        errMsg << "Dimensions of expected and tested meshes differ! " << expected.getDimension() << " vs " << tested.getDimension();
+        throw std::runtime_error(errMsg.str());
+    }
+
     int cnt = 0;
+    double maxErrorFound = 0;
+    T maxErrorExpectedValue = 0;
+    T maxErrorTestedValue = 0;
+    std::string maxErrorIdx = "";
+
     for (size_t i = 0; i < expected.mesh.size(); ++i) {
-        if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError) {
+        auto diff = std::abs(expected.mesh[i] - tested.mesh[i]);
+        if (diff > maxError) {
             if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) {
-                std::cout << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl;
+                std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: "
+                          << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i]
+                          << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << i << "=" << tested.getStrIndex(i) << std::endl;
             }
             cnt++;
         }
+        if (diff > maxErrorFound) {
+            maxErrorFound = diff;
+            maxErrorExpectedValue = expected.mesh[i];
+            maxErrorTestedValue = tested.mesh[i];
+            maxErrorIdx = tested.getStrIndex(i);
+        }
+    }
+    if (cnt != 0) {
+        std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size()
+                  << ", maxErrorFound = " << maxErrorFound << " at IDX: " << maxErrorIdx << " "
+                  << maxErrorExpectedValue << " vs " << maxErrorTestedValue
+                  << "(" << (100*(long double)maxErrorFound/(long double)maxErrorExpectedValue) << "%)"<<std::endl;
     }
-    std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl;
     return cnt;
 }
 
@@ -93,48 +120,137 @@ inline int compareMeshes(const PixelData<T> &expected, const PixelData<T> &teste
 template <typename ParticleTypeA, typename ParticleTypeB>
 inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTypeB &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 10) {
     int64_t cnt = 0;
-    if(expected.size() != tested.size()) {
-        std::cerr << "ERROR compareParticles: sizes differ!" << std::endl;
-        cnt++;
+    if (expected.size() != tested.size()) {
+        std::cerr << "ERROR compareParticles: sizes differs! " << expected.size() << " vs. " << tested.size() << std::endl;
+        return 1; // Return any number > 0 to indicate an error
     }
 
     for (size_t i = 0; i < expected.size(); ++i) {
-        if (std::abs(expected[i] - tested[i]) > maxError) {
+        if (std::abs((double)(expected[i] - tested[i])) > maxError) {
             if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) {
-                std::cout << "ERROR expected vs tested particle: " << (float)expected[i] << " vs " << (float)tested[i] << " IDX:" << i << std::endl;
+                std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested particle: " << (float)expected[i] << " vs " << (float)tested[i] << " IDX:" << i << std::endl;
             }
             cnt++;
         }
     }
-    std::cout << "Number of errors / all points: " << cnt << " / " << expected.size() << std::endl;
+    if (cnt != 0) {
+        std::cout << "Number of errors / all points: " << cnt << " / " << expected.size() << std::endl;
+    }
     return cnt;
 }
 
+/**
+ * Compares two Particle Cell Trees
+ * @param expected - expected levels
+ * @param tested - levels to verify
+ * @param maxError
+ * @param maxNumOfErrPrinted - how many error outputs should be printed
+ * @param maxTypeCompared - maximum type to be compared
+ * @return
+ */
+template <typename T, typename W>
+int compareParticleCellTrees(const std::vector<PixelData<T>> &expected, const std::vector<PixelData<W>> &tested, bool printErrors = true, int maxNumOfErrPrinted = 3, uint8_t maxTypeCompared = FILLER_TYPE) {
+    int cntGlobal = 0;
+    for (size_t level = 0; level < expected.size(); level++) {
+        int cnt = 0;
+        int numOfParticles = 0;
+        for (size_t i = 0; i < expected[level].mesh.size(); ++i) {
+            if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= maxTypeCompared) {
+                if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) ||
+                    std::isnan(tested[level].mesh[i])) {
+                    if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) {
+                        std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs "
+                                  << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl;
+                    }
+                    cnt++;
+                }
+                if (expected[level].mesh[i] > 0) numOfParticles++;
+            }
+        }
+        cntGlobal += cnt;
+        if (cnt > 0 && printErrors) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl;
+    }
+    return cntGlobal;
+}
 
 /**
- * Generates mesh with provided dims with random values in range [0, 1] * multiplier
+ * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset
  * @param y
  * @param x
  * @param z
  * @param multiplier
+ * @param offset
+ * @param useIdxNumbers - instead of random values put values from 0..sizeof(mesh)-1
  * @return
  */
 template <typename T>
-inline PixelData<T> getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) {
+inline PixelData<T> getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, float offset=0.0, bool useIdxNumbers = false) {
     PixelData<T> m(y, x, z);
-    std::cout << "Mesh info: " << m << std::endl;
+//    std::cout << "Mesh info: " << m << std::endl;
     std::random_device rd;
     std::mt19937 mt(rd());
     std::uniform_real_distribution<double> dist(0.0, 1.0);
+
 #ifdef HAVE_OPENMP
 #pragma omp parallel for default(shared)
 #endif
     for (size_t i = 0; i < m.mesh.size(); ++i) {
-        m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier;
+        m.mesh[i] = useIdxNumbers ? i + 1 : dist(mt) * multiplier + offset;
+    }
+    return m;
+}
+
+/**
+ * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset
+ * @param dim - dimension of generated mesh
+ * @param multiplier
+ * @param offset
+ * @param useIdxNumbers - instead of random values put values from 0..sizeof(mesh)-1
+ * @return
+ */
+template <typename T>
+inline PixelData<T> getRandInitializedMesh(PixelDataDim dim, float multiplier = 2.0f, float offset=0.0, bool useIdxNumbers = false) {
+    return getRandInitializedMesh<T>(dim.y, dim.x, dim.z, multiplier, offset, useIdxNumbers);
+}
+
+/**
+ * Generate mesh with square blob in the center of it with values randomly chosen from [20,40] range. Zero values outside.
+ * @tparam T
+ * @param y
+ * @param x
+ * @param z
+ * @return
+ */
+template <typename T>
+inline PixelData<T> getMeshWithBlobInMiddle(int y, int x, int z) {
+    PixelData<T> m(y, x, z, 0);
+
+    std::random_device rd;
+    std::mt19937 mt(rd());
+    std::uniform_real_distribution<double> dist(0.0, 1.0);
+
+    for (int yi = (1.0/3 * y); yi < (2.0/3 * y); yi++) {
+        for (int xi = (1.0/3 * x); xi < (2.0/3 * x); xi++) {
+            for (int zi = (1.0/3 * z); zi < (2.0/3 * z); zi++) {
+                m(yi, xi, zi) = 30 + dist(mt) * 10;
+            }
+        }
     }
+
     return m;
 }
 
+/**
+ * Generate mesh with square blob in the center of it with values randomly chosen from [20,40] range. Zero values outside.
+ * @tparam T
+ * @param dim
+ * @return
+ */
+template <typename T>
+inline PixelData<T> getMeshWithBlobInMiddle(const PixelDataDim &dim) {
+    return getMeshWithBlobInMiddle<T>(dim.y, dim.x, dim.z);
+}
+
 struct TestBenchStats{
 
         double inf_norm=0;