AdaptiveParticles · krzysg · Aug 1, 2022 · Aug 1, 2022 · Aug 1, 2022 · Aug 2, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -174,14 +174,14 @@ else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ")
 
     if(CMAKE_COMPILER_IS_GNUCC)
-        set(CMAKE_CXX_FLAGS_RELEASE "-O4 -ffast-math")
+        set(CMAKE_CXX_FLAGS_RELEASE "-O4")
         set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g  -Wall -pedantic")
         set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Bdynamic")
         if(NOT WIN32)
             set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ldl -lz")
         endif()
     elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        set(CMAKE_CXX_FLAGS_RELEASE "-O3 -ffast-math")
+        set(CMAKE_CXX_FLAGS_RELEASE "-O3")
         set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g  -Wall -pedantic")
         set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lz")
     endif()
@@ -211,8 +211,8 @@ if(APR_USE_CUDA)
     message(STATUS "APR: Building CUDA for APR")
     set(CMAKE_CUDA_STANDARD 14)
     set(CMAKE_CUDA_RUNTIME_LIBRARY "Static")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA")
-    set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math") # -lineinfo for profiling
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA")
+    set(CMAKE_CUDA_FLAGS_RELEASE "-O3") # -lineinfo for profiling
     set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G")
     if(APR_BENCHMARK)
         set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DAPR_BENCHMARK")
@@ -226,6 +226,7 @@ if(APR_USE_CUDA)
             src/algorithm/LocalIntensityScale.cu
             src/algorithm/OVPC.cu
             src/data_structures/APR/access/GPUAccess.cu
+            src/data_structures/APR/access/LinearAccessCuda.cu
             src/numerics/miscCuda.cu
             src/numerics/APRDownsampleGPU.cu
             src/numerics/PixelNumericsGPU.cu
@@ -241,6 +242,7 @@ if(APR_BUILD_STATIC_LIB)
     # generate static library used as a intermediate step in generating fat lib
     set(STATIC_TARGET_NAME staticLib)
     add_library(${STATIC_TARGET_NAME} STATIC $<TARGET_OBJECTS:aprObjLib> ${APR_CUDA_SOURCE_FILES})
+    set_property(TARGET ${STATIC_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF)
     target_compile_features(${STATIC_TARGET_NAME} PUBLIC cxx_std_14)
     set_target_properties(${STATIC_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME})
     set_target_properties(${STATIC_TARGET_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION OFF)
@@ -262,7 +264,7 @@ if(APR_BUILD_SHARED_LIB)
 # generate fat shared library
     set(SHARED_TARGET_NAME sharedLib)
     add_library(${SHARED_TARGET_NAME} SHARED $<TARGET_OBJECTS:aprObjLib> ${APR_CUDA_SOURCE_FILES})
-
+    set_property(TARGET ${SHARED_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF)
     target_include_directories(${SHARED_TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src> $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>)
     set_target_properties(${SHARED_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME})
     set_target_properties(${SHARED_TARGET_NAME} PROPERTIES LIBRARY_OUTPUT_NAME ${LIBRARY_NAME})

diff --git a/examples/Example_get_apr.h b/examples/Example_get_apr.h
@@ -30,7 +30,7 @@ struct cmdLineOptions{
     bool auto_parameters = false;
 
     float Ip_th = 0;
-    float lambda = -1;
+    float lambda = 3.0;
     float sigma_th = 0;
     float rel_error = 0.1;
     float grad_th = 1;

diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp
@@ -117,7 +117,7 @@ class APRConverter {
     PixelData<float> local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors
     PixelData<float> local_scale_temp2;
 
-    void applyParameters(APR& aAPR,APRParameters& aprParameters);
+    void applyParameters(APRParameters& aprParameters);
 
     template<typename T>
     void computeL(APR& aAPR,PixelData<T>& input_image);
@@ -184,7 +184,7 @@ void APRConverter<ImageType>::get_apr_custom_grad_scale(APR& aAPR,PixelData<Imag
     }
 
     aAPR.parameters = par;
-    applyParameters(aAPR,par);
+    applyParameters(par);
     solveForAPR(aAPR);
     generateDatastructures(aAPR);
 
@@ -215,6 +215,10 @@ void APRConverter<ImageType>::computeL(APR& aAPR,PixelData<T>& input_image){
 
     fine_grained_timer.start_timer("offset image");
 
+    // offset image by factor (this is required if there are zero areas in the background with
+    // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
+    // Warning both of these could result in over-flow!
+
     if (std::is_floating_point<ImageType>::value) {
         image_temp.copyFromMesh(input_image);
     } else {
@@ -247,7 +251,7 @@ void APRConverter<ImageType>::computeL(APR& aAPR,PixelData<T>& input_image){
 }
 
 template<typename ImageType>
-void APRConverter<ImageType>::applyParameters(APR& aAPR,APRParameters& aprParameters) {
+void APRConverter<ImageType>::applyParameters(APRParameters& aprParameters) {
     //
     //  Apply the main parameters
     //
@@ -261,39 +265,7 @@ void APRConverter<ImageType>::applyParameters(APR& aAPR,APRParameters& aprParame
     }
     fine_grained_timer.stop_timer();
 
-    fine_grained_timer.start_timer("threshold");
-    iComputeGradient.threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset);
-    fine_grained_timer.stop_timer();
-
-    float max_th = 60000;
-
-#ifdef HAVE_OPENMP
-#pragma omp parallel for default(shared)
-#endif
-    for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
-
-        float rescaled = local_scale_temp.mesh[i];
-        if (rescaled < aprParameters.sigma_th) {
-            rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : par.sigma_th;
-            local_scale_temp.mesh[i] = rescaled;
-        }
-    }
-
-#ifdef HAVE_LIBTIFF
-    if(par.output_steps) {
-        TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_rescaled.tif", local_scale_temp);
-    }
-#endif
-
-#ifdef HAVE_OPENMP
-#pragma omp parallel for default(shared)
-#endif
-    for (size_t i = 0; i < grad_temp.mesh.size(); ++i) {
-
-        if(grad_temp.mesh[i] < aprParameters.grad_th){
-            grad_temp.mesh[i] = 0;
-        }
-    }
+    iComputeGradient.applyParameters(grad_temp, local_scale_temp, local_scale_temp2, aprParameters, bspline_offset);
 }
 
 
@@ -401,7 +373,7 @@ inline bool APRConverter<ImageType>::get_lrf(APR &aAPR, PixelData<T>& input_imag
 template<typename ImageType>
 inline bool APRConverter<ImageType>::get_ds(APR &aAPR) {
 
-    applyParameters(aAPR,par);
+    applyParameters(par);
     aAPR.parameters = par;
 
     solveForAPR(aAPR);
@@ -422,103 +394,45 @@ inline bool APRConverter<ImageType>::get_ds(APR &aAPR) {
  */
 template<typename ImageType> template<typename T>
 inline bool APRConverter<ImageType>::get_apr_cuda(APR &aAPR, PixelData<T>& input_image) {
-    if (!initPipelineAPR(aAPR, input_image)) return false;
 
+    if (!initPipelineAPR(aAPR, input_image)) return false;
 
     initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num);
 
-    method_timer.start_timer("compute_gradient_magnitude_using_bsplines and local instensity scale CUDA");
-    APRTimer t(true);
-    APRTimer d(true);
-    t.start_timer(" =========== ALL");
-    {
-
-        computation_timer.start_timer("init_mem");
-        PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
-
-        /////////////////////////////////
-        /// Pipeline
-        ////////////////////////
-        //offset image by factor (this is required if there are zero areas in the background with uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
-        // Warning both of these could result in over-flow (if your image is non zero, with a 'buffer' and has intensities up to uint16_t maximum value then set image_type = "", i.e. uncomment the following line)
-
-        if (std::is_same<uint16_t, ImageType>::value) {
-            bspline_offset = 100;
-            image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-        } else if (std::is_same<uint8_t, ImageType>::value) {
-            bspline_offset = 5;
-            image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
-        } else {
-            image_temp.copyFromMesh(input_image);
-        }
-
-        computation_timer.stop_timer();
-
-        std::vector<GpuProcessingTask<ImageType>> gpts;
-
-        int numOfStreams = 1;
-        int repetitionsPerStream = 1;
-
-        computation_timer.start_timer("compute_L");
-        // Create streams and send initial task to do
-        for (int i = 0; i < numOfStreams; ++i) {
-            gpts.emplace_back(GpuProcessingTask<ImageType>(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()));
-            gpts.back().sendDataToGpu();
-            gpts.back().processOnGpu();
-        }
-        computation_timer.stop_timer();
-
-
-        for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) {
-            int c = i % numOfStreams;
-
-            computation_timer.start_timer("apply_parameters");
-            // get data from previous task
-            gpts[c].getDataFromGpu();
-
-            computation_timer.stop_timer();
-
-            // in theory we get new data and send them to task
-            if (i  < numOfStreams * (repetitionsPerStream - 1)) {
-                gpts[c].sendDataToGpu();
-                gpts[c].processOnGpu();
-            }
-
-            // Postprocess on CPU
-            std::cout << "--------- start CPU processing ---------- " << i << std::endl;
-
-            computation_timer.start_timer("solve_for_apr");
-            iPullingScheme.initialize_particle_cell_tree(aAPR.aprInfo);
-
-            PixelData<float> lst(local_scale_temp, true);
-
-#ifdef HAVE_LIBTIFF
-            if (par.output_steps){
-                TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_step.tif", lst);
-            }
-#endif
+    computation_timer.start_timer("init_mem");
+    PixelData<ImageType> image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image)
 
-#ifdef HAVE_LIBTIFF
-            if (par.output_steps){
-                TiffUtils::saveMeshAsTiff(par.output_dir + "gradient_step.tif", grad_temp);
-            }
-#endif
+    /////////////////////////////////
+    /// Pipeline
+    ////////////////////////
+    // offset image by factor (this is required if there are zero areas in the background with
+    // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!)
+    // Warning both of these could result in over-flow!
 
-            iLocalParticleSet.get_local_particle_cell_set(iPullingScheme,lst, local_scale_temp2,par);
+    if (std::is_same<uint16_t, ImageType>::value) {
+        bspline_offset = 100;
+        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
+    } else if (std::is_same<uint8_t, ImageType>::value) {
+        bspline_offset = 5;
+        image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); });
+    } else {
+        image_temp.copyFromMesh(input_image);
+    }
 
-            iPullingScheme.pulling_scheme_main();
+    GpuProcessingTask<ImageType> gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max());
+    gpt.sendDataToGpu();
+    gpt.processOnGpu();
+    auto linearAccessGpu = gpt.getDataFromGpu();
 
-            computation_timer.stop_timer();
+    aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size();
 
-            computation_timer.start_timer("generate_data_structures");
-            generateDatastructures(aAPR);
-            computation_timer.stop_timer();
-        }
-        std::cout << "Total n ENDED" << std::endl;
+    // generateDatastructures(aAPR) for linearAcceess for CUDA
+    aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec);
+    aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec);
+    aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec);
+    aAPR.apr_initialized = true;
 
-    }
-    t.stop_timer();
-    method_timer.stop_timer();
+    std::cout << "CUDA pipeline finished!\n";
 
     return true;
 }
@@ -560,7 +474,7 @@ inline bool APRConverter<ImageType>::get_apr_cpu(APR &aAPR, PixelData<T> &input_
         method_timer.stop_timer();
     }
 
-    applyParameters(aAPR,par);
+    applyParameters(par);
 
     computation_timer.stop_timer();
 
@@ -592,7 +506,7 @@ template<typename ImageType> template<typename T>
 inline bool APRConverter<ImageType>::get_apr(APR &aAPR, PixelData<T> &input_image) {
 // TODO: CUDA pipeline is temporarily turned off and CPU version is always chosen.
 //       After revising a CUDA pipeline remove "#if true // " part.
-#if true // #ifndef APR_USE_CUDA
+#ifndef APR_USE_CUDA
     return get_apr_cpu(aAPR, input_image);
 #else
     return get_apr_cuda(aAPR, input_image);