diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a55fa38..9cf047e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -174,14 +174,14 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ") if(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_CXX_FLAGS_RELEASE "-O4 -ffast-math") + set(CMAKE_CXX_FLAGS_RELEASE "-O4") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -pedantic") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Bdynamic") if(NOT WIN32) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ldl -lz") endif() elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -ffast-math") + set(CMAKE_CXX_FLAGS_RELEASE "-O3") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -pedantic") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lz") endif() @@ -211,8 +211,8 @@ if(APR_USE_CUDA) message(STATUS "APR: Building CUDA for APR") set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_RUNTIME_LIBRARY "Static") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA") - set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math") # -lineinfo for profiling + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA") + set(CMAKE_CUDA_FLAGS_RELEASE "-O3") # -lineinfo for profiling set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G") if(APR_BENCHMARK) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DAPR_BENCHMARK") @@ -226,6 +226,7 @@ if(APR_USE_CUDA) src/algorithm/LocalIntensityScale.cu src/algorithm/OVPC.cu src/data_structures/APR/access/GPUAccess.cu + src/data_structures/APR/access/LinearAccessCuda.cu src/numerics/miscCuda.cu src/numerics/APRDownsampleGPU.cu src/numerics/PixelNumericsGPU.cu @@ -241,6 +242,7 @@ if(APR_BUILD_STATIC_LIB) # generate static library used as a intermediate step in generating fat lib set(STATIC_TARGET_NAME staticLib) add_library(${STATIC_TARGET_NAME} STATIC $ ${APR_CUDA_SOURCE_FILES}) + set_property(TARGET ${STATIC_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF) target_compile_features(${STATIC_TARGET_NAME} PUBLIC cxx_std_14) set_target_properties(${STATIC_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) set_target_properties(${STATIC_TARGET_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION OFF) @@ -262,7 +264,7 @@ if(APR_BUILD_SHARED_LIB) # generate fat shared library set(SHARED_TARGET_NAME sharedLib) add_library(${SHARED_TARGET_NAME} SHARED $ ${APR_CUDA_SOURCE_FILES}) - + set_property(TARGET ${SHARED_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF) target_include_directories(${SHARED_TARGET_NAME} PUBLIC $ $) set_target_properties(${SHARED_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) set_target_properties(${SHARED_TARGET_NAME} PROPERTIES LIBRARY_OUTPUT_NAME ${LIBRARY_NAME}) diff --git a/examples/Example_get_apr.h b/examples/Example_get_apr.h index c1be9d2b..6d787811 100644 --- a/examples/Example_get_apr.h +++ b/examples/Example_get_apr.h @@ -30,7 +30,7 @@ struct cmdLineOptions{ bool auto_parameters = false; float Ip_th = 0; - float lambda = -1; + float lambda = 3.0; float sigma_th = 0; float rel_error = 0.1; float grad_th = 1; diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index b34e1b74..91858629 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -117,7 +117,7 @@ class APRConverter { PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors PixelData local_scale_temp2; - void applyParameters(APR& aAPR,APRParameters& aprParameters); + void applyParameters(APRParameters& aprParameters); template void computeL(APR& aAPR,PixelData& input_image); @@ -184,7 +184,7 @@ void APRConverter::get_apr_custom_grad_scale(APR& aAPR,PixelData::computeL(APR& aAPR,PixelData& input_image){ fine_grained_timer.start_timer("offset image"); + // offset image by factor (this is required if there are zero areas in the background with + // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) + // Warning both of these could result in over-flow! + if (std::is_floating_point::value) { image_temp.copyFromMesh(input_image); } else { @@ -247,7 +251,7 @@ void APRConverter::computeL(APR& aAPR,PixelData& input_image){ } template -void APRConverter::applyParameters(APR& aAPR,APRParameters& aprParameters) { +void APRConverter::applyParameters(APRParameters& aprParameters) { // // Apply the main parameters // @@ -261,39 +265,7 @@ void APRConverter::applyParameters(APR& aAPR,APRParameters& aprParame } fine_grained_timer.stop_timer(); - fine_grained_timer.start_timer("threshold"); - iComputeGradient.threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset); - fine_grained_timer.stop_timer(); - - float max_th = 60000; - -#ifdef HAVE_OPENMP -#pragma omp parallel for default(shared) -#endif - for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { - - float rescaled = local_scale_temp.mesh[i]; - if (rescaled < aprParameters.sigma_th) { - rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : par.sigma_th; - local_scale_temp.mesh[i] = rescaled; - } - } - -#ifdef HAVE_LIBTIFF - if(par.output_steps) { - TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_rescaled.tif", local_scale_temp); - } -#endif - -#ifdef HAVE_OPENMP -#pragma omp parallel for default(shared) -#endif - for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { - - if(grad_temp.mesh[i] < aprParameters.grad_th){ - grad_temp.mesh[i] = 0; - } - } + iComputeGradient.applyParameters(grad_temp, local_scale_temp, local_scale_temp2, aprParameters, bspline_offset); } @@ -401,7 +373,7 @@ inline bool APRConverter::get_lrf(APR &aAPR, PixelData& input_imag template inline bool APRConverter::get_ds(APR &aAPR) { - applyParameters(aAPR,par); + applyParameters(par); aAPR.parameters = par; solveForAPR(aAPR); @@ -422,103 +394,45 @@ inline bool APRConverter::get_ds(APR &aAPR) { */ template template inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input_image) { - if (!initPipelineAPR(aAPR, input_image)) return false; + if (!initPipelineAPR(aAPR, input_image)) return false; initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num); - method_timer.start_timer("compute_gradient_magnitude_using_bsplines and local instensity scale CUDA"); - APRTimer t(true); - APRTimer d(true); - t.start_timer(" =========== ALL"); - { - - computation_timer.start_timer("init_mem"); - PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image) - - ///////////////////////////////// - /// Pipeline - //////////////////////// - //offset image by factor (this is required if there are zero areas in the background with uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) - // Warning both of these could result in over-flow (if your image is non zero, with a 'buffer' and has intensities up to uint16_t maximum value then set image_type = "", i.e. uncomment the following line) - - if (std::is_same::value) { - bspline_offset = 100; - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } else if (std::is_same::value) { - bspline_offset = 5; - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } else { - image_temp.copyFromMesh(input_image); - } - - computation_timer.stop_timer(); - - std::vector> gpts; - - int numOfStreams = 1; - int repetitionsPerStream = 1; - - computation_timer.start_timer("compute_L"); - // Create streams and send initial task to do - for (int i = 0; i < numOfStreams; ++i) { - gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); - gpts.back().sendDataToGpu(); - gpts.back().processOnGpu(); - } - computation_timer.stop_timer(); - - - for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) { - int c = i % numOfStreams; - - computation_timer.start_timer("apply_parameters"); - // get data from previous task - gpts[c].getDataFromGpu(); - - computation_timer.stop_timer(); - - // in theory we get new data and send them to task - if (i < numOfStreams * (repetitionsPerStream - 1)) { - gpts[c].sendDataToGpu(); - gpts[c].processOnGpu(); - } - - // Postprocess on CPU - std::cout << "--------- start CPU processing ---------- " << i << std::endl; - - computation_timer.start_timer("solve_for_apr"); - iPullingScheme.initialize_particle_cell_tree(aAPR.aprInfo); - - PixelData lst(local_scale_temp, true); - -#ifdef HAVE_LIBTIFF - if (par.output_steps){ - TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_step.tif", lst); - } -#endif + computation_timer.start_timer("init_mem"); + PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image) -#ifdef HAVE_LIBTIFF - if (par.output_steps){ - TiffUtils::saveMeshAsTiff(par.output_dir + "gradient_step.tif", grad_temp); - } -#endif + ///////////////////////////////// + /// Pipeline + //////////////////////// + // offset image by factor (this is required if there are zero areas in the background with + // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) + // Warning both of these could result in over-flow! - iLocalParticleSet.get_local_particle_cell_set(iPullingScheme,lst, local_scale_temp2,par); + if (std::is_same::value) { + bspline_offset = 100; + image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); + } else if (std::is_same::value) { + bspline_offset = 5; + image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); + } else { + image_temp.copyFromMesh(input_image); + } - iPullingScheme.pulling_scheme_main(); + GpuProcessingTask gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()); + gpt.sendDataToGpu(); + gpt.processOnGpu(); + auto linearAccessGpu = gpt.getDataFromGpu(); - computation_timer.stop_timer(); + aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); - computation_timer.start_timer("generate_data_structures"); - generateDatastructures(aAPR); - computation_timer.stop_timer(); - } - std::cout << "Total n ENDED" << std::endl; + // generateDatastructures(aAPR) for linearAcceess for CUDA + aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec); + aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec); + aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec); + aAPR.apr_initialized = true; - } - t.stop_timer(); - method_timer.stop_timer(); + std::cout << "CUDA pipeline finished!\n"; return true; } @@ -560,7 +474,7 @@ inline bool APRConverter::get_apr_cpu(APR &aAPR, PixelData &input_ method_timer.stop_timer(); } - applyParameters(aAPR,par); + applyParameters(par); computation_timer.stop_timer(); @@ -592,7 +506,7 @@ template template inline bool APRConverter::get_apr(APR &aAPR, PixelData &input_image) { // TODO: CUDA pipeline is temporarily turned off and CPU version is always chosen. // After revising a CUDA pipeline remove "#if true // " part. -#if true // #ifndef APR_USE_CUDA +#ifndef APR_USE_CUDA return get_apr_cpu(aAPR, input_image); #else return get_apr_cuda(aAPR, input_image); diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index 53c3d7cd..6b682fdf 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -38,6 +38,35 @@ class ComputeGradient { template void calc_inv_bspline_z(PixelData &input); + template + void applyParameters(PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, APRParameters &aprParameters, float bspline_offset) { + threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset); + + float max_th = 60000; + +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) +#endif + for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { + + float rescaled = local_scale_temp.mesh[i]; + if (rescaled < aprParameters.sigma_th) { + rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : aprParameters.sigma_th; + local_scale_temp.mesh[i] = rescaled; + } + } + +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) +#endif + for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { + + if(grad_temp.mesh[i] < aprParameters.grad_th){ + grad_temp.mesh[i] = 0; + } + } + } + struct three_temps { float temp_1, temp_2, temp_3; }; @@ -65,6 +94,20 @@ class ComputeGradient { inline float impulse_resp_back(float k, float rho, float omg, float gamma, float c0); + typedef struct { + std::vector bc1_vec; + std::vector bc2_vec; + std::vector bc3_vec; + std::vector bc4_vec; + size_t k0; + float b1; + float b2; + float norm_factor; + size_t minLen; + } BsplineParams; + + BsplineParams prepareBSplineParams(size_t dimLen, float lambda, float tol, int maxFilterLen = -1); + }; template @@ -115,7 +158,6 @@ inline void ComputeGradient::get_gradient(PixelData &image_temp, Pixe timer.stop_timer(); } } - } @@ -208,81 +250,45 @@ void ComputeGradient::get_smooth_bspline_3D(PixelData& input, float lambda) { inline float ComputeGradient::impulse_resp(float k,float rho,float omg){ // Impulse Response Function - return (pow(rho,(std::abs(k)))*sin((std::abs(k) + 1)*omg)) / sin(omg); + return (powf(rho,(std::abs(k)))*sinf((std::abs(k) + 1)*omg)) / sinf(omg); } inline float ComputeGradient::impulse_resp_back(float k,float rho,float omg,float gamma,float c0){ // Impulse Response Function (nominator eq. 4.8, denominator from eq. 4.7) - return c0*pow(rho,std::abs(k))*(cos(omg*std::abs(k)) + gamma*sin(omg*std::abs(k)))*(1.0/(pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2))); + return c0*powf(rho,std::abs(k))*(cosf(omg*std::abs(k)) + gamma*sinf(omg*std::abs(k)))*(1.0/(powf((1 - 2.0*rho*cosf(omg) + pow(rho,2)),2))); } - -/** - * floating point output -> no rounding or under-/overflow check - */ -template -std::enable_if_t::value, T> -round(float val, size_t &errCount) { - return val; -} - -/** - * integer output -> check for under-/overflow and round - */ -template -std::enable_if_t::value, T> -round(float val, size_t &errCount) { - - val = std::round(val); - - if(val < std::numeric_limits::min() || val > std::numeric_limits::max()) { - errCount++; - } - return val; -} - - - -template -void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float tol, int k0Len) { - // - // Bevan Cheeseman 2016 - // - // Recursive Filter Implimentation for Smoothing BSplines +ComputeGradient::BsplineParams ComputeGradient::prepareBSplineParams(size_t dimLen, float lambda, float tol, int maxFilterLen) { + // Recursive Filter Implementation for Smoothing BSplines // B-Spline Signal Processing: Part 11-Efficient Design and Applications, Unser 1993 - float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda); // eq 4.6 - float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda))); // eq 4.5 - float omg = atan(sqrt((1/xi)*(144*lambda - 1))); // eq 4.6 + float xi = 1 - 96*lambda + 24*lambda * sqrtf(3 + 144*lambda); + float rho = (24*lambda - 1 - sqrtf(xi)) / (24*lambda) * sqrtf((1/xi) * (48*lambda + 24*lambda * sqrtf(3 + 144*lambda))); + float omg = atan(sqrtf((1/xi) * (144*lambda - 1))); + float c0 = (1 + powf(rho,2)) / (1-powf(rho,2)) * (1 - 2*rho * cosf(omg) + powf(rho,2)) / (1 + 2*rho*cosf(omg) + powf(rho,2)); + float gamma = (1 - powf(rho,2)) / (1+powf(rho,2)) * (1 / tan(omg)); - float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2)); // eq 4.8 - float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg)); // eq 4.8 + const float b1 = 2*rho*cosf(omg); + const float b2 = -powf(rho,2.0); - const float b1 = 2*rho*cos(omg); - const float b2 = -pow(rho,2.0); + const size_t idealK0Len = ceil(std::abs(logf(tol) / logf(rho))); + const size_t k0 = maxFilterLen > 0 ? maxFilterLen : idealK0Len; + const size_t minLen = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, dimLen); - const size_t z_num = image.z_num; - const size_t x_num = image.x_num; - const size_t y_num = image.y_num; -// const size_t minLen = y_num; - const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))),y_num); - - const size_t k0 = k0Len > 0 ? k0Len : (size_t)(ceil(std::abs(log(tol)/log(rho)))); + const float norm_factor = powf((1 - 2.0*rho*cosf(omg) + powf(rho,2)),2); + // std::cout << std::fixed << std::setprecision(9) << "CPU xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl; - const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2); -// std::cout << "CPUy xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; // for boundaries - std::vector impulse_resp_vec_f(k0+3); //forward - for (size_t k = 0; k < (k0+3); ++k) { - impulse_resp_vec_f[k] = impulse_resp(k,rho,omg); + std::vector impulse_resp_vec_f(k0+1); //forward + for (size_t k = 0; k < (k0+1); ++k) { + impulse_resp_vec_f[k] = impulse_resp(k, rho, omg); } - - std::vector impulse_resp_vec_b(k0+3); //backward - for (size_t k = 0; k < (k0+3); ++k) { - impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0); + std::vector impulse_resp_vec_b(k0+1); //backward + for (size_t k = 0; k < (k0+1); ++k) { + impulse_resp_vec_b[k] = impulse_resp_back(k, rho, omg, gamma, c0); } std::vector bc1_vec(k0, 0); //forward @@ -291,9 +297,8 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 0; k < k0; ++k) { bc1_vec[k] += impulse_resp_vec_f[k+1]; } - //assumes a constant value at the end of the filter when the required ghost is bigger then the image - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0; k++) { bc1_vec[minLen-1] += bc1_vec[k]; } @@ -302,8 +307,7 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 0; k < k0; ++k) { bc2_vec[k] = impulse_resp_vec_f[k]; } - - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0; k++) { bc2_vec[minLen-1] += bc2_vec[k]; } @@ -313,8 +317,7 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 0; k < (k0-1); ++k) { bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2]; } - - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0;k++) { bc3_vec[minLen-1] += bc3_vec[k]; } @@ -324,11 +327,64 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 1; k < k0; ++k) { bc4_vec[k] += 2*impulse_resp_vec_b[k]; } - - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0; k++) { bc4_vec[minLen-1] += bc4_vec[k]; } + return BsplineParams { + std::move(bc1_vec), + std::move(bc2_vec), + std::move(bc3_vec), + std::move(bc4_vec), + k0, + b1, + b2, + norm_factor, + minLen + }; +} + +/** + * floating point output -> no rounding or under-/overflow check + */ +template +std::enable_if_t::value, T> +round(float val, size_t &errCount) { + return val; +} + +/** + * integer output -> check for under-/overflow and round + */ +template +std::enable_if_t::value, T> +round(float val, size_t &errCount) { + + val = std::round(val); + + if(val < std::numeric_limits::min() || val > std::numeric_limits::max()) { + errCount++; + std::cout << val << " " << (float)std::numeric_limits::min() << " " << (float)std::numeric_limits::max() << std::endl; + } + return val; +} + + + +template +void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float tol, int k0Len) { + // + // Bevan Cheeseman 2016 + // + // Recursive Filter Implementation for Smoothing BSplines + // B-Spline Signal Processing: Part 11-Efficient Design and Applications, Unser 1993 + + const size_t z_num = image.z_num; + const size_t x_num = image.x_num; + const size_t y_num = image.y_num; + + auto p = prepareBSplineParams(y_num, lambda, tol, k0Len); + APRTimer btime; btime.verbose_flag = false; @@ -350,37 +406,35 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float const size_t iynum = x * y_num; //boundary conditions - for (size_t k = 0; k < minLen; ++k) { - temp1 += bc1_vec[k]*image.mesh[jxnumynum + iynum + k]; - temp2 += bc2_vec[k]*image.mesh[jxnumynum + iynum + k]; + for (size_t k = 0; k < p.minLen; ++k) { + temp1 += p.bc1_vec[k]*image.mesh[jxnumynum + iynum + k]; + temp2 += p.bc2_vec[k]*image.mesh[jxnumynum + iynum + k]; } //boundary conditions - for (size_t k = 0; k < minLen; ++k) { - temp3 += bc3_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; - temp4 += bc4_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; + for (size_t k = 0; k < p.minLen; ++k) { + temp3 += p.bc3_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; + temp4 += p.bc4_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; } //initialize the sequence - image.mesh[jxnumynum + iynum + 0] = temp2; - image.mesh[jxnumynum + iynum + 1] = temp1; + image.mesh[jxnumynum + iynum + 0] = round(temp2, error_count); + image.mesh[jxnumynum + iynum + 1] = round(temp1, error_count); for (auto it = (image.mesh.begin()+jxnumynum + iynum + 2); it != (image.mesh.begin()+jxnumynum + iynum + y_num); ++it) { - float temp = temp1*b1 + temp2*b2 + *it; + + float temp = temp1*p.b1 + temp2*p.b2 + *it; *it = round(temp, error_count); temp2 = temp1; temp1 = temp; } - image.mesh[jxnumynum + iynum + y_num - 2] = round(temp3*norm_factor, error_count); - image.mesh[jxnumynum + iynum + y_num - 1] = round(temp4*norm_factor, error_count); - - + image.mesh[jxnumynum + iynum + y_num - 2] = round(temp3*p.norm_factor, error_count); + image.mesh[jxnumynum + iynum + y_num - 1] = round(temp4*p.norm_factor, error_count); } } btime.stop_timer(); - btime.start_timer("backward_loop_y"); #ifdef HAVE_OPENMP #pragma omp parallel for default(shared) reduction(+: error_count) @@ -391,13 +445,12 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (int64_t i = x_num - 1; i >= 0; --i) { const size_t iynum = i * y_num; - float temp2 = image.mesh[jxnumynum + iynum + y_num - 1]/norm_factor; - float temp1 = image.mesh[jxnumynum + iynum + y_num - 2]/norm_factor; + float temp2 = image.mesh[jxnumynum + iynum + y_num - 1]/p.norm_factor; + float temp1 = image.mesh[jxnumynum + iynum + y_num - 2]/p.norm_factor; for (auto it = (image.mesh.begin()+jxnumynum + iynum + y_num-3); it != (image.mesh.begin()+jxnumynum + iynum-1); --it) { - float temp = temp1*b1 + temp2*b2 + *it; - - *it = round(temp*norm_factor, error_count); + float temp = temp1*p.b1 + temp2*p.b2 + *it; + *it = round(temp*p.norm_factor, error_count); temp2 = temp1; temp1 = temp; @@ -417,90 +470,13 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float // // Bevan Cheeseman 2016 // - // Recursive Filter Implimentation for Smoothing BSplines - - float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda); - float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda))); - float omg = atan(sqrt((1/xi)*(144*lambda - 1))); - float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2)); - float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg)); - - const float b1 = 2*rho*cos(omg); - const float b2 = -pow(rho,2.0); + // Recursive Filter Implementation for Smoothing BSplines const size_t z_num = image.z_num; const size_t x_num = image.x_num; const size_t y_num = image.y_num; - //const size_t minLen = std::min(z_num, std::min(x_num, y_num)); - //const size_t minLen = z_num; - - const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))), z_num); - - const size_t k0 = k0Len > 0 ? k0Len :(size_t)(ceil(std::abs(log(tol)/log(rho)))); - - const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2); -// std::cout << "CPUz xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; - - ////////////////////////////////////////////////////////////// - // - // Setting up boundary conditions - // - ////////////////////////////////////////////////////////////// - - std::vector impulse_resp_vec_f(k0+3); //forward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_f[k] = impulse_resp(k,rho,omg); - } - - std::vector impulse_resp_vec_b(k0+3); //backward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0); - } - - std::vector bc1_vec(k0, 0); //forward - //y(1) init - bc1_vec[1] = impulse_resp_vec_f[0]; - for(size_t k = 0; k < k0; k++){ - bc1_vec[k] += impulse_resp_vec_f[k+1]; - } - - //assumes a constant value at the end of the filter when the required ghost is bigger then the image - for(size_t k = (minLen); k < k0;k++){ - bc1_vec[minLen-1] += bc1_vec[k]; - } - - - std::vector bc2_vec(k0, 0); //backward - //y(0) init - for(size_t k = 0; k < k0; k++){ - bc2_vec[k] = impulse_resp_vec_f[k]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc2_vec[minLen-1] += bc2_vec[k]; - } - - std::vector bc3_vec(k0, 0); //forward - //y(N-1) init - bc3_vec[0] = impulse_resp_vec_b[1]; - for(size_t k = 0; k < (k0-1); k++){ - bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc3_vec[minLen-1] += bc3_vec[k]; - } - - std::vector bc4_vec(k0, 0); //backward - //y(N) init - bc4_vec[0] = impulse_resp_vec_b[0]; - for(size_t k = 1; k < k0; k++){ - bc4_vec[k] += 2*impulse_resp_vec_b[k]; - } - for(size_t k = (minLen); k < k0;k++){ - bc4_vec[minLen-1] += bc4_vec[k]; - } + auto p = prepareBSplineParams(z_num, lambda, tol, k0Len); //forwards direction std::vector temp_vec1(y_num,0); @@ -523,18 +499,18 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float size_t iynum = i * y_num; - for (size_t j = 0; j < minLen; ++j) { + for (size_t j = 0; j < p.minLen; ++j) { size_t index = j * x_num * y_num + iynum; #ifdef HAVE_OPENMP #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; k--) { //forwards boundary condition - temp_vec1[k] += bc1_vec[j] * image.mesh[index + k]; - temp_vec2[k] += bc2_vec[j] * image.mesh[index + k]; + temp_vec1[k] += p.bc1_vec[j] * image.mesh[index + k]; + temp_vec2[k] += p.bc2_vec[j] * image.mesh[index + k]; //backwards boundary condition - temp_vec3[k] += bc3_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; - temp_vec4[k] += bc4_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; + temp_vec3[k] += p.bc3_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; + temp_vec4[k] += p.bc4_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; } } @@ -557,7 +533,7 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float #pragma omp simd #endif for (size_t k = 0; k < y_num; ++k) { - temp_vec2[k] = round(1.0f*image.mesh[index + k] + b1*temp_vec1[k] + b2*temp_vec2[k], error_count); + temp_vec2[k] = round(image.mesh[index + k] + p.b1*temp_vec1[k] + p.b2*temp_vec2[k], error_count); } std::swap(temp_vec1, temp_vec2); @@ -568,12 +544,12 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float //initialization for (int64_t k = y_num - 1; k >= 0; --k) { //y(N) - image.mesh[(z_num - 1)*x_num*y_num + iynum + k] = round(temp_vec4[k]*norm_factor, error_count); + image.mesh[(z_num - 1)*x_num*y_num + iynum + k] = round(temp_vec4[k]*p.norm_factor, error_count); } for (int64_t k = y_num - 1; k >= 0; --k) { //y(N-1) - image.mesh[(z_num - 2)*x_num*y_num + iynum + k] = round(temp_vec3[k]*norm_factor, error_count); + image.mesh[(z_num - 2)*x_num*y_num + iynum + k] = round(temp_vec3[k]*p.norm_factor, error_count); } //main loop @@ -584,8 +560,8 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; --k) { - float temp = (image.mesh[index + k] + b1*temp_vec3[k] + b2*temp_vec4[k]); - image.mesh[index + k] = round(temp*norm_factor, error_count); + float temp = (image.mesh[index + k] + p.b1*temp_vec3[k] + p.b2*temp_vec4[k]); + image.mesh[index + k] = round(temp*p.norm_factor, error_count); temp_vec4[k] = temp_vec3[k]; temp_vec3[k] = temp; } @@ -605,85 +581,11 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float // // Recursive Filter Implimentation for Smoothing BSplines - float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda); - float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda))); - float omg = atan(sqrt((1/xi)*(144*lambda - 1))); - float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2)); - float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg)); - - const float b1 = 2*rho*cos(omg); - const float b2 = -pow(rho,2.0); - const size_t z_num = image.z_num; const size_t x_num = image.x_num; const size_t y_num = image.y_num; -// const size_t minLen = x_num; - const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))), x_num); - const size_t k0 = k0Len > 0 ? k0Len : ((size_t)(ceil(std::abs(log(tol)/log(rho))))); - const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2); - -// std::cout << "CPUx xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; - - ////////////////////////////////////////////////////////////// - // - // Setting up boundary conditions - // - ////////////////////////////////////////////////////////////// - - std::vector impulse_resp_vec_f(k0+3); //forward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_f[k] = impulse_resp(k,rho,omg); - } - - std::vector impulse_resp_vec_b(k0+3); //backward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0); - } - - std::vector bc1_vec(k0, 0); //forward - //y(1) init - bc1_vec[1] = impulse_resp_vec_f[0]; - for(size_t k = 0; k < k0;k++){ - bc1_vec[k] += impulse_resp_vec_f[k+1]; - } - - //assumes a constant value at the end of the filter when the required ghost is bigger then the image - for(size_t k = (minLen); k < k0;k++){ - bc1_vec[minLen-1] += bc1_vec[k]; - } - - std::vector bc2_vec(k0, 0); //backward - //y(0) init - for(size_t k = 0; k < k0;k++){ - bc2_vec[k] = impulse_resp_vec_f[k]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc2_vec[minLen-1] += bc2_vec[k]; - } - - std::vector bc3_vec(k0, 0); //forward - //y(N-1) init - bc3_vec[0] = impulse_resp_vec_b[1]; - for(size_t k = 0; k < (k0-1);k++){ - bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc3_vec[minLen-1] += bc3_vec[k]; - } - - std::vector bc4_vec(k0, 0); //backward - //y(N) init - bc4_vec[0] = impulse_resp_vec_b[0]; - for(size_t k = 1; k < k0;k++){ - bc4_vec[k] += 2*impulse_resp_vec_b[k]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc4_vec[minLen-1] += bc4_vec[k]; - } + auto p = prepareBSplineParams(x_num, lambda, tol, k0Len); //forwards direction @@ -705,15 +607,15 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float size_t jxnumynum = j * y_num * x_num; - for (size_t i = 0; i < minLen; ++i) { + for (size_t i = 0; i < p.minLen; ++i) { for (size_t k = 0; k < y_num; ++k) { //forwards boundary condition - temp_vec1[k] += bc1_vec[i]*image.mesh[jxnumynum + i*y_num + k]; - temp_vec2[k] += bc2_vec[i]*image.mesh[jxnumynum + i*y_num + k]; + temp_vec1[k] += p.bc1_vec[i]*image.mesh[jxnumynum + i*y_num + k]; + temp_vec2[k] += p.bc2_vec[i]*image.mesh[jxnumynum + i*y_num + k]; //backwards boundary condition - temp_vec3[k] += bc3_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; - temp_vec4[k] += bc4_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; + temp_vec3[k] += p.bc3_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; + temp_vec4[k] += p.bc4_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; } } @@ -735,7 +637,7 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; k--) { - temp_vec2[k] = round(image.mesh[index + k] + b1*temp_vec1[k] + b2*temp_vec2[k], error_count); + temp_vec2[k] = round(image.mesh[index + k] + p.b1*temp_vec1[k] + p.b2*temp_vec2[k], error_count); } std::swap(temp_vec1, temp_vec2); @@ -748,12 +650,12 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float //initialization for (int64_t k = y_num - 1; k >= 0; --k) { //y(N) - image.mesh[jxnumynum + (x_num - 1)*y_num + k] = round(temp_vec4[k]*norm_factor, error_count); + image.mesh[jxnumynum + (x_num - 1)*y_num + k] = round(temp_vec4[k]*p.norm_factor, error_count); } for (int64_t k = y_num - 1; k >= 0; --k) { //y(N-1) - image.mesh[jxnumynum + (x_num - 2)*y_num + k] = round(temp_vec3[k]*norm_factor, error_count); + image.mesh[jxnumynum + (x_num - 2)*y_num + k] = round(temp_vec3[k]*p.norm_factor, error_count); } //main loop @@ -764,8 +666,8 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; k--){ - float temp = (image.mesh[index + k] + b1*temp_vec3[ k]+ b2*temp_vec4[ k]); - image.mesh[index + k] = round(temp*norm_factor, error_count); + float temp = (image.mesh[index + k] + p.b1*temp_vec3[ k]+ p.b2*temp_vec4[ k]); + image.mesh[index + k] = round(temp*p.norm_factor, error_count); temp_vec4[k] = temp_vec3[k]; temp_vec3[k] = temp; } @@ -813,8 +715,7 @@ void ComputeGradient::calc_inv_bspline_y(PixelData& input){ } //LHS boundary condition - input.mesh[j*x_num*y_num + i*y_num] = a2*temp_vec[0]; - input.mesh[j*x_num*y_num + i*y_num] += (a1+a3)*temp_vec[1]; + input.mesh[j*x_num*y_num + i*y_num] = a1*temp_vec[1] + a2*temp_vec[0] + a3 * temp_vec[1]; for (int64_t k = 1; k < (y_num-1);k++){ const int64_t idx = j * x_num * y_num + i * y_num + k; @@ -822,8 +723,7 @@ void ComputeGradient::calc_inv_bspline_y(PixelData& input){ } //RHS boundary condition - input.mesh[j*x_num*y_num + i*y_num + y_num - 1] = (a1+a3)*temp_vec[y_num - 2]; - input.mesh[j*x_num*y_num + i*y_num + y_num - 1] += a2*temp_vec[y_num - 1]; + input.mesh[j*x_num*y_num + i*y_num + y_num - 1] = a1*temp_vec[y_num - 2] + a2*temp_vec[y_num - 1] + a3*temp_vec[y_num - 2]; } } } @@ -1015,11 +915,15 @@ void ComputeGradient::calc_bspline_fd_ds_mag(const PixelData &input, PixelDat //compute the boundary values if (y_num >= 2) { - temp[0] = sqrt(pow((right[0] - left[0]) / (2 * hx), 2.0) + pow((down[0] - up[0]) / (2 * hz), 2.0) + - pow((center[1] - center[0 /* boundary */]) / (2 * hy), 2.0)); - temp[y_num - 1] = sqrt(pow((right[y_num - 1] - left[y_num - 1]) / (2 * hx), 2.0) + - pow((down[y_num - 1] - up[y_num - 1]) / (2 * hz), 2.0) + - pow((center[y_num - 1 /* boundary */] - center[y_num - 2]) / (2 * hy), 2.0)); + float dx = (right[0] - left[0]) / (2 * hx); + float dz = (down[0] - up[0]) / (2 * hz); + float dy = (center[1] - center[0 /* boundary */]) / (2 * hy); + temp[0] = sqrtf(dx*dx + dz*dz + dy*dy); + + dx = (right[y_num - 1] - left[y_num - 1]) / (2 * hx); + dz = (down[y_num - 1] - up[y_num - 1]) / (2 * hz); + dy = (center[y_num - 1 /* boundary */] - center[y_num - 2]) / (2 * hy); + temp[y_num - 1] = sqrtf(dx*dx + dz*dz + dy*dy); } else { temp[0] = 0; // same values minus same values in x/y/z } @@ -1029,8 +933,10 @@ void ComputeGradient::calc_bspline_fd_ds_mag(const PixelData &input, PixelDat #pragma omp simd #endif for (size_t y = 1; y < y_num - 1; ++y) { - temp[y] = sqrt(pow((right[y] - left[y]) / (2 * hx), 2.0) + pow((down[y] - up[y]) / (2 * hz), 2.0) + - pow((center[y + 1] - center[y - 1]) / (2 * hy), 2.0)); + float dx = (right[y] - left[y]) / (2 * hx); + float dz = (down[y] - up[y]) / (2 * hz); + float dy = (center[y + 1] - center[y - 1]) / (2 * hy); + temp[y] = sqrtf(dx*dx + dz*dz + dy*dy); } // Set as a downsampled gradient maximum from 2x2x2 gradient cubes diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index cf636d5f..c4f0e849 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -1,28 +1,30 @@ -#include "ComputeGradientCuda.hpp" -#include "APRParameters.hpp" #include -#include +#include +#include +#include #include -#include +#include "ComputeGradientCuda.hpp" +#include "APRParameters.hpp" #include "data_structures/Mesh/PixelData.hpp" -#include "dsGradient.cuh" - -#include "invBspline.cuh" -#include -#include -#include "bsplineXdir.cuh" -#include "bsplineYdir.cuh" -#include "bsplineZdir.cuh" #include "data_structures/Mesh/downsample.cuh" #include "algorithm/ComputePullingScheme.cuh" -#include "algorithm/LocalIntensityScaleCuda.h" #include "algorithm/LocalIntensityScale.cuh" #include "misc/CudaTools.cuh" #include "misc/CudaMemory.cuh" -#include -#include +#include "algorithm/ParticleCellTreeCuda.cuh" +#include "algorithm/PullingSchemeCuda.hpp" +#include "data_structures/APR/access/LinearAccessCuda.hpp" + +#include "dsGradient.cuh" +#include "invBspline.cuh" +#include "bsplineParams.h" +#include "bsplineXdir.cuh" +#include "bsplineYdir.cuh" +#include "bsplineZdir.cuh" + + namespace { typedef struct { @@ -36,47 +38,52 @@ namespace { float norm_factor; } BsplineParams; + struct BsplineParamsCudaMemoryHandlers { + ScopedCudaMemHandler bc1; + ScopedCudaMemHandler bc2; + ScopedCudaMemHandler bc3; + ScopedCudaMemHandler bc4; + }; + float impulse_resp(float k, float rho, float omg) { // Impulse Response Function - return (pow(rho, (std::abs(k))) * sin((std::abs(k) + 1) * omg)) / sin(omg); + return (powf(rho, (std::abs(k))) * sinf((std::abs(k) + 1) * omg)) / sinf(omg); } float impulse_resp_back(float k, float rho, float omg, float gamma, float c0) { // Impulse Response Function (nominator eq. 4.8, denominator from eq. 4.7) - return c0 * pow(rho, std::abs(k)) * (cos(omg * std::abs(k)) + gamma * sin(omg * std::abs(k))) * - (1.0 / (pow((1 - 2.0 * rho * cos(omg) + pow(rho, 2)), 2))); + return c0 * powf(rho, std::abs(k)) * (cosf(omg * std::abs(k)) + gamma * sinf(omg * std::abs(k))) * + (1.0 / (powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2))); } - template - BsplineParams prepareBsplineStuff(const PixelData &image, float lambda, float tol, int maxFilterLen = -1) { + BsplineParams prepareBsplineStuff(size_t dimLen, float lambda, float tol, int maxFilterLen = -1) { + // Recursive Filter Implimentation for Smoothing BSplines // B-Spline Signal Processing: Part II - Efficient Design and Applications, Unser 1993 - float xi = 1 - 96 * lambda + 24 * lambda * sqrt(3 + 144 * lambda); // eq 4.6 - float rho = (24 * lambda - 1 - sqrt(xi)) / (24 * lambda) * - sqrt((1 / xi) * (48 * lambda + 24 * lambda * sqrt(3 + 144 * lambda))); // eq 4.5 - float omg = atan(sqrt((1 / xi) * (144 * lambda - 1))); // eq 4.6 + float xi = 1 - 96 * lambda + 24 * lambda * sqrtf(3 + 144 * lambda); // eq 4.6 + float rho = (24 * lambda - 1 - sqrtf(xi)) / (24 * lambda) * + sqrtf((1 / xi) * (48 * lambda + 24 * lambda * sqrtf(3 + 144 * lambda))); // eq 4.5 - float c0 = (1 + pow(rho, 2)) / (1 - pow(rho, 2)) * (1 - 2 * rho * cos(omg) + pow(rho, 2)) / - (1 + 2 * rho * cos(omg) + pow(rho, 2)); // eq 4.8 - float gamma = (1 - pow(rho, 2)) / (1 + pow(rho, 2)) * (1 / tan(omg)); // eq 4.8 + float omg = atan(sqrtf((1 / xi) * (144 * lambda - 1))); // eq 4.6 - const float b1 = 2 * rho * cos(omg); - const float b2 = -pow(rho, 2.0); + float c0 = (1 + powf(rho, 2)) / (1 - powf(rho, 2)) * (1 - 2 * rho * cosf(omg) + powf(rho, 2)) / + (1 + 2 * rho * cosf(omg) + powf(rho, 2)); // eq 4.8 + float gamma = (1 - powf(rho, 2)) / (1 + powf(rho, 2)) * (1 / tan(omg)); // eq 4.8 - const size_t idealK0Len = ceil(std::abs(log(tol) / log(rho))); - const size_t minDimension = std::min(image.z_num, std::min(image.x_num, image.y_num)); - const size_t k0 = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, minDimension); + const float b1 = 2 * rho * cosf(omg); + const float b2 = -powf(rho, 2.0); - const float norm_factor = pow((1 - 2.0 * rho * cos(omg) + pow(rho, 2)), 2); - std::cout << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 - << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; + const size_t idealK0Len = ceil(std::abs(logf(tol) / logf(rho))); + const size_t k0 = maxFilterLen > 0 ? maxFilterLen : idealK0Len; + const size_t minLen = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, dimLen); - // ------- Calculating boundary conditions + const float norm_factor = powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2); + +// std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 +// << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << " lambda=" << lambda << " tol=" << tol << std::endl; - // forward boundaries - std::vector impulse_resp_vec_f(k0 + 1); - for (size_t k = 0; k < impulse_resp_vec_f.size(); ++k) impulse_resp_vec_f[k] = impulse_resp(k, rho, omg); + // ------- Calculating boundary conditions size_t boundaryLen = sizeof(float) * k0; PinnedMemoryUniquePtr bc1{(float*)getPinnedMemory(boundaryLen)}; @@ -84,11 +91,19 @@ namespace { PinnedMemoryUniquePtr bc3{(float*)getPinnedMemory(boundaryLen)}; PinnedMemoryUniquePtr bc4{(float*)getPinnedMemory(boundaryLen)}; + // forward boundaries + std::vector impulse_resp_vec_f(k0 + 1); + for (size_t k = 0; k < impulse_resp_vec_f.size(); ++k) impulse_resp_vec_f[k] = impulse_resp(k, rho, omg); + //y(0) init for (size_t k = 0; k < k0; ++k) bc1[k] = impulse_resp_vec_f[k]; + for (size_t k = minLen; k < k0; ++k) bc1[minLen - 1] += bc1[k]; + //y(1) init + for (size_t k = 0; k < k0; ++k) bc2[k] = 0; bc2[1] = impulse_resp_vec_f[0]; for (size_t k = 0; k < k0; ++k) bc2[k] += impulse_resp_vec_f[k + 1]; + for (size_t k = minLen; k < k0; ++k) bc2[minLen - 1] += bc2[k]; // backward boundaries std::vector impulse_resp_vec_b(k0 + 1); @@ -96,11 +111,16 @@ namespace { impulse_resp_vec_b[k] = impulse_resp_back(k, rho, omg, gamma, c0); //y(N-1) init + for (size_t k = 0; k < k0; ++k) bc3[k] = 0; bc3[0] = impulse_resp_vec_b[1]; for (size_t k = 0; k < (k0 - 1); ++k) bc3[k + 1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k + 2]; + for (size_t k = minLen; k < k0; ++k) bc3[minLen - 1] += bc3[k]; + //y(N) init + for (size_t k = 0; k < k0; ++k) bc4[k] = 0; bc4[0] = impulse_resp_vec_b[0]; for (size_t k = 1; k < k0; ++k) bc4[k] += 2 * impulse_resp_vec_b[k]; + for (size_t k = minLen; k < k0; ++k) bc4[minLen - 1] += bc4[k]; return BsplineParams{ std::move(bc1), @@ -113,72 +133,55 @@ namespace { norm_factor }; } -} -/** - * Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise. - * @param input - * @param output - * @param length - len of input/output arrays - * @param thresholdLevel - */ -template -__global__ void threshold(const T *input, S *output, size_t length, float thresholdLevel) { - size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; - if (idx < length) { - if (input[idx] <= thresholdLevel) { output[idx] = 0; } - } -} - -template -void runThreshold(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, size_t z_num, float Ip_th, cudaStream_t aStream) { - dim3 threadsPerBlock(64); - dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1)/threadsPerBlock.x); - threshold<<>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th); -}; - -/** - * Thresholds input array to have minimum thresholdLevel. - * @param input - * @param length - len of input/output arrays - * @param thresholdLevel - */ -template -__global__ void thresholdImg(T *input, size_t length, float thresholdLevel) { - size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; - if (idx < length) { - if (input[idx] < thresholdLevel) { input[idx] = thresholdLevel; } + auto transferSpline(BsplineParams &aParams, cudaStream_t aStream) { + ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0, aStream); + ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0, aStream); + ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0, aStream); + ScopedCudaMemHandler bc4(aParams.bc4.get(), aParams.k0, aStream); + + return std::pair { + BsplineParamsCuda { + bc1.get(), + bc2.get(), + bc3.get(), + bc4.get(), + aParams.k0, + aParams.b1, + aParams.b2, + aParams.norm_factor + }, + + BsplineParamsCudaMemoryHandlers { + std::move(bc1), + std::move(bc2), + std::move(bc3), + std::move(bc4) + } + }; } } -template -void runThresholdImg(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, float Ip_th_offset, cudaStream_t aStream) { - dim3 threadsPerBlock(64); - dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1) / threadsPerBlock.x); - thresholdImg<<< numBlocks, threadsPerBlock, 0, aStream >>> (cudaImage, x_num * y_num * z_num, Ip_th_offset); -}; - template void getGradientCuda(const PixelData &image, PixelData &local_scale_temp, ImgType *cudaImage, ImgType *cudaGrad, float *cudalocal_scale_temp, - BsplineParams &p, float *bc1, float *bc2, float *bc3, float *bc4, float *boundary, + BsplineParamsCuda &px, BsplineParamsCuda &py, BsplineParamsCuda &pz, float *boundary, float bspline_offset, const APRParameters &par, cudaStream_t aStream) { - runThresholdImg(cudaImage, image.x_num, image.y_num, image.z_num, par.Ip_th + bspline_offset, aStream); + // TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim - runBsplineYdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, boundary, aStream); - runBsplineXdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); - runBsplineZdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); + if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream); + if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, aStream); + if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); - runKernelGradient(cudaImage, cudaGrad, image.x_num, image.y_num, image.z_num, local_scale_temp.x_num, local_scale_temp.y_num, par.dx, par.dy, par.dz, aStream); - runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); + runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); - runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); - runThreshold(cudalocal_scale_temp, cudaGrad, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, par.Ip_th, aStream); + if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); } class CurrentTime { @@ -199,6 +202,50 @@ public: } }; + +/** + * Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise. + * @param input + * @param output + * @param length - len of input/output arrays + * @param thresholdLevel + */ +template +__global__ void threshold(const T *input, S *output, size_t length, float thresholdLevel) { + size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; + if (idx < length) { + if (input[idx] <= thresholdLevel) { output[idx] = 0; } + } +} + +template +void runThreshold(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, size_t z_num, float Ip_th, cudaStream_t aStream) { + dim3 threadsPerBlock(64); + dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1)/threadsPerBlock.x); + threshold<<>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th); +}; + +template +__global__ void rescaleAndThreshold(T *data, size_t len, float sigmaThreshold, float sigmaThresholdMax) { + const float max_th = 60000.0; + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (idx < len) { + float rescaled = data[idx]; + if (rescaled < sigmaThreshold) { + rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold; + } + data[idx] = rescaled; + } +} + +template +void runRescaleAndThreshold(T *data, size_t len, float sigma, float sigmaMax, cudaStream_t aStream) { + dim3 threadsPerBlock(64); + dim3 numBlocks((len + threadsPerBlock.x - 1) / threadsPerBlock.x); + rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, sigma, sigmaMax); +} + + template template class GpuProcessingTask::GpuProcessingTaskImpl { @@ -207,6 +254,7 @@ class GpuProcessingTask::GpuProcessingTaskImpl { const PixelData &iCpuImage; PixelData &iCpuLevels; const APRParameters &iParameters; + GenInfo iAprInfo; float iBsplineOffset; int iMaxLevel; @@ -227,6 +275,11 @@ class GpuProcessingTask::GpuProcessingTaskImpl { const size_t boundaryLen; ScopedCudaMemHandler boundary; + ParticleCellTreeCuda pctc; + + ScopedCudaMemHandler y_vec; // for LinearAccess + LinearAccessCudaStructs lacs; + /** * @return newly created stream */ @@ -238,61 +291,90 @@ class GpuProcessingTask::GpuProcessingTaskImpl { public: - GpuProcessingTaskImpl(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : - iCpuImage(image), + // TODO: Remove need for passing 'levels' to GpuProcessingTask + // It was used during development to control internal computation like filters, gradient, levels etc. but + // once all is done there is no need for it anymore + GpuProcessingTaskImpl(const PixelData &inputImage, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : + iCpuImage(inputImage), iCpuLevels(levels), iStream(getStream()), - image (image, iStream), + image (inputImage, iStream), gradient (levels, iStream), local_scale_temp (levels, iStream), local_scale_temp2 (levels, iStream), iParameters(parameters), + iAprInfo(iCpuImage.getDimension()), iBsplineOffset(bspline_offset), iMaxLevel(maxLevel), - params(prepareBsplineStuff(image, parameters.lambda, tolerance)), - bc1(params.bc1.get(), params.k0, iStream), - bc2(params.bc2.get(), params.k0, iStream), - bc3(params.bc3.get(), params.k0, iStream), - bc4(params.bc4.get(), params.k0, iStream), - boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)image.x_num * (size_t)image.z_num}, - boundary{nullptr, boundaryLen, iStream} + // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. + // Should be fixed when other parts of pipeline are ready. +// params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)), +// bc1(params.bc1.get(), params.k0, iStream), +// bc2(params.bc2.get(), params.k0, iStream), +// bc3(params.bc3.get(), params.k0, iStream), +// bc4(params.bc4.get(), params.k0, iStream), + boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, + boundary{nullptr, boundaryLen, iStream}, + pctc(iAprInfo, iStream), + y_vec(nullptr, iAprInfo.getSize(), iStream) { // std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n"; - std::cout << iCpuImage << std::endl; - std::cout << iCpuLevels << std::endl; - std::cout << "\n\n\n"; - +// std::cout << iCpuImage << std::endl; +// std::cout << iCpuLevels << std::endl; } void sendDataToGpu() { - CurrentTime ct; - uint64_t start = ct.microseconds(); +// CurrentTime ct; +// uint64_t start = ct.microseconds(); image.copyH2D(); - std::cout << "SEND time: " << ct.microseconds() - start << std::endl; +// checkCuda(cudaStreamSynchronize(iStream)); +// std::cout << "SEND time: " << ct.microseconds() - start << std::endl; } - void getDataFromGpu() { - CurrentTime ct; - uint64_t start = ct.microseconds(); - local_scale_temp.copyD2H(); - cudaStreamSynchronize(iStream); - std::cout << "RCV time: " << ct.microseconds() - start << std::endl; + LinearAccessCudaStructs getDataFromGpu() { +// CurrentTime ct; +// uint64_t start = ct.microseconds(); +// local_scale_temp.copyD2H(); +// checkCuda(cudaStreamSynchronize(iStream)); +// std::cout << "RCV time: " << ct.microseconds() - start << std::endl; + return std::move(lacs); } void processOnGpu() { CurrentTime ct; uint64_t start = ct.microseconds(); + + // TODO: temporarily bspline params are generated here + // In principle this is OK and correct but would be faster (for processing series of same size images) if + // they would be calculated in constructor of GpuProcessingTaskImpl class (once). + BsplineParams px = prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance); + auto cudax = transferSpline(px, iStream); + auto splineCudaX = cudax.first; + BsplineParams py = prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance); + auto cuday = transferSpline(py, iStream); + auto splineCudaY = cuday.first; + BsplineParams pz = prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance); + auto cudaz = transferSpline(pz, iStream); + auto splineCudaZ = cudaz.first; + getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), - params, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(), + splineCudaX, splineCudaY, splineCudaZ, boundary.get(), iBsplineOffset, iParameters, iStream); - std::cout << "1: " << ct.microseconds() - start << std::endl; runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); - std::cout << "2: " << ct.microseconds() - start << std::endl; + + // Apply parameters from APRConverter: + runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream); + runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream); + runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream); + // TODO: automatic parameters are not implemented for GPU pipeline (yet) + float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); float level_factor = pow(2, iMaxLevel) * min_dim; const float mult_const = level_factor/iParameters.rel_error; runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); - std::cout << "3: " << ct.microseconds() - start << std::endl; + + computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); + computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream); } ~GpuProcessingTaskImpl() { @@ -302,11 +384,11 @@ public: }; template -GpuProcessingTask::GpuProcessingTask(PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) -: impl{new GpuProcessingTaskImpl(image, levels, parameters, bspline_offset, maxLevel)} {std::cout << "GpuProcessingTask\n";} +GpuProcessingTask::GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) +: impl{new GpuProcessingTaskImpl(image, levels, parameters, bspline_offset, maxLevel)} { } template -GpuProcessingTask::~GpuProcessingTask() {std::cout << "~GpuProcessingTask\n";} +GpuProcessingTask::~GpuProcessingTask() { } template GpuProcessingTask::GpuProcessingTask(GpuProcessingTask&&) = default; @@ -315,18 +397,11 @@ template void GpuProcessingTask::sendDataToGpu() {impl->sendDataToGpu();} template -void GpuProcessingTask::getDataFromGpu() {impl->getDataFromGpu();} +LinearAccessCudaStructs GpuProcessingTask::getDataFromGpu() {return impl->getDataFromGpu();} template void GpuProcessingTask::processOnGpu() {impl->processOnGpu();} -template -void GpuProcessingTask::doAll() { - sendDataToGpu(); - processOnGpu(); - getDataFromGpu(); -} - // explicit instantiation of handled types template class GpuProcessingTask; template class GpuProcessingTask; @@ -336,29 +411,39 @@ template class GpuProcessingTask; // explicit instantiation of handled types template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); +template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); +template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); +template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); + + + template void cudaFilterBsplineFull(PixelData &input, float lambda, float tolerance, TypeOfRecBsplineFlags flags, int maxFilterLen) { cudaStream_t aStream = 0; - BsplineParams p = prepareBsplineStuff(input, lambda, tolerance, maxFilterLen); - ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); - ScopedCudaMemHandler, D2H | H2D> cudaInput(input); + ScopedCudaMemHandler, D2H | H2D> cudaInput(input, aStream); - APRTimer timer(true); + APRTimer timer(false); timer.start_timer("GpuDeviceTimeFull"); if (flags & BSPLINE_Y_DIR) { + BsplineParams p = prepareBsplineStuff((size_t)input.y_num, lambda, tolerance, maxFilterLen); + auto cuda = transferSpline(p, aStream); + auto splineCuda = cuda.first; int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * input.x_num * input.z_num; - ScopedCudaMemHandler boundary(nullptr, boundaryLen); // allocate memory on device - runBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, boundary.get(), aStream); + ScopedCudaMemHandler boundary(nullptr, boundaryLen, aStream); // allocate memory on device + runBsplineYdir(cudaInput.get(), input.getDimension(), splineCuda, boundary.get(), aStream); } if (flags & BSPLINE_X_DIR) { - runBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); + BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen); + auto cuda = transferSpline(p, aStream); + auto splineCuda = cuda.first; + runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); } if (flags & BSPLINE_Z_DIR) { - runBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); + BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen); + auto cuda = transferSpline(p, aStream); + auto splineCuda = cuda.first; + runBsplineZdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); } timer.stop_timer(); } @@ -367,16 +452,18 @@ void cudaFilterBsplineFull(PixelData &input, float lambda, float tolera template void cudaInverseBspline(PixelData &, TypeOfInvBsplineFlags); template void cudaInverseBspline(PixelData &input, TypeOfInvBsplineFlags flags) { - ScopedCudaMemHandler, H2D | D2H> cudaInput(input); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaInput(input, aStream); if (flags & INV_BSPLINE_Y_DIR) { - runInvBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0); + runInvBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream); } if (flags & INV_BSPLINE_X_DIR) { - runInvBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0); + runInvBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream); } if (flags & INV_BSPLINE_Z_DIR) { - runInvBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0); + runInvBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream); } } @@ -384,62 +471,59 @@ void cudaInverseBspline(PixelData &input, TypeOfInvBsplineFlags flags) template void computeLevelsCuda(const PixelData &, PixelData &, int, float, float, float, float); template void computeLevelsCuda(const PixelData &grad_temp, PixelData &local_scale_temp, int maxLevel, float relError, float dx, float dy, float dz) { - ScopedCudaMemHandler, H2D> cudaGrad(grad_temp); - ScopedCudaMemHandler, D2H | H2D> cudaLis(local_scale_temp); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D> cudaGrad(grad_temp, aStream); + ScopedCudaMemHandler, D2H | H2D> cudaLis(local_scale_temp, aStream); float min_dim = std::min(dy, std::min(dx, dz)); float level_factor = pow(2, maxLevel) * min_dim; const float mult_const = level_factor/relError; - cudaStream_t aStream = 0; runComputeLevels(cudaGrad.get(), cudaLis.get(), grad_temp.mesh.size(), mult_const, aStream); } // explicit instantiation of handled types template void getGradient(PixelData &, PixelData &, PixelData &, PixelData &, float, const APRParameters &); +template void getGradient(PixelData &, PixelData &, PixelData &, PixelData &, float, const APRParameters &); + template void getGradient(PixelData &image, PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, float bspline_offset, const APRParameters &par) { - ScopedCudaMemHandler, D2H | H2D> cudaImage(image); - ScopedCudaMemHandler, D2H | H2D> cudaGrad(grad_temp); - ScopedCudaMemHandler, D2H> cudalocal_scale_temp(local_scale_temp); - ScopedCudaMemHandler, D2H> cudalocal_scale_temp2(local_scale_temp2); - - float tolerance = 0.0001; - BsplineParams p = prepareBsplineStuff(image, par.lambda, tolerance); + cudaStream_t aStream = 0; + ScopedCudaMemHandler, D2H | H2D> cudaImage(image, aStream); + ScopedCudaMemHandler, D2H | H2D> cudaGrad(grad_temp, aStream); + ScopedCudaMemHandler, D2H> cudalocal_scale_temp(local_scale_temp, aStream); + ScopedCudaMemHandler, D2H> cudalocal_scale_temp2(local_scale_temp2, aStream); - ScopedCudaMemHandler bc1 (p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2 (p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3 (p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4 (p.bc4.get(), p.k0); int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * image.x_num * image.z_num; - ScopedCudaMemHandler boundary(nullptr, boundaryLen); + ScopedCudaMemHandler boundary(nullptr, boundaryLen, aStream); - getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(), - p, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(), - bspline_offset, par, 0); -} + float tolerance = 0.0001; -// explicit instantiation of handled types -template void thresholdImg(PixelData &, const float); -template -void thresholdImg(PixelData &image, const float threshold) { - ScopedCudaMemHandler, H2D | D2H> cudaImage(image); + // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. + // Should be fixed when other parts of pipeline are ready. - runThresholdImg(cudaImage.get(), image.x_num, image.y_num, image.z_num, threshold, 0); -} + // FIX BSPLINE PARAMS !!!!!!!! to get full gradient pipeline test working !!!!!!!!!!!!!!!!!!!!!!!!!1 -// explicit instantiation of handled types -template void thresholdGradient(PixelData &, const PixelData &, const float); -template -void thresholdGradient(PixelData &output, const PixelData &input, const float Ip_th) { - ScopedCudaMemHandler, H2D> cudaInput(input); - ScopedCudaMemHandler, H2D | D2H> cudaOutput(output); - runThreshold(cudaInput.get(), cudaOutput.get(), input.x_num, input.y_num, input.z_num, Ip_th, 0); + BsplineParams px = prepareBsplineStuff(image.x_num, par.lambda, tolerance); + auto cudax = transferSpline(px, aStream); + auto splineCudaX = cudax.first; + BsplineParams py = prepareBsplineStuff(image.y_num, par.lambda, tolerance); + auto cuday = transferSpline(py, aStream); + auto splineCudaY = cuday.first; + BsplineParams pz = prepareBsplineStuff(image.z_num, par.lambda, tolerance); + auto cudaz = transferSpline(pz, aStream); + auto splineCudaZ = cudaz.first; + + getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(), + splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, aStream); } void cudaDownsampledGradient(PixelData &input, PixelData &grad, const float hx, const float hy, const float hz) { - ScopedCudaMemHandler, H2D | D2H> cudaInput(input); - ScopedCudaMemHandler, D2H> cudaGrad(grad); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaInput(input, aStream); + ScopedCudaMemHandler, D2H> cudaGrad(grad, aStream); - runKernelGradient(cudaInput.get(), cudaGrad.get(), input.x_num, input.y_num, input.z_num, grad.x_num, grad.y_num, hx, hy, hz, 0); + runKernelGradient(cudaInput.get(), cudaGrad.get(), input.getDimension(), grad.getDimension(), hx, hy, hz, aStream); } diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index 36bb70b1..837d29f5 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -7,7 +7,7 @@ #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/APRParameters.hpp" - +#include "data_structures/APR/access/LinearAccessCuda.hpp" // Test helpers and definitions using TypeOfRecBsplineFlags = uint16_t; @@ -32,10 +32,6 @@ template void computeLevelsCuda(const PixelData &grad_temp, PixelData &local_scale_temp, int maxLevel, float relError, float dx = 1, float dy = 1, float dz = 1); template void getGradient(PixelData &image, PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, float bspline_offset, const APRParameters &par); -template -void thresholdImg(PixelData &image, const float threshold); -template -void thresholdGradient(PixelData &output, const PixelData &input, const float Ip_th); void cudaDownsampledGradient(PixelData &input, PixelData &grad, const float hx, const float hy, const float hz); template @@ -46,14 +42,13 @@ class GpuProcessingTask { public: - GpuProcessingTask(PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel); + GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel); ~GpuProcessingTask(); GpuProcessingTask(GpuProcessingTask&&); void sendDataToGpu(); - void getDataFromGpu(); + LinearAccessCudaStructs getDataFromGpu(); void processOnGpu(); - void doAll(); }; #endif //LIBAPR_COMPUTEGRADIENTCUDA_HPP diff --git a/src/algorithm/ComputePullingScheme.cuh b/src/algorithm/ComputePullingScheme.cuh index 28450f30..51b88143 100644 --- a/src/algorithm/ComputePullingScheme.cuh +++ b/src/algorithm/ComputePullingScheme.cuh @@ -9,8 +9,13 @@ template __global__ void computeLevels(const T *grad, float *lis, size_t len, float mult_const) { size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { - //divide gradient magnitude by Local Intensity Scale (first step in calculating the Local Resolution Estimate L(y), minus constants) - uint32_t d = (grad[idx] / lis[idx]) * mult_const; + // divide gradient magnitude by Local Intensity Scale (first step in calculating the Local Resolution Estimate L(y), minus constants) + // TODO: This part is using a "trick" to convert first to int and then to uint32_t + // Without that some numbers on CPU and GPU are converted to different values... + // For example -6507.28 without conversion to int is converted to 0 but in CPU we got huge value. + // Anyway - both CPU & GPU sides should be checked and maybe some better way of it should be + // used - currently we've got undefined result of such operation. + uint32_t d = (int)((grad[idx] / lis[idx]) * mult_const); //incorporate other factors and compute the level of the Particle Cell, effectively construct LPC L_n lis[idx] = (d == 0) ? 0 : 31 - __clz(d); // fast log2 } diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 5539baef..1593b5ab 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -11,24 +11,17 @@ //#include #include "misc/CudaTools.cuh" - +#include "data_structures/Mesh/paddPixelData.cuh" /** + * Calculates mean in Y direction * - * How it works along y-dir (let's suppose offset = 2 and number of workers = 8 for simplicity): - * - * image idx: 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 - * - * loop #1 - * workersIdx 0 1 2 3 4 5 6 7 - * loop #2 - * workersIdx 6 7 0 1 2 3 4 5 - * loop #3 - * workersIdx 4 5 6 7 0 1 2 3 - * .............. - * - * so #offset workers must wait in each loop to have next elements to sum - * + * NOTE: This is not optimal implementation but.. correct and more or less fast as previous one. + * The reason for change was to have results exactly same as in CPU side. + * Currently after reading whole y-dir line of data mean calculation is done only by one from all threads in block + * so here is some room for improvements. + * If needed may be optimized in future. The main limitation is size of shared memory needed which + * limits number of CUDA blocks that can run in parallel. * @tparam T * @param image * @param offset @@ -37,44 +30,117 @@ * @param z_num */ template -__global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) { +__global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect) { // NOTE: Block size in x/z direction must be 1 const size_t workersOffset = (blockIdx.z * x_num + blockIdx.x) * y_num; const int numOfWorkers = blockDim.y; - const unsigned int active = __activemask(); const int workerIdx = threadIdx.y; + + extern __shared__ char sharedMemChar[]; + T *buffer = (T*) sharedMemChar; + T *data = (T*) &buffer[y_num]; + + // Read whole line of data from y-direction int workerOffset = workerIdx; + while (workerOffset < y_num) { + buffer[workerOffset] = image[workersOffset + workerOffset]; + workerOffset += numOfWorkers; + } + + const int divisor = 2 * offset + 1; + size_t currElementOffset = 0; + size_t saveElementOffset = 0; + size_t nextElementOffset = 1; + + if (workerIdx == 0) { + // clear shared mem + for (int i = offset; i < divisor; ++i) data[i] = 0; - int offsetInTheLoop = 0; - T sum = 0; - T v = 0; - bool waitForNextLoop = false; - int countNumOfSumElements = 1; - while(workerOffset < y_num) { - if (!waitForNextLoop) v = image[workersOffset + workerOffset]; - bool waitForNextValues = (workerIdx + offsetInTheLoop) % numOfWorkers >= (numOfWorkers - offset); - for (int off = 1; off <= offset; ++off) { - T prevElement = __shfl_sync(active, v, workerIdx + blockDim.y - off, blockDim.y); - T nextElement = __shfl_sync(active, v, workerIdx + off, blockDim.y); - // LHS boundary check + don't add previous values if they were added in a previous loop execution - if (workerOffset >= off && !waitForNextLoop) {sum += prevElement; ++countNumOfSumElements;} - // RHS boundary check + don't read next values since they are not read yet - if (!waitForNextValues && workerOffset + off < y_num) {sum += nextElement; ++countNumOfSumElements;} + // saturate cache with #offset elements since it will allow to calculate first element value on LHS + float sum = 0; + int count = 0; + while (count <= offset) { + T v = buffer[currElementOffset]; + sum += v; + data[count] = v; + if (boundaryReflect && count > 0) { + data[2 * offset - count + 1] = v; + sum += v; + } + currElementOffset += nextElementOffset; + ++count; } - waitForNextLoop = waitForNextValues; - if (!waitForNextLoop) { + + if (boundaryReflect) { + count += offset; // elements in above loop in range [1, offset] were summed twice + } + + // Pointer in circular buffer + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, y_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const int lastElement = y_num - 1 - offset; + for (int y = 0; y <= lastElement; ++y) { + // Calculate and save currently processed element and move to the new one + buffer[saveElementOffset] = sum / count; + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'data' buffer + if (y == lastElement) break; + + // Read new element + T v = buffer[currElementOffset]; + + // Update sum to cover [-offset, offset] of currently processed element + sum -= data[beginPtr]; sum += v; - image[workersOffset + workerOffset] = sum / countNumOfSumElements; - // workere is done with current element - move to next one - sum = 0; - countNumOfSumElements = 1; - workerOffset += numOfWorkers; + // Store new element in circularBuffer + data[beginPtr] = v; + + // Move to next elements to read and in circular buffer + count = min(count + 1, divisor); + beginPtr = (beginPtr + 1) % divisor; + currElementOffset += nextElementOffset; + } + + // Handle last #offset elements on RHS + int boundaryPtr = (beginPtr - 1 - 1 + (2 * offset + 1)) % divisor; + + while (saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' and do not remove first element from moving filter + // since 'sum' of filter elements contains all elements from processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + // In such a case first 'o' element should not be removed when filter moves right. + if (y_num - (currElementOffset - saveElementOffset) / nextElementOffset > offset || boundaryReflect) { + if (!boundaryReflect) count = count - 1; + sum -= data[beginPtr]; + } + + if (boundaryReflect) { + sum += data[boundaryPtr]; + boundaryPtr = (boundaryPtr - 1 + (2 * offset + 1)) % divisor; + } + + buffer[saveElementOffset] = sum / count; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } - offsetInTheLoop += offset; } -} + // Save whole line of data + workerOffset = workerIdx; + while (workerOffset < y_num) { + image[workersOffset + workerOffset] = buffer[workerOffset]; + workerOffset += numOfWorkers; + } +} constexpr int NumberOfWorkers = 32; // Cannot be greater than 32 since there is no inter-warp communication implemented. /** @@ -93,7 +159,7 @@ constexpr int NumberOfWorkers = 32; // Cannot be greater than 32 since there is * read/write operations for given element. */ template -__global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) { +__global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect = false) { const size_t workerOffset = blockIdx.y * blockDim.y + threadIdx.y + (blockIdx.z * blockDim.z + threadIdx.z) * y_num * x_num; const int workerYoffset = blockIdx.y * blockDim.y + threadIdx.y ; const int workerIdx = threadIdx.y; @@ -113,43 +179,72 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_ // saturate cache with #offset elements since it will allow to calculate first element value on LHS float sum = 0; int count = 0; - while (count < offset) { + while (count <= offset) { T v = image[workerOffset + currElementOffset]; sum += v; data[count][workerIdx] = v; + if (boundaryReflect && count > 0) {data[2 * offset - count + 1][workerIdx] = v; sum += v;} currElementOffset += nextElementOffset; ++count; } + if (boundaryReflect) { + count += offset; // elements in above loop in range [1, offset] were summed twice + } + // Pointer in circular buffer - int beginPtr = offset; + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, x_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const int lastElement = x_num - 1 - offset; + for (int x = 0; x <= lastElement; ++x) { + // Calculate and save currently processed element and move to the new one + image[workerOffset + saveElementOffset] = sum / count; + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'data' buffer + if (x == lastElement) break; - // main loop going through all elements in range [0, x_num-offset) - for (int x = 0; x < x_num - offset; ++x) { // Read new element T v = image[workerOffset + currElementOffset]; // Update sum to cover [-offset, offset] of currently processed element - sum += v; sum -= data[beginPtr][workerIdx]; + sum += v; - // Save and move pointer + // Store new element in circularBuffer data[beginPtr][workerIdx] = v; - beginPtr = (beginPtr + 1) % divisor; - // Update count and save currently processed element + // Move to next elements to read and in circular buffer count = min(count + 1, divisor); - image[workerOffset + saveElementOffset] = sum / count; - - // Move to next elements + beginPtr = (beginPtr + 1) % divisor; currElementOffset += nextElementOffset; - saveElementOffset += nextElementOffset; } // Handle last #offset elements on RHS + int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + while (saveElementOffset < currElementOffset) { - count = count - 1; - sum -= data[beginPtr][workerIdx]; + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' and do not remove first element from moving filter + // since 'sum' of filter elements contains all elements from processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + // In such a case first 'o' element should not be removed when filter moves right. + if (x_num - (currElementOffset - saveElementOffset)/nextElementOffset > offset || boundaryReflect) { + if (!boundaryReflect) count = count - 1; + sum -= data[beginPtr][workerIdx]; + } + + if (boundaryReflect) { + sum += data[boundaryPtr][workerIdx]; + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + } + image[workerOffset + saveElementOffset] = sum / count; beginPtr = (beginPtr + 1) % divisor; saveElementOffset += nextElementOffset; @@ -173,7 +268,7 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_ * read/write operations for given element. */ template -__global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) { +__global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect = false) { const size_t workerOffset = blockIdx.y * blockDim.y + threadIdx.y + (blockIdx.z * blockDim.z + threadIdx.z) * y_num; // *.z is 'x' const int workerYoffset = blockIdx.y * blockDim.y + threadIdx.y ; const int workerIdx = threadIdx.y; @@ -193,43 +288,72 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ // saturate cache with #offset elements since it will allow to calculate first element value on LHS float sum = 0; int count = 0; - while (count < offset) { + while (count <= offset) { T v = image[workerOffset + currElementOffset]; sum += v; data[count][workerIdx] = v; + if (boundaryReflect && count > 0) {data[2 * offset - count + 1][workerIdx] = v; sum += v;} currElementOffset += nextElementOffset; ++count; } + if (boundaryReflect) { + count += offset; // elements in above loop in range [1, offset] were summed twice + } + // Pointer in circular buffer - int beginPtr = offset; + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, z_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const int lastElement = z_num - 1 - offset; + for (int z = 0; z <= lastElement; ++z) { + // Calculate and save currently processed element and move to the new one + image[workerOffset + saveElementOffset] = sum / count; + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'data' buffer + if (z == lastElement) break; - // main loop going through all elements in range [0, z_num-offset) - for (int z = 0; z < z_num - offset; ++z) { // Read new element T v = image[workerOffset + currElementOffset]; // Update sum to cover [-offset, offset] of currently processed element - sum += v; sum -= data[beginPtr][workerIdx]; + sum += v; - // Save and move pointer + // Store new element in circularBuffer data[beginPtr][workerIdx] = v; - beginPtr = (beginPtr + 1) % divisor; - // Update count and save currently processed element + // Move to next elements to read and in circular buffer count = min(count + 1, divisor); - image[workerOffset + saveElementOffset] = sum / count; - - // Move to next elements + beginPtr = (beginPtr + 1) % divisor; currElementOffset += nextElementOffset; - saveElementOffset += nextElementOffset; } // Handle last #offset elements on RHS + int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + while (saveElementOffset < currElementOffset) { - count = count - 1; - sum -= data[beginPtr][workerIdx]; + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' and do not remove first element from moving filter + // since 'sum' of filter elements contains all elements from processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + // In such a case first 'o' element should not be removed when filter moves right. + if (z_num - (currElementOffset - saveElementOffset)/nextElementOffset > offset || boundaryReflect) { + if (!boundaryReflect) count = count - 1; + sum -= data[beginPtr][workerIdx]; + } + + if (boundaryReflect) { + sum += data[boundaryPtr][workerIdx]; + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + } + image[workerOffset + saveElementOffset] = sum / count; beginPtr = (beginPtr + 1) % divisor; saveElementOffset += nextElementOffset; @@ -238,48 +362,49 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ } template -void runMeanYdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { +void runMeanYdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) { dim3 threadsPerBlock(1, NumberOfWorkers, 1); dim3 numBlocks((x_num + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (z_num + threadsPerBlock.z - 1)/threadsPerBlock.z); - meanYdir<<>>(cudaImage, offset, x_num, y_num, z_num); + const int sharedMemorySize = sizeof(T) * y_num + (offset * 2 + 1) * sizeof(float); + meanYdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } template -void runMeanXdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { +void runMeanXdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) { dim3 threadsPerBlock(1, NumberOfWorkers, 1); dim3 numBlocks(1, (y_num + threadsPerBlock.y - 1) / threadsPerBlock.y, (z_num + threadsPerBlock.z - 1) / threadsPerBlock.z); // Shared memory size - it is able to keep filter len elements for each worker. const int sharedMemorySize = (offset * 2 + 1) * sizeof(float) * NumberOfWorkers; - meanXdir<<>>(cudaImage, offset, x_num, y_num, z_num); + meanXdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } template -void runMeanZdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { +void runMeanZdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) { dim3 threadsPerBlock(1, NumberOfWorkers, 1); dim3 numBlocks(1, (y_num + threadsPerBlock.y - 1) / threadsPerBlock.y, (x_num + threadsPerBlock.x - 1) / threadsPerBlock.x); // intentionally here for better memory readings // Shared memory size - it is able to keep filter len elements for each worker. const int sharedMemorySize = (offset * 2 + 1) * sizeof(float) * NumberOfWorkers; - meanZdir<<>>(cudaImage, offset, x_num, y_num, z_num); + meanZdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } -template -void runMean(T *cudaImage, const PixelData &image, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream) { +template +void runMean(T *cudaImage, const PixelDataDim dim, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream, bool boundaryReflect = false) { if (flags & MEAN_Y_DIR) { - runMeanYdir(cudaImage, offsetY, image.x_num, image.y_num, image.z_num, aStream); + runMeanYdir(cudaImage, offsetY, dim.x, dim.y, dim.z, aStream, boundaryReflect); } if (flags & MEAN_X_DIR) { - runMeanXdir(cudaImage, offsetX, image.x_num, image.y_num, image.z_num, aStream); + runMeanXdir(cudaImage, offsetX, dim.x, dim.y, dim.z, aStream, boundaryReflect); } if (flags & MEAN_Z_DIR) { - runMeanZdir(cudaImage, offsetZ, image.x_num, image.y_num, image.z_num, aStream); + runMeanZdir(cudaImage, offsetZ, dim.x, dim.y, dim.z, aStream, boundaryReflect); } } @@ -314,30 +439,58 @@ void runAbsDiff1D(T *data, const T *reference, size_t len, cudaStream_t aStream) } template -__global__ void rescaleAndThreshold(T *data, size_t len, float varRescale, float sigmaThreshold, float sigmaThresholdMax) { - const float max_th = 60000.0; +__global__ void rescale(T *data, size_t len, float varRescale) { size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { float rescaled = varRescale * data[idx]; - if (rescaled < sigmaThreshold) { - rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold; - } data[idx] = rescaled; } } template -void runRescaleAndThreshold(T *data, size_t len, float varRescale, float sigma, float sigmaMax, cudaStream_t aStream) { +void runRescale(T *data, size_t len, float varRescale, cudaStream_t aStream) { dim3 threadsPerBlock(64); dim3 numBlocks((len + threadsPerBlock.x - 1) / threadsPerBlock.x); - rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, varRescale, sigma, sigmaMax); + rescale <<< numBlocks, threadsPerBlock, 0, aStream >>>(data, len, varRescale); +} + +template +__global__ void constantScale(S *image, size_t len) { + // This is totally naive and slow implementation (only 1 thread is used) just to have CPU + // code implemented in CUDA. This code will not be run in any normal usage of APR + // and it is just here for sanity check and or super small images cases (like few pixels) + // so DO NOT TRY TO OPTIMIZE IT - use your time for something more productive or have + // some beers... still better than writing fast version of this code. + + float min_val = 660000; + double sum = 0; + + for (size_t i = 0; i < len; ++i) { + float tmp = image[i]; + + sum += tmp; + if (tmp < min_val) min_val = tmp; + } + + float scale_val = (float) (sum / (float)len - min_val); + + for (size_t i = 0; i < len; ++i) { + image[i] = scale_val; + } +} + +template +void runConstantScale(S *image, PixelDataDim &dim, cudaStream_t aStream) { + // Check kernel description for further info! + constantScale<<<1, 1, 0, aStream>>>(image, dim.size()); } template void runLocalIntensityScalePipeline(const PixelData &image, const APRParameters &par, S *cudaImage, S *cudaTemp, cudaStream_t aStream) { float var_rescale; std::vector var_win; - LocalIntensityScale().get_window_alt(var_rescale, var_win, par,image); + auto lis = LocalIntensityScale(); + lis.get_window_alt(var_rescale, var_win, par, image); size_t win_y = var_win[0]; size_t win_x = var_win[1]; size_t win_z = var_win[2]; @@ -345,12 +498,61 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete size_t win_x2 = var_win[4]; size_t win_z2 = var_win[5]; - // --------- CUDA ---------------- - runCopy1D(cudaImage, cudaTemp, image.mesh.size(), aStream); - runMean(cudaImage, image, win_x, win_y, win_z, MEAN_ALL_DIR, aStream); - runAbsDiff1D(cudaImage, cudaTemp, image.mesh.size(), aStream); - runMean(cudaImage, image, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream); - runRescaleAndThreshold(cudaImage, image.mesh.size(), var_rescale, par.sigma_th, par.sigma_th_max, aStream); + + + bool constant_scale = false; + + if (par.constant_intensity_scale || (lis.number_active_dimensions == 0)) { + // include the case where the local intensity scale doesn't make sense due to the image being to small. + // (This is for just edge cases and sanity checking) + constant_scale = true; + } + + PixelDataDim imageSize = image.getDimension(); + + if (!constant_scale) { + CudaMemoryUniquePtr paddedImage; + CudaMemoryUniquePtr paddedTemp; + PixelDataDim paddSize(std::max(win_y, win_y2), std::max(win_x, win_x2), std::max(win_z, win_z2)); + PixelDataDim paddedImageSize = imageSize + paddSize + paddSize; // padding on both ends of each dimension + + S *ci = cudaImage; + S *ct = cudaTemp; + PixelDataDim dim = image.getDimension(); + + if (par.reflect_bc_lis) { + // padd + S *mem = nullptr; + checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); + paddedImage.reset(mem); + mem = nullptr; + checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); + paddedTemp.reset(mem); + + runPaddPixels(cudaImage, paddedImage.get(), imageSize, paddedImageSize, paddSize, aStream); + runPaddPixels(cudaTemp, paddedTemp.get(), imageSize, paddedImageSize, paddSize, aStream); + + ci = paddedImage.get(); + ct = paddedTemp.get(); + dim = paddedImageSize; + } + + // Run LIS pipeline + runCopy1D(ci, ct, dim.size(), aStream); + runMean(ci, dim, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, false); + runAbsDiff1D(ci, ct, dim.size(), aStream); + runMean(ci, dim, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, false); + runRescale(ci, dim.size(), var_rescale, aStream); + + if (par.reflect_bc_lis) { + // unpadd + runUnpaddPixels(ci, cudaImage, paddedImageSize, imageSize, paddSize, aStream); + runUnpaddPixels(ct, cudaTemp, paddedImageSize, imageSize, paddSize, aStream); + } + } + else { + runConstantScale(cudaImage, imageSize, aStream); + } } template void runLocalIntensityScalePipeline(const PixelData&, const APRParameters&, float*, float*, cudaStream_t); @@ -360,24 +562,26 @@ template void runLocalIntensityScalePipeline(const PixelData // =================================================== TEST helpers // TODO: should be moved somewhere template -void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags) { - ScopedCudaMemHandler, H2D | D2H> cudaImage(image); - APRTimer timer(true); - timer.start_timer("GpuDeviceTimeFull"); - runMean(cudaImage.get(), image, offset, offset, offset, flags, 0); - timer.stop_timer(); +void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags, bool boundaryReflect) { + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaImage(image, aStream); + + runMean(cudaImage.get(), image.getDimension(), offset, offset, offset, flags, 0, boundaryReflect); } // explicit instantiation of handled types -template void calcMean(PixelData&, int, TypeOfMeanFlags); -template void calcMean(PixelData&, int, TypeOfMeanFlags); +template void calcMean(PixelData&, int, TypeOfMeanFlags, bool); +template void calcMean(PixelData&, int, TypeOfMeanFlags, bool); template void getLocalIntensityScale(PixelData &image, PixelData &temp, const APRParameters &par) { - ScopedCudaMemHandler, H2D | D2H> cudaImage(image); - ScopedCudaMemHandler, D2H> cudaTemp(temp); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaImage(image, aStream); + ScopedCudaMemHandler, D2H> cudaTemp(temp, aStream); - runLocalIntensityScalePipeline(image, par, cudaImage.get(), cudaTemp.get(), 0); + runLocalIntensityScalePipeline(image, par, cudaImage.get(), cudaTemp.get(), aStream); } template void getLocalIntensityScale(PixelData&, PixelData&, const APRParameters&); diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index 3d5942c2..e576efd5 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -16,6 +16,8 @@ class LocalIntensityScale { bool active_x = true; bool active_z = true; +public: + int number_active_dimensions = 3; @@ -153,13 +155,13 @@ void get_local_intensity_scale(PixelData &local_scale_temp, PixelData &input_image, PixelData &var); template - void calc_sat_mean_z(PixelData &input, const size_t offset); + void calc_sat_mean_z(PixelData &input, const size_t offset, bool boundaryReflect = false); template - void calc_sat_mean_x(PixelData &input, const size_t offset); + void calc_sat_mean_x(PixelData &input, const size_t offset, bool boundaryReflect = false); template - void calc_sat_mean_y(PixelData &input, const size_t offset); + void calc_sat_mean_y(PixelData &input, const size_t offset, bool boundaryReflect = false); void get_window(float &var_rescale, std::vector &var_win, const APRParameters &par); @@ -302,195 +304,337 @@ inline void LocalIntensityScale::get_window_alt(float& var_rescale, std::vector< } } -/** - * Calculates a O(1) recursive mean using SAT. - * @tparam T - * @param input - * @param offset - */ template -inline void LocalIntensityScale::calc_sat_mean_y(PixelData& input, const size_t offset){ +inline void LocalIntensityScale::calc_sat_mean_y(PixelData& input, const size_t offset, bool boundaryReflect) { const size_t z_num = input.z_num; const size_t x_num = input.x_num; const size_t y_num = input.y_num; - std::vector temp_vec(y_num); - float divisor = 2 * offset + 1; + const size_t divisor = offset + 1 + offset; + + auto &mesh = input.mesh; + const size_t dimLen = y_num; #ifdef HAVE_OPENMP - #pragma omp parallel for default(shared) firstprivate(temp_vec) + #pragma omp parallel for default(shared) #endif - for(size_t j = 0; j < z_num; ++j) { - for(size_t i = 0; i < x_num; ++i){ - size_t index = j * x_num*y_num + i * y_num; - - //first pass over and calculate cumsum - float temp = 0; - for (size_t k = 0; k < y_num; ++k) { - temp += input.mesh[index + k]; - temp_vec[k] = temp; + for (size_t j = 0; j < z_num; ++j) { + for (size_t i = 0; i < x_num; ++i) { + size_t index = j * x_num * y_num + i * y_num; + + size_t count = 0; + size_t currElementOffset = 0; + size_t nextElementOffset = 1; + size_t saveElementOffset = 0; + + std::vector circularBuffer(divisor, 0); + T sum = 0; + + while (count <= offset) { + auto v = mesh[index + currElementOffset]; + sum += v; + circularBuffer[count] = v; + if (boundaryReflect && count > 0) { circularBuffer[2 * offset - count + 1] = v; sum += v;} + + currElementOffset += nextElementOffset; + count++; } - //handling boundary conditions (LHS) - for (size_t k = 0; k <= offset; ++k) { - input.mesh[index + k] = 0; - } + if (boundaryReflect) count += offset; - //second pass calculate mean - for (size_t k = offset + 1; k < y_num; ++k) { - input.mesh[index + k] = -temp_vec[k - offset - 1]/divisor; - } + int beginPtr = (offset + 1) % divisor; - //second pass calculate mean - for (size_t k = 0; k < (y_num-offset); ++k) { - input.mesh[index + k] += temp_vec[k + offset]/divisor; - } + const int lastElement = dimLen - 1 - offset; + for (int i = 0; i <= lastElement; ++i) { + mesh[index + saveElementOffset] = sum / count; + saveElementOffset += nextElementOffset; + + if (i == lastElement) break; + + auto v = mesh[index + currElementOffset]; - float counter = 0; - //handling boundary conditions (RHS) - for (size_t k = (y_num - offset); k < (y_num); ++k) { - counter++; - input.mesh[index + k]*= divisor; - input.mesh[index + k]+= temp_vec[y_num-1]; - input.mesh[index + k]*= 1.0/(divisor - counter); + sum -= circularBuffer[beginPtr]; + sum += v; + + circularBuffer[beginPtr] = v; + + count = std::min(count + 1, divisor); + beginPtr = (beginPtr + 1) % divisor; + currElementOffset += nextElementOffset; } - //handling boundary conditions (LHS), need to rehandle the boundary - for (size_t k = 1; k <= offset; ++k) { - input.mesh[index + k] *= divisor/(k + offset + 1.0); + int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor; + while(saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' since 'sum' of filter elements contains all elements from + // processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter + bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset) / nextElementOffset > offset; + + if (removeElementFromFilter) { + if (!boundaryReflect) count = count - 1; + } + if (removeElementFromFilter || boundaryReflect) { + sum -= circularBuffer[beginPtr]; + } + if (boundaryReflect) { + sum += circularBuffer[boundaryPtr]; + } + + mesh[index + saveElementOffset] = sum / count; + + boundaryPtr = (boundaryPtr - 1 + divisor) % divisor; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } - //end point boundary condition - input.mesh[index] *= divisor/(offset + 1.0); } } } template -inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size_t offset) { +inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size_t offset, bool boundaryReflect) { + const size_t z_num = input.z_num; const size_t x_num = input.x_num; const size_t y_num = input.y_num; - std::vector temp_vec(y_num*(2*offset + 1),0); + const size_t divisor = offset + 1 + offset; + std::vector circularBuffer(y_num * divisor, 0); + std::vector sum(y_num, 0); - #ifdef HAVE_OPENMP - #pragma omp parallel for default(shared) firstprivate(temp_vec) - #endif - for(size_t j = 0; j < z_num; j++) { + auto &mesh = input.mesh; + const size_t dimLen = x_num; + + if (dimLen < offset) { + throw std::runtime_error("offset cannot be bigger than processed dimension length!"); + } + +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) firstprivate(circularBuffer, sum) +#endif + for (size_t j = 0; j < z_num; j++) { size_t jxnumynum = j * x_num * y_num; - for(size_t k = 0; k < y_num ; k++){ - temp_vec[k] = input.mesh[jxnumynum + k]; - } + size_t count = 0; // counts number of active elements in filter + size_t currElementOffset = 0; // offset of element in processed dimension + size_t nextElementOffset = 1; + size_t saveElementOffset = 0; // offset used to finish RHS boundary + + // Clear buffers so they can be reused in next 'z_num' loop + std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop + std::fill(circularBuffer.begin(), circularBuffer.end(), 0); - for(size_t i = 1; i < 2 * offset + 1; i++) { - for(size_t k = 0; k < y_num; k++) { - temp_vec[i*y_num + k] = input.mesh[jxnumynum + i*y_num + k] + temp_vec[(i-1)*y_num + k]; + // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS + while (count <= offset) { + for (size_t k = 0; k < y_num; ++k) { + auto v = mesh[jxnumynum + currElementOffset * y_num + k]; + sum[k] += v; + circularBuffer[count * y_num + k] = v; + if (boundaryReflect && count > 0) { circularBuffer[(2 * offset - count + 1) * y_num + k] = v; sum[k] += v;} } + + currElementOffset += nextElementOffset; + ++count; } - // LHS boundary - for(size_t i = 0; i < offset + 1; i++){ - for(size_t k = 0; k < y_num; k++) { - input.mesh[jxnumynum + i * y_num + k] = (temp_vec[(i + offset) * y_num + k]) / (i + offset + 1); - } + if (boundaryReflect) { + count += offset; // elements in above loop in range [1, offset] were summed twice } - // middle - size_t current_index = offset + 1; - size_t index_modulo = 0; - for(size_t i = offset + 1; i < x_num - offset; i++){ - // the current cumsum - index_modulo = (current_index + offset) % (2*offset + 1); // current_index - offset - 1 - size_t previous_modulo = (current_index + offset - 1) % (2*offset + 1); // the index of previous cumsum - - for(size_t k = 0; k < y_num; k++) { - float temp = input.mesh[jxnumynum + (i + offset)*y_num + k] + temp_vec[previous_modulo*y_num + k]; - input.mesh[jxnumynum + i*y_num + k] = (temp - temp_vec[index_modulo*y_num + k]) / - (2*offset + 1); - temp_vec[index_modulo*y_num + k] = temp; + // Pointer in circular buffer + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, x_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const size_t lastElement = x_num - 1 - offset; + for (size_t x = 0; x <= lastElement; ++x) { + // Calculate and save currently processed element and move to the new one + for (size_t k = 0; k < y_num; ++k) { + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'circularBuffer' buffer + if (x == lastElement) break; + + for (size_t k = 0; k < y_num; ++k) { + // Read new element + T v = mesh[jxnumynum + currElementOffset * y_num + k]; + + // Update sum to cover [-offset, offset] of currently processed element + sum[k] -= circularBuffer[beginPtr * y_num + k]; + sum[k] += v; + + // Store new element in circularBuffer + circularBuffer[beginPtr * y_num + k] = v; } - current_index = (current_index + 1) % (2*offset + 1); + // Move to next elements to read and in circular buffer + count = std::min(count + 1, divisor); + beginPtr = (beginPtr + 1) % divisor; + currElementOffset += nextElementOffset; } - // RHS boundary - current_index = (current_index + offset) % (2*offset + 1); - for(size_t i = x_num - offset; i < x_num; i++){ - for(size_t k = 0; k < y_num; k++){ - input.mesh[jxnumynum + i*y_num + k] = (temp_vec[index_modulo*y_num + k] - - temp_vec[current_index*y_num + k]) / (x_num - i + offset); + // boundaryPtr is used only in boundaryReflect mode, adding divisor makes it always non-negative value + int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor; + + // Handle last #offset elements on RHS + while(saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' since 'sum' of filter elements contains all elements from + // processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset)/nextElementOffset > offset; + + if (removeElementFromFilter) { + if (!boundaryReflect) count = count - 1; + } + + for (size_t k = 0; k < y_num; ++k) { + if (removeElementFromFilter || boundaryReflect) { + sum[k] -= circularBuffer[beginPtr * y_num + k]; + } + + if (boundaryReflect) { + sum[k] += circularBuffer[boundaryPtr * y_num + k]; + } + + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; } - current_index = (current_index + 1) % (2*offset + 1); + + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } } } template -inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input,const size_t offset) { +inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input, const size_t offset, bool boundaryReflect) { + const size_t z_num = input.z_num; const size_t x_num = input.x_num; const size_t y_num = input.y_num; - std::vector temp_vec(y_num*(2*offset + 1),0); - size_t xnumynum = x_num * y_num; + const size_t divisor = offset + 1 + offset; + std::vector circularBuffer(y_num * divisor, 0); + std::vector sum(y_num, 0); - #ifdef HAVE_OPENMP - #pragma omp parallel for default(shared) firstprivate(temp_vec) - #endif - for(size_t i = 0; i < x_num; i++) { + auto &mesh = input.mesh; + size_t dimLen = z_num; - size_t iynum = i * y_num; + if (dimLen < offset) { + throw std::runtime_error("offset cannot be bigger than processed dimension length!"); + } - //prefetching - for(size_t k = 0; k < y_num ; k++){ - temp_vec[k] = input.mesh[iynum + k]; - } +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) firstprivate(circularBuffer, sum) +#endif + for (size_t j = 0; j < x_num; j++) { + size_t jxnumynum = j * y_num; + + size_t count = 0; // counts number of active elements in filter + size_t currElementOffset = 0; // offset of element in processed dimension + size_t nextElementOffset = x_num; + size_t saveElementOffset = 0; // offset used to finish RHS boundary + + // Clear buffers so they can be reused in next 'x_num' loop + std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop + std::fill(circularBuffer.begin(), circularBuffer.end(), 0); - for(size_t j = 1; j < 2 * offset + 1; j++) { - for(size_t k = 0; k < y_num; k++) { - temp_vec[j*y_num + k] = input.mesh[j * xnumynum + iynum + k] + temp_vec[(j-1)*y_num + k]; + // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS + while(count <= offset) { + for (size_t k = 0; k < y_num; ++k) { + auto v = mesh[jxnumynum + currElementOffset * y_num + k]; + sum[k] += v; + circularBuffer[count * y_num + k] = v; + if (boundaryReflect && count > 0) { circularBuffer[(2 * offset - count + 1) * y_num + k] = v; sum[k] += v;} } + + currElementOffset += nextElementOffset; + ++count; } - // LHS boundary - for(size_t j = 0; j < offset + 1; j++){ - for(size_t k = 0; k < y_num; k++) { - input.mesh[j * xnumynum + iynum + k] = (temp_vec[(j + offset)*y_num + k]) / (j + offset + 1); - } + if (boundaryReflect) { + count += offset; // elements in above loop in range [1, offset] were summed twice } - // middle - size_t current_index = offset + 1; - size_t index_modulo = 0; - for(size_t j = offset + 1; j < z_num - offset; j++){ + // Pointer in circular buffer + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, z_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const size_t lastElement = z_num - 1 - offset; + for (size_t z = 0; z <= lastElement; ++z) { + // Calculate and save currently processed element and move to the new one + for (size_t k = 0; k < y_num; ++k) { + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'circularBuffer' buffer + if (z == lastElement) break; + + for (size_t k = 0; k < y_num; ++k) { + // Read new element + T v = mesh[jxnumynum + currElementOffset * y_num + k]; - index_modulo = (current_index + offset) % (2*offset + 1); // current_index - offset - 1 - size_t previous_modulo = (current_index + offset - 1) % (2*offset + 1); // the index of previous cumsum + // Update sum to cover [-offset, offset] of currently processed element + sum[k] -= circularBuffer[beginPtr * y_num + k]; + sum[k] += v; - for(size_t k = 0; k < y_num; k++) { - // the current cumsum - float temp = input.mesh[(j + offset) * xnumynum + iynum + k] + temp_vec[previous_modulo*y_num + k]; - input.mesh[j * xnumynum + iynum + k] = (temp - temp_vec[index_modulo*y_num + k]) / - (2*offset + 1); - temp_vec[index_modulo*y_num + k] = temp; + // Save new element + circularBuffer[beginPtr * y_num + k] = v; } - current_index = (current_index + 1) % (2*offset + 1); + // Move to next elements to read and in circular buffer + count = std::min(count + 1, divisor); + beginPtr = (beginPtr + 1) % divisor; + currElementOffset += nextElementOffset; } - // RHS boundary - current_index = (current_index + offset) % (2*offset + 1); - for(size_t j = z_num - offset; j < z_num; j++){ - for(size_t k = 0; k < y_num; k++){ - input.mesh[j * xnumynum + iynum + k] = (temp_vec[index_modulo*y_num + k] - - temp_vec[current_index*y_num + k]) / (z_num - j + offset); + // boundaryPtr is used only in boundaryReflect mode, adding divisor makes it always non-negative value + int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor; + + // Handle last #offset elements on RHS + while(saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' since 'sum' of filter elements contains all elements from + // processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset)/nextElementOffset > offset; + + if (removeElementFromFilter) { + if (!boundaryReflect) count = count - 1; + } + + for (size_t k = 0; k < y_num; ++k) { + if (removeElementFromFilter || boundaryReflect) { + sum[k] -= circularBuffer[beginPtr * y_num + k]; + } + + if (boundaryReflect) { + sum[k] += circularBuffer[boundaryPtr * y_num + k]; + } + + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; } - current_index = (current_index + 1) % (2*offset + 1); + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } } } -#endif //PARTPLAY_LOCAL_INTENSITY_SCALE_HPP +#endif diff --git a/src/algorithm/LocalIntensityScaleCuda.h b/src/algorithm/LocalIntensityScaleCuda.h index a635a156..f572d5e5 100644 --- a/src/algorithm/LocalIntensityScaleCuda.h +++ b/src/algorithm/LocalIntensityScaleCuda.h @@ -16,7 +16,7 @@ constexpr TypeOfMeanFlags MEAN_Z_DIR = 0x04; constexpr TypeOfMeanFlags MEAN_ALL_DIR = MEAN_Y_DIR | MEAN_X_DIR | MEAN_Z_DIR; template -void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags = MEAN_ALL_DIR); +void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags = MEAN_ALL_DIR, bool boundaryReflect = false); template void getLocalIntensityScale(PixelData &image, PixelData &temp, const APRParameters &par); diff --git a/src/algorithm/LocalParticleCellSet.hpp b/src/algorithm/LocalParticleCellSet.hpp index 7935076b..f834805a 100644 --- a/src/algorithm/LocalParticleCellSet.hpp +++ b/src/algorithm/LocalParticleCellSet.hpp @@ -49,6 +49,10 @@ inline int __builtin_clz(unsigned int x) #endif +#include "algorithm/PullingScheme.hpp" +#include "algorithm/PullingSchemeSparse.hpp" +#include "io/TiffUtils.hpp" + class LocalParticleCellSet { public: diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index f568212b..80765bca 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -1,44 +1,34 @@ #include "PullingSchemeCuda.hpp" #include -#include -//#include -#include #include "misc/CudaTools.cuh" #include "data_structures/Mesh/downsample.cuh" +#include "algorithm/OVPC.h" +#include "algorithm/ParticleCellTreeCuda.cuh" -namespace { - using ElementType = uint8_t; - static constexpr int BIT_SHIFT = 6; - static constexpr ElementType OVPC_SEED = 1; - static constexpr ElementType OVPC_BOUNDARY = 2; - static constexpr ElementType OVPC_FILLER = 3; - - static constexpr ElementType SEED_MASK = OVPC_SEED << BIT_SHIFT; - static constexpr ElementType BOUNDARY_MASK = OVPC_BOUNDARY << BIT_SHIFT; - static constexpr ElementType FILLER_MASK = OVPC_FILLER << BIT_SHIFT; - static constexpr ElementType MASK = 0x03 << BIT_SHIFT; -} template -__global__ void copy1D(const T *input, S *output, size_t length) { +__global__ void copyAndClampLevels(const T *input, S *output, size_t length, int levelMin, int levelMax) { size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; if (idx < length) { - output[idx] = input[idx]; + T v = input[idx]; + if (v > levelMax) v = levelMax; + if (v < levelMin) v = levelMin; + output[idx] = v; } } template -void runCopy1D(T *inputData, S *outputData, size_t lenght, cudaStream_t aStream) { +void runCopyAndClampLevels(T *inputData, S *outputData, size_t lenght, int levelMin, int levelMax, cudaStream_t aStream) { dim3 threadsPerBlock(128); dim3 numBlocks((lenght + threadsPerBlock.x - 1)/threadsPerBlock.x); - copy1D<<>>(inputData, outputData, lenght); + copyAndClampLevels<<>>(inputData, outputData, lenght, levelMin, levelMax); }; template -__global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int level) { +__global__ void firstStep(T *data, size_t xLen, size_t yLen, size_t zLen, int level) { const int xi = (blockIdx.x * blockDim.x) + threadIdx.x; const int yi = (blockIdx.y * blockDim.y) + threadIdx.y; const int zi = (blockIdx.z * blockDim.z) + threadIdx.z; @@ -51,39 +41,38 @@ __global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int lev int zmin = zi > 0 ? zi - 1 : 0; int zmax = zi < zLen - 1 ? zi + 1 : zLen - 1; - bool ok = true; - bool neig = false; + bool hasNeighHigherLevel = false; + bool hasNeighSameLevel = false; for (int z = zmin; z <= zmax; ++z) { for (int x = xmin; x <= xmax; ++x) { for (int y = ymin; y <= ymax; ++y) { const size_t idx = z * xLen * yLen + x * yLen + y; - T currentLevel = ~MASK & data[idx]; - if (currentLevel > level) { ok = false; break; } - else if (currentLevel == level) neig = true; + T currentLevel = ~OVPC::MASK & data[idx]; + if (currentLevel > level) { hasNeighHigherLevel = true; break; } + else if (currentLevel == level) hasNeighSameLevel = true; } } } - if (ok) { + if (!hasNeighHigherLevel) { const size_t idx = zi * xLen * yLen + xi * yLen + yi; T status = data[idx]; - if (status == level) data[idx] |= SEED_MASK; - else if (neig) data[idx] |= BOUNDARY_MASK; - else data[idx] |= FILLER_MASK; + if (status == level) data[idx] |= OVPC::SEED; + else if (hasNeighSameLevel) data[idx] |= OVPC::BOUNDARY; + else data[idx] |= OVPC::FILLER; } } template -void runOneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int level, cudaStream_t aStream) { +void runFirstStep(T *data, size_t xLen, size_t yLen, size_t zLen, int level, cudaStream_t aStream) { dim3 threadsPerBlock(1, 128, 1); dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x, (yLen + threadsPerBlock.y - 1) / threadsPerBlock.y, (zLen + threadsPerBlock.z - 1) / threadsPerBlock.z); -// dim3 numBlocks((xLen * yLen * zLen + threadsPerBlock.x - 1)/threadsPerBlock.x); - oneLevel<<>>(data, xLen, yLen, zLen, level); + firstStep<<>>(data, xLen, yLen, zLen, level); }; template -__global__ void secondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax) { +__global__ void secondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMin) { const int xi = (blockIdx.x * blockDim.x) + threadIdx.x; const int yi = (blockIdx.y * blockDim.y) + threadIdx.y; const int zi = (blockIdx.z * blockDim.z) + threadIdx.z; @@ -103,73 +92,94 @@ __global__ void secondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t for (int x = xmin; x <= xmax; ++x) { for (int y = ymin; y <= ymax; ++y) { size_t children_index = z * xLenc * yLenc + x * yLenc + y; - child[children_index] = status >= (OVPC_SEED << BIT_SHIFT) ? 0 : child[children_index] >> BIT_SHIFT; + child[children_index] = status >= (OVPC::OVPC_SEED << OVPC::BIT_SHIFT) ? 0 : child[children_index] >> OVPC::BIT_SHIFT; } } } - if (isLevelMax) data[zi * xLen * yLen + xi * yLen + yi] = status >> BIT_SHIFT; + if (isLevelMin) data[zi * xLen * yLen + xi * yLen + yi] = status >> OVPC::BIT_SHIFT; } template -void runSecondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax, cudaStream_t aStream) { +void runSecondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax, cudaStream_t aStream) { dim3 threadsPerBlock(1, 128, 1); dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x, (yLen + threadsPerBlock.y - 1) / threadsPerBlock.y, (zLen + threadsPerBlock.z - 1) / threadsPerBlock.z); - secondPhase<<>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax); + secondStep<<>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax); }; + // explicit instantiation of handled types -template void computeOVPC(const PixelData&, PixelData&, int, int); +template std::vector> computeOvpcCuda(const PixelData&, const GenInfo&); +template std::vector> computeOvpcCuda(const PixelData&, const GenInfo&); + +/** + * CUDA implementation of Pullin Scheme (OVPC - Optimal Valid Particle Cell set). + * @tparam T - type of input levels + * @param input - input levels computed in earlier stages + * @param gi - GenInfo for given APR + * + * @return - PCT for CPU (copied from GPU) + */ +template +std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi) { + // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing + // all steps -template -void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax) { - ScopedCudaMemHandler, H2D> in(input); - ScopedCudaMemHandler, D2H> mem(output); - - // TODO: This is not needed later - just for having clear debug - //cudaMemset(mem.get(), 0, mem.getNumOfBytes()); - - // =============== Create pyramid - std::vector levels(levelMax + 1, nullptr); - std::vector xSize(levelMax + 1); - std::vector ySize(levelMax + 1); - std::vector zSize(levelMax + 1); - - int xDS = input.x_num; - int yDS = input.y_num; - int zDS = input.z_num; - - size_t offset = 0; - for (int l = levelMax; l >= levelMin; --l) { - levels[l] = reinterpret_cast(mem.get()) + offset; - xSize[l] = xDS; - ySize[l] = yDS; - zSize[l] = zDS; - - offset += xDS * yDS * zDS * sizeof(TreeElementType); - // round up to 16-bytes - const size_t alignemet = 16; - offset = ((offset + alignemet - 1) / alignemet ) * alignemet; - - xDS = ceil(xDS/2.0); - yDS = ceil(yDS/2.0); - zDS = ceil(zDS/2.0); - } + cudaStream_t stream = nullptr; + + ScopedCudaMemHandler, H2D> in(input, stream); + + ParticleCellTreeCuda pct(gi, stream); + int levelMin = gi.l_min; + int levelMax = gi.l_max - 1; - runCopy1D(in.get(), levels[levelMax], in.getSize(), 0); + // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range + runCopyAndClampLevels(in.get(), pct[levelMax], in.getSize(), levelMin, levelMax, stream); + + // Downsample with max reduction to levelMin to fill rest of the tree + for (int l = levelMax - 1; l >= levelMin; --l) { + runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], stream); + } + + // ================== Phase 1 - top to down + for (int l = levelMin; l <= levelMax; ++l) { + runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, stream); + } + // ================== Phase 1 - down to top for (int l = levelMax - 1; l >= levelMin; --l) { - runDownsampleMax(levels[l + 1], levels[l], xSize[l + 1], ySize[l + 1], zSize[l + 1], 0); + runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, stream); } + return pct.getPCTcpu(); +} + +// explicit instantiation of handled types +template void computeOvpcCuda(float *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream); +template void computeOvpcCuda(int *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream); + + +template +void computeOvpcCuda(ImgType *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream) { + int levelMin = gi.l_min; + int levelMax = gi.l_max - 1; + + + // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range + runCopyAndClampLevels(in, pct[levelMax], gi.y_num[levelMax]*gi.x_num[levelMax]*gi.z_num[levelMax], levelMin, levelMax, stream); + + // Downsample with max reduction to levelMin to fill rest of the tree + for (int l = levelMax - 1; l >= levelMin; --l) { + runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], stream); + } // ================== Phase 1 - top to down for (int l = levelMin; l <= levelMax; ++l) { - runOneLevel(levels[l], xSize[l], ySize[l], zSize[l], l, 0); + runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, stream); } // ================== Phase 1 - down to top for (int l = levelMax - 1; l >= levelMin; --l) { - runSecondPhase(levels[l], levels[l+1], xSize[l], ySize[l], zSize[l], xSize[l+1], ySize[l+1], zSize[l+1], l == levelMin, 0); + runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, stream); } -}; +} \ No newline at end of file diff --git a/src/algorithm/OVPC.h b/src/algorithm/OVPC.h index f8e975ac..f55bfee3 100644 --- a/src/algorithm/OVPC.h +++ b/src/algorithm/OVPC.h @@ -9,11 +9,13 @@ #include #include "data_structures/Mesh/PixelData.hpp" -#include "data_structures/APR/APRAccess.hpp" +#include "data_structures/APR/GenInfo.hpp" #include "algorithm/PullingScheme.hpp" class OVPC { + +public: // Element big enouth to keep all the levels + 2 highest bits for type // for uint8_t we have [ 2 bit - type(empty, seed, boundary, filler) | 6 bit - level(0-63) ] using ElementType = uint8_t; @@ -31,9 +33,8 @@ class OVPC { int iLevelMin; std::vector> iParticleCellTree; -public: template - OVPC(const APRAccess &aAprAccess, const PixelData &aInputLevels) { + OVPC(const GenInfo &aAprAccess, const PixelData &aInputLevels) { // Level Max is one less since we are working on downsampled version iLevelMax = aAprAccess.l_max - 1; iLevelMin = aAprAccess.l_min; @@ -43,8 +44,8 @@ class OVPC { iParticleCellTree[iLevelMax].init(aInputLevels.y_num, aInputLevels.x_num, aInputLevels.z_num); fillLevel(iLevelMax, aInputLevels); - // Downsample with max reduction to levelMin to fill the rest of the tree - for(int level = iLevelMax - 1; level >= iLevelMin; --level) { + // Downsample with max reduction to levelMin to fill rest of the tree + for (int level = iLevelMax - 1; level >= iLevelMin; --level) { downsample(iParticleCellTree[level + 1], iParticleCellTree[level], [](const float &x, const float &y) -> float { return std::max(x, y); }, [](const float &x) -> float { return x; }, true); diff --git a/src/algorithm/ParticleCellTreeCuda.cuh b/src/algorithm/ParticleCellTreeCuda.cuh new file mode 100644 index 00000000..d3bc6160 --- /dev/null +++ b/src/algorithm/ParticleCellTreeCuda.cuh @@ -0,0 +1,77 @@ +#ifndef PARTICLE_CELL_TREE_CUDA_CUH +#define PARTICLE_CELL_TREE_CUDA_CUH + + +#include "data_structures/APR/GenInfo.hpp" +#include "algorithm/PullingScheme.hpp" +#include "misc/CudaTools.cuh" + +/* + * CUDA representation of PCT (Particle Cell Tree) + * Allocates memory and initialize it to EMPTY + * + * Allows acces to each level via subscription operator: + * ParticleCellTreeCuda pct(aprInfo); + * pct[level] + * + * getPCTcpu and uploadPCT2GPU handle interaction with CPU code (mainly for test/debug purposes). + */ +class ParticleCellTreeCuda { + ScopedCudaMemHandler mem; + std::vector startOffsets; + GenInfo gi; + size_t numOfElements = 0; + cudaStream_t stream = nullptr; + +public: + + ParticleCellTreeCuda(const GenInfo &aprInfo, const cudaStream_t aStream) : gi(aprInfo), stream(aStream) { + // Calculate size of needed memory for PCT and offsets for particular levels + int l_max = aprInfo.l_max - 1; + int l_min = aprInfo.l_min; + + startOffsets.resize(l_max + 1, 0); + + for (int l = l_min; l <= l_max; ++l) { + auto yLen = ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1)); + auto xLen = ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1)); + auto zLen = ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1)); + size_t levelSize = yLen * xLen * zLen; + startOffsets[l] = numOfElements; + numOfElements += levelSize; + } + + // Initialize memory, it is not binded to any CPU memory so we provide nullptr + mem.initialize(nullptr, numOfElements, stream); + cudaMemsetAsync(mem.get(), EMPTY, numOfElements, stream); + } + + inline uint8_t* operator[](size_t level) { return mem.get() + startOffsets[level]; } + + auto getPCTcpu() { + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + for (int i = gi.l_min; i < gi.l_max; ++i) { + checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream)); + } + checkCuda(cudaStreamSynchronize(stream)); + + return pct; + } + + void downloadPCTfromGPU(std::vector> &pct) { + for (int i = gi.l_min; i < gi.l_max; ++i) { + checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream)); + } + checkCuda(cudaStreamSynchronize(stream)); + } + + void uploadPCT2GPU(const std::vector> &pct) { + for (int i = gi.l_min; i < gi.l_max; ++i) { + checkCuda(cudaMemcpyAsync((*this)[i], pct[i].mesh.get(), pct[i].mesh.size(), cudaMemcpyHostToDevice, stream)); + } + checkCuda(cudaStreamSynchronize(stream)); + } +}; + + +#endif diff --git a/src/algorithm/PullingScheme.hpp b/src/algorithm/PullingScheme.hpp index 58ae9ee2..05b0b723 100644 --- a/src/algorithm/PullingScheme.hpp +++ b/src/algorithm/PullingScheme.hpp @@ -13,14 +13,21 @@ #include "data_structures/Mesh/ImagePatch.hpp" #include +// Main types #define EMPTY 0 #define SEED_TYPE 1 #define BOUNDARY_TYPE 2 #define FILLER_TYPE 3 + +// Type used in linear/random access +#define UPSAMPLING_SEED_TYPE 4 + +// Types specific for this implementation of Pulling Scheme (OVPC is not using them) #define ASCENDANT 8 #define PROPOGATE 15 #define ASCENDANTNEIGHBOUR 16 + #define NEIGHBOURLOOP(jn,in,kn, boundaries) \ for(jn = boundaries[0][0]; jn < boundaries[0][1]; jn++) \ for(in = boundaries[1][0]; in < boundaries[1][1]; in++) \ @@ -51,13 +58,13 @@ for(jn = j * 2; jn < j * 2 + children_boundaries[0]; jn++) \ class PullingScheme { - double powr(uint64_t num,uint64_t pow2){ +public: + + static double powr(uint64_t num,uint64_t pow2){ //return (uint64_t) std::round(std::pow(num,pow2)); return std::round(pow(num,pow2)); } - -public: template void fill(float k, const PixelData &input); @@ -65,6 +72,7 @@ class PullingScheme { void fill_patch(float level, const PixelData &input, ImagePatch& patch); void pulling_scheme_main(); + static std::vector> generateParticleCellTree(const GenInfo &aprInfo); void initialize_particle_cell_tree(const GenInfo &aprInfo); std::vector>& getParticleCellTree() { return particle_cell_tree; } @@ -86,6 +94,25 @@ class PullingScheme { int l_max; }; + +inline std::vector> PullingScheme::generateParticleCellTree(const GenInfo &aprInfo) { + int l_max = aprInfo.l_max - 1; + int l_min = aprInfo.l_min; + + std::vector> pct; + pct.resize(l_max + 1); + + for (int l = l_min; l <= l_max; ++l) { + pct[l].initWithValue(ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1)), + ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1)), + ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1)), + EMPTY); + + } + + return pct; +} + /** * Initializes particle_cell_tree up to level (max - 1) */ @@ -93,14 +120,7 @@ inline void PullingScheme::initialize_particle_cell_tree(const GenInfo &aprInfo) l_max = aprInfo.l_max - 1; l_min = aprInfo.l_min; - particle_cell_tree.resize(l_max + 1); - - for (int l = l_min; l <= l_max; ++l) { - particle_cell_tree[l].initWithValue(ceil(aprInfo.org_dims[0] / powr(2.0, l_max - l + 1)), - ceil(aprInfo.org_dims[1] / powr(2.0, l_max - l + 1)), - ceil(aprInfo.org_dims[2] / powr(2.0, l_max - l + 1)), - EMPTY); - } + particle_cell_tree = generateParticleCellTree(aprInfo); } /** diff --git a/src/algorithm/PullingSchemeCuda.hpp b/src/algorithm/PullingSchemeCuda.hpp index 79a23560..12aa81d3 100644 --- a/src/algorithm/PullingSchemeCuda.hpp +++ b/src/algorithm/PullingSchemeCuda.hpp @@ -7,11 +7,15 @@ #include "data_structures/Mesh/PixelData.hpp" +#include "data_structures/APR/GenInfo.hpp" +#include "algorithm/ParticleCellTreeCuda.cuh" using TreeElementType = uint8_t; -template -void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax); +template +std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi); +template +void computeOvpcCuda(ImgType *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream); #endif //LIBAPR_PULLINGSCHEMECUDA_HPP diff --git a/src/algorithm/bsplineParams.h b/src/algorithm/bsplineParams.h new file mode 100644 index 00000000..44dbd1c1 --- /dev/null +++ b/src/algorithm/bsplineParams.h @@ -0,0 +1,19 @@ +#ifndef APR_BSPLINEPARAMS_H +#define APR_BSPLINEPARAMS_H + + +#include + + +struct BsplineParamsCuda { + float *bc1; + float *bc2; + float *bc3; + float *bc4; + size_t k0; + float b1; + float b2; + float norm_factor; +}; + +#endif //APR_BSPLINEPARAMS_H diff --git a/src/algorithm/bsplineXdir.cuh b/src/algorithm/bsplineXdir.cuh index be0a5f78..1df52a80 100644 --- a/src/algorithm/bsplineXdir.cuh +++ b/src/algorithm/bsplineXdir.cuh @@ -5,9 +5,11 @@ #include #include #include +#include "cudaMisc.cuh" +#include "bsplineParams.h" /** - * Runs bspline recursive filter in X direction. Each processed 2D patch consist of number of workes + * Runs bspline recursive filter in X direction. Each processed 2D patch consist of number of workers * (distributed in Y direction) and each of them is handling the whole row in X-dir. * Next patches are build on a top of first (like patch1 in example below) and they cover * whole y-dimension. Such a setup should be run for every plane in z-direction. @@ -59,42 +61,44 @@ * @param norm_factor - filter norm factor */ template -__global__ void bsplineXdir(T *image, size_t x_num, size_t y_num, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, - float b1, float b2, float norm_factor) { +__global__ void bsplineXdir(T *image, PixelDataDim dim, BsplineParamsCuda p, bool *error) { const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y; - const size_t zDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * x_num * y_num; - const size_t nextElementXdirOffset = y_num; - const size_t dirLen = x_num; + const size_t zDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.x * dim.y; + const size_t nextElementXdirOffset = dim.y; + const size_t dirLen = dim.x; + const size_t minLen = min(dirLen, p.k0); - if (yDirOffset < y_num) { + if (yDirOffset < dim.y) { float temp1 = 0; float temp2 = 0; float temp3 = 0; float temp4 = 0; + // calculate boundary values - for (int k = 0; k < k0; ++k) { + for (int k = 0; k < minLen; ++k) { T val = image[zDirOffset + k * nextElementXdirOffset + yDirOffset]; - temp1 += bc1[k] * val; - temp2 += bc2[k] * val; + temp1 += p.bc1[k] * val; + temp2 += p.bc2[k] * val; val = image[zDirOffset + (dirLen - 1 - k) * nextElementXdirOffset + yDirOffset]; - temp3 += bc3[k] * val; - temp4 += bc4[k] * val; + temp3 += p.bc3[k] * val; + temp4 += p.bc4[k] * val; } + size_t errorCnt = 0; + // set boundary values in two first and two last points processed direction - image[zDirOffset + 0 * nextElementXdirOffset + yDirOffset] = temp1; - image[zDirOffset + 1 * nextElementXdirOffset + yDirOffset] = temp2; - image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = temp3 * norm_factor; - image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = temp4 * norm_factor; + image[zDirOffset + 0 * nextElementXdirOffset + yDirOffset] = round(temp1, errorCnt); + image[zDirOffset + 1 * nextElementXdirOffset + yDirOffset] = round(temp2, errorCnt); + image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = round(temp3 * p.norm_factor, errorCnt); + image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = round(temp4 * p.norm_factor, errorCnt); // Causal Filter loop int64_t offset = zDirOffset + 2 * nextElementXdirOffset + yDirOffset; int64_t offsetLimit = zDirOffset + (dirLen - 2) * nextElementXdirOffset; while (offset < offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp1 * b2 + temp2 * b1 + image[offset]; + const float temp = round(image[offset] + p.b1 * temp2 + p.b2 * temp1, errorCnt); image[offset] = temp; temp1 = temp2; temp2 = temp; @@ -107,13 +111,15 @@ __global__ void bsplineXdir(T *image, size_t x_num, size_t y_num, offsetLimit = zDirOffset; while (offset >= offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp3 * b1 + temp4 * b2 + image[offset]; - image[offset] = temp * norm_factor; + const float temp = image[offset] + p.b1 * temp3 + p.b2 * temp4; + image[offset] = round(temp * p.norm_factor, errorCnt); temp4 = temp3; temp3 = temp; offset -= nextElementXdirOffset; } + + if (errorCnt > 0) *error = true; } } @@ -121,15 +127,24 @@ __global__ void bsplineXdir(T *image, size_t x_num, size_t y_num, * Function for launching a kernel */ template -void runBsplineXdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, - size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) { +void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockX(1, numOfWorkersYdir, 1); dim3 numBlocksX(1, - (y_num + threadsPerBlockX.y - 1) / threadsPerBlockX.y, - (z_num + threadsPerBlockX.z - 1) / threadsPerBlockX.z); - bsplineXdir <<>> (cudaImage, x_num, y_num, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor); + (dim.y + threadsPerBlockX.y - 1) / threadsPerBlockX.y, + (dim.z + threadsPerBlockX.z - 1) / threadsPerBlockX.z); + // In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel + // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. + bool isErrorDetected = false; + { + ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); + bsplineXdir <<>>(cudaImage, dim, p, error.get()); + } + + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineXdir - " + "try squashing the input image to a narrower range or use APRConverter"); + } } #endif diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index b9dc2f25..e9905b64 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -5,12 +5,15 @@ #include #include #include +#include "cudaMisc.cuh" +#include "bsplineParams.h" + /** * Runs bspline recursive filter in Y direction - divided into two phases: * 1. calculate boundary conditions * 2. run recursive filter as a set of 2D patches: - * Each processed 2D patch consist of number of workes + * Each processed 2D patch consist of number of workers * (distributed in Y direction) and each of them is handling the whole row in Y-dir. * Next patches are build on next to it in the x-dir to cover whole x * z domain. * @@ -57,44 +60,45 @@ template -__global__ void bsplineYdirBoundary(T *image, size_t x_num, size_t y_num, size_t z_num, - const float *bc1_vec, const float *bc2_vec, const float *bc3_vec, const float *bc4_vec, - size_t k0, float *boundary) { +__global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, BsplineParamsCuda p, float *boundary, bool *error) { const int xzIndexOfWorker = (blockIdx.x * blockDim.x) + threadIdx.x; const int xzIndexOfBlock = (blockIdx.x * blockDim.x); const int numOfWorkers = blockDim.x; const int currentWorkerId = threadIdx.x; - const size_t workersOffset = xzIndexOfBlock * y_num; // per each (x,z) coordinate we have y-row + const size_t workersOffset = xzIndexOfBlock * dim.y; // per each (x,z) coordinate we have y-row + + const int64_t maxXZoffset = dim.x * dim.z; - const int64_t maxXZoffset = x_num * z_num; + const size_t dirLen = dim.y; + const size_t minLen = min(dirLen, p.k0); extern __shared__ float sharedMem[]; float *bc1_vec2 = &sharedMem[0]; - float *bc2_vec2 = &bc1_vec2[k0]; - T *cache = (T*)&bc2_vec2[k0]; + float *bc2_vec2 = &bc1_vec2[p.k0]; + float *cache = (float*)&bc2_vec2[p.k0]; // Read from global mem to cache - for (int i = currentWorkerId; i < k0 * numOfWorkers; i += numOfWorkers) { - if (i < k0) { - bc1_vec2[i] = bc1_vec[i]; - bc2_vec2[i] = bc2_vec[i]; + for (int i = currentWorkerId; i < p.k0 * numOfWorkers; i += numOfWorkers) { + if (i < p.k0) { + bc1_vec2[i] = p.bc1[i]; + bc2_vec2[i] = p.bc2[i]; } - int offs = i % k0; - int work = i / k0; - if (work + xzIndexOfBlock < maxXZoffset) { - cache[work * k0 + offs] = image[workersOffset + y_num * work + offs]; + int offs = i % p.k0; + int work = i / p.k0; + if (work + xzIndexOfBlock < maxXZoffset && offs < dirLen) { + cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + offs]; } } __syncthreads(); //forwards direction - if (xzIndexOfWorker < x_num * z_num) { + if (xzIndexOfWorker < dim.x * dim.z) { float temp1 = 0; float temp2 = 0; - for (size_t k = 0; k < k0; ++k) { - temp1 += bc1_vec2[k] * cache[currentWorkerId * k0 + k]; - temp2 += bc2_vec2[k] * cache[currentWorkerId * k0 + k]; + for (size_t k = 0; k < minLen; ++k) { + temp1 += bc1_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; + temp2 += bc2_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; } boundary[xzIndexOfWorker*4 + 0] = temp1; boundary[xzIndexOfWorker*4 + 1] = temp2; @@ -103,57 +107,61 @@ __global__ void bsplineYdirBoundary(T *image, size_t x_num, size_t y_num, size_t // ----------------- second end __syncthreads(); - for (int i = currentWorkerId; i < k0 * numOfWorkers; i += numOfWorkers) { - if (i < k0) { - bc1_vec2[i] = bc3_vec[i]; - bc2_vec2[i] = bc4_vec[i]; + for (int i = currentWorkerId; i < p.k0 * numOfWorkers; i += numOfWorkers) { + if (i < p.k0) { + bc1_vec2[i] = p.bc3[i]; + bc2_vec2[i] = p.bc4[i]; } - int offs = i % k0; - int work = i / k0; - if (work + xzIndexOfBlock < maxXZoffset) { - cache[work * k0 + offs] = image[workersOffset + y_num * work + y_num - 1 - offs]; + int offs = i % p.k0; + int work = i / p.k0; + if (work + xzIndexOfBlock < maxXZoffset && offs < dirLen) { + cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + dim.y - 1 - offs]; } } __syncthreads(); + size_t errorCnt = 0; + //forwards direction - if (xzIndexOfWorker < x_num * z_num) { + if (xzIndexOfWorker < dim.x * dim.z) { float temp3 = 0; float temp4 = 0; - for (size_t k = 0; k < k0; ++k) { - temp3 += bc1_vec2[k] * cache[currentWorkerId * k0 + k]; - temp4 += bc2_vec2[k] * cache[currentWorkerId * k0 + k]; + for (size_t k = 0; k < minLen; ++k) { + temp3 += bc1_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; + temp4 += bc2_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; } - boundary[xzIndexOfWorker*4 + 2] = temp3; - boundary[xzIndexOfWorker*4 + 3] = temp4; + boundary[xzIndexOfWorker*4 + 2] = round(temp3 * p.norm_factor, errorCnt); + boundary[xzIndexOfWorker*4 + 3] = round(temp4 * p.norm_factor, errorCnt); } + + if (errorCnt > 0) *error = true; } constexpr int blockWidth = 32; constexpr int numOfThreads = 32; extern __shared__ char sharedMemProcess[]; template -__global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_num, const size_t z_num, size_t k0, - const float b1, const float b2, const float norm_factor, float *boundary) { +__global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, BsplineParamsCuda p, float *boundary, bool *error) { const int numOfWorkers = blockDim.x; const int currentWorkerId = threadIdx.x; const int xzOffset = blockIdx.x * blockDim.x; - const int64_t maxXZoffset = x_num * z_num; - const int64_t workersOffset = xzOffset * y_num; + const int64_t maxXZoffset = dim.x * dim.z; + const int64_t workersOffset = xzOffset * dim.y; - T (*cache)[blockWidth + 0] = (T (*)[blockWidth + 0]) &sharedMemProcess[0]; + float (*cache)[blockWidth + 0] = (float (*)[blockWidth + 0]) &sharedMemProcess[0]; float temp1, temp2; + size_t errorCnt = 0; // ---------------- forward direction ------------------------------------------- - for (int yBlockBegin = 0; yBlockBegin < y_num - 2; yBlockBegin += blockWidth) { + for (int yBlockBegin = 0; yBlockBegin < dim.y - 2; yBlockBegin += blockWidth) { // Read from global mem to cache for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) { int offs = i % blockWidth; int work = i / blockWidth; - if (offs + yBlockBegin < (y_num - 2) && work + xzOffset < maxXZoffset) { - cache[work][(offs + work)%blockWidth] = image[workersOffset + y_num * work + offs + yBlockBegin]; + if (offs + yBlockBegin < (dim.y - 2) && work + xzOffset < maxXZoffset) { + cache[work][(offs + work)%blockWidth] = image[workersOffset + dim.y * work + offs + yBlockBegin]; } } __syncthreads(); @@ -166,8 +174,8 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_ cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = temp1; cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = temp2; } - for (size_t k = yBlockBegin == 0 ? 2 : 0; k < blockWidth && k + yBlockBegin < y_num - 2; ++k) { - float temp = temp1*b2 + temp2*b1 + cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; + for (size_t k = yBlockBegin == 0 ? 2 : 0; k < blockWidth && k + yBlockBegin < dim.y - 2; ++k) { + float temp = temp2*p.b1 + temp1*p.b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp; temp1 = temp2; temp2 = temp; @@ -179,37 +187,37 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_ for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) { int offs = i % blockWidth; int work = i / blockWidth; - if (offs + yBlockBegin < (y_num - 2) && work + xzOffset < maxXZoffset) { - image[workersOffset + y_num * work + offs + yBlockBegin] = cache[work][(offs + work)%blockWidth]; + if (offs + yBlockBegin < (dim.y - 2) && work + xzOffset < maxXZoffset) { + image[workersOffset + dim.y * work + offs + yBlockBegin] = round(cache[work][(offs + work)%blockWidth], errorCnt); } } __syncthreads(); } // ---------------- backward direction ------------------------------------------- - for (int yBlockBegin = y_num - 1; yBlockBegin >= 0; yBlockBegin -= blockWidth) { + for (int yBlockBegin = dim.y - 1; yBlockBegin >= 0; yBlockBegin -= blockWidth) { // Read from global mem to cache for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) { int offs = i % blockWidth; int work = i / blockWidth; if (yBlockBegin - offs >= 0 && work + xzOffset < maxXZoffset) { - cache[work][(offs + work)%blockWidth] = image[workersOffset + y_num * work - offs + yBlockBegin]; + cache[work][(offs + work)%blockWidth] = image[workersOffset + dim.y * work - offs + yBlockBegin]; } } __syncthreads(); // Do operations if (xzOffset + currentWorkerId < maxXZoffset) { - if (yBlockBegin == y_num - 1) { - temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3]; - temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2]; - cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = norm_factor * temp1; - cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = norm_factor * temp2; + if (yBlockBegin == dim.y - 1) { + temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3] / p.norm_factor; + temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2] / p.norm_factor; + cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = p.norm_factor * temp1; + cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = p.norm_factor * temp2; } - for (int64_t k = yBlockBegin == y_num - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) { - float temp = temp2*b1 + temp1*b2 + cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; - cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * norm_factor; + for (int64_t k = yBlockBegin == dim.y - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) { + float temp = temp2*p.b1 + temp1*p.b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; + cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * p.norm_factor; temp1 = temp2; temp2 = temp; } @@ -221,25 +229,35 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_ int offs = i % blockWidth; int work = i / blockWidth; if (yBlockBegin - offs >= 0 && work + xzOffset < maxXZoffset) { - image[workersOffset + y_num * work - offs + yBlockBegin] = cache[work][(offs + work)%blockWidth]; + image[workersOffset + dim.y * work - offs + yBlockBegin] = round(cache[work][(offs + work)%blockWidth], errorCnt); } } __syncthreads(); } + + if (errorCnt > 0) *error = true; } /** * Function for launching a kernel */ template -void runBsplineYdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, - size_t k0, float b1, float b2, float norm_factor, float *boundary, cudaStream_t aStream) { +void runBsplineYdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, float *boundary, cudaStream_t aStream) { + dim3 threadsPerBlock(numOfThreads); - dim3 numBlocks((x_num * z_num + threadsPerBlock.x - 1) / threadsPerBlock.x); - size_t sharedMemSize = (2 /*bc vectors*/) * (k0) * sizeof(float) + numOfThreads * (k0) * sizeof(T); - bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>> (cudaImage, x_num, y_num, z_num, bc1, bc2, bc3, bc4, k0, boundary); - sharedMemSize = numOfThreads * blockWidth * sizeof(T); - bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>> (cudaImage, x_num, y_num, z_num, k0, b1, b2, norm_factor, boundary); + dim3 numBlocks((dim.x * dim.z + threadsPerBlock.x - 1) / threadsPerBlock.x); + size_t sharedMemSize = (2 /*bc vectors*/) * (p.k0) * sizeof(float) + numOfThreads * (p.k0) * sizeof(float); + bool isErrorDetected = false; + { + ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); + bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); + sharedMemSize = numOfThreads * blockWidth * sizeof(float); + bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); + } + + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineYdir - " + "try squashing the input image to a narrower range or use APRConverter"); + } } #endif diff --git a/src/algorithm/bsplineZdir.cuh b/src/algorithm/bsplineZdir.cuh index 33a5b420..43550ff8 100644 --- a/src/algorithm/bsplineZdir.cuh +++ b/src/algorithm/bsplineZdir.cuh @@ -5,6 +5,9 @@ #include #include #include +#include "cudaMisc.cuh" +#include "bsplineParams.h" + /** * Runs bspline recursive filter in Z direction. Each processed 2D patch consist of number of workes @@ -60,42 +63,44 @@ * @param norm_factor - filter norm factor */ template -__global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, - float b1, float b2, float norm_factor) { +__global__ void bsplineZdir(T *image, PixelDataDim dim, BsplineParamsCuda p, bool *error) { const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y; - const size_t xDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * y_num; // x is in 'z' to have good memory coalescing - const size_t nextElementZdirOffset = x_num * y_num; - const size_t dirLen = z_num; + const size_t xDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.y; // x is in 'z' to have good memory coalescing + const size_t nextElementZdirOffset = dim.x * dim.y; + const size_t dirLen = dim.z; + const size_t minLen = min(dirLen, p.k0); - if (yDirOffset < y_num) { + if (yDirOffset < dim.y) { float temp1 = 0; float temp2 = 0; float temp3 = 0; float temp4 = 0; + // calculate boundary values - for (int k = 0; k < k0; ++k) { + for (int k = 0; k < minLen; ++k) { T val = image[xDirOffset + k * nextElementZdirOffset + yDirOffset]; - temp1 += bc1[k] * val; - temp2 += bc2[k] * val; + temp1 += p.bc1[k] * val; + temp2 += p.bc2[k] * val; val = image[xDirOffset + (dirLen - 1 - k) * nextElementZdirOffset + yDirOffset]; - temp3 += bc3[k] * val; - temp4 += bc4[k] * val; + temp3 += p.bc3[k] * val; + temp4 += p.bc4[k] * val; } + size_t errorCnt = 0; + // set boundary values in two first and two last points processed direction - image[xDirOffset + 0 * nextElementZdirOffset + yDirOffset] = temp1; - image[xDirOffset + 1 * nextElementZdirOffset + yDirOffset] = temp2; - image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = temp3 * norm_factor; - image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = temp4 * norm_factor; + image[xDirOffset + 0 * nextElementZdirOffset + yDirOffset] = round(temp1, errorCnt); + image[xDirOffset + 1 * nextElementZdirOffset + yDirOffset] = round(temp2, errorCnt); + image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = round(temp3 * p.norm_factor, errorCnt); + image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = round(temp4 * p.norm_factor, errorCnt); // Causal Filter loop int64_t offset = xDirOffset + 2 * nextElementZdirOffset + yDirOffset; int64_t offsetLimit = xDirOffset + (dirLen - 2) * nextElementZdirOffset; while (offset < offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp1 * b2 + temp2 * b1 + image[offset]; + const float temp = round(image[offset] + p.b1 * temp2 + p.b2 * temp1, errorCnt); image[offset] = temp; temp1 = temp2; temp2 = temp; @@ -108,13 +113,15 @@ __global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num, offsetLimit = xDirOffset; while (offset >= offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp3 * b1 + temp4 * b2 + image[offset]; - image[offset] = temp * norm_factor; + const float temp = image[offset] + p.b1 * temp3 + p.b2 * temp4; + image[offset] = round(temp * p.norm_factor, errorCnt); temp4 = temp3; temp3 = temp; offset -= nextElementZdirOffset; } + + if (errorCnt > 0) *error = true; } } @@ -122,15 +129,24 @@ __global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num, * Function for launching a kernel */ template -void runBsplineZdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, - size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) { +void runBsplineZdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockZ(1, numOfWorkersYdir, 1); dim3 numBlocksZ(1, - (y_num + threadsPerBlockZ.y - 1) / threadsPerBlockZ.y, - (x_num + threadsPerBlockZ.x - 1) / threadsPerBlockZ.x); - bsplineZdir <<>> (cudaImage, x_num, y_num, z_num, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor); + (dim.y + threadsPerBlockZ.y - 1) / threadsPerBlockZ.y, + (dim.x + threadsPerBlockZ.x - 1) / threadsPerBlockZ.x); + // In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel + // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. + bool isErrorDetected = false; + { + ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); + bsplineZdir <<>> (cudaImage, dim, p, error.get()); + } + + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineZdir - " + "try squashing the input image to a narrower range or use APRConverter"); + } } #endif diff --git a/src/algorithm/cudaMisc.cuh b/src/algorithm/cudaMisc.cuh new file mode 100644 index 00000000..7442c60b --- /dev/null +++ b/src/algorithm/cudaMisc.cuh @@ -0,0 +1,66 @@ +#ifndef CUDAMISC_CUH +#define CUDAMISC_CUH + + +#include + + +/** + * floating point output -> no rounding or under-/overflow check + */ +template +__device__ std::enable_if_t::value, T> round(float val, size_t &errCount) { + return val; +} + +/** + * integer output -> check for under-/overflow and round + * + * CUDA is not supporting std::numeric_limits so this results in belows manual checking of different + * data types range. In theory we could use --expt-relaxed-constexpr flag but since it is experimental + * and without guarantee of long existence for now it is better to stick to belows definitions. + */ +template +__device__ std::enable_if_t::value, uint8_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < 0 || val > 255) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, int8_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < -128 || val > 127) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, uint16_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < 0 || val > 65535) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, int16_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < -32768 || val > 32767) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, uint32_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < 0 || val > 4294967295) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, int32_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < -2147483648 || val > 2147483647) { errCount++; } + return val; +} + + +#endif diff --git a/src/algorithm/dsGradient.cuh b/src/algorithm/dsGradient.cuh index de4a2c77..8e2efc84 100644 --- a/src/algorithm/dsGradient.cuh +++ b/src/algorithm/dsGradient.cuh @@ -5,11 +5,14 @@ template __global__ void -gradient(const T *input, size_t x_num, size_t y_num, size_t z_num, T *grad, size_t x_num_ds, size_t y_num_ds, - float hx, float hy, float hz) { +gradient(const T *input, PixelDataDim inputDim, T *grad, PixelDataDim gradDim, float hx, float hy, float hz) { const int xi = ((blockIdx.x * blockDim.x) + threadIdx.x) * 2; const int yi = ((blockIdx.y * blockDim.y) + threadIdx.y) * 2; const int zi = ((blockIdx.z * blockDim.z) + threadIdx.z) * 2; + const auto x_num = inputDim.x; + const auto y_num = inputDim.y; + const auto z_num = inputDim.z; + if (xi >= x_num || yi >= y_num || zi >= z_num) return; const size_t xnumynum = x_num * y_num; @@ -33,28 +36,28 @@ gradient(const T *input, size_t x_num, size_t y_num, size_t z_num, T *grad, size for (int y = 1; y <= 2; ++y) { float xd = (temp[z][x - 1][y] - temp[z][x + 1][y]) / (2 * hx); xd = xd * xd; - float yd = (temp[z - 1][x][y] - temp[z + 1][x][y]) / (2 * hy); - yd = yd * yd; - float zd = (temp[z][x][y - 1] - temp[z][x][y + 1]) / (2 * hz); + float zd = (temp[z - 1][x][y] - temp[z + 1][x][y]) / (2 * hz); zd = zd * zd; - float gm = __fsqrt_rn(xd + yd + zd); + float yd = (temp[z][x][y - 1] - temp[z][x][y + 1]) / (2 * hy); + yd = yd * yd; + float gm = sqrtf(xd + zd + yd); if (gm > maxGrad) maxGrad = gm; } - const size_t idx = zi / 2 * x_num_ds * y_num_ds + xi / 2 * y_num_ds + yi / 2; + const size_t idx = zi / 2 * gradDim.x * gradDim.y + xi / 2 * gradDim.y + yi / 2; grad[idx] = maxGrad; } template void runKernelGradient(const T *cudaInput, T *cudaGrad, - size_t xLenInput, size_t yLenInput, size_t zLenInput, - size_t xLenGradient, size_t yLenGradient, + PixelDataDim inputDim, + PixelDataDim gradDim, float hx, float hy, float hz, cudaStream_t aStream) { dim3 threadsPerBlock(1, 64, 1); - dim3 numBlocks((xLenInput + threadsPerBlock.x - 1) / threadsPerBlock.x, - (yLenInput + threadsPerBlock.y - 1) / threadsPerBlock.y, - (zLenInput + threadsPerBlock.z - 1) / threadsPerBlock.z); - gradient <<>> (cudaInput, xLenInput, yLenInput, zLenInput, cudaGrad, xLenGradient, yLenGradient, hx, hy, hz); + dim3 numBlocks((inputDim.x + threadsPerBlock.x - 1) / threadsPerBlock.x, + (inputDim.y + threadsPerBlock.y - 1) / threadsPerBlock.y, + (inputDim.z + threadsPerBlock.z - 1) / threadsPerBlock.z); + gradient <<>> (cudaInput, inputDim, cudaGrad, gradDim, hx, hy, hz); } diff --git a/src/algorithm/invBspline.cuh b/src/algorithm/invBspline.cuh index d422abf1..7c27d853 100644 --- a/src/algorithm/invBspline.cuh +++ b/src/algorithm/invBspline.cuh @@ -9,14 +9,18 @@ __global__ void invBsplineYdir(T *image, size_t x_num, size_t y_num, size_t z_nu int workerOffset = workerIdx; int loopNum = 0; - T p = 0; - T v = 0; + const float a1 = 1.0/6.0; + const float a2 = 4.0/6.0; + const float a3 = 1.0/6.0; + + float p = 0; + float v = 0; bool notLastInRow = true; while (workerOffset < y_num) { if (notLastInRow) v = image[workersOffset + workerOffset]; - T temp = __shfl_sync(active, v, workerIdx + blockDim.y - 1, blockDim.y); + float temp = __shfl_sync(active, v, workerIdx + blockDim.y - 1, blockDim.y); p = notLastInRow ? temp : p; - T n = __shfl_sync(active, v, workerIdx + 1, blockDim.y); + float n = __shfl_sync(active, v, workerIdx + 1, blockDim.y); // handle boundary (reflective mode) if (workerOffset == 0) p = n; @@ -24,7 +28,7 @@ __global__ void invBsplineYdir(T *image, size_t x_num, size_t y_num, size_t z_nu notLastInRow = (workerIdx + 1 + loopNum) % blockDim.y != 0; if (notLastInRow) { - v = (p + v * 4 + n) / 6.0; + v = a1 * p + a2 * v + a3 * n; image[workersOffset + workerOffset] = v; workerOffset += blockDim.y; } @@ -49,21 +53,25 @@ __global__ void invBsplineXdir(T *image, size_t x_num, size_t y_num, size_t z_nu const int workerIdx = blockIdx.y * blockDim.y + threadIdx.y ; const int nextElementOffset = y_num; + const float a1 = 1.0/6.0; + const float a2 = 4.0/6.0; + const float a3 = 1.0/6.0; + if (workerIdx < y_num) { int currElementOffset = 0; T v1 = image[workerOffset + currElementOffset]; T v2 = image[workerOffset + currElementOffset + nextElementOffset]; - image[workerOffset + currElementOffset] = (2 * v2 + 4 * v1) / 6.0; + image[workerOffset + currElementOffset] = a1 * v2 + a2 * v1 + a3 * v2; for (int x = 2; x < x_num; ++x) { T v3 = image[workerOffset + currElementOffset + 2 * nextElementOffset]; - image[workerOffset + currElementOffset + nextElementOffset] = (v1 + 4 * v2 + v3) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = (a1 * v1 + a2 * v2 + a3 * v3); v1 = v2; v2 = v3; currElementOffset += nextElementOffset; } - image[workerOffset + currElementOffset + nextElementOffset] = (2 * v1 + 4 * v2) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = (a1 + a3) * v1 + a2 * v2; } } @@ -83,21 +91,25 @@ __global__ void invBsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_nu const int workerIdx = blockIdx.y * blockDim.y + threadIdx.y ; const int nextElementOffset = x_num * y_num; + const float a1 = 1.0/6.0; + const float a2 = 4.0/6.0; + const float a3 = 1.0/6.0; + if (workerIdx < y_num) { int currElementOffset = 0; T v1 = image[workerOffset + currElementOffset]; T v2 = image[workerOffset + currElementOffset + nextElementOffset]; - image[workerOffset + currElementOffset] = (2 * v2 + 4 * v1) / 6.0; + image[workerOffset + currElementOffset] = a1 * v2 + a2 * v1 + a1 * v2; for (int x = 2; x < z_num; ++x) { T v3 = image[workerOffset + currElementOffset + 2 * nextElementOffset]; - image[workerOffset + currElementOffset + nextElementOffset] = (v1 + 4 * v2 + v3) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = a1 * v1 + a2 * v2 + a3 * v3; v1 = v2; v2 = v3; currElementOffset += nextElementOffset; } - image[workerOffset + currElementOffset + nextElementOffset] = (2 * v1 + 4 * v2) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = (a1 + a3) * v1 + a2 * v2; } } diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp index f8fd090e..8d5da2bd 100644 --- a/src/data_structures/APR/GenInfo.hpp +++ b/src/data_structures/APR/GenInfo.hpp @@ -5,6 +5,11 @@ #ifndef LIBAPR_GENINFO_HPP #define LIBAPR_GENINFO_HPP + +#include +#include +#include + //Note this function sets up the domain for the APR for a given input size. class GenInfo { @@ -29,6 +34,16 @@ class GenInfo { std::vector level_size; // precomputation of the size of each level, used by the iterators. + GenInfo() {} + GenInfo(const PixelDataDim &dim) { init(dim); } + + size_t getSize() const { return (size_t)y_num[l_max] * x_num[l_max] * z_num[l_max]; } + + //initialize the information given the original dimensions + void init(const PixelDataDim &dim) { + init(dim.y, dim.x, dim.z); + } + //initialize the information given the original dimensions void init(uint64_t y_org,uint64_t x_org,uint64_t z_org){ @@ -64,6 +79,11 @@ class GenInfo { } } + //initialize the information given the original dimensions + void init_tree(const PixelDataDim &dim){ + init_tree(dim.y, dim.x, dim.z); + } + //initialize the information given the original dimensions void init_tree(uint64_t y_org,uint64_t x_org,uint64_t z_org){ @@ -97,6 +117,26 @@ class GenInfo { z_num[l] = ceil(z_org / cellSize); } } + + friend std::ostream & operator<<(std::ostream &os, const GenInfo &gi) { + os << "GenInfo {\n"; + os << " Original dimensions(y/x/z): [" << gi.org_dims[0] << ", " << gi.org_dims[1] << ", " << gi.org_dims[2] << "]\n"; + os << " Original size: " << gi.getSize() << "\n"; + os << " Number of dimensions: " << static_cast(gi.number_dimensions) << "\n"; + os << " l_min, l_max: {" << gi.l_min << " - " << gi.l_max << "}\n"; + os << " total number of particles: " << gi.total_number_particles << "\n"; + os << " y_num, x_num, z_num:\n"; + for (int l = gi.l_min; l <= gi.l_max; ++l) { + os << " level [" << l << "] = " << gi.y_num[l] << ", " << gi.x_num[l] << ", " << gi.z_num[l] << "\n"; + } + os << " level_size:\n"; + for (int l = gi.l_min; l <= gi.l_max; ++l) { + os << " level " << l << ": " << gi.level_size[l] << "\n"; + } + os << "}"; + + return os; + } }; diff --git a/src/data_structures/APR/access/LinearAccess.hpp b/src/data_structures/APR/access/LinearAccess.hpp index 5f92c0ef..b92476c2 100644 --- a/src/data_structures/APR/access/LinearAccess.hpp +++ b/src/data_structures/APR/access/LinearAccess.hpp @@ -11,6 +11,7 @@ #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/APRParameters.hpp" +#include "algorithm/PullingScheme.hpp" #include "APRAccessStructures.hpp" @@ -225,44 +226,43 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet initialize_xz_linear(); + // ********************************************************************************************************************* + // FULL RESOLUTION + // ********************************************************************************************************************* //edge case if(level_max()<=2){ // For performance reasons and clarity of the code, it doesn't make sense here to handle these cases. Below assumes there is atleast levels <=2; //just initialize full resolution const auto level_start = level_xz_vec[level_max()]; - uint64_t counter = 0; + uint64_t particleCounter = 0; for (int z = 0; z < z_num(level_max()); ++z) { for (int x = 0; x < x_num(level_max()); ++x) { const size_t offset_pc_data = z * x_num(level_max()) + x; - for (int y = 0; y < y_num(level_max()); ++y) { - - counter++; - } - xz_end_vec[level_start + offset_pc_data] = counter; + particleCounter += y_num(level_max()); + xz_end_vec[level_start + offset_pc_data] = particleCounter; } } - y_vec.resize(counter); - counter = 0; + genInfo->total_number_particles = xz_end_vec.back(); + y_vec.resize(genInfo->total_number_particles); + size_t idx = 0; for (int z = 0; z < z_num(level_max()); ++z) { for (int x = 0; x < x_num(level_max()); ++x) { - for (int y = 0; y < y_num(level_max()); ++y) { - y_vec[counter] = y; - counter++; + y_vec[idx++] = y; } } } - return; } - // ======================================================================== + // ********************************************************************************************************************* + // FIRST STEP + // ********************************************************************************************************************* apr_timer.start_timer("first_step"); - const uint8_t UPSAMPLING_SEED_TYPE = 4; const uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization for (int level = level_min()+1; level < level_max(); ++level) { const size_t xLen = genInfo->x_num[level]; @@ -293,7 +293,9 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet } apr_timer.stop_timer(); - // ======================================================================== + // ********************************************************************************************************************* + // SECOND STEP + // ********************************************************************************************************************* apr_timer.start_timer("second_step"); @@ -328,14 +330,15 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet } } + +// ********************************************************************************************************************* +// SECOND STEP LAST LEVEL +// +// l_max - 1 is special as it also has the l_max information that then needs to be upsampled. +// ********************************************************************************************************************* std::vector temp_max_xz; temp_max_xz.resize(genInfo->z_num[genInfo->l_max - 1]*genInfo->x_num[genInfo->l_max - 1],0); - /* - * l_max - 1 is special as it also has the l_max information that then needs to be upsampled. - * - */ - size_t l_minus_1 = genInfo->l_max - 1; const size_t xLen = genInfo->x_num[l_minus_1]; const size_t zLen = genInfo->z_num[l_minus_1]; @@ -409,6 +412,11 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet apr_timer.stop_timer(); + + // ********************************************************************************************************************* + // THIRD STEP - Get Y values + // ********************************************************************************************************************* + apr_timer.start_timer("init y"); genInfo->total_number_particles = xz_end_vec.back(); @@ -452,10 +460,11 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet } } - /* - * l_max - 1 is special as it also has the l_max information that then needs to be upsampled. - * - */ + // ********************************************************************************************************************* + // 4th STEP LAST LEVEL + // + // l_max - 1 is special as it also has the l_max information that then needs to be upsampled. + // ********************************************************************************************************************* #ifdef HAVE_OPENMP @@ -545,7 +554,6 @@ inline void LinearAccess::initialize_linear_structure_sparse(APRParameters& apr_ // ======================================================================== apr_timer.start_timer("first_step"); - const uint8_t UPSAMPLING_SEED_TYPE = 4; const uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization for (int level = level_min()+1; level < level_max(); ++level) { const size_t xLen = genInfo->x_num[level]; diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu new file mode 100644 index 00000000..1a876d0e --- /dev/null +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -0,0 +1,638 @@ +#include "LinearAccessCuda.hpp" + +#include "misc/CudaTools.cuh" +#include "algorithm/ParticleCellTreeCuda.cuh" + +// CUDA version of GenInfo structure +typedef struct GenInfoCuda_t { + int l_min; + int l_max; + + int *org_dims; // fixed size: [3] + + uint8_t number_dimensions; + + int *x_num; + int *y_num; + int *z_num; + + // this differs from original GenInfo structure + // since we need to be able to send data back from GPU to CPU + uint64_t *total_number_particles; + + int *level_size; + + uint64_t get_total_number_particles() const { return *total_number_particles; } + + __device__ int level_max() const { return l_max; } + __device__ int level_min() const { return l_min; } + +} GenInfoCuda; + +// ----------------------------- + +/* + * Class for easy transfering to/from GPU of GenInfo structure. + */ +class GenInfoGpuAccess { + GenInfo &gi; + + cudaStream_t iStream; + + ScopedCudaMemHandler org_dims; + ScopedCudaMemHandler x_num; + ScopedCudaMemHandler y_num; + ScopedCudaMemHandler z_num; + ScopedCudaMemHandler total_number_particles; + ScopedCudaMemHandler level_size; + + +public: + GenInfoGpuAccess(GenInfo &genInfo, cudaStream_t cudaStream) : + gi(genInfo), + iStream(cudaStream), + org_dims(gi.org_dims, 3, iStream), + x_num(gi.x_num.data(), gi.x_num.size(), iStream), + y_num(gi.y_num.data(), gi.y_num.size(), iStream), + z_num(gi.z_num.data(), gi.z_num.size(), iStream), + total_number_particles(&gi.total_number_particles, 1, iStream), + level_size(gi.level_size.data(), gi.level_size.size(), iStream) + { + } + + GenInfoCuda getGenInfoCuda() { + GenInfoCuda gic; + + gic.l_min = gi.l_min; + gic.l_max = gi.l_max; + gic.org_dims = org_dims.get(); + gic.number_dimensions = gi.number_dimensions; + gic.x_num = x_num.get(); + gic.y_num = y_num.get(); + gic.z_num = z_num.get(); + gic.total_number_particles = total_number_particles.get(); + gic.level_size = level_size.get(); + + return gic; + } + + ~GenInfoGpuAccess() { + copyDtoH(); + } + + void copyHtoD() { + // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension) + total_number_particles.copyH2D(); + } + + void copyDtoH() { + // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension) + total_number_particles.copyD2H(); + } +}; + +// ********************************************************************************************************************* +// FULL RESOLUTION +// ********************************************************************************************************************* +/** + * Handle edge case for #levels <= 2 + * For performance reasons and clarity of the code, + * it doesn't make sense here to handle these cases. + * Below assumes there is at least levels <=2; + * @param level_xz + * @param xz_end + * @param y + * @param gic - cuda version of GenInfo + */ +__global__ void fullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, GenInfoCuda gic) { + + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const unsigned levelMax = gic.level_max(); + const uint64_t xMax = gic.x_num[levelMax]; + const uint64_t yMax = gic.y_num[levelMax]; + const uint64_t zMax = gic.z_num[levelMax]; + + + if (x < xMax && z < zMax) { + const uint64_t levelStart = level_xz[levelMax]; + uint64_t offset_pc_data = z * xMax + x; + uint64_t particleCounter = (1 + x + z * xMax) * yMax; + + xz_end[levelStart + offset_pc_data] = particleCounter; + + for (int i = 0; i < yMax; ++i) { + uint64_t idx = (xMax * z + x) * yMax + i; + y[idx] = i; + } + } + + if (x == 0 && z == 0) { + *gic.total_number_particles = xMax * yMax * zMax; + } +} + +void runFullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, const GenInfo &gi, GenInfoGpuAccess &giga, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[gi.l_max] + threadsPerBlock.z - 1)/threadsPerBlock.z); + fullResolution<<>>(level_xz, xz_end, y, giga.getGenInfoCuda()); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFullResolution failed"); + } +} + + +// ********************************************************************************************************************* +// FIRST STEP +// ********************************************************************************************************************* + +static constexpr uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization + + +__global__ void firstStep(const uint8_t *prevLevel, uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level]; + const uint64_t yLen = gic.y_num[level]; + const uint64_t zLen = gic.z_num[level]; + const uint64_t xLenDS = gic.x_num[level - 1]; + const uint64_t yLenDS = gic.y_num[level - 1]; + + if (x < xLen && z < zLen) { + const size_t offset_part_map_ds = (x / 2) * yLenDS + (z / 2) * yLenDS * xLenDS; + const size_t offset_part_map = x * yLen + z * yLen * xLen; + + for (size_t y = 0; y < yLenDS; ++y) { + uint8_t status = prevLevel[offset_part_map_ds + y]; + if (status > 0 && status <= min_type) { + currLevel[offset_part_map + 2 * y] = seed_us; // 2 * y + currLevel[offset_part_map + min(2 * y + 1, yLen - 1)] = seed_us; // 2 * y + 1 + } + } + } +} + +void runFirstStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + for (int level = gi.l_min + 1; level < gi.l_max; ++level) { + dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); + auto *p_mapPrev = p_map[level - 1]; + auto *p_mapCurr = p_map[level]; + firstStep<<>>(p_mapPrev, p_mapCurr, level, min_type, giga.getGenInfoCuda()); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFirstStep failed"); + } +} + + +// ********************************************************************************************************************* +// SECOND STEP +// ********************************************************************************************************************* + + +__global__ void secondStep(const uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level]; + const uint64_t yLen = gic.y_num[level]; + const uint64_t zLen = gic.z_num[level]; + + const uint64_t level_start = level_xz[level]; + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + const size_t offset_part_map = yLen * offset_pc_data; + + uint64_t counter = 0; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + counter++; + } + } + + xz_end[level_start + offset_pc_data] = counter; + } +} + +void runSecondStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + for (int level = gi.l_min; level < gi.l_max - 1; ++level) { + dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); + auto *p_mapCurr = p_map[level]; + secondStep<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runSecondStep failed"); + } +} + + +// ********************************************************************************************************************* +// SECOND STEP LAST LEVEL +// +// l_max - 1 is special as it also has the l_max information that then needs to be upsampled. +// ********************************************************************************************************************* + + +__global__ void secondStepLastLevel(const uint8_t *currLevel, int level_minus_1, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level_minus_1]; + const uint64_t yLen = gic.y_num[level_minus_1]; + const uint64_t zLen = gic.z_num[level_minus_1]; + + const uint64_t xLen_m = gic.x_num[level_minus_1 + 1]; // level max + const uint64_t yLen_m = gic.y_num[level_minus_1 + 1]; // level max + const uint64_t zLen_m = gic.z_num[level_minus_1 + 1]; // level max + + const uint64_t level_start = level_xz[level_minus_1]; + const uint64_t level_start_m = level_xz[level_minus_1 + 1]; // level max + + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + const size_t offset_part_map = yLen * offset_pc_data; + + uint64_t counter = 0; + uint64_t counter_l = 0; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + counter++; + } + else if (status > 0 && status <= min_type) { + counter_l++; + + if ((2 * y) < (yLen_m - 1)) { + counter_l++; + } + } + } + + xz_end[level_start + offset_pc_data] = counter; + + // In original CPU code value of counter_l is remembered in temporary buffer and later + // write down to xz_end vector. Here is the solution without need of temp. buffer. + for (size_t dz = 0; dz <= 1; dz++) { + for (size_t dx = 0; dx <= 1; dx++) { + size_t uz = 2 * z + dz; // upsampled z + size_t ux = 2 * x + dx; // upsampled x + if (uz < zLen_m && ux < xLen_m) { + const size_t offset_pc_data_m = uz * xLen_m + ux; + xz_end[level_start_m + offset_pc_data_m] = counter_l; + } + } + } + + } +} + +__global__ void secondStepCountParticles(GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total) { + // std::partial_sum on one CUDA core naive implementation + size_t sum = xz_end[0]; + for (size_t i = 1; i < counter_total; i++) { + sum += xz_end[i]; + xz_end[i] = sum; + } + + *gic.total_number_particles = xz_end[counter_total -1]; +} + +void runSecondStepLastLevel(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + dim3 numBlocks( (gi.x_num[gi.l_max - 1] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[gi.l_max - 1] + threadsPerBlock.z - 1)/threadsPerBlock.z); + + int level = gi.l_max - 1; + auto *p_mapCurr = p_map[level]; + secondStepLastLevel<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runSecondStepLastLevel #1 failed"); + } + + secondStepCountParticles<<<1, 1, 0, aStream>>>(giga.getGenInfoCuda(), level_xz, xz_end, counter_total); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runSecondStepLastLevel #2 failed"); + } +} + + +// ********************************************************************************************************************* +// THIRD STEP - Get Y values +// ********************************************************************************************************************* + + +__global__ void getYvalues(const uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level]; + const uint64_t yLen = gic.y_num[level]; + const uint64_t zLen = gic.z_num[level]; + + const uint64_t level_start = level_xz[level]; + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + const size_t offset_part_map = yLen * offset_pc_data; + + uint64_t counter = 0; + + uint64_t offset_y = xz_end[level_start + offset_pc_data - 1]; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + y_vec[counter + offset_y] = y; + counter++; + } + } + } +} + +void runGetYvalues(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + for (int level = gi.l_min; level < gi.l_max - 1; ++level) { + dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); + auto *p_mapCurr = p_map[level]; + getYvalues<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runGetYvalues failed"); + } +} + + +// ********************************************************************************************************************* +// 4th STEP LAST LEVEL +// +// l_max - 1 is special as it also has the l_max information that then needs to be upsampled. +// ********************************************************************************************************************* + + +__global__ void fourthStep(const uint8_t *currLevel, int level_minus_1, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level_minus_1]; + const uint64_t yLen = gic.y_num[level_minus_1]; + const uint64_t zLen = gic.z_num[level_minus_1]; + + const uint64_t xLen_m = gic.x_num[level_minus_1 + 1]; // level max + const uint64_t yLen_m = gic.y_num[level_minus_1 + 1]; // level max + + const uint64_t level_start_minus_1 = level_xz[level_minus_1]; + const uint64_t level_start_m = level_xz[level_minus_1 + 1]; // level max + + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + + const size_t offset_pc_data_m = (z*2) * xLen_m + x * 2; + const size_t offset_part_map = yLen * offset_pc_data; // current level + + uint64_t counter = 0; + uint64_t counter_l = 0; + + uint64_t offset_y = xz_end[level_start_minus_1 + offset_pc_data - 1]; + uint64_t offset_y_m = xz_end[level_start_m + offset_pc_data_m -1]; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + y_vec[counter + offset_y] = y; + counter++; + } + else if (status > 0 && status <= min_type) { + y_vec[counter_l + offset_y_m] = 2*y; + counter_l++; + + if ((2 * y) < (yLen_m - 1)) { + y_vec[counter_l + offset_y_m] = 2*y + 1; + counter_l++; + } + } + } + } +} + +__global__ void fourthStepLastLevel(GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + + int maxLevel = gic.level_max(); + const uint64_t xLen_m = gic.x_num[maxLevel]; // level max + const uint64_t zLen_m = gic.z_num[maxLevel]; // level max + + const uint64_t level_start_m = level_xz[maxLevel]; + + + if (x < xLen_m && z < zLen_m) { + + // first check if it's not already there + if ( ((z % 2) != 0) || ((x % 2) != 0) ) { + const size_t offset_pc_data_m = z * xLen_m + x; + const size_t offset_pc_data_m_f = (z/2) * 2 * xLen_m + (x/2) * 2; + + uint64_t offset_y_b_f = xz_end[level_start_m + offset_pc_data_m_f - 1]; + uint64_t offset_y_e_f = xz_end[level_start_m + offset_pc_data_m_f]; + uint64_t offset_y_b = xz_end[level_start_m + offset_pc_data_m - 1]; + + for (uint64_t idx = offset_y_b_f; idx < offset_y_e_f; ++idx) { + y_vec[offset_y_b++] = y_vec[idx]; + } + } + + } +} + +void runFourthStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, uint64_t counter_total, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[gi.l_max] + threadsPerBlock.z - 1)/threadsPerBlock.z); + + int level = gi.l_max - 1; + auto *p_mapCurr = p_map[level]; + fourthStep<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFourthStep #1 failed"); + } + + fourthStepLastLevel<<>>(giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + + cudaError_t err2 = cudaGetLastError(); + if (err2 != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFourthStep #2 failed"); + } +} + + +// ********************************************************************************************************************* +// MAIN FUNC TO CALL - implements logic of LinearAccess::initialize_linear_structure CPU func. +// ********************************************************************************************************************* + + +/* + * This function does everything: + * - creates CPU structures + * - copies everything to GPU + * - run computation of all linear-structures + * - copy it back to CPU + * - returns all the structure + * + * In current shape it is a good function for testing implementation rather than using it in production code. + * Production code should use parts of it and work on pre-allocated memory - probably in GpuProcessingTask. + */ +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct) { + + cudaStream_t aStream = nullptr; + + // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing + // all steps + ParticleCellTreeCuda p_map (gi, aStream); + p_map.uploadPCT2GPU(pct); + + uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2; + + VectorData y_vec(true); + VectorData xz_end_vec(true); + VectorData level_xz_vec(true); + + // initialize_xz_linear() - CPU impl. + uint64_t counter_total = 1; //the buffer val to allow -1 calls without checking. + level_xz_vec.resize(gi.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1; + level_xz_vec[0] = 1; //allowing for the offset. + for (int i = 0; i <= gi.l_max; ++i) { + counter_total += gi.x_num[i] * gi.z_num[i]; + level_xz_vec[i + 1] = counter_total; + } + xz_end_vec.resize(counter_total, 0); + +// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; +// prt(y_vec); +// prt(xz_end_vec); +// prt(level_xz_vec); + + // TODO: This is temporary solution. + // Since in CPU code size of y_vec is calculated 'on the fly' and in CUDA code it would be much better + // to have pre-allocated memory for that - currently y_vec is pre-allocated to have maximum size. This is not + // optimal but always working solution. If any better idea pop up - it will be changed. + size_t maxYvecSize = gi.x_num[gi.l_max] * gi.y_num[gi.l_max] * gi.z_num[gi.l_max]; + y_vec.resize(maxYvecSize); + + + { + ScopedCudaMemHandler y_vec_cuda(y_vec.data(), y_vec.size()); + ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size()); + ScopedCudaMemHandler level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size()); + GenInfoGpuAccess giga(gi, aStream); + if (gi.l_max <= 2) { + runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), gi, giga, aStream); + } + else { + runFirstStep(gi, giga, p_map, min_type, aStream); + runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), aStream); + runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), counter_total, aStream); + runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), aStream); + runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), counter_total, aStream); + } + } + + // TODO: Resized back to correct size, should it be initialized to this size in the first place or pre-allocation for + // full size is more than enough? (for example in case of computing particles for multiple frames with same resolution + // we can get different size of particles for each frame - with preallocated buffer we can do all of them on it). + y_vec.resize(gi.total_number_particles); + + // Transfer changes to PCT from GPU to CPU (this is needed only for tests) + p_map.downloadPCTfromGPU(pct); + + + LinearAccessCudaStructs lac; + lac.y_vec.swap(y_vec); + lac.xz_end_vec.swap(xz_end_vec); + lac.level_xz_vec.swap(level_xz_vec); + + return lac; +} + +void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream) { + + uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2; + + VectorData xz_end_vec(true); + VectorData level_xz_vec(true); + + // initialize_xz_linear() - CPU impl. + uint64_t counter_total = 1; //the buffer val to allow -1 calls without checking. + level_xz_vec.resize(gi.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1; + level_xz_vec[0] = 1; //allowing for the offset. + for (int i = 0; i <= gi.l_max; ++i) { + counter_total += gi.x_num[i] * gi.z_num[i]; + level_xz_vec[i + 1] = counter_total; + } + xz_end_vec.resize(counter_total, 0); + + + { + ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size()); + ScopedCudaMemHandler level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size()); + GenInfoGpuAccess giga(gi, aStream); + if (gi.l_max <= 2) { + runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, gi, giga, aStream); + } + else { + runFirstStep(gi, giga, p_map, min_type, aStream); + runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), aStream); + runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), counter_total, aStream); + runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, aStream); + runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, counter_total, aStream); + } + } + + VectorData y_vec(true); + y_vec.resize(gi.total_number_particles); + checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda, gi.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, aStream)); + checkCuda(cudaStreamSynchronize(aStream)); + + lacs.y_vec.swap(y_vec); + lacs.xz_end_vec.swap(xz_end_vec); + lacs.level_xz_vec.swap(level_xz_vec); +} diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp new file mode 100644 index 00000000..27d56ab6 --- /dev/null +++ b/src/data_structures/APR/access/LinearAccessCuda.hpp @@ -0,0 +1,20 @@ +#ifndef APR_LINEARACCESSCUDA_HPP +#define APR_LINEARACCESSCUDA_HPP + +#include "algorithm/APRParameters.hpp" +#include "data_structures/Mesh/PixelData.hpp" +#include "data_structures/APR/GenInfo.hpp" +#include "algorithm/ParticleCellTreeCuda.cuh" + +typedef struct { + VectorData y_vec; + VectorData xz_end_vec; + VectorData level_xz_vec; +} LinearAccessCudaStructs; + +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); + +void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream); + + +#endif //APR_LINEARACCESSCUDA_HPP diff --git a/src/data_structures/APR/access/RandomAccess.hpp b/src/data_structures/APR/access/RandomAccess.hpp index 0daf7a54..18366d99 100644 --- a/src/data_structures/APR/access/RandomAccess.hpp +++ b/src/data_structures/APR/access/RandomAccess.hpp @@ -1210,7 +1210,7 @@ inline void RandomAccess::initialize_tree_access(RandomAccess& APROwn_access, st } -void RandomAccess::init_data_structure_tree(RandomAccess& APROwn_access, SparseGaps>& y_begin){ +inline void RandomAccess::init_data_structure_tree(RandomAccess& APROwn_access, SparseGaps>& y_begin){ uint64_t cumsum = 0; APRTimer apr_timer(false); @@ -1423,7 +1423,7 @@ inline void RandomAccess::initialize_tree_access_sparse(RandomAccess& APROwn_acc } -void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParameters& apr_parameters, SparseGaps &p_map) { +inline void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParameters& apr_parameters, SparseGaps &p_map) { // // Initialize the new structure; // @@ -1513,7 +1513,7 @@ void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParame gap.global_index_begin_offset = 0; uint64_t counter = 0; - uint16_t prev_y = -2; //init + uint16_t prev_y = 65534; // Originally = -2 which is 65534 when assigned to uint16 - removing compiler error //init auto& mesh = p_map.data[i][offset_pc_data][0].mesh; @@ -1577,7 +1577,7 @@ void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParame auto& mesh = p_map.data[i][offset_pc_data1][0].mesh; - uint16_t prev_y = -2; //init + uint16_t prev_y = 65534; // Originally = -2 which is 65534 when assigned to uint16 - removing compiler error //init //SPARSE iteration for (auto it=mesh.begin(); it!=mesh.end(); ++it) { diff --git a/src/data_structures/Mesh/ImagePatch.hpp b/src/data_structures/Mesh/ImagePatch.hpp index a249efdd..01d27fd3 100644 --- a/src/data_structures/Mesh/ImagePatch.hpp +++ b/src/data_structures/Mesh/ImagePatch.hpp @@ -38,7 +38,7 @@ struct ImagePatch { }; -void initPatchGlobal(ImagePatch& patch, int z_begin_global, int z_end_global, int x_begin_global, int x_end_global, int y_begin_global, int y_end_global) { +inline void initPatchGlobal(ImagePatch& patch, int z_begin_global, int z_end_global, int x_begin_global, int x_end_global, int y_begin_global, int y_end_global) { patch.z_begin_global = z_begin_global; patch.x_begin_global = x_begin_global; patch.y_begin_global = y_begin_global; diff --git a/src/data_structures/Mesh/PixelData.cu b/src/data_structures/Mesh/PixelData.cu index fd27f4d5..35924482 100644 --- a/src/data_structures/Mesh/PixelData.cu +++ b/src/data_structures/Mesh/PixelData.cu @@ -10,11 +10,14 @@ #include "misc/CudaTools.cuh" #include "downsample.cuh" -#include +#include "paddPixelData.cuh" + // explicit instantiation of handled types template void downsampleMeanCuda(const PixelData&, PixelData&); template void downsampleMaxCuda(const PixelData&, PixelData&); +template void paddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); +template void unpaddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); template void downsampleMeanCuda(const PixelData &input, PixelData &output) { @@ -31,3 +34,19 @@ void downsampleMaxCuda(const PixelData &input, PixelData &output) { runDownsampleMax(in.get(), out.get(), input.x_num, input.y_num, input.z_num, 0); }; + +template +void paddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize) { + ScopedCudaMemHandler, H2D> inputData(input); + ScopedCudaMemHandler, D2H> outputData(output); + + runPaddPixels(inputData.get(), outputData.get(), input.getDimension(), output.getDimension(), padSize, 0); +}; + +template +void unpaddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize) { + ScopedCudaMemHandler, H2D> inputData(input); + ScopedCudaMemHandler, D2H> outputData(output); + + runUnpaddPixels(inputData.get(), outputData.get(), input.getDimension(), output.getDimension(), padSize, 0); +}; diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index 931b95a3..f0127920 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -34,9 +34,11 @@ struct PixelDataDim { size_t x; size_t z; - PixelDataDim(size_t y, size_t x, size_t z) : y(y), x(x), z(z) {} + constexpr PixelDataDim(size_t y, size_t x, size_t z) : y(y), x(x), z(z) {} size_t size() const { return y * x * z; } + size_t maxDimSize() const { return std::max(x, std::max(y, z)); } + int numOfDimensions() const { return (int)(x > 1) + (int)(y > 1) + (int)(z > 1); } PixelDataDim operator+(const PixelDataDim &rhs) const { return {y + rhs.y, x + rhs.x, z + rhs.z}; } PixelDataDim operator-(const PixelDataDim &rhs) const { return {y - rhs.y, x - rhs.x, z - rhs.z}; } @@ -147,10 +149,6 @@ public : usePinnedMemory = usePinned; } - void setUsePinnedMemory(bool usePinned){ - usePinnedMemory = usePinned; - } - inline uint64_t size() const{ return vec.size(); } @@ -281,8 +279,19 @@ public : std::swap(usePinnedMemory, aObj.usePinnedMemory); std::swap(vecMemory, aObj.vecMemory); vec.swap(aObj.vec); +#ifdef APR_USE_CUDA + std::swap(vecMemoryPinned, aObj.vecMemoryPinned); +#endif } + VectorData(VectorData &&aObj) { + usePinnedMemory = aObj.usePinnedMemory; + vecMemory.swap(aObj.vecMemory); + vec = std::move(aObj.vec); +#ifdef APR_USE_CUDA + vecMemoryPinned =std::move(aObj.vecMemoryPinned); +#endif + } /** * Apply unary operator to each element in parallel, writing the result to VectorData 'output'. @@ -436,6 +445,19 @@ public : */ PixelData(int aSizeOfY, int aSizeOfX, int aSizeOfZ, T aInitVal) { initWithValue(aSizeOfY, aSizeOfX, aSizeOfZ, aInitVal); } + /** + * Constructor - initialize initial size of mesh to provided values + * @param aDims - PixelDataDim with length of each dimension + */ + PixelData(PixelDataDim aDims) { init(aDims.y, aDims.x, aDims.z); } + + /** + * Constructor - creates mesh with provided dimentions initialized to aInitVal + * @param aDims - PixelDataDim with length of each dimension + * @param aInitVal - initial value of all elements + */ + PixelData(PixelDataDim aDims, T aInitVal) { initWithValue(aDims.y, aDims.x, aDims.z, aInitVal); } + /** * Move constructor * @param aObj mesh to be moved @@ -498,6 +520,16 @@ public : * @return element @(y, x, z) */ T& operator()(int y, int x, int z) { + // TODO: In number of places during running tests below check shows problems. + // Investigate and try to fix. Such check in future probably should be permanent + // to discover all problems rather than hiding them. +#ifndef NDEBUG // with Cmake we need to use double neg. condition since there is not ifdef DEBUG defined :( + if ((y < 0 || y >= y_num) || (x < 0 || x >= x_num) || (z < 0 || z >= z_num)) { +// std::cerr << "Provided coordinates=(" << y << ", " << x << ", " << z; +// std::cerr << ") while PixelData size=(" << y_num << ", " << x_num << ", " << z_num << ")" << std::endl; +// throw std::runtime_error("Provided (y,x,z) coordinates are out of range!"); + } +#endif y = std::min(y, y_num-1); x = std::min(x, x_num-1); z = std::min(z, z_num-1); @@ -710,6 +742,10 @@ public : init(y_num_ds, x_num_ds, z_num_ds, aUsePinnedMemory); } + void initDownsampled(const PixelDataDim &dim, bool aUsePinnedMemory) { + initDownsampled(dim.y, dim.x, dim.z, aUsePinnedMemory); + } + /** * Initializes mesh with size of half of provided dimensions (rounding up if not divisible by 2) and initialize values * @param aSizeOfY @@ -725,6 +761,10 @@ public : initWithValue(y_num_ds, x_num_ds, z_num_ds, aInitVal, aUsePinnedMemory); } + void initDownsampled(const PixelDataDim &dim, T aInitVal, bool aUsePinnedMemory) { + initDownsampled(dim.y, dim.x, dim.z, aInitVal, aUsePinnedMemory); + } + /** * Initializes mesh with size of half of provided mesh dimensions (rounding up if not divisible by 2) * @param aMesh - mesh used to get dimensions @@ -950,15 +990,16 @@ void downsample(const PixelData &aInput, PixelData &aOutput, R reduce, C c const size_t shy = std::min(2*y + 1, y_num - 1); const size_t idx = z * x_num_ds * y_num_ds + x * y_num_ds + y; outMesh[idx] = constant_operator( - reduce(reduce(reduce(reduce(reduce(reduce(reduce( // inMesh coordinates + reduce(reduce(reduce(reduce( // inMesh coordinates inMesh[2*z * x_num * y_num + 2*x * y_num + 2*y], // z, x, y - inMesh[2*z * x_num * y_num + 2*x * y_num + shy]), // z, x, y+1 inMesh[2*z * x_num * y_num + shx * y_num + 2*y]), // z, x+1, y - inMesh[2*z * x_num * y_num + shx * y_num + shy]), // z, x+1, y+1 inMesh[shz * x_num * y_num + 2*x * y_num + 2*y]), // z+1, x, y - inMesh[shz * x_num * y_num + 2*x * y_num + shy]), // z+1, x, y+1 inMesh[shz * x_num * y_num + shx * y_num + 2*y]), // z+1, x+1, y - inMesh[shz * x_num * y_num + shx * y_num + shy]) // z+1, x+1, y+1 + reduce(reduce(reduce( + inMesh[2*z * x_num * y_num + 2*x * y_num + shy], // z, x, y+1 + inMesh[2*z * x_num * y_num + shx * y_num + shy]), // z, x+1, y+1 + inMesh[shz * x_num * y_num + 2*x * y_num + shy]), // z+1, x, y+1 + inMesh[shz * x_num * y_num + shx * y_num + shy])) // z+1, x+1, y+1 ); } } diff --git a/src/data_structures/Mesh/PixelDataCuda.h b/src/data_structures/Mesh/PixelDataCuda.h index 34f7a56c..97f2144e 100644 --- a/src/data_structures/Mesh/PixelDataCuda.h +++ b/src/data_structures/Mesh/PixelDataCuda.h @@ -1,17 +1,35 @@ -// -// Created by Krzysztof Gonciarz on 4/9/18. -// - #ifndef LIBAPR_PIXELDATACUDA_H #define LIBAPR_PIXELDATACUDA_H #include "PixelData.hpp" + template void downsampleMeanCuda(const PixelData &aInput, PixelData &aOutput); template void downsampleMaxCuda(const PixelData &input, PixelData &output); -#endif //LIBAPR_PIXELDATACUDA_H +/** + * Copies data from input to output (which is bigger by pad size) reflecting around the edge pixels. + * @tparam T + * @param input + * @param output + * @param padSize + */ +template +void paddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); + +/** + * Copies data from input to output (which is smaller by pad size). + * @tparam T + * @param input + * @param output + * @param padSize + */ +template +void unpaddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); + +#endif + diff --git a/src/data_structures/Mesh/downsample.cuh b/src/data_structures/Mesh/downsample.cuh index 947db945..a6548a52 100644 --- a/src/data_structures/Mesh/downsample.cuh +++ b/src/data_structures/Mesh/downsample.cuh @@ -24,14 +24,14 @@ __global__ void downsampleMean(const T *input, S *output, size_t x_num, size_t y size_t idx = (zi * x_num + xi) * y_num + yi; // Go through all elements in 2x2 - T v = input[idx]; + S v = input[idx]; v += input[idx + xs * y_num]; v += input[idx + zs * x_num * y_num]; v += input[idx + xs * y_num + zs * x_num * y_num]; // Get data from odd thread to even one const int workerIdx = threadIdx.y; - T a = __shfl_sync(__activemask(), v, workerIdx + 1); + S a = __shfl_sync(__activemask(), v, workerIdx + 1); // downsampled dimensions twice smaller (rounded up) diff --git a/src/data_structures/Mesh/paddPixelData.cuh b/src/data_structures/Mesh/paddPixelData.cuh new file mode 100644 index 00000000..dae96d79 --- /dev/null +++ b/src/data_structures/Mesh/paddPixelData.cuh @@ -0,0 +1,81 @@ +#ifndef LIBAPR_PADDPIXELDATA_CUH +#define LIBAPR_PADDPIXELDATA_CUH + + +#include "data_structures/Mesh/PixelData.hpp" + + +template +__global__ void paddPixels(const T* input, T *output, const PixelDataDim inputSize, const PixelDataDim outputSize, const PixelDataDim padSize) { + size_t yIdx = blockIdx.y * blockDim.y + threadIdx.y; + size_t xIdx = blockIdx.x * blockDim.x + threadIdx.x; + size_t zIdx = blockIdx.z * blockDim.z + threadIdx.z; + + // copy data to output (padded) cube + if (yIdx < outputSize.y && xIdx < outputSize.x && zIdx < outputSize.z) { + + // output cube index + size_t outputIdx = (zIdx * outputSize.x + xIdx) * outputSize.y + yIdx; + + // input cube index + int yIn = yIdx - padSize.y; + if (yIn < 0) yIn = -yIn; // reflected boundary on LHS + if (yIn >= inputSize.y) yIn -= 2 * (yIn - (inputSize.y - 1)); // reflected boundary on RHS + + int xIn = xIdx - padSize.x; + if (xIn < 0) xIn = -xIn; // reflected boundary on LHS + if (xIn >= inputSize.x) xIn -= 2 * (xIn - (inputSize.x - 1)); // reflected boundary on RHS + + int zIn = zIdx - padSize.z; + if (zIn < 0) zIn = -zIn; // reflected boundary on LHS + if (zIn >= inputSize.z) zIn -= 2 * (zIn - (inputSize.z - 1)); // reflected boundary on RHS + + size_t inputIdx = (zIn * inputSize.x + xIn) * inputSize.y + yIn; + + output[outputIdx] = input[inputIdx]; + } +} + +template +void runPaddPixels(const T* input, T *output, const PixelDataDim &inputSize, const PixelDataDim &outputSize, const PixelDataDim &padSize, cudaStream_t aStream) { + dim3 threadsPerBlock(1, 64, 1); + dim3 numBlocks((outputSize.x + threadsPerBlock.x - 1) / threadsPerBlock.x, + (outputSize.y + threadsPerBlock.y - 1) / threadsPerBlock.y, + (outputSize.z + threadsPerBlock.z - 1) / threadsPerBlock.z); + + paddPixels<<>>(input, output, inputSize, outputSize, padSize); +} + +template +__global__ void unpaddPixels(const T* input, T *output, const PixelDataDim inputSize, const PixelDataDim outputSize, const PixelDataDim padSize) { + size_t yIdx = blockIdx.y * blockDim.y + threadIdx.y; + size_t xIdx = blockIdx.x * blockDim.x + threadIdx.x; + size_t zIdx = blockIdx.z * blockDim.z + threadIdx.z; + + // copy data to output (unpadded) cube + if (yIdx < outputSize.y && xIdx < outputSize.x && zIdx < outputSize.z) { + + // output cube index + size_t outputIdx = (zIdx * outputSize.x + xIdx) * outputSize.y + yIdx; + + // input cube index (map coordinates of output cube to internal cube of padded cube) + int yIn = yIdx + padSize.y; + int xIn = xIdx + padSize.x; + int zIn = zIdx + padSize.z; + size_t inputIdx = (zIn * inputSize.x + xIn) * inputSize.y + yIn; + + output[outputIdx] = input[inputIdx]; + } +} + +template +void runUnpaddPixels(const T* input, T *output, const PixelDataDim &inputSize, const PixelDataDim &outputSize, const PixelDataDim &padSize, cudaStream_t aStream) { + dim3 threadsPerBlock(1, 64, 1); + dim3 numBlocks((outputSize.x + threadsPerBlock.x - 1) / threadsPerBlock.x, + (outputSize.y + threadsPerBlock.y - 1) / threadsPerBlock.y, + (outputSize.z + threadsPerBlock.z - 1) / threadsPerBlock.z); + + unpaddPixels<<>>(input, output, inputSize, outputSize, padSize); +} + +#endif diff --git a/src/misc/CudaMemory.cuh b/src/misc/CudaMemory.cuh index e237779f..fbe125e9 100644 --- a/src/misc/CudaMemory.cuh +++ b/src/misc/CudaMemory.cuh @@ -11,14 +11,20 @@ #include -inline cudaError_t checkCuda(cudaError_t result) { -#if defined(DEBUG) || defined(_DEBUG) - if (result != cudaSuccess) { - fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); - assert(result == cudaSuccess); + +// TODO: this method is duplicated in CudaTools.cuh +// Somehow including it here break compilation - fix it please. +#define checkCuda(ans) { cudaAssert2((ans), __FILE__, __LINE__); } +inline void cudaAssert2(cudaError_t code, const char *file, int line, bool abort=true) +{ +#if defined(DEBUG) || defined(_DEBUG) || !defined(NDEBUG) + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line); + assert(code == cudaSuccess); // If debugging it helps to see call tree somehow + if (abort) exit(code); } #endif - return result; } inline void* getPinnedMemory(size_t aNumOfBytes) { diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh index 3f9b5fca..10e4cb73 100644 --- a/src/misc/CudaTools.cuh +++ b/src/misc/CudaTools.cuh @@ -8,16 +8,26 @@ #include #include -//#include #include -//#include - - #include #include + #include "data_structures/Mesh/PixelData.hpp" +#define checkCuda(ans) { cudaAssert((ans), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ +#if defined(DEBUG) || defined(_DEBUG) || !defined(NDEBUG) + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line); + assert(code == cudaSuccess); // If debugging it helps to see call tree somehow + if (abort) exit(code); + } +#endif +} + inline void waitForCuda() { cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError(); @@ -29,12 +39,6 @@ inline void printCudaDims(const dim3 &threadsPerBlock, const dim3 &numBlocks) { std::cout << "Number of threads (x/y/z): " << threadsPerBlock.x << "/" << threadsPerBlock.y << "/" << threadsPerBlock.z << std::endl; } -template -inline void getDataFromKernel(PixelData &input, size_t inputSize, ImgType *cudaInput) { - cudaMemcpy(input.mesh.get(), cudaInput, inputSize, cudaMemcpyDeviceToHost); - cudaFree(cudaInput); -} - class CudaTimer { std::vector iStartTimes; std::vector names; @@ -85,12 +89,18 @@ public: // Useful type for keeping CUDA allocated memory (which is released with cudaFree) -template +static cudaError_t CUDARTAPI deleter(void *devPtr) { + //std::cout << "cudaFree() called...\n"; + return cudaFree(devPtr); +} + +template struct CudaMemoryUniquePtr : public std::unique_ptr { using std::unique_ptr::unique_ptr; // inheriting other constructors - explicit CudaMemoryUniquePtr(T *aMemory = nullptr) : std::unique_ptr(aMemory, &cudaFree) {} + explicit CudaMemoryUniquePtr(T *aMemory = nullptr) : std::unique_ptr(aMemory, &deleter) {} }; + /** * Directions for sending data between Host and Device */ @@ -211,6 +221,17 @@ public: initialize(); } + ScopedCudaMemHandler (ScopedCudaMemHandler &&obj) { + iData = obj.iData; + obj.iData = nullptr; + iSize = obj.iSize; + obj.iSize = 0; + iBytes = obj.iBytes; + obj.iBytes = 0; + iStream = obj.iStream; + obj.iStream = nullptr; + iCudaMemory = std::move(obj.iCudaMemory); + } ~ScopedCudaMemHandler() { if (DIRECTION & D2H) { @@ -223,15 +244,21 @@ public: size_t getNumOfBytes() const {return iBytes; } void copyH2D() { - cudaMemcpyAsync(iCudaMemory.get(), iData, iBytes, cudaMemcpyHostToDevice, iStream); + if (iData != nullptr) { + checkCuda(cudaMemcpyAsync(iCudaMemory.get(), iData, iBytes, cudaMemcpyHostToDevice, iStream)); + } } void copyH2D(const size_t numElements) { - cudaMemcpyAsync(iCudaMemory.get(), iData, numElements*DataSize, cudaMemcpyHostToDevice, iStream); + if (iData != nullptr) { + checkCuda(cudaMemcpyAsync(iCudaMemory.get(), iData, numElements*DataSize, cudaMemcpyHostToDevice, iStream)); + } } void copyD2H() { - cudaMemcpyAsync((void*)iData, iCudaMemory.get(), iBytes, cudaMemcpyDeviceToHost, iStream); + if (iData != nullptr) { + checkCuda(cudaMemcpyAsync((void *) iData, iCudaMemory.get(), iBytes, cudaMemcpyDeviceToHost, iStream)); + } } private: @@ -240,7 +267,7 @@ private: void initialize() { ElementType *mem = nullptr; - cudaMalloc(&mem, iBytes); + checkCuda(cudaMalloc(&mem, iBytes)); iCudaMemory.reset(mem); if (DIRECTION & H2D) { copyH2D(); diff --git a/test/APRTest.cpp b/test/APRTest.cpp index 33ea37d6..83071a7f 100644 --- a/test/APRTest.cpp +++ b/test/APRTest.cpp @@ -134,7 +134,7 @@ bool compare_two_iterators(Iterator1& it1, Iterator2& it2, int maxNumOfErrPrinte uint64_t counter_1 = 0; uint64_t counter_2 = 0; - uint64_t errors = 0; + int64_t errors = 0; for (int level = it1.level_min(); level <= it1.level_max(); ++level) { for (int z = 0; z < it1.z_num(level); z++) { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index dc3e5a11..d3377fb0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -11,13 +11,18 @@ buildTarget(testComputeGradient ComputeGradientTest.cpp) buildTarget(testLocalIntensityScale LocalIntensityScaleTest.cpp) buildTarget(testPullingScheme PullingSchemeTest.cpp) buildTarget(testAPRParameters APRParametersTest.cpp) +buildTarget(testLinearAccess LinearAccessTest.cpp) #APR GPU Tests if(APR_USE_CUDA) buildTarget(testAPRCuda APRTestCuda.cpp) + buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp) + buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp) + buildTarget(testFullPipelineCuda FullPipelineCudaTest.cpp) + buildTarget(testPullingSchemeCuda PullingSchemeCudaTest.cpp) + buildTarget(testLinearAccessCuda LinearAccessCudaTest.cpp) endif() - if(APR_BUILD_EXAMPLES) buildTarget(testExamples ExamplesTest.cpp) endif() diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp new file mode 100644 index 00000000..588c5ea3 --- /dev/null +++ b/test/ComputeGradientCudaTest.cpp @@ -0,0 +1,369 @@ + +#include + +#include "data_structures/Mesh/PixelData.hpp" +#include "algorithm/ComputeGradient.hpp" +#include "algorithm/ComputeGradientCuda.hpp" +#include "TestTools.hpp" + +namespace { + +#ifdef APR_USE_CUDA + + + // ======================================================================== + // BSPLINE tests + // ======================================================================== + + template + class BsplineTest : public testing::Test {}; + TYPED_TEST_SUITE_P(BsplineTest); + + TYPED_TEST_P(BsplineTest, testBsplineInXdirCUDA) { + APRTimer timer(false); + + std::vector> yzSizes = {{1, 1}, + {32, 32}, + {33, 33}, + {44, 35}, + {35, 44}, + {255, 129}}; + + for (auto &p: yzSizes) { + int yLen = p.first; + int zLen = p.second; + // Run test with dimension in range much shorter than filter length to longer than filter length + // (for lambda=3 and tolerance=0.00001 expected filter length k0=18) + for (int xLen = 2; xLen < 22; ++xLen) { + // Generate random mesh + using ImgType = TypeParam; + PixelData m = getRandInitializedMesh(yLen, xLen, zLen, 30, 10); + + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().bspline_filt_rec_x(mCpu, lambda, tolerance); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_X_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + TYPED_TEST_P(BsplineTest, testBsplineInZdirCUDA) { + APRTimer timer(false); + + std::vector> xySizes = {{1, 1}, + {32, 32}, + {33, 33}, + {44, 35}, + {35, 44}, + {255, 129}}; + + for (auto &p : xySizes) { + int xLen = p.first; + int yLen = p.second; + // Run test with dimension in range much shorter than filter length to longer than filter length + // (for lambda=3 and tolerance=0.00001 expected filter length k0=18) + for (int zLen = 2; zLen < 22; ++zLen) { + // Generate random mesh + using ImgType = TypeParam; + PixelData m = getRandInitializedMesh(yLen, xLen, zLen, 30, 10); + + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().bspline_filt_rec_z(mCpu, lambda, tolerance); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Z_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + TYPED_TEST_P(BsplineTest, testBsplineInYdirCUDA) { + APRTimer timer(false); + + std::vector> xzSizes = {{1, 1}, + {32, 32}, + {33, 33}, + {44, 35}, + {35, 44}, + {255, 129}}; + + for (auto &p : xzSizes) { + int xLen = p.first; + int zLen = p.second; + // Run test with dimension in range much shorter than filter length to longer than filter length + // (for lambda=3 and tolerance=0.00001 expected filter length k0=18) + for (int yLen = 2; yLen < 22; ++yLen) { + // Generate random mesh + using ImgType = TypeParam; + PixelData m = getRandInitializedMesh(yLen, xLen, zLen, 30, 10); + + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().bspline_filt_rec_y(mCpu, lambda, tolerance); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Y_DIR); + timer.stop_timer(); + + //Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + REGISTER_TYPED_TEST_SUITE_P(BsplineTest, testBsplineInXdirCUDA, testBsplineInZdirCUDA, testBsplineInYdirCUDA); + using ImgTypes = ::testing::Types< float, uint16_t, int16_t, uint8_t>; + INSTANTIATE_TYPED_TEST_SUITE_P(Testing, BsplineTest, ImgTypes); + + TEST(ComputeBspineTest, BSPLINE_FULL_XYZ_DIR_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(127, 128, 129, 100, 10); + + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; // as defined in get_smooth_bspline_3D + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().get_smooth_bspline_3D(mCpu, lambda); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_ALL_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + + + // ======================================================================== + // INV. BSPLINE tests + // ======================================================================== + + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(127, 61, 66, 100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_x(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_X_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(128, 61, 66, 100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_z(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_Z_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(127, 61, 71, 100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_y(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_Y_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_FULL_XYZ_DIR_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(32,32,32,100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_y(mCpu); + ComputeGradient().calc_inv_bspline_x(mCpu); + ComputeGradient().calc_inv_bspline_z(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_ALL_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + + // ======================================================================== + // Downsampled gradient + // ======================================================================== + + TEST(ComputeGradientTest, GPU_VS_CPU_DOWNSAMPLE_GRADIENT_ON_RANDOM_VALUES) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(31, 32, 33, 100); + + // Calculate gradient on CPU + PixelData grad; + grad.initDownsampled(m, 0); + timer.start_timer("CPU gradient"); + ComputeGradient().calc_bspline_fd_ds_mag(m, grad, 1, 1, 1); + timer.stop_timer(); + + // Calculate gradient on GPU + PixelData gradCuda; + gradCuda.initDownsampled(m, 0); + timer.start_timer("GPU gradient"); + cudaDownsampledGradient(m, gradCuda, 1, 1, 1); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(grad, gradCuda, 0), 0); + } + + + // ======================================================================== + // Full pipeline/gradient tests + // ======================================================================== + + TEST(ComputeThreshold, FULL_GRADIENT_TEST) { + APRTimer timer(false); + + // Generate random mesh + using ImageType = uint16_t; + PixelData input_image = getRandInitializedMesh(33, 35, 37, 15, 20); + PixelData &image_temp = input_image; + + PixelData grad_temp; // should be a down-sampled image + grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); + PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + + PixelData grad_temp_GPU; // should be a down-sampled image + grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); + PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, true); + PixelData local_scale_temp2_GPU; + local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate bspline on CPU + PixelData mCpuImage(image_temp, true); + + ComputeGradient computeGradient; + + timer.start_timer(">>>>>>>>>>>>>>>>> CPU gradient"); + computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpuImage(image_temp, true); + timer.start_timer(">>>>>>>>>>>>>>>>> GPU gradient"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage, 0), 0); + EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0); + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + } + +#endif // APR_USE_CUDA + +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index 0b2fc17e..0d822357 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -2,99 +2,13 @@ * Created by Krzysztof Gonciarz 2018 */ #include -#include #include #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/ComputeGradient.hpp" -#include "algorithm/ComputeGradientCuda.hpp" #include -#include "algorithm/APRConverter.hpp" +#include "TestTools.hpp" namespace { - /** - * Compares mesh with provided data - * @param mesh - * @param data - data with [Z][Y][X] structure - * @return true if same - */ - template - bool compare(PixelData &mesh, const float *data, const float epsilon) { - size_t dataIdx = 0; - for (int z = 0; z < mesh.z_num; ++z) { - for (int y = 0; y < mesh.y_num; ++y) { - for (int x = 0; x < mesh.x_num; ++x) { - bool v = std::abs(mesh(y, x, z) - data[dataIdx]) < epsilon; - if (v == false) { - std::cerr << "Mesh and expected data differ. First place at (Y, X, Z) = " << y << ", " << x - << ", " << z << ") " << mesh(y, x, z) << " vs " << data[dataIdx] << std::endl; - return false; - } - ++dataIdx; - } - } - } - return true; - } - - /** - * Compares two meshes - * @param expected - * @param tested - * @param maxNumOfErrPrinted - how many error values should be printed (-1 for all) - * @return number of errors detected - */ - template - int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { - int cnt = 0; - for (size_t i = 0; i < expected.mesh.size(); ++i) { - if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) || - std::isnan(tested.mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested mesh: " << expected.mesh[i] << " vs " << tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; - } - cnt++; - } - } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; - return cnt; - } - - /** - * Generates mesh with provided dims with random values in range [0, 1] * multiplier - * @param y - * @param x - * @param z - * @param multiplier - * @return - */ - template - PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) { - PixelData m(y, x, z); - std::cout << "Mesh info: " << m << std::endl; - std::random_device rd; - std::mt19937 mt(rd()); - std::uniform_real_distribution dist(0.0, 1.0); - for (size_t i = 0; i < m.mesh.size(); ++i) { - m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier; - } - return m; - } - - template - bool initFromZYXarray(PixelData &mesh, const float *data) { - size_t dataIdx = 0; - for (int z = 0; z < mesh.z_num; ++z) { - for (int y = 0; y < mesh.y_num; ++y) { - for (int x = 0; x < mesh.x_num; ++x) { - mesh(y, x, z) = data[dataIdx]; - ++dataIdx; - } - } - } - return true; - } - - TEST(ComputeGradientTest, 2D_XY) { { // Corner points @@ -455,7 +369,7 @@ namespace { 0.0000000000, 0.2193282992, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.2930246294, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000 }; // put values in corners - m(1, 1, 4) = 1; + m(0, 1, 2) = 1; // Calculate bspline on CPU PixelData mCpu(m, true); @@ -724,496 +638,6 @@ namespace { ASSERT_TRUE(compare(m, expect, 0.01)); } - - // ======================= CUDA ======================================= - // ======================= CUDA ======================================= - // ======================= CUDA ======================================= - -#ifdef APR_USE_CUDA - - TEST(ComputeGradientTest, 2D_XY_CUDA) { - // Corner points - PixelData m(6, 6, 1, 0); - // expect gradient is 3x3 X/Y plane - float expect[] = {1.41, 0, 4.24, - 0, 0, 0, - 2.82, 0, 5.65}; - // put values in corners - m(0, 0, 0) = 2; - m(5, 0, 0) = 4; - m(0, 5, 0) = 6; - m(5, 5, 0) = 8; - PixelData grad; - grad.initDownsampled(m, 0); - cudaDownsampledGradient(m, grad, 1, 1, 1); - ASSERT_TRUE(compare(grad, expect, 0.01)); - } - - TEST(ComputeGradientTest, Corners3D_CUDA) { - PixelData m(6, 6, 4, 0); - // expect gradient is 3x3x2 X/Y/Z plane - float expect[] = {1.73, 0, 5.19, - 0, 0, 0, - 3.46, 0, 6.92, - - 8.66, 0, 12.12, - 0, 0, 0, - 10.39, 0, 13.85}; - // put values in corners - m(0, 0, 0) = 2; - m(5, 0, 0) = 4; - m(0, 5, 0) = 6; - m(5, 5, 0) = 8; - m(0, 0, 3) = 10; - m(5, 0, 3) = 12; - m(0, 5, 3) = 14; - m(5, 5, 3) = 16; - - PixelData grad; - grad.initDownsampled(m, 0); - cudaDownsampledGradient(m, grad, 1, 1, 1); - ASSERT_TRUE(compare(grad, expect, 0.01)); - } - - TEST(ComputeGradientTest, GPU_VS_CPU_ON_RANDOM_VALUES) { - // Generate random mesh - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(33, 31, 3); - - APRTimer timer(true); - - // Calculate gradient on CPU - PixelData grad; - grad.initDownsampled(m, 0); - timer.start_timer("CPU gradient"); - ComputeGradient().calc_bspline_fd_ds_mag(m, grad, 1, 1, 1); - timer.stop_timer(); - - // Calculate gradient on GPU - PixelData gradCuda; - gradCuda.initDownsampled(m, 0); - timer.start_timer("GPU gradient"); - cudaDownsampledGradient(m, gradCuda, 1, 1, 1); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(grad, gradCuda), 0); - } - - TEST(ComputeBspineTest, BSPLINE_Y_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(129,127,128); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().bspline_filt_rec_y(mCpu, lambda, tolerance); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Y_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeBspineTest, BSPLINE_X_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(129,127,128); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().bspline_filt_rec_x(mCpu, lambda, tolerance); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_X_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeBspineTest, BSPLINE_Z_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(129,127,128); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().bspline_filt_rec_z(mCpu, lambda, tolerance); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Z_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeBspineTest, BSPLINE_FULL_XYZ_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 128, 129); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; // as defined in get_smooth_bspline_3D - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().get_smooth_bspline_3D(mCpu, lambda); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_ALL_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_CUDA) { - using ImgType = float; - - ImgType init[] = {1.00, 0.00, 0.00, - 1.00, 0.00, 6.00, - 0.00, 6.00, 0.00, - 6.00, 0.00, 0.00}; - - ImgType expect[] = {1.00, 0.00, 2.00, - 0.83, 1.00, 4.00, - 1.17, 4.00, 1.00, - 4.00, 2.00, 0.00}; - - PixelData m(4, 3, 1); - initFromZYXarray(m, init); - - // Calculate and compare - m.printMesh(4,2); - cudaInverseBspline(m, INV_BSPLINE_Y_DIR); - m.printMesh(4,2); - ASSERT_TRUE(compare(m, expect, 0.01)); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 33, 31); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_y(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_Y_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_CUDA) { - using ImgType = float; - - ImgType init[] = {0.00, 6.00, 0.00, - 1.00, 0.00, 0.00, - 0.00, 0.00, 1.00}; - - ImgType expect[] = {2.00, 4.00, 2.00, - 0.67, 0.16, 0.00, - 0.00, 0.16, 0.67}; - - PixelData m(3, 3, 1); - initFromZYXarray(m, init); - - // Calculate and compare - m.printMesh(4,2); - cudaInverseBspline(m, INV_BSPLINE_X_DIR); - m.printMesh(4,2); - ASSERT_TRUE(compare(m, expect, 0.01)); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 61, 66); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_x(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_X_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 61, 66); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_z(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_Z_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_FULL_XYZ_DIR_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(3,3,3,100); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_y(mCpu); - ComputeGradient().calc_inv_bspline_x(mCpu); - ComputeGradient().calc_inv_bspline_z(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_ALL_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeThreshold, CALC_THRESHOLD_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(31, 33, 13); - PixelData g = getRandInitializedMesh(31, 33, 13); - float thresholdLevel = 1; - - // Calculate bspline on CPU - PixelData mCpu(g, true); - timer.start_timer("CPU threshold"); - ComputeGradient().threshold_gradient(mCpu, m, thresholdLevel); - - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(g, true); - timer.start_timer("GPU threshold"); - thresholdGradient(mGpu, m, thresholdLevel); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeThreshold, CALC_THRESHOLD_IMG_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData g = getRandInitializedMesh(31, 33, 13, 1, true); - - float thresholdLevel = 10; - - // Calculate bspline on CPU - PixelData mCpu(g, true); - timer.start_timer("CPU threshold"); - for (size_t i = 0; i < mCpu.mesh.size(); ++i) { - if (mCpu.mesh[i] <= (thresholdLevel)) { mCpu.mesh[i] = thresholdLevel; } - } - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(g, true); - timer.start_timer("GPU threshold"); - thresholdImg(mGpu, thresholdLevel); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - // TODO: These two test will be fixed as soon as CUDA pipeline is updated. - // Currently turning them off to have testable rest of CUDA impl. -// TEST(ComputeThreshold, FULL_GRADIENT_TEST) { -// APRTimer timer(true); -// -// // Generate random mesh -// using ImageType = float; -// PixelData input_image = getRandInitializedMesh(310, 330, 13, 25); -// PixelData &image_temp = input_image; -// -// PixelData grad_temp; // should be a down-sampled image -// grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// PixelData local_scale_temp2; -// local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// PixelData grad_temp_GPU; // should be a down-sampled image -// grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, true); -// PixelData local_scale_temp2_GPU; -// local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// APRParameters par; -// par.lambda = 3; -// par.Ip_th = 10; -// par.dx = 1; -// par.dy = 1; -// par.dz = 1; -// -// // Calculate bspline on CPU -// PixelData mCpuImage(image_temp, true); -// -// ComputeGradient computeGradient; -// -// timer.start_timer(">>>>>>>>>>>>>>>>> CPU gradient"); -// computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par); -// timer.stop_timer(); -// -// // Calculate bspline on GPU -// PixelData mGpuImage(image_temp, true); -// timer.start_timer(">>>>>>>>>>>>>>>>> GPU gradient"); -// getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); -// timer.stop_timer(); -// -// // Compare GPU vs CPU -// EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage), 0); -// EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0.1), 0); -// EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU), 0); -// } -// -// TEST(ComputeThreshold, FULL_PIPELINE_TEST) { -// APRTimer timer(true); -// -// // Generate random mesh -// using ImageType = float; -// PixelData input_image = getRandInitializedMesh(310, 330, 32, 25); -// int maxLevel = ceil(std::log2(330)); -// -// PixelData &image_temp = input_image; -// -// PixelData grad_temp; // should be a down-sampled image -// grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// PixelData local_scale_temp2; -// local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// PixelData grad_temp_GPU; // should be a down-sampled image -// grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// PixelData local_scale_temp2_GPU; -// local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// -// APRParameters par; -// par.lambda = 3; -// par.Ip_th = 10; -// par.sigma_th = 0; -// par.sigma_th_max = 0; -// par.dx = 1; -// par.dy = 1; -// par.dz = 1; -// -// ComputeGradient computeGradient; -// LocalIntensityScale localIntensityScale; -// LocalParticleCellSet localParticleSet; -// -// // Calculate bspline on CPU -// PixelData mCpuImage(image_temp, true); -// timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); -// computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par); -// localIntensityScale.get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); -// localParticleSet.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); -// timer.stop_timer(); -// -// // Calculate bspline on GPU -// PixelData mGpuImage(image_temp, true); -// timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); -// GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); -// gpt.doAll(); -// timer.stop_timer(); -// -// // Compare GPU vs CPU -// // allow some differences since float point diffs -// // TODO: It would be much better to count number of diffs with delta==1 and allow some of these -// EXPECT_TRUE(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0.01) < 29); -// } - - -#endif // APR_USE_CUDA - } int main(int argc, char **argv) { diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp new file mode 100644 index 00000000..913b7e09 --- /dev/null +++ b/test/FullPipelineCudaTest.cpp @@ -0,0 +1,367 @@ + +#include + +#include "algorithm/LocalIntensityScaleCuda.h" +#include "algorithm/LocalIntensityScale.hpp" +#include "algorithm/ComputeGradient.hpp" +#include "algorithm/ComputeGradientCuda.hpp" +#include "algorithm/PullingSchemeCuda.hpp" +#include "data_structures/APR/access/LinearAccessCuda.hpp" +#include "TestTools.hpp" +#include "data_structures/Mesh/PixelDataCuda.h" +#include "algorithm/APRConverter.hpp" +#include "misc/CudaTools.cuh" + + +namespace { +#ifdef APR_USE_CUDA + + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS) { + APRTimer timer(true); + + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d/2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true, true); + PixelData grad_temp_GPU(grad_temp, true, true); + PixelData local_scale_temp_GPU(local_scale_temp, true, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true, true); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0); + } + } + + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS) { + APRTimer timer(true); + + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d%2 == 0) ? dim1 : dim2; + PixelData input_image = (d/2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true, false); + PixelData grad_temp_GPU(grad_temp, true, false); + PixelData local_scale_temp_GPU(local_scale_temp, true, false); + PixelData local_scale_temp2_GPU(local_scale_temp2, true, false); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0); + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + } + } + + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS) { + APRTimer timer(true); + + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU(grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + + // Prepare parameters and APR info structures + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + GenInfo aprInfo; + aprInfo.init(input_image.getDimension()); + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + auto pct = computeOvpcCuda(local_scale_temp_GPU, aprInfo); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); + } + } + + + + + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS_LINEARACCESS) { + APRTimer timer(true); + + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU(grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + + // Prepare parameters and APR info structures + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + par.neighborhood_optimization = true; + + GenInfo aprInfo(input_image.getDimension()); + GenInfo giGpu(input_image.getDimension()); + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + LinearAccess linearAccess; + linearAccess.genInfo = &aprInfo; + + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + auto pct = computeOvpcCuda(local_scale_temp_GPU, giGpu); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + // Test if returned structures have same data + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + + EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + } + } + + TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GpuProcessingTask) { + APRTimer timer(true); + + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{1024,512,512}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + + int maxLevel = ceil(std::log2(dim.maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData local_scale_temp_GPU(local_scale_temp, false); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + par.neighborhood_optimization = true; + + float bspline_offset = 0; + + GenInfo aprInfo(input_image.getDimension()); + GenInfo giGpu(input_image.getDimension()); + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + ComputeGradient().applyParameters(grad_temp, local_scale_temp, local_scale_temp2, par, bspline_offset); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + LinearAccess linearAccess; + linearAccess.genInfo = &aprInfo; + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + timer.stop_timer(); + + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, bspline_offset, maxLevel); + gpt.sendDataToGpu(); + gpt.processOnGpu(); + auto linearAccessGpu = gpt.getDataFromGpu(); + giGpu.total_number_particles = linearAccessGpu.y_vec.size(); + cudaDeviceSynchronize(); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0); + + EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + + } + } + +#endif // APR_USE_CUDA +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp new file mode 100644 index 00000000..eb91e7bd --- /dev/null +++ b/test/LinearAccessCudaTest.cpp @@ -0,0 +1,388 @@ +#include + +#include "algorithm/LocalParticleCellSet.hpp" +#include "algorithm/PullingScheme.hpp" +#include "algorithm/APRConverter.hpp" +#include "data_structures/APR/access/LinearAccessCuda.hpp" + +#include "TestTools.hpp" + +namespace { + template + void fillPS(PullingScheme &aPS, PixelData &levels) { + PixelData levelsDS(ceil(levels.y_num / 2.0), ceil(levels.x_num / 2.0), ceil(levels.z_num / 2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); + } + +/** + * Prints PCT + * @param particleCellTree + */ + template + void printParticleCellTree(const std::vector> &particleCellTree) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; + tree.printMeshT(3, 0); + } + } + + /** + * Create PCT with provided data + * @param aprInfo + * @param levels complete list of values from level min to level max in form { {level, min, values}, ..., {level, max, values} } + * if levels are not provided PCT with EMPTY values is returned + * @return Particle Cell Tree with values (or with EMPTY if levels are not provided) + */ + auto makePCT(const GenInfo &aprInfo, std::initializer_list> levels) { + auto pct = PullingScheme::generateParticleCellTree(aprInfo); + + // Fill particle cell tree only if levels provided - otherwise return tree with EMPTY values + if (levels.size() != 0) { + + int l = aprInfo.l_min; + // PS levels range is [l_max - 1, l_min] + if (((aprInfo.l_max - 1) - aprInfo.l_min + 1) != (int) levels.size()) { + throw std::runtime_error("Wrong number of level data provided!"); + } + for (auto &level: levels) { + if (pct[l].getDimension().size() != level.size()) { + std::cerr << "Provided data for level=" << l << " differs from level size " + << pct[l].getDimension().size() << " vs. " << level.size() << std::endl; + std::cerr << aprInfo << std::endl; + throw std::runtime_error("Not this time..."); + } + std::copy(level.begin(), level.end(), pct[l].mesh.begin()); + l++; + } + } + return pct; + } + + // Copy PCT - copies only existing levels of it. + auto copyPCT(const std::vector> &pct) { + std::vector> copy; + copy.resize(pct.size()); + + for (size_t l = 0; l < pct.size(); ++l) { + copy[l].initWithResize(pct[l].y_num, pct[l].x_num, pct[l].z_num); + // Copy only existing levels + if (pct[l].z_num > 0) copy[l].copyFromMesh(pct[l]); + } + + return copy; + } + + // Create random Particle Cell Tree with dimensions specified in 'gi' with given number of particles. + auto makeRandomPCT(const GenInfo &gi, int numOfParticles = 3) { + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + + // Generate random levels for PS and OVPC + PixelData levels(std::ceil(gi.org_dims[0]/2.0), + std::ceil(gi.org_dims[1]/2.0), + std::ceil(gi.org_dims[2]/2.0), + 0); + int seed = std::time(nullptr); + std::srand(seed); + for (int i = 0; i < numOfParticles; ++i) { + int modulo = (gi.l_max - gi.l_min); + if (modulo == 0) modulo = 1; + levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = std::rand() % modulo + gi.l_min; + } + fillPS(ps, levels); + ps.pulling_scheme_main(); + + return copyPCT(ps.getParticleCellTree()); + } + +} + +// TODO: There are still problems with computing of small (like 1D images in pipeline) +// belows test can be used to trigger those errors - should be fixed + +//TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_fullAprPipeline) { +// // TODO: delete me after development +// // Full 'get apr' pipeline to test imp. on different stages +// // Useful during debugging and can be removed once finished +// +// // Prepare input data (image) +// int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; +// // PS input values = 5 0 0 0 0 0 0 0 +// +//// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; +//// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; +// +// int len = sizeof(values)/sizeof(int); +// PixelData data(len, 1, 1); +// initFromZYXarray(data, values); +// std::cout << "----- Input image:\n"; +// data.printMeshT(3, 1); +// +// // Produce APR +// APR apr; +// APRConverter aprConverter; +// aprConverter.par.rel_error = 0.1; +// aprConverter.par.lambda = 0.1; +// aprConverter.par.sigma_th = 0.0001; +// aprConverter.par.neighborhood_optimization = true; +// aprConverter.get_apr(apr, data); +// +// // Print information about APR and all particles +// std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; +// for (int l = apr.level_min(); l <= apr.level_max(); ++l) { +// std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; +// } +// std::cout << "APR particles z x y level:\n"; +// auto it = apr.iterator(); +// for (int level = it.level_min(); level <= it.level_max(); ++level) { +// for (int z = 0; z < it.z_num(level); z++) { +// for (int x = 0; x < it.x_num(level); ++x) { +// for (it.begin(level, z, x); it < it.end(); it++) { +// std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; +// } +// } +// } +// } +// std::cout << std::endl; +// +// // Sample input +// ParticleData particleIntensities; +// particleIntensities.sample_image(apr, data); +// +// // Reconstruct image from particles +// PixelData reconstructImg; +// APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); +// std::cout << "----- Reconstructed image:"< levelImg; +// APRReconstruction::reconstruct_level(apr, levelImg); +// std::cout << "----- Image levels:" << std::endl; +// levelImg.printMeshT(3, 1); +// +// // Show intensities and levels of each particle +// std::cout << "----- Particle intensities:\n"; +// for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; +// std::cout << std::endl; +// +// particleIntensities.fill_with_levels(apr); +// +// std::cout << "----- Particle levels:\n"; +// for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; +// std::cout << std::endl; +// +// // Show some general information about generated APR +// double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); +// std::cout << std::endl; +// std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; +// std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; +//} + + +//TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { +// // TODO: delete me after development +// // Runs PS to test imp. on different stages +// // Useful during debugging and can be removed once finished +//// int values[] = {0,0,0,5, 0,0,0,0}; +//// int len = sizeof(values)/sizeof(int); +// +// PixelData levels(8, 1, 1, 0); +// levels(5,0,0) = 1; +// +//// initFromZYXarray(levels, values); +// std::cout << "---------------\n"; +// levels.printMeshT(3, 1); +// std::cout << "---------------\n"; +// +// GenInfo gi; +// const PixelDataDim dim = levels.getDimension(); +// std::cout << "Levels dim: " << dim << std::endl; +// gi.init(dim.y * 2, dim.x * 1, dim.z * 1); // time two in y-direction since PS container is downsized. +// std::cout << gi << std::endl; +// +// APRTimer t(false); +// +// t.start_timer("PS1"); +// PullingScheme ps; +// ps.initialize_particle_cell_tree(gi); +// int l_max = gi.l_max - 1; +// int l_min = gi.l_min; +// std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; +// +// fillPS(ps, levels); +// +// std::cout << "---------- Filled PS tree\n"; +// printParticleCellTree(ps.getParticleCellTree()); +// std::cout << "---------------\n"; +// +// ps.pulling_scheme_main(); +// t.stop_timer(); +// +// // Useful during debugging and can be removed once finished +// std::cout << "----------PS:\n"; +// printParticleCellTree(ps.getParticleCellTree()); +// std::cout << "-------------\n"; +// +// LinearAccess linearAccess; +// linearAccess.genInfo = &gi; +// APRParameters par; +// par.neighborhood_optimization = true; +// linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); +// +// std::cout << gi << std::endl; +// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; +// prt(linearAccess.y_vec); +// prt(linearAccess.xz_end_vec); +// prt(linearAccess.level_xz_vec); +// +// LinearIterator it(linearAccess, gi); +// for (int l = 0; l <= 3; l++) { +// std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl; +// } +// std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl; +// +// std::cout << "===========================\n"; +// for (int level = it.level_min(); level <= it.level_max(); ++level) { +// for (int z = 0; z < it.z_num(level); z++) { +// for (int x = 0; x < it.x_num(level); ++x) { +// for (it.begin(level, z, x); it < it.end(); it++) { +// std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; +// } +// } +// } +// } +// std::cout << std::endl; +//} + +// ********************************************************************************************************************* +// Tests of CUDA implementation of LinearAccess +// ********************************************************************************************************************* + + +TEST(LinearAccessCudaTest, optimizationForSmallLevels) { + // Tests optimized part of LinearAccess returning full-resolution for levels <= 2 + + // --- Create input data structures and objects + GenInfo gi; + gi.init(4, 3, 2); + auto pct = makePCT(gi, {}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + auto linearAccess = initializeLinearStructureCuda(gi, par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // all 'y' particles for each xz + std::vector expected_xz_end_vec = {0, 0, 0, 4, 8, 12, 16, 20, 24}; + std::vector expected_level_xz_vec = {1, 1, 3, 9}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + // Useful during debugging and can be removed once finished + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); + EXPECT_EQ(gi.total_number_particles, 4 * 3 * 2); +} + +TEST(LinearAccessCudaTest, optimizationForSmallLevelsVScpu) { + // Tests optimized part of LinearAccess returning full-resolution for levels <= 2 for all possible combination of xyz + // For bigger xyz 'optimized' part of code is not used + + for (int x = 1; x <= 4; ++x) { + for (int y = 1; y <= 4; ++y) { + for (int z = 1; z <= 4; ++z) { +// std::cout << "< ============================================= " << x << " " << y << " "<< z << std::endl; + // --- Create input data structures and objects + GenInfo gi; + gi.init(y, x, z); + + auto pct = makePCT(gi, {}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + GenInfo giGpu; + giGpu.init(y, x, z); + auto pctGpu = makePCT(giGpu, {}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu); + + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(giGpu.total_number_particles, gi.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + } + } + } + +} + +TEST(LinearAccessCudaTest, testGPUvsCPUforDifferentSizes) { + + for (int x : {1, 2, 4, 100, 255}) { + for (int y : {1, 2, 4, 100, 256}) { + for (int z : {1, 2, 4, 100, 257}) { +// std::cout << "< ============================================= " << y << " " << x << " "<< z << std::endl; + + // ----------- Create input data structures and objects + GenInfo gi; + gi.init(y, x, z); + + auto pct = makeRandomPCT(gi, 133); + + auto pctCpu = copyPCT(pct); + auto pctGpu = copyPCT(pct); + + GenInfo giGpu; + giGpu.init(y, x, z); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + + // --------- methods under test + APRTimer t(false); + t.start_timer("__________________________ CPU"); + // --- Method under test + linearAccess.initialize_linear_structure(par, pctCpu); + t.stop_timer(); + + t.start_timer("_________________________ GPU"); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu); + t.stop_timer(); + + + // ----------- verify results + + // LinearAccess changes PCT - compare if changes in CPU and GPU side are same + EXPECT_EQ(compareParticleCellTrees(pctCpu, pctGpu), 0); + + // Test if returned structures have same data + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + + EXPECT_EQ(giGpu.total_number_particles, gi.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + } + } + } + +} + + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/LinearAccessTest.cpp b/test/LinearAccessTest.cpp new file mode 100644 index 00000000..b6c67db8 --- /dev/null +++ b/test/LinearAccessTest.cpp @@ -0,0 +1,248 @@ +#include + +#include "algorithm/PullingScheme.hpp" +#include "algorithm/LocalParticleCellSet.hpp" + +#include "TestTools.hpp" + + +/** + * Create PCT with provided data + * @param aprInfo + * @param levels complete list of values from level min to level max in form { {level, min, values}, ..., {level, max, values} } + * @return Particle Cell Tree with values + */ +auto makePCT(const GenInfo &aprInfo, std::initializer_list> levels) { + auto pct = PullingScheme::generateParticleCellTree(aprInfo); + + + int l = aprInfo.l_min; + + // PS levels range is [l_max - 1, l_min] + if (((aprInfo.l_max - 1) - aprInfo.l_min + 1) != (int) levels.size()) { + throw std::runtime_error("Wrong number of level data provided!"); + } + for (auto &level : levels) { + if (pct[l].getDimension().size() != level.size()) { + std::cerr << "Provided data for level=" << l << " differs from level size " << pct[l].getDimension().size() << " vs. " << level.size() << std::endl; + std::cerr << aprInfo << std::endl; + throw std::runtime_error("Not this time..."); + } + std::copy(level.begin(), level.end(), pct[l].mesh.begin()); + l++; + } + + return pct; +} + +TEST(LinearAccessTest, optimizationForSmallLevels) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(4, 3, 2); + auto pct = makePCT(gi, {{1, 2, 3, 4}}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // all 'y' particles for each xz + std::vector expected_xz_end_vec = {0, 0, 0, 4, 8, 12, 16, 20, 24}; + std::vector expected_level_xz_vec = {1, 1, 3, 9}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); +} + +TEST(LinearAccessTest, yDirNeighbourhoodOptTrue) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(16, 1, 1); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {2, 3, 1, 2, 3, 0, 1}; + std::vector expected_xz_end_vec = {0, 0, 2, 5, 7}; + std::vector expected_level_xz_vec = {1, 1, 2, 3, 4, 5}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); +} + +TEST(LinearAccessTest, yDirNeighbourhoodOptFalse) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(16, 1, 1); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = false; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {2, 3, 2, 3, 0, 1, 2, 3}; + std::vector expected_xz_end_vec = {0, 0, 2, 4, 8}; + std::vector expected_level_xz_vec = {1, 1, 2, 3, 4, 5}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); +} + +TEST(LinearAccessTest, xDirNeighbourhoodOptTrue) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 16, 1); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); +} + +TEST(LinearAccessTest, xDirNeighbourhoodOptFalse) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 16, 1); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = false; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); +} + +TEST(LinearAccessTest, zDirNeighbourhoodOptTrue) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 1, 16); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); +} + +TEST(LinearAccessTest, zDirNeighbourhoodOptFalse) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 1, 16); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = false; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp new file mode 100644 index 00000000..39f8ff22 --- /dev/null +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -0,0 +1,643 @@ + +#include + +#include "algorithm/LocalIntensityScaleCuda.h" +#include "algorithm/LocalIntensityScale.hpp" +#include "TestTools.hpp" +#include "data_structures/Mesh/PixelDataCuda.h" + +namespace { + +#ifdef APR_USE_CUDA + + TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_X_DIR_VS_MANUALLY_CALCULATED_VALUES) { + // Belows data is precomputed for x-len = 5 (and maximum offset = 4) so do not change these numbers! + constexpr PixelDataDim const dim{1, 5, 1}; + float expectedData[2][5][dim.x] = { + { // with no boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 + {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 + {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 + {2.50, 3.00, 3.00, 3.00, 3.50}, // offset = 3 + {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 + }, + { // with boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, + {1.66, 2.00, 3.00, 4.00, 4.33}, + {2.20, 2.40, 3.00, 3.60, 3.80}, + {2.71, 2.85, 3.00, 3.14, 3.28}, + {3.22, 3.11, 3.00, 2.88, 2.77} + } + }; + + APRTimer timer(false); // set to true to see timings + + PixelData m(dim); + float dataIn[] = {1, 2, 3, 4, 5}; + initFromZYXarray(m, dataIn); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 4; ++offset) { +// std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + PixelData expected(dim); + initFromZYXarray(expected, expectedData[boundary][offset]); + EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0); + EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0); + + // Also GPU and CPU should give exactly same output + EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_X_DIR_RANDOM_VALUES) { + APRTimer timer(false); + + constexpr PixelDataDim const dim{49, 53, 51}; + PixelData m = getRandInitializedMesh(dim, 50, 10); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 6; ++offset) { +// std::cout << "------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + PixelData mCpu; + mCpu.init(m); + mCpu.copyFromMesh(m); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); // Expect exactly same results + } + } + } + + /** + * Generate input and expected output using easy brute force approach. + * When comparing vs CPU or GPU outputs there is small error expected since little difference in order of float + * operations. + * @tparam T - type of generated data + * @param len - length + * @param offset - offset for which expected output should be calculated + * @param boundary - use boundary? + * @param useRandomNumbers - use random numbers or if false then index numbers in buffers [1..len] + * @return tuple of [input, expectedOutput] + */ + template + auto generateInputAndExpected(int len, int offset, bool boundary, bool useRandomNumbers) { + std::vector input(len); + std::vector expected(len); + + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_real_distribution dist(0.0, 10.0); + + // Feel input and calculate expected data + for (int i = 0; i < len; ++i) input[i] = useRandomNumbers ? dist(mt) : i + 1; + + for (int i = 0; i < len; ++i) { + int count = 0; + T sum = 0; + for (int x = i - offset; x <= i + offset; ++x) { + int currIdx = x; + if (boundary) { + currIdx = abs(x); + if (currIdx > len - 1) currIdx = (len - 1) - (currIdx - (len - 1)); + } + + if (currIdx < 0 || currIdx >= len) continue; + + sum += input[currIdx]; + count++; + } + expected[i] = sum / count; + } + return std::make_tuple(input, expected); + } + + TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_X_DIR) { + // Input params + using T = float; + + for (int b = 0; b <= 1; b++) { + for (int len = 5; len <= 45; len += 20) { + for (int offset = 0; offset <= 6 && offset < len; offset++) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = b > 0; + bool useRandomNumbers = r > 0; +// std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl; + + auto t = generateInputAndExpected(len, offset, hasBoundary, useRandomNumbers); + auto input = std::get<0>(t); + auto expected = std::get<1>(t); + PixelData m(1, len, 1, 0); + initFromZYXarray(m, input.data()); + PixelData expectedMesh(1, len, 1, 0); + initFromZYXarray(expectedMesh, expected.data()); + + APRTimer timer(false); + LocalIntensityScale lis; + + // Run on CPU old-impl + timer.start_timer("CPU X-DIR"); + PixelData mCpu(m, true); + lis.calc_sat_mean_x(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, hasBoundary); + timer.stop_timer(); + + // expectedMesh because of different order of calculation will have small floating-point differences + // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! + EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; + } + } + } + } + } + + TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_Z_DIR_VS_MANUALLY_CALCULATED_VALUES) { + // Belows data is precomputed for x-len = 5 (and maximum offset = 4) so do not change these numbers! + constexpr PixelDataDim const dim{1, 1, 5}; + float expectedData[2][5][dim.z] = { + { // with no boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 + {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 + {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 + {2.50, 3.00, 3.00, 3.00, 3.50}, // offset = 3 + {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 + }, + { // with boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, + {1.66, 2.00, 3.00, 4.00, 4.33}, + {2.20, 2.40, 3.00, 3.60, 3.80}, + {2.71, 2.85, 3.00, 3.14, 3.28}, + {3.22, 3.11, 3.00, 2.88, 2.77} + } + }; + + APRTimer timer(false); // set to true to see timings + + PixelData m(dim); + float dataIn[] = {1, 2, 3, 4, 5}; + initFromZYXarray(m, dataIn); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 4; ++offset) { +// std::cout << "------------------ OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_z(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + PixelData expected(dim); + initFromZYXarray(expected, expectedData[boundary][offset]); + EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0); + EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0); + + // Also GPU and CPU should give exactly same output + EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Z_DIR_RANDOM_VALUES) { + APRTimer timer(false); + + constexpr PixelDataDim const dim{49, 51, 53}; + PixelData m = getRandInitializedMesh(dim, 50, 10); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 6; ++offset) { +// std::cout << "---------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + PixelData mCpu; + mCpu.init(m); + mCpu.copyFromMesh(m); + timer.start_timer("CPU mean Z-DIR"); + lis.calc_sat_mean_z(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Z-DIR"); + calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_Z_DIR) { + // Input params + using T = float; + + for (int b = 0; b <= 1; b++) { + for (int len = 5; len <= 45; len += 20) { + for (int offset = 0; offset <= 6 && offset < len; offset++) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = b > 0; + bool useRandomNumbers = r > 0; +// std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl; + + auto t = generateInputAndExpected(len, offset, hasBoundary, useRandomNumbers); + auto input = std::get<0>(t); + auto expected = std::get<1>(t); + PixelData m(1, 1, len, 0); + initFromZYXarray(m, input.data()); + PixelData expectedMesh(1, 1, len, 0); + initFromZYXarray(expectedMesh, expected.data()); + + APRTimer timer(false); + LocalIntensityScale lis; + + // Run on CPU old-impl + timer.start_timer("CPU Z-DIR"); + PixelData mCpu(m, true); + lis.calc_sat_mean_z(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU Z-DIR"); + calcMean(mGpu, offset, MEAN_Z_DIR, hasBoundary); + timer.stop_timer(); + + // expectedMesh because of different order of calculation will have small floating-point differences + // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! + EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; + } + } + } + } + } + + TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_Y_DIR_VS_MANUALLY_CALCULATED_VALUES) { + // Belows data is precomputed for y_len = 5 (and maximum offset = 4) so do not change these numbers! + constexpr PixelDataDim const dim{5, 1, 1}; + float expectedData[2][5][dim.y] = { + { // with no boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 + {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 + {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 + {2.50, 3.00, 3.00, 3.00, 3.50}, // offset = 3 + {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 + }, + { // with boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, + {1.66, 2.00, 3.00, 4.00, 4.33}, + {2.20, 2.40, 3.00, 3.60, 3.80}, + {2.71, 2.85, 3.00, 3.14, 3.28}, + {3.22, 3.11, 3.00, 2.88, 2.77} + } + }; + + APRTimer timer(false); // set to true to see timings + + PixelData m(dim); + float dataIn[] = {1, 2, 3, 4, 5}; + initFromZYXarray(m, dataIn); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 4; ++offset) { + // std::cout << "------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean Y-DIR"); + lis.calc_sat_mean_y(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + PixelData expected(dim); + initFromZYXarray(expected, expectedData[boundary][offset]); + EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0); + EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0); + + // Also GPU and CPU should give exactly same output + EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Y_DIR_RANDOM_VALUES) { + APRTimer timer(false); + + constexpr PixelDataDim const dim{49, 51, 53}; + PixelData m = getRandInitializedMesh(dim, 2, 0,false); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 6; ++offset) { +// std::cout << "---------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + PixelData mCpu(m, true); + timer.start_timer("CPU mean Y-DIR"); + lis.calc_sat_mean_y(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_Y_DIR) { + // Input params + using T = float; + + for (int b = 0; b <= 1; b++) { + for (int len = 5; len <= 45; len += 20) { + for (int offset = 0; offset <= 6 && offset < len; offset++) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = b > 0; + bool useRandomNumbers = r > 0; +// std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl; + + auto t = generateInputAndExpected(len, offset, hasBoundary, useRandomNumbers); + auto input = std::get<0>(t); + auto expected = std::get<1>(t); + PixelData m(len, 1, 1, 0); + initFromZYXarray(m, input.data()); + PixelData expectedMesh(len, 1, 1, 0); + initFromZYXarray(expectedMesh, expected.data()); + + APRTimer timer(false); + LocalIntensityScale lis; + + // Run on CPU old-impl + timer.start_timer("CPU Y-DIR"); + PixelData mCpu(m, true); + lis.calc_sat_mean_y(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR, hasBoundary); + timer.stop_timer(); + + // expectedMesh because of different order of calculation will have small floating-point differences + // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! + EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; + } + } + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { + APRTimer timer(false); + PixelData m = getRandInitializedMesh(33, 32, 31); + + LocalIntensityScale lis; + for (int boundary = 0; boundary <= 1; boundary++) { + for (int offset = 0; offset <= 6; ++offset) { + bool hasBoundary = (boundary > 0); +// std::cout << "========================> " << " offset=" << offset << " hasBoundary=" << hasBoundary << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean ALL-DIR"); + lis.calc_sat_mean_y(mCpu, offset, hasBoundary); + lis.calc_sat_mean_x(mCpu, offset, hasBoundary); + lis.calc_sat_mean_z(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean ALL-DIR"); + calcMean(mGpu, offset, MEAN_ALL_DIR, hasBoundary); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) { + APRTimer timer(false); + PixelData m = getRandInitializedMesh(33, 31, 13); + + LocalIntensityScale lis; + for (int boundary = 0; boundary <= 1; boundary++) { + for (int offset = 0; offset <= 6; ++offset) { + bool hasBoundary = (boundary > 0); +// std::cout << "========================> " << " offset=" << offset << " hasBoundary=" << hasBoundary << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean ALL-DIR"); + lis.calc_sat_mean_y(mCpu, offset, hasBoundary); + lis.calc_sat_mean_x(mCpu, offset, hasBoundary); + lis.calc_sat_mean_z(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean ALL-DIR"); + calcMean(mGpu, offset, MEAN_ALL_DIR, hasBoundary); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { + APRTimer timer(false); + + for (int boundary = 0; boundary <= 1; ++boundary) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = (boundary > 0); + bool useRandomNumbers = (r > 0); + + PixelData m = getRandInitializedMesh(31, 33, 32, 25, 10, !useRandomNumbers); + + APRParameters params; + params.sigma_th = 1; + params.sigma_th_max = 2; + params.reflect_bc_lis = hasBoundary; + + // Run on CPU + PixelData mCpu(m, true); + PixelData mCpuTemp(m, false); + timer.start_timer("CPU LIS FULL"); + LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + PixelData mGpuTemp(m, false); + timer.start_timer("GPU LIS FULL"); + getLocalIntensityScale(mGpu, mGpuTemp, params); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE_SUPER_SMALL) { + // In case of very small input image like 2x2x2 constant scale is being used + APRTimer timer(false); + + for (int boundary = 0; boundary <= 1; ++boundary) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = (boundary > 0); + bool useRandomNumbers = (r > 0); + + PixelData m = getRandInitializedMesh(2,2,2, 25, 10, !useRandomNumbers); + + APRParameters params; + params.sigma_th = 1; + params.sigma_th_max = 2; + params.reflect_bc_lis = hasBoundary; + + // Run on CPU + PixelData mCpu(m, true); + PixelData mCpuTemp(m, false); + timer.start_timer("CPU LIS FULL"); + LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params); + mCpu.printMesh(3,2); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + PixelData mGpuTemp(m, false); + timer.start_timer("GPU LIS FULL"); + getLocalIntensityScale(mGpu, mGpuTemp, params); + timer.stop_timer(); + + // Compare results - only mGPU mattters since mGpuTemp in case of constant scale is not modified + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE_CONSTANT_SCALE) { + APRTimer timer(false); + + for (int boundary = 0; boundary <= 1; ++boundary) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = (boundary > 0); + bool useRandomNumbers = (r > 0); + + PixelData m = getRandInitializedMesh(31, 33, 32, 25, 10, !useRandomNumbers); + + APRParameters params; + params.sigma_th = 1; + params.sigma_th_max = 2; + params.reflect_bc_lis = hasBoundary; + params.constant_intensity_scale = true; + + // Run on CPU + PixelData mCpu(m, true); + PixelData mCpuTemp(m, false); + timer.start_timer("CPU LIS FULL"); + LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + PixelData mGpuTemp(m, false); + timer.start_timer("GPU LIS FULL"); + getLocalIntensityScale(mGpu, mGpuTemp, params); + timer.stop_timer(); + + // Compare results + // NOTE: mCpuTemp and mGpuTemp are not checked since in case of + // constant_intensity_scale they are not set to any value + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + +#endif // APR_USE_CUDA +} + + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/LocalIntensityScaleTest.cpp b/test/LocalIntensityScaleTest.cpp index a9f1558b..09a6466b 100644 --- a/test/LocalIntensityScaleTest.cpp +++ b/test/LocalIntensityScaleTest.cpp @@ -5,9 +5,6 @@ #include #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/LocalIntensityScale.hpp" -#include "algorithm/LocalIntensityScaleCuda.h" -#include "data_structures/APR/APR.hpp" -#include "algorithm/APRConverter.hpp" #include "TestTools.hpp" @@ -24,7 +21,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_y(m, 0); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=1 @@ -37,7 +34,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_y(m, 1); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=2 (+symmetricity check) @@ -50,7 +47,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_y(m, 2); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); // check if data in opposite order gives same result float dataIn2[] = {24,21,18,15,12,9,6,3}; @@ -60,7 +57,7 @@ namespace { lis.calc_sat_mean_y(m, 2); - ASSERT_TRUE(compare(m, expect2, 0.05)); + ASSERT_TRUE(compare(m, expect2, 0.000001)); } } @@ -76,7 +73,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_x(m, 0); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=1 @@ -89,7 +86,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_x(m, 1); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=2 (+symmetricity check) @@ -102,7 +99,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_x(m, 2); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); // check if data in opposite order gives same result float dataIn2[] = {24,21,18,15,12,9,6,3}; @@ -112,7 +109,7 @@ namespace { lis.calc_sat_mean_x(m, 2); - ASSERT_TRUE(compare(m, expect2, 0.05)); + ASSERT_TRUE(compare(m, expect2, 0.000001)); } } @@ -128,7 +125,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_z(m, 0); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=1 @@ -141,7 +138,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_z(m, 1); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=2 (+symmetricity check) @@ -154,7 +151,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_z(m, 2); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); // check if data in opposite order gives same result float dataIn2[] = {24,21,18,15,12,9,6,3}; @@ -164,221 +161,10 @@ namespace { lis.calc_sat_mean_z(m, 2); - ASSERT_TRUE(compare(m, expect2, 0.05)); + ASSERT_TRUE(compare(m, expect2, 0.000001)); } } - -// ============================================================================ -// ==================== CUDA IMPL TESTS ============================= -// ============================================================================ - -#ifdef APR_USE_CUDA - - TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) { - { // OFFSET=0 - - PixelData m(8, 1, 1, 0); - float dataIn[] = {3,6,9,12,15,18,21,24}; - float expect[] = {3,6,9,12,15,18,21,24}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 0, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - } - { // OFFSET=1 - - PixelData m(8, 1, 1, 0); - float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8}; - float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 1, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - } - { // OFFSET=2 (+symmetricity check) - - PixelData m(8, 1, 1, 0); - float dataIn[] = {3,6,9,12,15,18,21,24}; - float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 2, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - - // check if data in opposite order gives same result - float dataIn2[] = {24,21,18,15,12,9,6,3}; - float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6}; - - initFromZYXarray(m, dataIn2); - - calcMean(m, 2, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect2, 0.05)); - } - } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(33, 31, 13); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean Y-DIR"); - lis.calc_sat_mean_y(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Y-DIR"); - calcMean(mGpu, offset, MEAN_Y_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - TEST(LocalIntensityScaleCudaTest, 1GPU_VS_CPU_X_DIR) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(33, 31, 13); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean X-DIR"); - lis.calc_sat_mean_x(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { - APRTimer timer(true); - using ImgType = float; - PixelData m = getRandInitializedMesh(310, 330, 13, 255); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean Z-DIR"); - lis.calc_sat_mean_z(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Z-DIR"); - calcMean(mGpu, offset, MEAN_Z_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(33, 31, 13); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean ALL-DIR"); - lis.calc_sat_mean_y(mCpu, offset); - lis.calc_sat_mean_x(mCpu, offset); - lis.calc_sat_mean_z(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean ALL-DIR"); - calcMean(mGpu, offset); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - //@KG: The CPU code doesn't work for uint16 --> overflow will likely result. - -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) { -// APRTimer timer(true); -// PixelData m = getRandInitializedMesh(33, 31, 13); -// -// LocalIntensityScale lis; -// for (int offset = 0; offset < 6; ++offset) { -// // Run on CPU -// PixelData mCpu(m, true); -// timer.start_timer("CPU mean ALL-DIR"); -// lis.calc_sat_mean_y(mCpu, offset); -// lis.calc_sat_mean_x(mCpu, offset); -// lis.calc_sat_mean_z(mCpu, offset); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean ALL-DIR"); -// calcMean(mGpu, offset); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 1), 0); -// } -// } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(310, 330, 13, 25); - - APRParameters params; - params.sigma_th = 1; - params.sigma_th_max = 2; - params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. - - // Run on CPU - PixelData mCpu(m, true); - PixelData mCpuTemp(m, false); - timer.start_timer("CPU LIS FULL"); - - LocalIntensityScale localIntensityScale; - - localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - PixelData mGpuTemp(m, false); - timer.start_timer("GPU LIS ALL-DIR"); - getLocalIntensityScale(mGpu, mGpuTemp, params); - timer.stop_timer(); - - // Compare results - //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - -#endif // APR_USE_CUDA - } int main(int argc, char **argv) { diff --git a/test/MeshDataTest.cpp b/test/MeshDataTest.cpp index 869229e3..20b1bbe3 100644 --- a/test/MeshDataTest.cpp +++ b/test/MeshDataTest.cpp @@ -5,6 +5,7 @@ #include "data_structures/Mesh/PixelData.hpp" #include "data_structures/Mesh/PixelDataCuda.h" #include +#include "TestTools.hpp" namespace { @@ -34,6 +35,7 @@ namespace { ASSERT_EQ(d.x, 20); ASSERT_EQ(d.z, 30); ASSERT_EQ(d.size(), 10*20*30); + ASSERT_EQ(d.maxDimSize(), 30); } { // adding int to all dims @@ -80,6 +82,16 @@ namespace { ASSERT_FALSE(x == z); ASSERT_TRUE(x != z); } + { // number of dimensions + const PixelDataDim x = {2, 3, 5}; + const PixelDataDim y = {2, 1, 5}; + const PixelDataDim z = {1, 4, 1}; + const PixelDataDim w = {1, 1, 1}; + ASSERT_EQ(x.numOfDimensions(), 3); + ASSERT_EQ(y.numOfDimensions(), 2); + ASSERT_EQ(z.numOfDimensions(), 1); + ASSERT_EQ(w.numOfDimensions(), 0); + } } TEST_F(VectorDataTest, InitTest) { @@ -337,6 +349,16 @@ namespace { ASSERT_EQ(md.mesh.size(), 100*200*300); } + // size provided + { + PixelDataDim dim(100, 200, 300); + PixelData md(dim); + ASSERT_EQ(md.x_num, 200); + ASSERT_EQ(md.y_num, 100); + ASSERT_EQ(md.z_num, 300); + ASSERT_EQ(md.mesh.size(), 100*200*300); + } + // mesh provided { // generate some data @@ -675,51 +697,7 @@ namespace { } #ifdef APR_USE_CUDA -namespace { - /** - * Compares two meshes - * @param expected - * @param tested - * @param maxNumOfErrPrinted - how many error values should be printed (-1 for all) - * @return number of errors detected - */ - template - int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { - int cnt = 0; - for (size_t i = 0; i < expected.mesh.size(); ++i) { - if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) || - std::isnan(tested.mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested mesh: " << expected.mesh[i] << " vs " << tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; - } - cnt++; - } - } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; - return cnt; - } - /** - * Generates mesh with provided dims with random values in range [0, 1] * multiplier - * @param y - * @param x - * @param z - * @param multiplier - * @return - */ - template - PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) { - PixelData m(y, x, z); - std::cout << "Mesh info: " << m << std::endl; - std::random_device rd; - std::mt19937 mt(rd()); - std::uniform_real_distribution dist(0.0, 1.0); - for (size_t i = 0; i < m.mesh.size(); ++i) { - m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier; - } - return m; - } -} TEST(MeshDataSimpleTest, DownSampleCuda) { { // reduce/constant_operator calculate maximum value when downsampling PixelData m(5, 6, 4); @@ -773,10 +751,10 @@ TEST(MeshDataSimpleTest, DownSampleCuda) { EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); } { - APRTimer timer(true); + APRTimer timer(false); // reduce/constant_operator calculate average value of pixels when downsampling - PixelData m = getRandInitializedMesh(33, 22, 21); + PixelData m = getRandInitializedMesh(33, 22, 21, 100, 5); for (size_t i = 0; i < m.mesh.size(); ++i) m.mesh[i] = 27 - i; PixelData mCpu; mCpu.initDownsampled(m); @@ -792,7 +770,7 @@ TEST(MeshDataSimpleTest, DownSampleCuda) { downsampleMeanCuda(m, mGpu); timer.stop_timer(); - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); } } #endif diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp new file mode 100644 index 00000000..bd24156e --- /dev/null +++ b/test/PullingSchemeCudaTest.cpp @@ -0,0 +1,254 @@ +#include + +#include "algorithm/PullingScheme.hpp" +#include "algorithm/OVPC.h" + +#include "algorithm/PullingSchemeCuda.hpp" +#include "algorithm/ComputeGradientCuda.hpp" +#include "algorithm/LocalParticleCellSet.hpp" + +#include "TestTools.hpp" + +// Class for storing expected values for one element of Particle Cell Tree (output of Pulling Scheme) +class LevelData { +public: + int level; + int y; + int x; + int z; + uint8_t expectedType; // seed, boundary, filler... +}; + +/** + * Verify computed Particle Cell Tree (PCT) vs expected values + * Expected values should list all data for types=1,2,3 (seed, boundary filler) which are used to generate particles: + * {levels, y,x,z(position), type} + * All other values are ignored (and used by Pulling Scheme (PS) only for intermediate calculations) + * @param aPCT - PCT produces by PS (note: values in PCT will be changed during verification!) + * @param expectedValues expected values + * @return true if correct, false otherwise + */ +template +bool verifyParticleCellTree(std::vector> &aPCT, const std::vector &expectedValues) { + + const uint8_t AlreadyCheckedMark = 255; + const uint8_t MaxValueOfImportantType = FILLER_TYPE; // All types above are used by PS during computation phase only + + for (const auto &r : expectedValues) { + // std::cout << r.level << " " << r.y << "," << r.x << "," << r.z << " " << (int)r.expectedType << std::endl; + + auto &v = aPCT[r.level](r.y, r.x, r.z); + // Add dim. checks for accessing pct + if (v == r.expectedType) { + v = AlreadyCheckedMark; + } + else { + std::cout << "Error! Data on level=" << r.level << " at (" << r.y << "," << r.x << "," << r.z << ") expected=" << (int)r.expectedType << " got=" << (int)v << std::endl; + return false; + } + } + + for (size_t level = 0; level < aPCT.size(); level++) { + auto &d = aPCT[level]; + auto y_num = d.y_num; + auto x_num = d.x_num; + auto z_num = d.z_num; + + for (int j = 0; j < z_num; j++) { + for (int i = 0; i < x_num; i++) { + for (int k = 0; k < y_num; k++) { + const auto &v = d(k, i, j); + if (v != AlreadyCheckedMark && v <= MaxValueOfImportantType && v > 0) { + std::cout << "Error! Data on level = " << level << " at (" << k << "," << i << "," << j << ") with value = " << (int)v << " not verified or bad!" << std::endl; + return false; + } + } + } + } + } + + return true; +} + +/** + * Prints PCT + * @param particleCellTree + */ +template +void printParticleCellTree(const std::vector> &particleCellTree) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; +// std::cout << "-- level = " << l << ", " << tree << std::endl; + tree.printMeshT(3,0); + } +} + +template +void fillPS(PullingScheme &aPS, PixelData &levels) { + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); +} + + +// ------------------------------------------------------------------------------------------------------------------------------------------- + +TEST(PullingSchemeTest, PSvsOVPCCUDA) { + // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC + GenInfo gi; + gi.init(255, 157, 257); + + // Generate random levels for PS and OVPC + PixelData levels(std::ceil(gi.org_dims[0]/2.0), + std::ceil(gi.org_dims[1]/2.0), + std::ceil(gi.org_dims[2]/2.0), + 0); + // Add a few particles only - it will end up with Pulling Scheme generate particles on (almost) all + // levels - good case to compare with OVPC + const int numOfParticles = 3; + std::srand(std::time(nullptr)); + for (int i = 0; i < numOfParticles; ++i) { + levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = gi.l_max; + } + PixelData levelsOVPC(levels, true); // just copy 'levels' + PixelData levelsPS(levels, true); + + // Initialize all needed objects + APRTimer t(true); + + t.start_timer("PS - init"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levelsPS); + t.stop_timer(); + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + // Run test methods and compare results + t.start_timer("OVPCCUDA - compute"); + auto pct = computeOvpcCuda(levelsOVPC, gi); + t.stop_timer(); + + // -------------- Verify result + ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); +} + + +TEST(PullingSchemeTest, OVPCCUDA_Ydir) { + // Prepare input data for PS + float values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(len, 1, 1); // <-- Y-dir + initFromZYXarray(levels, values); // <-- Y-dir + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir + + int levelMax = gi.l_max - 1; + int levelMin = gi.l_min; + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPCCUDA - compute"); + auto pct = computeOvpcCuda(levels, gi); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 1,0,0, 2}, + {3, 2,0,0, 3}, + {3, 3,0,0, 3}, + + {2, 2,0,0, 3}, + {2, 3,0,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(pct, ev)); +} + +TEST(PullingSchemeTest, OVPCCUDA_Xdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, len, 1); // <-- X-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir + + int levelMax = gi.l_max - 1; + int levelMin = gi.l_min; + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPCCUDA - compute"); + auto pct = computeOvpcCuda(levels, gi); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,1,0, 2}, + {3, 0,2,0, 3}, + {3, 0,3,0, 3} , + + {2, 0,2,0, 3}, + {2, 0,3,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(pct, ev)); +} + +TEST(PullingSchemeTest, OVPCCUDA_Zdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, 1, len); // <-- Z-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir + + int levelMax = gi.l_max - 1; + int levelMin = gi.l_min; + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPCCUDA - compute"); + auto pct = computeOvpcCuda(levels, gi); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,0,1, 2}, + {3, 0,0,2, 3}, + {3, 0,0,3, 3} , + + {2, 0,0,2, 3}, + {2, 0,0,3, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(pct, ev)); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index f72897cd..eeee9718 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -4,109 +4,509 @@ #include #include "data_structures/Mesh/PixelData.hpp" -//TODO: only APRAccess.hpp should be included here but currently because of dependencies it does not work :( -#include "data_structures/APR/APR.hpp" -#include "algorithm/APRConverter.hpp" -//#include "data_structures/APR/APRAccess.hpp" +#include "data_structures/APR/access/APRAccessStructures.hpp" #include "algorithm/PullingScheme.hpp" +#include "algorithm/OVPC.h" #include "TestTools.hpp" -#ifdef APR_USE_CUDA -#include "algorithm/ComputeGradientCuda.hpp" -#endif +#include "algorithm/LocalParticleCellSet.hpp" + namespace { + + // ================================================================================================================= + // ======== Some test helpers + // ================================================================================================================= + + /** + * Prints PCT + * @param particleCellTree + */ template - PixelData generateLevels(const PixelData &dimsMesh, int maxLevel) { - PixelData levels(dimsMesh, false); - for (size_t i = 0; i < levels.mesh.size(); ++i) { - levels.mesh[i] = ( i/2 ) % (maxLevel + 2); + void printParticleCellTree(const std::vector> &particleCellTree) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; +// std::cout << "-- level = " << l << ", " << tree << std::endl; + tree.printMeshT(3,0); } -// std::cout << "LEVELS: " << std::endl; - levels.printMesh(3, 0); - return levels; } -// void printParticleCellTree(const std::vector> &particleCellTree) { -// for (int l = 0; l < particleCellTree.size(); ++l) { -// auto &tree = particleCellTree[l]; -// std::cout << "------ 1level=" << l << " " << tree << std::endl; -// tree.printMesh(3,0); -// } -// } + // Class for storing expected values for one element of Particle Cell Tree (output of Pulling Scheme) + class LevelData { + public: + int level; + int y; + int x; + int z; + uint8_t expectedType; // seed, boundary, filler... + }; - TEST(PullingSchemeTest, Init) { + /** + * Verify computed Particle Cell Tree (PCT) vs expected values + * Expected values should list all data for types=1,2,3 (seed, boundary filler) which are used to generate particles: + * {levels, y,x,z(position), type} + * All other values are ignored (and used by Pulling Scheme (PS) only for intermediate calculations) + * @param aPCT - PCT produces by PS (note: values in PCT will be changed during verification!) + * @param expectedValues expected values + * @return true if correct, false otherwise + */ + template + bool verifyParticleCellTree(std::vector> &aPCT, const std::vector &expectedValues) { - GenInfo aprInfo; + const uint8_t AlreadyCheckedMark = 255; + const uint8_t MaxValueOfImportantType = FILLER_TYPE; // All types above are used by PS during computation phase only - aprInfo.l_max = 4; - aprInfo.l_min = 2; - aprInfo.org_dims[0] = 8; - aprInfo.org_dims[1] = 16; - aprInfo.org_dims[2] = 1; + for (const auto &r : expectedValues) { + // std::cout << r.level << " " << r.y << "," << r.x << "," << r.z << " " << (int)r.expectedType << std::endl; - PullingScheme ps; - ps.initialize_particle_cell_tree(aprInfo); - std::vector> &pctree = ps.getParticleCellTree(); - - // TEST: check if zeroed and correct number of levels - ASSERT_EQ(aprInfo.l_max, pctree.size()); // all levels [0, access.level_max - 1] - for (size_t l = 0; l < pctree.size(); ++l) { - auto &tree = pctree[l]; - for (auto &e : tree.mesh) { - ASSERT_EQ(0, e); + auto &v = aPCT[r.level](r.y, r.x, r.z); + // Add dim. checks for accessing pct + if (v == r.expectedType) { + v = AlreadyCheckedMark; + } + else { + std::cout << "Error! Data on level=" << r.level << " at (" << r.y << "," << r.x << "," << r.z << ") expected=" << (int)r.expectedType << " got=" << (int)v << std::endl; + return false; } } - // Generate mesh with test levels - PixelData levels = generateLevels(pctree[aprInfo.l_max - 1], aprInfo.l_max); - - // Fill particle cell tree with levels - int l_max = aprInfo.l_max - 1; - int l_min = aprInfo.l_min; - ps.fill(l_max, levels); - - PixelData levelsDS; - for(int l_ = l_max - 1; l_ >= l_min; l_--){ - //down sample the resolution level k, using a max reduction - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - ps.fill(l_,levelsDS); - levels.swap(levelsDS); + for (size_t level = 0; level < aPCT.size(); level++) { + auto &d = aPCT[level]; + auto y_num = d.y_num; + auto x_num = d.x_num; + auto z_num = d.z_num; + + for (int j = 0; j < z_num; j++) { + for (int i = 0; i < x_num; i++) { + for (int k = 0; k < y_num; k++) { + const auto &v = d(k, i, j); + if (v != AlreadyCheckedMark && v <= MaxValueOfImportantType && v > 0) { + std::cout << "Error! Data on level = " << level << " at (" << k << "," << i << "," << j << ") with value = " << (int)v << " not verified or bad!" << std::endl; + return false; + } + } + } + } } -// -// printParticleCellTree(pctree); -// ps.fill_neighbours(l_max); -// pctree[l_max].printMesh(3, 0); -// ps.pulling_scheme_main(); -// printParticleCellTree(pctree); + + return true; + } + + template + void fillPS(PullingScheme &aPS, PixelData &levels) { + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); + } + + // ================================================================================================================= + // ======== Pulling Scheme algorithm tests + // ================================================================================================================= + TEST(PullingSchemeTest, PullingScheme1D_Ydir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(len, 1, 1); // <-- Y-dir + initFromZYXarray(levels, values); // <-- Y-dir + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - initialize with data"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 1,0,0, 2}, + {3, 2,0,0, 3}, + {3, 3,0,0, 3}, + + {2, 2,0,0, 3}, + {2, 3,0,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + TEST(PullingSchemeTest, PullingScheme1D_Xdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, len, 1); // <-- X-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - initialize with data"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,1,0, 2}, + {3, 0,2,0, 3}, + {3, 0,3,0, 3} , + + {2, 0,2,0, 3}, + {2, 0,3,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + TEST(PullingSchemeTest, PullingScheme1D_Zdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, 1, len); // <-- Z-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - initialize with data"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,0,1, 2}, + {3, 0,0,2, 3}, + {3, 0,0,3, 3} , + + {2, 0,0,2, 3}, + {2, 0,0,3, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + TEST(PullingSchemeTest, PullingScheme3D_smallCube) { + // Prepare input data for PS + PixelData levels(3, 3, 3, 0); + levels(2, 2, 2) = 3; + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, 2 * dim.x, 2 * dim.z); + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - initialize with data"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {2, 0,0,0, 3}, + {2, 0,1,0, 3}, + {2, 0,2,0, 3}, + {2, 1,0,0, 3}, + {2, 1,1,0, 3}, + {2, 1,2,0, 3}, + {2, 2,0,0, 3}, + {2, 2,1,0, 3}, + {2, 2,2,0, 3}, + + {2, 0,0,1, 3}, + {2, 0,1,1, 3}, + {2, 0,2,1, 3}, + {2, 1,0,1, 3}, + {2, 1,1,1, 2}, + {2, 1,2,1, 2}, + {2, 2,0,1, 3}, + {2, 2,1,1, 2}, + {2, 2,2,1, 2}, + + {2, 0,0,2, 3}, + {2, 0,1,2, 3}, + {2, 0,2,2, 3}, + {2, 1,0,2, 3}, + {2, 1,1,2, 2}, + {2, 1,2,2, 2}, + {2, 2,0,2, 3}, + {2, 2,1,2, 2}, + {2, 2,2,2, 1}, + + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + // ================================================================================================================= + // ======== OVPC - Optimal Valid Particle Cell - alternative version of original Pulling Scheme algorithm + // ================================================================================================================= + TEST(PullingSchemeTest, OVPC_Ydir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(len, 1, 1); // <-- Y-dir + initFromZYXarray(levels, values); // <-- Y-dir + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); + t.stop_timer(); + t.start_timer("OVPC - compute"); + ps.generateTree(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 1,0,0, 2}, + {3, 2,0,0, 3}, + {3, 3,0,0, 3}, + + {2, 2,0,0, 3}, + {2, 3,0,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + TEST(PullingSchemeTest, OVPC_Xdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, len, 1); // <-- X-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); + t.stop_timer(); + t.start_timer("OVPC - compute"); + ps.generateTree(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,1,0, 2}, + {3, 0,2,0, 3}, + {3, 0,3,0, 3} , + + {2, 0,2,0, 3}, + {2, 0,3,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); } -#ifdef APR_USE_CUDA - TEST(PullingSchemeTest, computeLevels) { - using ImgType = float; - const int maxLevel = 3; - const float relError = 0.1; - PixelData grad = getRandInitializedMesh(10, 20, 33); - PixelData localIntensityScaleCpu = getRandInitializedMesh(10, 20, 33); + TEST(PullingSchemeTest, OVPC_Zdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, 1, len); // <-- Z-dir + initFromZYXarray(levels, values); - PixelData localIntensityScaleGpu(localIntensityScaleCpu, true); - PixelData elo(localIntensityScaleCpu, true); - APRTimer timer(true); + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir - LocalParticleCellSet localParticleCellSet; + // Initialize all needed objects + APRTimer t(false); - timer.start_timer("CPU PS FULL"); - localParticleCellSet.computeLevels(grad, localIntensityScaleCpu, maxLevel, relError,1,1,1); - timer.stop_timer(); + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); + t.stop_timer(); + t.start_timer("OVPC - compute"); + ps.generateTree(); + t.stop_timer(); - timer.start_timer("GPU PS FULL"); - computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError); - timer.stop_timer(); + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,0,1, 2}, + {3, 0,0,2, 3}, + {3, 0,0,3, 3} , - EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0); + {2, 0,0,2, 3}, + {2, 0,0,3, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + TEST(PullingSchemeTest, OVPC_smallCube) { + // Prepare input data for PS + PixelData levels(3, 3, 3, 0); + levels(2, 2, 2) = 3; + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, 2 * dim.x, 2 * dim.z); + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); + t.stop_timer(); + t.start_timer("OVPC - compute"); + ps.generateTree(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {2, 0,0,0, 3}, + {2, 0,1,0, 3}, + {2, 0,2,0, 3}, + {2, 1,0,0, 3}, + {2, 1,1,0, 3}, + {2, 1,2,0, 3}, + {2, 2,0,0, 3}, + {2, 2,1,0, 3}, + {2, 2,2,0, 3}, + + {2, 0,0,1, 3}, + {2, 0,1,1, 3}, + {2, 0,2,1, 3}, + {2, 1,0,1, 3}, + {2, 1,1,1, 2}, + {2, 1,2,1, 2}, + {2, 2,0,1, 3}, + {2, 2,1,1, 2}, + {2, 2,2,1, 2}, + + {2, 0,0,2, 3}, + {2, 0,1,2, 3}, + {2, 0,2,2, 3}, + {2, 1,0,2, 3}, + {2, 1,1,2, 2}, + {2, 1,2,2, 2}, + {2, 2,0,2, 3}, + {2, 2,1,2, 2}, + {2, 2,2,2, 1}, + + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); } -#endif + + + // ================================================================================================================= + // ======== PS vs OVPC + // ================================================================================================================= + + TEST(PullingSchemeTest, PSvsOVPC) { + // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC + GenInfo gi; + gi.init(255, 257, 199); + + // Generate random levels for PS and OVPC + PixelData levels(std::ceil(gi.org_dims[0]/2.0), + std::ceil(gi.org_dims[1]/2.0), + std::ceil(gi.org_dims[2]/2.0), + 0); + // Add a few particles only - it will end up with Pulling Scheme generate particles on (almost) all + // levels - good case to compare with OVPC + const int numOfParticles = 3; + std::srand(std::time(nullptr)); + for (int i = 0; i < numOfParticles; ++i) { + levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = gi.l_max; + } + PixelData levelsOVPC(levels, true); // just copy 'levels' + APRTimer t(false); + + // Run test methods and compare results + t.start_timer("OVPC - init"); + OVPC nps(gi, levelsOVPC); + t.stop_timer(); + t.start_timer("OVPC compute"); + nps.generateTree(); + t.stop_timer(); + + + t.start_timer("PS - init"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levels); + t.stop_timer(); + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), nps.getParticleCellTree()), 0); + } + } int main(int argc, char **argv) { diff --git a/test/TestTools.hpp b/test/TestTools.hpp index f323d2bb..158bf2ea 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -8,6 +8,8 @@ #include "data_structures/Mesh/PixelData.hpp" #include +#include "data_structures/APR/particles/ParticleData.hpp" + std::string get_source_directory_apr(){ // returns path to the directory where utils.cpp is stored @@ -44,7 +46,7 @@ inline bool compare(PixelData &mesh, const float *data, const float epsilon) } template -inline bool initFromZYXarray(PixelData &mesh, const float *data) { +inline bool initFromZYXarray(PixelData &mesh, const T *data) { size_t dataIdx = 0; for (int z = 0; z < mesh.z_num; ++z) { for (int y = 0; y < mesh.y_num; ++y) { @@ -65,17 +67,42 @@ inline bool initFromZYXarray(PixelData &mesh, const float *data) { * @return number of errors detected */ template -inline int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { +inline int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0, int maxNumOfErrPrinted = 3) { + if (expected.getDimension() != tested.getDimension()) { + std::stringstream errMsg; + errMsg << "Dimensions of expected and tested meshes differ! " << expected.getDimension() << " vs " << tested.getDimension(); + throw std::runtime_error(errMsg.str()); + } + int cnt = 0; + double maxErrorFound = 0; + T maxErrorExpectedValue = 0; + T maxErrorTestedValue = 0; + std::string maxErrorIdx = ""; + for (size_t i = 0; i < expected.mesh.size(); ++i) { - if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError) { + auto diff = std::abs(expected.mesh[i] - tested.mesh[i]); + if (diff > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; + std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: " + << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] + << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << i << "=" << tested.getStrIndex(i) << std::endl; } cnt++; } + if (diff > maxErrorFound) { + maxErrorFound = diff; + maxErrorExpectedValue = expected.mesh[i]; + maxErrorTestedValue = tested.mesh[i]; + maxErrorIdx = tested.getStrIndex(i); + } + } + if (cnt != 0) { + std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() + << ", maxErrorFound = " << maxErrorFound << " at IDX: " << maxErrorIdx << " " + << maxErrorExpectedValue << " vs " << maxErrorTestedValue + << "(" << (100*(long double)maxErrorFound/(long double)maxErrorExpectedValue) << "%)"< &expected, const PixelData &teste template inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTypeB &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 10) { int64_t cnt = 0; - if(expected.size() != tested.size()) { - std::cerr << "ERROR compareParticles: sizes differ!" << std::endl; - cnt++; + if (expected.size() != tested.size()) { + std::cerr << "ERROR compareParticles: sizes differs! " << expected.size() << " vs. " << tested.size() << std::endl; + return 1; // Return any number > 0 to indicate an error } for (size_t i = 0; i < expected.size(); ++i) { - if (std::abs(expected[i] - tested[i]) > maxError) { + if (std::abs((double)(expected[i] - tested[i])) > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested particle: " << (float)expected[i] << " vs " << (float)tested[i] << " IDX:" << i << std::endl; + std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested particle: " << (float)expected[i] << " vs " << (float)tested[i] << " IDX:" << i << std::endl; } cnt++; } } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.size() << std::endl; + if (cnt != 0) { + std::cout << "Number of errors / all points: " << cnt << " / " << expected.size() << std::endl; + } return cnt; } +/** + * Compares two Particle Cell Trees + * @param expected - expected levels + * @param tested - levels to verify + * @param maxError + * @param maxNumOfErrPrinted - how many error outputs should be printed + * @param maxTypeCompared - maximum type to be compared + * @return + */ +template +int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, bool printErrors = true, int maxNumOfErrPrinted = 3, uint8_t maxTypeCompared = FILLER_TYPE) { + int cntGlobal = 0; + for (size_t level = 0; level < expected.size(); level++) { + int cnt = 0; + int numOfParticles = 0; + for (size_t i = 0; i < expected[level].mesh.size(); ++i) { + if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= maxTypeCompared) { + if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) || + std::isnan(tested[level].mesh[i])) { + if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { + std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs " + << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl; + } + cnt++; + } + if (expected[level].mesh[i] > 0) numOfParticles++; + } + } + cntGlobal += cnt; + if (cnt > 0 && printErrors) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl; + } + return cntGlobal; +} /** - * Generates mesh with provided dims with random values in range [0, 1] * multiplier + * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset * @param y * @param x * @param z * @param multiplier + * @param offset + * @param useIdxNumbers - instead of random values put values from 0..sizeof(mesh)-1 * @return */ template -inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) { +inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, float offset=0.0, bool useIdxNumbers = false) { PixelData m(y, x, z); - std::cout << "Mesh info: " << m << std::endl; +// std::cout << "Mesh info: " << m << std::endl; std::random_device rd; std::mt19937 mt(rd()); std::uniform_real_distribution dist(0.0, 1.0); + #ifdef HAVE_OPENMP #pragma omp parallel for default(shared) #endif for (size_t i = 0; i < m.mesh.size(); ++i) { - m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier; + m.mesh[i] = useIdxNumbers ? i + 1 : dist(mt) * multiplier + offset; + } + return m; +} + +/** + * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset + * @param dim - dimension of generated mesh + * @param multiplier + * @param offset + * @param useIdxNumbers - instead of random values put values from 0..sizeof(mesh)-1 + * @return + */ +template +inline PixelData getRandInitializedMesh(PixelDataDim dim, float multiplier = 2.0f, float offset=0.0, bool useIdxNumbers = false) { + return getRandInitializedMesh(dim.y, dim.x, dim.z, multiplier, offset, useIdxNumbers); +} + +/** + * Generate mesh with square blob in the center of it with values randomly chosen from [20,40] range. Zero values outside. + * @tparam T + * @param y + * @param x + * @param z + * @return + */ +template +inline PixelData getMeshWithBlobInMiddle(int y, int x, int z) { + PixelData m(y, x, z, 0); + + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_real_distribution dist(0.0, 1.0); + + for (int yi = (1.0/3 * y); yi < (2.0/3 * y); yi++) { + for (int xi = (1.0/3 * x); xi < (2.0/3 * x); xi++) { + for (int zi = (1.0/3 * z); zi < (2.0/3 * z); zi++) { + m(yi, xi, zi) = 30 + dist(mt) * 10; + } + } } + return m; } +/** + * Generate mesh with square blob in the center of it with values randomly chosen from [20,40] range. Zero values outside. + * @tparam T + * @param dim + * @return + */ +template +inline PixelData getMeshWithBlobInMiddle(const PixelDataDim &dim) { + return getMeshWithBlobInMiddle(dim.y, dim.x, dim.z); +} + struct TestBenchStats{ double inf_norm=0;