From e6aa9c94dc774692760d102e3ec36ea572e43c94 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 1 Aug 2022 14:39:27 +0200 Subject: [PATCH 01/59] Bspline filters fixed for CUDA pipeline --- src/algorithm/APRConverter.hpp | 10 +- src/algorithm/ComputeGradient.hpp | 392 +++++++++------------------ src/algorithm/ComputeGradientCuda.cu | 133 +++++---- src/algorithm/bsplineXdir.cuh | 56 ++-- src/algorithm/bsplineYdir.cuh | 110 +++++--- src/algorithm/bsplineZdir.cuh | 55 ++-- src/algorithm/cudaMisc.cuh | 66 +++++ test/CMakeLists.txt | 2 +- test/ComputeGradientCudaTest.cpp | 155 +++++++++++ test/ComputeGradientTest.cpp | 166 +----------- test/TestTools.hpp | 10 +- 11 files changed, 589 insertions(+), 566 deletions(-) create mode 100644 src/algorithm/cudaMisc.cuh create mode 100644 test/ComputeGradientCudaTest.cpp diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index d6728f5f..988bf337 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -233,8 +233,9 @@ void APRConverter::computeL(APR& aAPR,PixelData& input_image){ //////////////////////// fine_grained_timer.start_timer("offset image"); - //offset image by factor (this is required if there are zero areas in the background with uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) - // Warning both of these could result in over-flow (if your image is non zero, with a 'buffer' and has intensities up to uint16_t maximum value then set image_type = "", i.e. uncomment the following line) + // offset image by factor (this is required if there are zero areas in the background with + // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) + // Warning both of these could result in over-flow! if (std::is_same::value) { bspline_offset = 100; @@ -461,8 +462,9 @@ inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input ///////////////////////////////// /// Pipeline //////////////////////// - //offset image by factor (this is required if there are zero areas in the background with uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) - // Warning both of these could result in over-flow (if your image is non zero, with a 'buffer' and has intensities up to uint16_t maximum value then set image_type = "", i.e. uncomment the following line) + // offset image by factor (this is required if there are zero areas in the background with + // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) + // Warning both of these could result in over-flow! if (std::is_same::value) { bspline_offset = 100; diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index 53c3d7cd..911013b1 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -65,6 +65,20 @@ class ComputeGradient { inline float impulse_resp_back(float k, float rho, float omg, float gamma, float c0); + typedef struct { + std::vector bc1_vec; + std::vector bc2_vec; + std::vector bc3_vec; + std::vector bc4_vec; + size_t k0; + float b1; + float b2; + float norm_factor; + size_t minLen; + } BsplineParams; + + BsplineParams prepareBSplineParams(size_t dimLen, float lambda, float tol, int maxFilterLen = -1); + }; template @@ -208,81 +222,45 @@ void ComputeGradient::get_smooth_bspline_3D(PixelData& input, float lambda) { inline float ComputeGradient::impulse_resp(float k,float rho,float omg){ // Impulse Response Function - return (pow(rho,(std::abs(k)))*sin((std::abs(k) + 1)*omg)) / sin(omg); + return (powf(rho,(std::abs(k)))*sinf((std::abs(k) + 1)*omg)) / sinf(omg); } inline float ComputeGradient::impulse_resp_back(float k,float rho,float omg,float gamma,float c0){ // Impulse Response Function (nominator eq. 4.8, denominator from eq. 4.7) - return c0*pow(rho,std::abs(k))*(cos(omg*std::abs(k)) + gamma*sin(omg*std::abs(k)))*(1.0/(pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2))); -} - - -/** - * floating point output -> no rounding or under-/overflow check - */ -template -std::enable_if_t::value, T> -round(float val, size_t &errCount) { - return val; -} - -/** - * integer output -> check for under-/overflow and round - */ -template -std::enable_if_t::value, T> -round(float val, size_t &errCount) { - - val = std::round(val); - - if(val < std::numeric_limits::min() || val > std::numeric_limits::max()) { - errCount++; - } - return val; + return c0*powf(rho,std::abs(k))*(cosf(omg*std::abs(k)) + gamma*sinf(omg*std::abs(k)))*(1.0/(powf((1 - 2.0*rho*cosf(omg) + pow(rho,2)),2))); } - - -template -void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float tol, int k0Len) { - // - // Bevan Cheeseman 2016 - // - // Recursive Filter Implimentation for Smoothing BSplines +ComputeGradient::BsplineParams ComputeGradient::prepareBSplineParams(size_t dimLen, float lambda, float tol, int maxFilterLen) { + // Recursive Filter Implementation for Smoothing BSplines // B-Spline Signal Processing: Part 11-Efficient Design and Applications, Unser 1993 - float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda); // eq 4.6 - float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda))); // eq 4.5 - float omg = atan(sqrt((1/xi)*(144*lambda - 1))); // eq 4.6 + float xi = 1 - 96*lambda + 24*lambda * sqrtf(3 + 144*lambda); + float rho = (24*lambda - 1 - sqrtf(xi)) / (24*lambda) * sqrtf((1/xi) * (48*lambda + 24*lambda * sqrtf(3 + 144*lambda))); + float omg = atan(sqrtf((1/xi) * (144*lambda - 1))); + float c0 = (1 + powf(rho,2)) / (1-powf(rho,2)) * (1 - 2*rho * cosf(omg) + powf(rho,2)) / (1 + 2*rho*cosf(omg) + powf(rho,2)); + float gamma = (1 - powf(rho,2)) / (1+powf(rho,2)) * (1 / tan(omg)); - float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2)); // eq 4.8 - float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg)); // eq 4.8 + const float b1 = 2*rho*cosf(omg); + const float b2 = -powf(rho,2.0); - const float b1 = 2*rho*cos(omg); - const float b2 = -pow(rho,2.0); + const size_t idealK0Len = ceil(std::abs(logf(tol) / logf(rho))); + const size_t k0 = maxFilterLen > 0 ? maxFilterLen : idealK0Len; + const size_t minLen = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, dimLen); - const size_t z_num = image.z_num; - const size_t x_num = image.x_num; - const size_t y_num = image.y_num; -// const size_t minLen = y_num; - const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))),y_num); - - const size_t k0 = k0Len > 0 ? k0Len : (size_t)(ceil(std::abs(log(tol)/log(rho)))); + const float norm_factor = powf((1 - 2.0*rho*cosf(omg) + powf(rho,2)),2); + std::cout << std::fixed << std::setprecision(9) << "CPU xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl; - const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2); -// std::cout << "CPUy xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; // for boundaries - std::vector impulse_resp_vec_f(k0+3); //forward - for (size_t k = 0; k < (k0+3); ++k) { - impulse_resp_vec_f[k] = impulse_resp(k,rho,omg); + std::vector impulse_resp_vec_f(k0+1); //forward + for (size_t k = 0; k < (k0+1); ++k) { + impulse_resp_vec_f[k] = impulse_resp(k, rho, omg); } - - std::vector impulse_resp_vec_b(k0+3); //backward - for (size_t k = 0; k < (k0+3); ++k) { - impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0); + std::vector impulse_resp_vec_b(k0+1); //backward + for (size_t k = 0; k < (k0+1); ++k) { + impulse_resp_vec_b[k] = impulse_resp_back(k, rho, omg, gamma, c0); } std::vector bc1_vec(k0, 0); //forward @@ -291,9 +269,8 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 0; k < k0; ++k) { bc1_vec[k] += impulse_resp_vec_f[k+1]; } - //assumes a constant value at the end of the filter when the required ghost is bigger then the image - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0; k++) { bc1_vec[minLen-1] += bc1_vec[k]; } @@ -302,8 +279,7 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 0; k < k0; ++k) { bc2_vec[k] = impulse_resp_vec_f[k]; } - - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0; k++) { bc2_vec[minLen-1] += bc2_vec[k]; } @@ -313,8 +289,7 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 0; k < (k0-1); ++k) { bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2]; } - - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0;k++) { bc3_vec[minLen-1] += bc3_vec[k]; } @@ -324,11 +299,64 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (size_t k = 1; k < k0; ++k) { bc4_vec[k] += 2*impulse_resp_vec_b[k]; } - - for(size_t k = (minLen); k < k0;k++){ + for (size_t k = minLen; k < k0; k++) { bc4_vec[minLen-1] += bc4_vec[k]; } + return BsplineParams { + std::move(bc1_vec), + std::move(bc2_vec), + std::move(bc3_vec), + std::move(bc4_vec), + k0, + b1, + b2, + norm_factor, + minLen + }; +} + +/** + * floating point output -> no rounding or under-/overflow check + */ +template +std::enable_if_t::value, T> +round(float val, size_t &errCount) { + return val; +} + +/** + * integer output -> check for under-/overflow and round + */ +template +std::enable_if_t::value, T> +round(float val, size_t &errCount) { + + val = std::round(val); + + if(val < std::numeric_limits::min() || val > std::numeric_limits::max()) { + errCount++; + std::cout << val << " " << (float)std::numeric_limits::min() << " " << (float)std::numeric_limits::max() << std::endl; + } + return val; +} + + + +template +void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float tol, int k0Len) { + // + // Bevan Cheeseman 2016 + // + // Recursive Filter Implementation for Smoothing BSplines + // B-Spline Signal Processing: Part 11-Efficient Design and Applications, Unser 1993 + + const size_t z_num = image.z_num; + const size_t x_num = image.x_num; + const size_t y_num = image.y_num; + + auto p = prepareBSplineParams(y_num, lambda, tol, k0Len); + APRTimer btime; btime.verbose_flag = false; @@ -350,37 +378,35 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float const size_t iynum = x * y_num; //boundary conditions - for (size_t k = 0; k < minLen; ++k) { - temp1 += bc1_vec[k]*image.mesh[jxnumynum + iynum + k]; - temp2 += bc2_vec[k]*image.mesh[jxnumynum + iynum + k]; + for (size_t k = 0; k < p.minLen; ++k) { + temp1 += p.bc1_vec[k]*image.mesh[jxnumynum + iynum + k]; + temp2 += p.bc2_vec[k]*image.mesh[jxnumynum + iynum + k]; } //boundary conditions - for (size_t k = 0; k < minLen; ++k) { - temp3 += bc3_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; - temp4 += bc4_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; + for (size_t k = 0; k < p.minLen; ++k) { + temp3 += p.bc3_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; + temp4 += p.bc4_vec[k]*image.mesh[jxnumynum + iynum + y_num - 1 - k]; } //initialize the sequence - image.mesh[jxnumynum + iynum + 0] = temp2; - image.mesh[jxnumynum + iynum + 1] = temp1; + image.mesh[jxnumynum + iynum + 0] = round(temp2, error_count); + image.mesh[jxnumynum + iynum + 1] = round(temp1, error_count); for (auto it = (image.mesh.begin()+jxnumynum + iynum + 2); it != (image.mesh.begin()+jxnumynum + iynum + y_num); ++it) { - float temp = temp1*b1 + temp2*b2 + *it; + + float temp = temp1*p.b1 + temp2*p.b2 + *it; *it = round(temp, error_count); temp2 = temp1; temp1 = temp; } - image.mesh[jxnumynum + iynum + y_num - 2] = round(temp3*norm_factor, error_count); - image.mesh[jxnumynum + iynum + y_num - 1] = round(temp4*norm_factor, error_count); - - + image.mesh[jxnumynum + iynum + y_num - 2] = round(temp3*p.norm_factor, error_count); + image.mesh[jxnumynum + iynum + y_num - 1] = round(temp4*p.norm_factor, error_count); } } btime.stop_timer(); - btime.start_timer("backward_loop_y"); #ifdef HAVE_OPENMP #pragma omp parallel for default(shared) reduction(+: error_count) @@ -391,13 +417,12 @@ void ComputeGradient::bspline_filt_rec_y(PixelData& image,float lambda,float for (int64_t i = x_num - 1; i >= 0; --i) { const size_t iynum = i * y_num; - float temp2 = image.mesh[jxnumynum + iynum + y_num - 1]/norm_factor; - float temp1 = image.mesh[jxnumynum + iynum + y_num - 2]/norm_factor; + float temp2 = image.mesh[jxnumynum + iynum + y_num - 1]/p.norm_factor; + float temp1 = image.mesh[jxnumynum + iynum + y_num - 2]/p.norm_factor; for (auto it = (image.mesh.begin()+jxnumynum + iynum + y_num-3); it != (image.mesh.begin()+jxnumynum + iynum-1); --it) { - float temp = temp1*b1 + temp2*b2 + *it; - - *it = round(temp*norm_factor, error_count); + float temp = temp1*p.b1 + temp2*p.b2 + *it; + *it = round(temp*p.norm_factor, error_count); temp2 = temp1; temp1 = temp; @@ -417,90 +442,13 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float // // Bevan Cheeseman 2016 // - // Recursive Filter Implimentation for Smoothing BSplines - - float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda); - float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda))); - float omg = atan(sqrt((1/xi)*(144*lambda - 1))); - float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2)); - float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg)); - - const float b1 = 2*rho*cos(omg); - const float b2 = -pow(rho,2.0); + // Recursive Filter Implementation for Smoothing BSplines const size_t z_num = image.z_num; const size_t x_num = image.x_num; const size_t y_num = image.y_num; - //const size_t minLen = std::min(z_num, std::min(x_num, y_num)); - //const size_t minLen = z_num; - - const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))), z_num); - const size_t k0 = k0Len > 0 ? k0Len :(size_t)(ceil(std::abs(log(tol)/log(rho)))); - - const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2); -// std::cout << "CPUz xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; - - ////////////////////////////////////////////////////////////// - // - // Setting up boundary conditions - // - ////////////////////////////////////////////////////////////// - - std::vector impulse_resp_vec_f(k0+3); //forward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_f[k] = impulse_resp(k,rho,omg); - } - - std::vector impulse_resp_vec_b(k0+3); //backward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0); - } - - std::vector bc1_vec(k0, 0); //forward - //y(1) init - bc1_vec[1] = impulse_resp_vec_f[0]; - for(size_t k = 0; k < k0; k++){ - bc1_vec[k] += impulse_resp_vec_f[k+1]; - } - - //assumes a constant value at the end of the filter when the required ghost is bigger then the image - for(size_t k = (minLen); k < k0;k++){ - bc1_vec[minLen-1] += bc1_vec[k]; - } - - - std::vector bc2_vec(k0, 0); //backward - //y(0) init - for(size_t k = 0; k < k0; k++){ - bc2_vec[k] = impulse_resp_vec_f[k]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc2_vec[minLen-1] += bc2_vec[k]; - } - - std::vector bc3_vec(k0, 0); //forward - //y(N-1) init - bc3_vec[0] = impulse_resp_vec_b[1]; - for(size_t k = 0; k < (k0-1); k++){ - bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc3_vec[minLen-1] += bc3_vec[k]; - } - - std::vector bc4_vec(k0, 0); //backward - //y(N) init - bc4_vec[0] = impulse_resp_vec_b[0]; - for(size_t k = 1; k < k0; k++){ - bc4_vec[k] += 2*impulse_resp_vec_b[k]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc4_vec[minLen-1] += bc4_vec[k]; - } + auto p = prepareBSplineParams(z_num, lambda, tol, k0Len); //forwards direction std::vector temp_vec1(y_num,0); @@ -523,18 +471,18 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float size_t iynum = i * y_num; - for (size_t j = 0; j < minLen; ++j) { + for (size_t j = 0; j < p.minLen; ++j) { size_t index = j * x_num * y_num + iynum; #ifdef HAVE_OPENMP #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; k--) { //forwards boundary condition - temp_vec1[k] += bc1_vec[j] * image.mesh[index + k]; - temp_vec2[k] += bc2_vec[j] * image.mesh[index + k]; + temp_vec1[k] += p.bc1_vec[j] * image.mesh[index + k]; + temp_vec2[k] += p.bc2_vec[j] * image.mesh[index + k]; //backwards boundary condition - temp_vec3[k] += bc3_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; - temp_vec4[k] += bc4_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; + temp_vec3[k] += p.bc3_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; + temp_vec4[k] += p.bc4_vec[j] * image.mesh[(z_num - 1 - j)*x_num*y_num + iynum + k]; } } @@ -557,7 +505,7 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float #pragma omp simd #endif for (size_t k = 0; k < y_num; ++k) { - temp_vec2[k] = round(1.0f*image.mesh[index + k] + b1*temp_vec1[k] + b2*temp_vec2[k], error_count); + temp_vec2[k] = round(image.mesh[index + k] + p.b1*temp_vec1[k] + p.b2*temp_vec2[k], error_count); } std::swap(temp_vec1, temp_vec2); @@ -568,12 +516,12 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float //initialization for (int64_t k = y_num - 1; k >= 0; --k) { //y(N) - image.mesh[(z_num - 1)*x_num*y_num + iynum + k] = round(temp_vec4[k]*norm_factor, error_count); + image.mesh[(z_num - 1)*x_num*y_num + iynum + k] = round(temp_vec4[k]*p.norm_factor, error_count); } for (int64_t k = y_num - 1; k >= 0; --k) { //y(N-1) - image.mesh[(z_num - 2)*x_num*y_num + iynum + k] = round(temp_vec3[k]*norm_factor, error_count); + image.mesh[(z_num - 2)*x_num*y_num + iynum + k] = round(temp_vec3[k]*p.norm_factor, error_count); } //main loop @@ -584,8 +532,8 @@ void ComputeGradient::bspline_filt_rec_z(PixelData& image,float lambda,float #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; --k) { - float temp = (image.mesh[index + k] + b1*temp_vec3[k] + b2*temp_vec4[k]); - image.mesh[index + k] = round(temp*norm_factor, error_count); + float temp = (image.mesh[index + k] + p.b1*temp_vec3[k] + p.b2*temp_vec4[k]); + image.mesh[index + k] = round(temp*p.norm_factor, error_count); temp_vec4[k] = temp_vec3[k]; temp_vec3[k] = temp; } @@ -605,85 +553,11 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float // // Recursive Filter Implimentation for Smoothing BSplines - float xi = 1 - 96*lambda + 24*lambda*sqrt(3 + 144*lambda); - float rho = (24*lambda - 1 - sqrt(xi))/(24*lambda)*sqrt((1/xi)*(48*lambda + 24*lambda*sqrt(3 + 144*lambda))); - float omg = atan(sqrt((1/xi)*(144*lambda - 1))); - float c0 = (1+ pow(rho,2))/(1-pow(rho,2)) * (1 - 2*rho*cos(omg) + pow(rho,2))/(1 + 2*rho*cos(omg) + pow(rho,2)); - float gamma = (1-pow(rho,2))/(1+pow(rho,2)) * (1/tan(omg)); - - const float b1 = 2*rho*cos(omg); - const float b2 = -pow(rho,2.0); - const size_t z_num = image.z_num; const size_t x_num = image.x_num; const size_t y_num = image.y_num; -// const size_t minLen = x_num; - const size_t minLen = k0Len > 0 ? k0Len : std::min((size_t)(ceil(std::abs(log(tol)/log(rho)))), x_num); - const size_t k0 = k0Len > 0 ? k0Len : ((size_t)(ceil(std::abs(log(tol)/log(rho))))); - const float norm_factor = pow((1 - 2.0*rho*cos(omg) + pow(rho,2)),2); - -// std::cout << "CPUx xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; - - ////////////////////////////////////////////////////////////// - // - // Setting up boundary conditions - // - ////////////////////////////////////////////////////////////// - - std::vector impulse_resp_vec_f(k0+3); //forward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_f[k] = impulse_resp(k,rho,omg); - } - - std::vector impulse_resp_vec_b(k0+3); //backward - for (size_t k = 0; k < (k0+3);k++){ - impulse_resp_vec_b[k] = impulse_resp_back(k,rho,omg,gamma,c0); - } - - std::vector bc1_vec(k0, 0); //forward - //y(1) init - bc1_vec[1] = impulse_resp_vec_f[0]; - for(size_t k = 0; k < k0;k++){ - bc1_vec[k] += impulse_resp_vec_f[k+1]; - } - - //assumes a constant value at the end of the filter when the required ghost is bigger then the image - for(size_t k = (minLen); k < k0;k++){ - bc1_vec[minLen-1] += bc1_vec[k]; - } - - std::vector bc2_vec(k0, 0); //backward - //y(0) init - for(size_t k = 0; k < k0;k++){ - bc2_vec[k] = impulse_resp_vec_f[k]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc2_vec[minLen-1] += bc2_vec[k]; - } - - std::vector bc3_vec(k0, 0); //forward - //y(N-1) init - bc3_vec[0] = impulse_resp_vec_b[1]; - for(size_t k = 0; k < (k0-1);k++){ - bc3_vec[k+1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k+2]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc3_vec[minLen-1] += bc3_vec[k]; - } - - std::vector bc4_vec(k0, 0); //backward - //y(N) init - bc4_vec[0] = impulse_resp_vec_b[0]; - for(size_t k = 1; k < k0;k++){ - bc4_vec[k] += 2*impulse_resp_vec_b[k]; - } - - for(size_t k = (minLen); k < k0;k++){ - bc4_vec[minLen-1] += bc4_vec[k]; - } + auto p = prepareBSplineParams(x_num, lambda, tol, k0Len); //forwards direction @@ -705,15 +579,15 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float size_t jxnumynum = j * y_num * x_num; - for (size_t i = 0; i < minLen; ++i) { + for (size_t i = 0; i < p.minLen; ++i) { for (size_t k = 0; k < y_num; ++k) { //forwards boundary condition - temp_vec1[k] += bc1_vec[i]*image.mesh[jxnumynum + i*y_num + k]; - temp_vec2[k] += bc2_vec[i]*image.mesh[jxnumynum + i*y_num + k]; + temp_vec1[k] += p.bc1_vec[i]*image.mesh[jxnumynum + i*y_num + k]; + temp_vec2[k] += p.bc2_vec[i]*image.mesh[jxnumynum + i*y_num + k]; //backwards boundary condition - temp_vec3[k] += bc3_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; - temp_vec4[k] += bc4_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; + temp_vec3[k] += p.bc3_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; + temp_vec4[k] += p.bc4_vec[i]*image.mesh[jxnumynum + (x_num - 1 - i)*y_num + k]; } } @@ -735,7 +609,7 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; k--) { - temp_vec2[k] = round(image.mesh[index + k] + b1*temp_vec1[k] + b2*temp_vec2[k], error_count); + temp_vec2[k] = round(image.mesh[index + k] + p.b1*temp_vec1[k] + p.b2*temp_vec2[k], error_count); } std::swap(temp_vec1, temp_vec2); @@ -748,12 +622,12 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float //initialization for (int64_t k = y_num - 1; k >= 0; --k) { //y(N) - image.mesh[jxnumynum + (x_num - 1)*y_num + k] = round(temp_vec4[k]*norm_factor, error_count); + image.mesh[jxnumynum + (x_num - 1)*y_num + k] = round(temp_vec4[k]*p.norm_factor, error_count); } for (int64_t k = y_num - 1; k >= 0; --k) { //y(N-1) - image.mesh[jxnumynum + (x_num - 2)*y_num + k] = round(temp_vec3[k]*norm_factor, error_count); + image.mesh[jxnumynum + (x_num - 2)*y_num + k] = round(temp_vec3[k]*p.norm_factor, error_count); } //main loop @@ -764,8 +638,8 @@ void ComputeGradient::bspline_filt_rec_x(PixelData& image,float lambda,float #pragma omp simd #endif for (int64_t k = y_num - 1; k >= 0; k--){ - float temp = (image.mesh[index + k] + b1*temp_vec3[ k]+ b2*temp_vec4[ k]); - image.mesh[index + k] = round(temp*norm_factor, error_count); + float temp = (image.mesh[index + k] + p.b1*temp_vec3[ k]+ p.b2*temp_vec4[ k]); + image.mesh[index + k] = round(temp*p.norm_factor, error_count); temp_vec4[k] = temp_vec3[k]; temp_vec3[k] = temp; } diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index cf636d5f..982e649c 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -1,28 +1,26 @@ -#include "ComputeGradientCuda.hpp" -#include "APRParameters.hpp" #include -#include +#include +#include +#include #include -#include +#include "ComputeGradientCuda.hpp" +#include "APRParameters.hpp" #include "data_structures/Mesh/PixelData.hpp" -#include "dsGradient.cuh" - -#include "invBspline.cuh" -#include -#include -#include "bsplineXdir.cuh" -#include "bsplineYdir.cuh" -#include "bsplineZdir.cuh" #include "data_structures/Mesh/downsample.cuh" #include "algorithm/ComputePullingScheme.cuh" -#include "algorithm/LocalIntensityScaleCuda.h" #include "algorithm/LocalIntensityScale.cuh" #include "misc/CudaTools.cuh" #include "misc/CudaMemory.cuh" -#include -#include + +#include "dsGradient.cuh" +#include "invBspline.cuh" +#include "bsplineXdir.cuh" +#include "bsplineYdir.cuh" +#include "bsplineZdir.cuh" + + namespace { typedef struct { @@ -38,45 +36,42 @@ namespace { float impulse_resp(float k, float rho, float omg) { // Impulse Response Function - return (pow(rho, (std::abs(k))) * sin((std::abs(k) + 1) * omg)) / sin(omg); + return (powf(rho, (std::abs(k))) * sinf((std::abs(k) + 1) * omg)) / sinf(omg); } float impulse_resp_back(float k, float rho, float omg, float gamma, float c0) { // Impulse Response Function (nominator eq. 4.8, denominator from eq. 4.7) - return c0 * pow(rho, std::abs(k)) * (cos(omg * std::abs(k)) + gamma * sin(omg * std::abs(k))) * - (1.0 / (pow((1 - 2.0 * rho * cos(omg) + pow(rho, 2)), 2))); + return c0 * powf(rho, std::abs(k)) * (cosf(omg * std::abs(k)) + gamma * sinf(omg * std::abs(k))) * + (1.0 / (powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2))); } - template - BsplineParams prepareBsplineStuff(const PixelData &image, float lambda, float tol, int maxFilterLen = -1) { + BsplineParams prepareBsplineStuff(size_t dimLen, float lambda, float tol, int maxFilterLen = -1) { // Recursive Filter Implimentation for Smoothing BSplines // B-Spline Signal Processing: Part II - Efficient Design and Applications, Unser 1993 - float xi = 1 - 96 * lambda + 24 * lambda * sqrt(3 + 144 * lambda); // eq 4.6 - float rho = (24 * lambda - 1 - sqrt(xi)) / (24 * lambda) * - sqrt((1 / xi) * (48 * lambda + 24 * lambda * sqrt(3 + 144 * lambda))); // eq 4.5 - float omg = atan(sqrt((1 / xi) * (144 * lambda - 1))); // eq 4.6 + float xi = 1 - 96 * lambda + 24 * lambda * sqrtf(3 + 144 * lambda); // eq 4.6 + float rho = (24 * lambda - 1 - sqrtf(xi)) / (24 * lambda) * + sqrtf((1 / xi) * (48 * lambda + 24 * lambda * sqrtf(3 + 144 * lambda))); // eq 4.5 - float c0 = (1 + pow(rho, 2)) / (1 - pow(rho, 2)) * (1 - 2 * rho * cos(omg) + pow(rho, 2)) / - (1 + 2 * rho * cos(omg) + pow(rho, 2)); // eq 4.8 - float gamma = (1 - pow(rho, 2)) / (1 + pow(rho, 2)) * (1 / tan(omg)); // eq 4.8 + float omg = atan(sqrtf((1 / xi) * (144 * lambda - 1))); // eq 4.6 - const float b1 = 2 * rho * cos(omg); - const float b2 = -pow(rho, 2.0); + float c0 = (1 + powf(rho, 2)) / (1 - powf(rho, 2)) * (1 - 2 * rho * cosf(omg) + powf(rho, 2)) / + (1 + 2 * rho * cosf(omg) + powf(rho, 2)); // eq 4.8 + float gamma = (1 - powf(rho, 2)) / (1 + powf(rho, 2)) * (1 / tan(omg)); // eq 4.8 - const size_t idealK0Len = ceil(std::abs(log(tol) / log(rho))); - const size_t minDimension = std::min(image.z_num, std::min(image.x_num, image.y_num)); - const size_t k0 = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, minDimension); + const float b1 = 2 * rho * cosf(omg); + const float b2 = -powf(rho, 2.0); - const float norm_factor = pow((1 - 2.0 * rho * cos(omg) + pow(rho, 2)), 2); - std::cout << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 - << " b2=" << b2 << " k0=" << k0 << " norm_factor=" << norm_factor << std::endl; + const size_t idealK0Len = ceil(std::abs(logf(tol) / logf(rho))); + const size_t k0 = maxFilterLen > 0 ? maxFilterLen : idealK0Len; + const size_t minLen = maxFilterLen > 0 ? maxFilterLen : std::min(idealK0Len, dimLen); - // ------- Calculating boundary conditions + const float norm_factor = powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2); + + //std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 + // << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl; - // forward boundaries - std::vector impulse_resp_vec_f(k0 + 1); - for (size_t k = 0; k < impulse_resp_vec_f.size(); ++k) impulse_resp_vec_f[k] = impulse_resp(k, rho, omg); + // ------- Calculating boundary conditions size_t boundaryLen = sizeof(float) * k0; PinnedMemoryUniquePtr bc1{(float*)getPinnedMemory(boundaryLen)}; @@ -84,11 +79,19 @@ namespace { PinnedMemoryUniquePtr bc3{(float*)getPinnedMemory(boundaryLen)}; PinnedMemoryUniquePtr bc4{(float*)getPinnedMemory(boundaryLen)}; + // forward boundaries + std::vector impulse_resp_vec_f(k0 + 1); + for (size_t k = 0; k < impulse_resp_vec_f.size(); ++k) impulse_resp_vec_f[k] = impulse_resp(k, rho, omg); + //y(0) init for (size_t k = 0; k < k0; ++k) bc1[k] = impulse_resp_vec_f[k]; + for (size_t k = minLen; k < k0; ++k) bc1[minLen - 1] += bc1[k]; + //y(1) init + for (size_t k = 0; k < k0; ++k) bc2[k] = 0; bc2[1] = impulse_resp_vec_f[0]; for (size_t k = 0; k < k0; ++k) bc2[k] += impulse_resp_vec_f[k + 1]; + for (size_t k = minLen; k < k0; ++k) bc2[minLen - 1] += bc2[k]; // backward boundaries std::vector impulse_resp_vec_b(k0 + 1); @@ -96,11 +99,16 @@ namespace { impulse_resp_vec_b[k] = impulse_resp_back(k, rho, omg, gamma, c0); //y(N-1) init + for (size_t k = 0; k < k0; ++k) bc3[k] = 0; bc3[0] = impulse_resp_vec_b[1]; for (size_t k = 0; k < (k0 - 1); ++k) bc3[k + 1] += impulse_resp_vec_b[k] + impulse_resp_vec_b[k + 2]; + for (size_t k = minLen; k < k0; ++k) bc3[minLen - 1] += bc3[k]; + //y(N) init + for (size_t k = 0; k < k0; ++k) bc4[k] = 0; bc4[0] = impulse_resp_vec_b[0]; for (size_t k = 1; k < k0; ++k) bc4[k] += 2 * impulse_resp_vec_b[k]; + for (size_t k = minLen; k < k0; ++k) bc4[minLen - 1] += bc4[k]; return BsplineParams{ std::move(bc1), @@ -166,9 +174,9 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc runThresholdImg(cudaImage, image.x_num, image.y_num, image.z_num, par.Ip_th + bspline_offset, aStream); - runBsplineYdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, boundary, aStream); - runBsplineXdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); - runBsplineZdir(cudaImage, image.x_num, image.y_num, image.z_num, bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); + runBsplineYdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, boundary, aStream); + runBsplineXdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); + runBsplineZdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); runKernelGradient(cudaImage, cudaGrad, image.x_num, image.y_num, image.z_num, local_scale_temp.x_num, local_scale_temp.y_num, par.dx, par.dy, par.dz, aStream); @@ -249,7 +257,9 @@ public: iParameters(parameters), iBsplineOffset(bspline_offset), iMaxLevel(maxLevel), - params(prepareBsplineStuff(image, parameters.lambda, tolerance)), + // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. + // Should be fixed when other parts of pipeline are ready. + params(prepareBsplineStuff((size_t)image.x_num, parameters.lambda, tolerance)), bc1(params.bc1.get(), params.k0, iStream), bc2(params.bc2.get(), params.k0, iStream), bc3(params.bc3.get(), params.k0, iStream), @@ -336,29 +346,44 @@ template class GpuProcessingTask; // explicit instantiation of handled types template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); +template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); +template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); +template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); + template void cudaFilterBsplineFull(PixelData &input, float lambda, float tolerance, TypeOfRecBsplineFlags flags, int maxFilterLen) { cudaStream_t aStream = 0; - BsplineParams p = prepareBsplineStuff(input, lambda, tolerance, maxFilterLen); - ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); + ScopedCudaMemHandler, D2H | H2D> cudaInput(input); - APRTimer timer(true); + APRTimer timer(false); timer.start_timer("GpuDeviceTimeFull"); if (flags & BSPLINE_Y_DIR) { + BsplineParams p = prepareBsplineStuff((size_t)input.y_num, lambda, tolerance, maxFilterLen); + ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); + ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); + ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); + ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * input.x_num * input.z_num; ScopedCudaMemHandler boundary(nullptr, boundaryLen); // allocate memory on device - runBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, boundary.get(), aStream); + runBsplineYdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, boundary.get(), aStream); } if (flags & BSPLINE_X_DIR) { - runBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); + BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen); + ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); + ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); + ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); + ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); + runBsplineXdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); } if (flags & BSPLINE_Z_DIR) { - runBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); + BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen); + ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); + ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); + ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); + ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); + runBsplineZdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); } timer.stop_timer(); } @@ -404,7 +429,9 @@ void getGradient(PixelData &image, PixelData &grad_temp, Pixel ScopedCudaMemHandler, D2H> cudalocal_scale_temp2(local_scale_temp2); float tolerance = 0.0001; - BsplineParams p = prepareBsplineStuff(image, par.lambda, tolerance); + // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. + // Should be fixed when other parts of pipeline are ready. + BsplineParams p = prepareBsplineStuff(image.x_num, par.lambda, tolerance); ScopedCudaMemHandler bc1 (p.bc1.get(), p.k0); ScopedCudaMemHandler bc2 (p.bc2.get(), p.k0); diff --git a/src/algorithm/bsplineXdir.cuh b/src/algorithm/bsplineXdir.cuh index be0a5f78..6ee3c755 100644 --- a/src/algorithm/bsplineXdir.cuh +++ b/src/algorithm/bsplineXdir.cuh @@ -5,9 +5,10 @@ #include #include #include +#include "cudaMisc.cuh" /** - * Runs bspline recursive filter in X direction. Each processed 2D patch consist of number of workes + * Runs bspline recursive filter in X direction. Each processed 2D patch consist of number of workers * (distributed in Y direction) and each of them is handling the whole row in X-dir. * Next patches are build on a top of first (like patch1 in example below) and they cover * whole y-dimension. Such a setup should be run for every plane in z-direction. @@ -59,22 +60,24 @@ * @param norm_factor - filter norm factor */ template -__global__ void bsplineXdir(T *image, size_t x_num, size_t y_num, +__global__ void bsplineXdir(T *image, PixelDataDim dim, const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, - float b1, float b2, float norm_factor) { + float b1, float b2, float norm_factor, bool *error) { const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y; - const size_t zDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * x_num * y_num; - const size_t nextElementXdirOffset = y_num; - const size_t dirLen = x_num; + const size_t zDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.x * dim.y; + const size_t nextElementXdirOffset = dim.y; + const size_t dirLen = dim.x; + const size_t minLen = min(dirLen, k0); - if (yDirOffset < y_num) { + if (yDirOffset < dim.y) { float temp1 = 0; float temp2 = 0; float temp3 = 0; float temp4 = 0; + // calculate boundary values - for (int k = 0; k < k0; ++k) { + for (int k = 0; k < minLen; ++k) { T val = image[zDirOffset + k * nextElementXdirOffset + yDirOffset]; temp1 += bc1[k] * val; temp2 += bc2[k] * val; @@ -83,18 +86,20 @@ __global__ void bsplineXdir(T *image, size_t x_num, size_t y_num, temp4 += bc4[k] * val; } + size_t errorCnt = 0; + // set boundary values in two first and two last points processed direction - image[zDirOffset + 0 * nextElementXdirOffset + yDirOffset] = temp1; - image[zDirOffset + 1 * nextElementXdirOffset + yDirOffset] = temp2; - image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = temp3 * norm_factor; - image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = temp4 * norm_factor; + image[zDirOffset + 0 * nextElementXdirOffset + yDirOffset] = round(temp1, errorCnt); + image[zDirOffset + 1 * nextElementXdirOffset + yDirOffset] = round(temp2, errorCnt); + image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = round(temp3 * norm_factor, errorCnt); + image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = round(temp4 * norm_factor, errorCnt); // Causal Filter loop int64_t offset = zDirOffset + 2 * nextElementXdirOffset + yDirOffset; int64_t offsetLimit = zDirOffset + (dirLen - 2) * nextElementXdirOffset; while (offset < offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp1 * b2 + temp2 * b1 + image[offset]; + const float temp = round(image[offset] + b1 * temp2 + b2 * temp1, errorCnt); image[offset] = temp; temp1 = temp2; temp2 = temp; @@ -107,13 +112,15 @@ __global__ void bsplineXdir(T *image, size_t x_num, size_t y_num, offsetLimit = zDirOffset; while (offset >= offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp3 * b1 + temp4 * b2 + image[offset]; - image[offset] = temp * norm_factor; + const float temp = image[offset] + b1 * temp3 + b2 * temp4; + image[offset] = round(temp * norm_factor, errorCnt); temp4 = temp3; temp3 = temp; offset -= nextElementXdirOffset; } + + if (errorCnt > 0) *error = true; } } @@ -121,15 +128,26 @@ __global__ void bsplineXdir(T *image, size_t x_num, size_t y_num, * Function for launching a kernel */ template -void runBsplineXdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, +void runBsplineXdir(T *cudaImage, PixelDataDim dim, const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockX(1, numOfWorkersYdir, 1); dim3 numBlocksX(1, - (y_num + threadsPerBlockX.y - 1) / threadsPerBlockX.y, - (z_num + threadsPerBlockX.z - 1) / threadsPerBlockX.z); - bsplineXdir <<>> (cudaImage, x_num, y_num, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor); + (dim.y + threadsPerBlockX.y - 1) / threadsPerBlockX.y, + (dim.z + threadsPerBlockX.z - 1) / threadsPerBlockX.z); + // In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel + // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. + bool isErrorDetected = false; + { + ScopedCudaMemHandler error(&isErrorDetected, 1); + bsplineXdir <<>>(cudaImage, dim, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor, error.get()); + } + + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineXdir - " + "try squashing the input image to a narrower range or use APRConverter"); + } } #endif diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index b9dc2f25..a1026704 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -5,12 +5,14 @@ #include #include #include +#include "cudaMisc.cuh" + /** * Runs bspline recursive filter in Y direction - divided into two phases: * 1. calculate boundary conditions * 2. run recursive filter as a set of 2D patches: - * Each processed 2D patch consist of number of workes + * Each processed 2D patch consist of number of workers * (distributed in Y direction) and each of them is handling the whole row in Y-dir. * Next patches are build on next to it in the x-dir to cover whole x * z domain. * @@ -57,22 +59,25 @@ template -__global__ void bsplineYdirBoundary(T *image, size_t x_num, size_t y_num, size_t z_num, +__global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, const float *bc1_vec, const float *bc2_vec, const float *bc3_vec, const float *bc4_vec, - size_t k0, float *boundary) { + size_t k0, float norm_factor, float *boundary, bool *error) { const int xzIndexOfWorker = (blockIdx.x * blockDim.x) + threadIdx.x; const int xzIndexOfBlock = (blockIdx.x * blockDim.x); const int numOfWorkers = blockDim.x; const int currentWorkerId = threadIdx.x; - const size_t workersOffset = xzIndexOfBlock * y_num; // per each (x,z) coordinate we have y-row + const size_t workersOffset = xzIndexOfBlock * dim.y; // per each (x,z) coordinate we have y-row + + const int64_t maxXZoffset = dim.x * dim.z; - const int64_t maxXZoffset = x_num * z_num; + const size_t dirLen = dim.y; + const size_t minLen = min(dirLen, k0); extern __shared__ float sharedMem[]; float *bc1_vec2 = &sharedMem[0]; float *bc2_vec2 = &bc1_vec2[k0]; - T *cache = (T*)&bc2_vec2[k0]; + float *cache = (float*)&bc2_vec2[k0]; // Read from global mem to cache for (int i = currentWorkerId; i < k0 * numOfWorkers; i += numOfWorkers) { @@ -83,18 +88,18 @@ __global__ void bsplineYdirBoundary(T *image, size_t x_num, size_t y_num, size_t int offs = i % k0; int work = i / k0; if (work + xzIndexOfBlock < maxXZoffset) { - cache[work * k0 + offs] = image[workersOffset + y_num * work + offs]; + cache[work * k0 + offs] = image[workersOffset + dim.y * work + offs]; } } __syncthreads(); //forwards direction - if (xzIndexOfWorker < x_num * z_num) { + if (xzIndexOfWorker < dim.x * dim.z) { float temp1 = 0; float temp2 = 0; - for (size_t k = 0; k < k0; ++k) { - temp1 += bc1_vec2[k] * cache[currentWorkerId * k0 + k]; - temp2 += bc2_vec2[k] * cache[currentWorkerId * k0 + k]; + for (size_t k = 0; k < minLen; ++k) { + temp1 += bc1_vec2[k] * (T)cache[currentWorkerId * k0 + k]; + temp2 += bc2_vec2[k] * (T)cache[currentWorkerId * k0 + k]; } boundary[xzIndexOfWorker*4 + 0] = temp1; boundary[xzIndexOfWorker*4 + 1] = temp2; @@ -111,49 +116,54 @@ __global__ void bsplineYdirBoundary(T *image, size_t x_num, size_t y_num, size_t int offs = i % k0; int work = i / k0; if (work + xzIndexOfBlock < maxXZoffset) { - cache[work * k0 + offs] = image[workersOffset + y_num * work + y_num - 1 - offs]; + cache[work * k0 + offs] = image[workersOffset + dim.y * work + dim.y - 1 - offs]; } } __syncthreads(); + size_t errorCnt = 0; + //forwards direction - if (xzIndexOfWorker < x_num * z_num) { + if (xzIndexOfWorker < dim.x * dim.z) { float temp3 = 0; float temp4 = 0; - for (size_t k = 0; k < k0; ++k) { - temp3 += bc1_vec2[k] * cache[currentWorkerId * k0 + k]; - temp4 += bc2_vec2[k] * cache[currentWorkerId * k0 + k]; + for (size_t k = 0; k < minLen; ++k) { + temp3 += bc1_vec2[k] * (T)cache[currentWorkerId * k0 + k]; + temp4 += bc2_vec2[k] * (T)cache[currentWorkerId * k0 + k]; } - boundary[xzIndexOfWorker*4 + 2] = temp3; - boundary[xzIndexOfWorker*4 + 3] = temp4; + boundary[xzIndexOfWorker*4 + 2] = round(temp3 * norm_factor, errorCnt); + boundary[xzIndexOfWorker*4 + 3] = round(temp4 * norm_factor, errorCnt); } + + if (errorCnt > 0) *error = true; } constexpr int blockWidth = 32; constexpr int numOfThreads = 32; extern __shared__ char sharedMemProcess[]; template -__global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_num, const size_t z_num, size_t k0, - const float b1, const float b2, const float norm_factor, float *boundary) { +__global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, + const float b1, const float b2, const float norm_factor, float *boundary, bool *error) { const int numOfWorkers = blockDim.x; const int currentWorkerId = threadIdx.x; const int xzOffset = blockIdx.x * blockDim.x; - const int64_t maxXZoffset = x_num * z_num; - const int64_t workersOffset = xzOffset * y_num; + const int64_t maxXZoffset = dim.x * dim.z; + const int64_t workersOffset = xzOffset * dim.y; - T (*cache)[blockWidth + 0] = (T (*)[blockWidth + 0]) &sharedMemProcess[0]; + float (*cache)[blockWidth + 0] = (float (*)[blockWidth + 0]) &sharedMemProcess[0]; float temp1, temp2; + size_t errorCnt = 0; // ---------------- forward direction ------------------------------------------- - for (int yBlockBegin = 0; yBlockBegin < y_num - 2; yBlockBegin += blockWidth) { + for (int yBlockBegin = 0; yBlockBegin < dim.y - 2; yBlockBegin += blockWidth) { // Read from global mem to cache for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) { int offs = i % blockWidth; int work = i / blockWidth; - if (offs + yBlockBegin < (y_num - 2) && work + xzOffset < maxXZoffset) { - cache[work][(offs + work)%blockWidth] = image[workersOffset + y_num * work + offs + yBlockBegin]; + if (offs + yBlockBegin < (dim.y - 2) && work + xzOffset < maxXZoffset) { + cache[work][(offs + work)%blockWidth] = image[workersOffset + dim.y * work + offs + yBlockBegin]; } } __syncthreads(); @@ -166,8 +176,8 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_ cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = temp1; cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = temp2; } - for (size_t k = yBlockBegin == 0 ? 2 : 0; k < blockWidth && k + yBlockBegin < y_num - 2; ++k) { - float temp = temp1*b2 + temp2*b1 + cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; + for (size_t k = yBlockBegin == 0 ? 2 : 0; k < blockWidth && k + yBlockBegin < dim.y - 2; ++k) { + float temp = temp2*b1 + temp1*b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp; temp1 = temp2; temp2 = temp; @@ -179,36 +189,37 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_ for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) { int offs = i % blockWidth; int work = i / blockWidth; - if (offs + yBlockBegin < (y_num - 2) && work + xzOffset < maxXZoffset) { - image[workersOffset + y_num * work + offs + yBlockBegin] = cache[work][(offs + work)%blockWidth]; + if (offs + yBlockBegin < (dim.y - 2) && work + xzOffset < maxXZoffset) { + image[workersOffset + dim.y * work + offs + yBlockBegin] = round(cache[work][(offs + work)%blockWidth], errorCnt); } } __syncthreads(); } // ---------------- backward direction ------------------------------------------- - for (int yBlockBegin = y_num - 1; yBlockBegin >= 0; yBlockBegin -= blockWidth) { + for (int yBlockBegin = dim.y - 1; yBlockBegin >= 0; yBlockBegin -= blockWidth) { // Read from global mem to cache for (int i = currentWorkerId; i < blockWidth * numOfWorkers; i += numOfWorkers) { int offs = i % blockWidth; int work = i / blockWidth; if (yBlockBegin - offs >= 0 && work + xzOffset < maxXZoffset) { - cache[work][(offs + work)%blockWidth] = image[workersOffset + y_num * work - offs + yBlockBegin]; + cache[work][(offs + work)%blockWidth] = image[workersOffset + dim.y * work - offs + yBlockBegin]; } } __syncthreads(); // Do operations if (xzOffset + currentWorkerId < maxXZoffset) { - if (yBlockBegin == y_num - 1) { - temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3]; - temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2]; + if (yBlockBegin == dim.y - 1) { + temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3] / norm_factor; + temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2] / norm_factor; cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = norm_factor * temp1; cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = norm_factor * temp2; } - for (int64_t k = yBlockBegin == y_num - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) { - float temp = temp2*b1 + temp1*b2 + cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; + int64_t k2 = yBlockBegin == dim.y - 1 ? 2 : 0; + for (int64_t k = yBlockBegin == dim.y - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) { + float temp = temp2*b1 + temp1*b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * norm_factor; temp1 = temp2; temp2 = temp; @@ -221,25 +232,36 @@ __global__ void bsplineYdirProcess(T *image, const size_t x_num, const size_t y_ int offs = i % blockWidth; int work = i / blockWidth; if (yBlockBegin - offs >= 0 && work + xzOffset < maxXZoffset) { - image[workersOffset + y_num * work - offs + yBlockBegin] = cache[work][(offs + work)%blockWidth]; + image[workersOffset + dim.y * work - offs + yBlockBegin] = round(cache[work][(offs + work)%blockWidth], errorCnt); } } __syncthreads(); } + + if (errorCnt > 0) *error = true; } /** * Function for launching a kernel */ template -void runBsplineYdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, +void runBsplineYdir(T *cudaImage, PixelDataDim dim, const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, float b1, float b2, float norm_factor, float *boundary, cudaStream_t aStream) { dim3 threadsPerBlock(numOfThreads); - dim3 numBlocks((x_num * z_num + threadsPerBlock.x - 1) / threadsPerBlock.x); - size_t sharedMemSize = (2 /*bc vectors*/) * (k0) * sizeof(float) + numOfThreads * (k0) * sizeof(T); - bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>> (cudaImage, x_num, y_num, z_num, bc1, bc2, bc3, bc4, k0, boundary); - sharedMemSize = numOfThreads * blockWidth * sizeof(T); - bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>> (cudaImage, x_num, y_num, z_num, k0, b1, b2, norm_factor, boundary); + dim3 numBlocks((dim.x * dim.z + threadsPerBlock.x - 1) / threadsPerBlock.x); + size_t sharedMemSize = (2 /*bc vectors*/) * (k0) * sizeof(float) + numOfThreads * (k0) * sizeof(float); + bool isErrorDetected = false; + { + ScopedCudaMemHandler error(&isErrorDetected, 1); + bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, bc1, bc2, bc3,bc4, k0, norm_factor, boundary, error.get()); + sharedMemSize = numOfThreads * blockWidth * sizeof(float); + bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, k0, b1, b2, norm_factor, boundary, error.get()); + } + + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineYdir - " + "try squashing the input image to a narrower range or use APRConverter"); + } } #endif diff --git a/src/algorithm/bsplineZdir.cuh b/src/algorithm/bsplineZdir.cuh index 33a5b420..cd59f0fb 100644 --- a/src/algorithm/bsplineZdir.cuh +++ b/src/algorithm/bsplineZdir.cuh @@ -2,10 +2,12 @@ #define BSPLINE_Z_DIR_H +#include "cudaMisc.cuh" #include #include #include + /** * Runs bspline recursive filter in Z direction. Each processed 2D patch consist of number of workes * (distributed in Y direction) and each of them is handling the whole row in Z-dir. @@ -60,22 +62,24 @@ * @param norm_factor - filter norm factor */ template -__global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num, +__global__ void bsplineZdir(T *image, PixelDataDim dim, const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, - float b1, float b2, float norm_factor) { + float b1, float b2, float norm_factor, bool *error) { const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y; - const size_t xDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * y_num; // x is in 'z' to have good memory coalescing - const size_t nextElementZdirOffset = x_num * y_num; - const size_t dirLen = z_num; + const size_t xDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.y; // x is in 'z' to have good memory coalescing + const size_t nextElementZdirOffset = dim.x * dim.y; + const size_t dirLen = dim.z; + const size_t minLen = min(dirLen, k0); - if (yDirOffset < y_num) { + if (yDirOffset < dim.y) { float temp1 = 0; float temp2 = 0; float temp3 = 0; float temp4 = 0; + // calculate boundary values - for (int k = 0; k < k0; ++k) { + for (int k = 0; k < minLen; ++k) { T val = image[xDirOffset + k * nextElementZdirOffset + yDirOffset]; temp1 += bc1[k] * val; temp2 += bc2[k] * val; @@ -84,18 +88,20 @@ __global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num, temp4 += bc4[k] * val; } + size_t errorCnt = 0; + // set boundary values in two first and two last points processed direction - image[xDirOffset + 0 * nextElementZdirOffset + yDirOffset] = temp1; - image[xDirOffset + 1 * nextElementZdirOffset + yDirOffset] = temp2; - image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = temp3 * norm_factor; - image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = temp4 * norm_factor; + image[xDirOffset + 0 * nextElementZdirOffset + yDirOffset] = round(temp1, errorCnt); + image[xDirOffset + 1 * nextElementZdirOffset + yDirOffset] = round(temp2, errorCnt); + image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = round(temp3 * norm_factor, errorCnt); + image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = round(temp4 * norm_factor, errorCnt); // Causal Filter loop int64_t offset = xDirOffset + 2 * nextElementZdirOffset + yDirOffset; int64_t offsetLimit = xDirOffset + (dirLen - 2) * nextElementZdirOffset; while (offset < offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp1 * b2 + temp2 * b1 + image[offset]; + const float temp = round(image[offset] + b1 * temp2 + b2 * temp1, errorCnt); image[offset] = temp; temp1 = temp2; temp2 = temp; @@ -108,13 +114,15 @@ __global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num, offsetLimit = xDirOffset; while (offset >= offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = temp3 * b1 + temp4 * b2 + image[offset]; - image[offset] = temp * norm_factor; + const float temp = image[offset] + b1 * temp3 + b2 * temp4; + image[offset] = round(temp * norm_factor, errorCnt); temp4 = temp3; temp3 = temp; offset -= nextElementZdirOffset; } + + if (errorCnt > 0) *error = true; } } @@ -122,15 +130,26 @@ __global__ void bsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_num, * Function for launching a kernel */ template -void runBsplineZdir(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, +void runBsplineZdir(T *cudaImage, PixelDataDim dim, const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockZ(1, numOfWorkersYdir, 1); dim3 numBlocksZ(1, - (y_num + threadsPerBlockZ.y - 1) / threadsPerBlockZ.y, - (x_num + threadsPerBlockZ.x - 1) / threadsPerBlockZ.x); - bsplineZdir <<>> (cudaImage, x_num, y_num, z_num, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor); + (dim.y + threadsPerBlockZ.y - 1) / threadsPerBlockZ.y, + (dim.x + threadsPerBlockZ.x - 1) / threadsPerBlockZ.x); + // In case of error this will be set to true by one of the kernels (CUDA does not guarantee which kernel will set global variable if more then one kernel + // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. + bool isErrorDetected = false; + { + ScopedCudaMemHandler error(&isErrorDetected, 1); + bsplineZdir <<>> (cudaImage, dim, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor, error.get()); + } + + if (isErrorDetected) { + throw std::invalid_argument("integer under-/overflow encountered in CUDA bsplineZdir - " + "try squashing the input image to a narrower range or use APRConverter"); + } } #endif diff --git a/src/algorithm/cudaMisc.cuh b/src/algorithm/cudaMisc.cuh new file mode 100644 index 00000000..7442c60b --- /dev/null +++ b/src/algorithm/cudaMisc.cuh @@ -0,0 +1,66 @@ +#ifndef CUDAMISC_CUH +#define CUDAMISC_CUH + + +#include + + +/** + * floating point output -> no rounding or under-/overflow check + */ +template +__device__ std::enable_if_t::value, T> round(float val, size_t &errCount) { + return val; +} + +/** + * integer output -> check for under-/overflow and round + * + * CUDA is not supporting std::numeric_limits so this results in belows manual checking of different + * data types range. In theory we could use --expt-relaxed-constexpr flag but since it is experimental + * and without guarantee of long existence for now it is better to stick to belows definitions. + */ +template +__device__ std::enable_if_t::value, uint8_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < 0 || val > 255) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, int8_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < -128 || val > 127) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, uint16_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < 0 || val > 65535) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, int16_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < -32768 || val > 32767) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, uint32_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < 0 || val > 4294967295) { errCount++; } + return val; +} + +template +__device__ std::enable_if_t::value, int32_t> round(float val, size_t &errCount) { + val = std::round(val); + if (val < -2147483648 || val > 2147483647) { errCount++; } + return val; +} + + +#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7bc8f6cc..ba468743 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -14,9 +14,9 @@ buildTarget(testPullingScheme PullingSchemeTest.cpp) #APR GPU Tests if(APR_USE_CUDA) buildTarget(testAPRCuda APRTestCuda.cpp) + buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp) endif() - if(APR_BUILD_EXAMPLES) buildTarget(testExamples ExamplesTest.cpp) endif() diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp new file mode 100644 index 00000000..e20678fd --- /dev/null +++ b/test/ComputeGradientCudaTest.cpp @@ -0,0 +1,155 @@ + +#include + +#include "data_structures/Mesh/PixelData.hpp" +#include "algorithm/ComputeGradient.hpp" +#include "algorithm/ComputeGradientCuda.hpp" +#include "TestTools.hpp" + +namespace { + +#ifdef APR_USE_CUDA + + template + class BsplineTest : public testing::Test {}; + TYPED_TEST_SUITE_P(BsplineTest); + + TYPED_TEST_P(BsplineTest, testBsplineInXdirCUDA) { + APRTimer timer(true); + + std::vector> yzSizes = {{1, 1}, + {32, 32}, + {33, 33}, + {44, 35}, + {35, 44}, + {255, 129}}; + + for (auto &p: yzSizes) { + int yLen = p.first; + int zLen = p.second; + // Run test with dimension in range much shorter than filter length to longer than filter length + // (for lambda=3 and tolerance=0.00001 expected filter length k0=18) + for (int xLen = 2; xLen < 22; ++xLen) { + // Generate random mesh + using ImgType = TypeParam; + PixelData m = getRandInitializedMesh(yLen, xLen, zLen, 30, 10); + + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().bspline_filt_rec_x(mCpu, lambda, tolerance); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_X_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + } + } + } + + TYPED_TEST_P(BsplineTest, testBsplineInZdirCUDA) { + APRTimer timer(true); + + std::vector> xySizes = {{1, 1}, + {32, 32}, + {33, 33}, + {44, 35}, + {35, 44}, + {255, 129}}; + + for (auto &p : xySizes) { + int xLen = p.first; + int yLen = p.second; + // Run test with dimension in range much shorter than filter length to longer than filter length + // (for lambda=3 and tolerance=0.00001 expected filter length k0=18) + for (int zLen = 2; zLen < 22; ++zLen) { + // Generate random mesh + using ImgType = TypeParam; + PixelData m = getRandInitializedMesh(yLen, xLen, zLen, 30, 10); + + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().bspline_filt_rec_z(mCpu, lambda, tolerance); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Z_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + } + } + } + + TYPED_TEST_P(BsplineTest, testBsplineInYdirCUDA) { + APRTimer timer(false); + + std::vector> xzSizes = {{1, 1}, + {32, 32}, + {33, 33}, + {44, 35}, + {35, 44}, + {255, 129}}; + + for (auto &p : xzSizes) { + int xLen = p.first; + int zLen = p.second; + // Run test with dimension in range much shorter than filter length to longer than filter length + // (for lambda=3 and tolerance=0.00001 expected filter length k0=18) + for (int yLen = 2; yLen < 22; ++yLen) { + // Generate random mesh + using ImgType = TypeParam; + PixelData m = getRandInitializedMesh(yLen, xLen, zLen, 30, 10); + + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().bspline_filt_rec_y(mCpu, lambda, tolerance); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Y_DIR); + timer.stop_timer(); + + //Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0001, 2), 0); + } + } + } + + REGISTER_TYPED_TEST_SUITE_P(BsplineTest, testBsplineInXdirCUDA, testBsplineInZdirCUDA, testBsplineInYdirCUDA); + using ImgTypes = ::testing::Types< float, uint16_t, int16_t, uint8_t>; + INSTANTIATE_TYPED_TEST_SUITE_P(Testing, BsplineTest, ImgTypes); + + +#endif // APR_USE_CUDA + +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index 0b2fc17e..c2f14805 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -9,92 +9,9 @@ #include "algorithm/ComputeGradientCuda.hpp" #include #include "algorithm/APRConverter.hpp" +#include "TestTools.hpp" namespace { - /** - * Compares mesh with provided data - * @param mesh - * @param data - data with [Z][Y][X] structure - * @return true if same - */ - template - bool compare(PixelData &mesh, const float *data, const float epsilon) { - size_t dataIdx = 0; - for (int z = 0; z < mesh.z_num; ++z) { - for (int y = 0; y < mesh.y_num; ++y) { - for (int x = 0; x < mesh.x_num; ++x) { - bool v = std::abs(mesh(y, x, z) - data[dataIdx]) < epsilon; - if (v == false) { - std::cerr << "Mesh and expected data differ. First place at (Y, X, Z) = " << y << ", " << x - << ", " << z << ") " << mesh(y, x, z) << " vs " << data[dataIdx] << std::endl; - return false; - } - ++dataIdx; - } - } - } - return true; - } - - /** - * Compares two meshes - * @param expected - * @param tested - * @param maxNumOfErrPrinted - how many error values should be printed (-1 for all) - * @return number of errors detected - */ - template - int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { - int cnt = 0; - for (size_t i = 0; i < expected.mesh.size(); ++i) { - if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) || - std::isnan(tested.mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested mesh: " << expected.mesh[i] << " vs " << tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; - } - cnt++; - } - } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; - return cnt; - } - - /** - * Generates mesh with provided dims with random values in range [0, 1] * multiplier - * @param y - * @param x - * @param z - * @param multiplier - * @return - */ - template - PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) { - PixelData m(y, x, z); - std::cout << "Mesh info: " << m << std::endl; - std::random_device rd; - std::mt19937 mt(rd()); - std::uniform_real_distribution dist(0.0, 1.0); - for (size_t i = 0; i < m.mesh.size(); ++i) { - m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier; - } - return m; - } - - template - bool initFromZYXarray(PixelData &mesh, const float *data) { - size_t dataIdx = 0; - for (int z = 0; z < mesh.z_num; ++z) { - for (int y = 0; y < mesh.y_num; ++y) { - for (int x = 0; x < mesh.x_num; ++x) { - mesh(y, x, z) = data[dataIdx]; - ++dataIdx; - } - } - } - return true; - } - - TEST(ComputeGradientTest, 2D_XY) { { // Corner points @@ -801,87 +718,6 @@ namespace { EXPECT_EQ(compareMeshes(grad, gradCuda), 0); } - TEST(ComputeBspineTest, BSPLINE_Y_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(129,127,128); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().bspline_filt_rec_y(mCpu, lambda, tolerance); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Y_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeBspineTest, BSPLINE_X_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(129,127,128); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().bspline_filt_rec_x(mCpu, lambda, tolerance); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_X_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeBspineTest, BSPLINE_Z_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(129,127,128); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().bspline_filt_rec_z(mCpu, lambda, tolerance); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_Z_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - TEST(ComputeBspineTest, BSPLINE_FULL_XYZ_DIR_CUDA) { APRTimer timer(true); diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 14a71814..5a1d1ca8 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -8,6 +8,8 @@ #include "data_structures/Mesh/PixelData.hpp" #include +#include "data_structures/APR/particles/ParticleData.hpp" + std::string get_source_directory_apr(){ // returns path to the directory where utils.cpp is stored @@ -102,25 +104,27 @@ inline int64_t compareParticles(const ParticleData &expected, const ParticleD /** - * Generates mesh with provided dims with random values in range [0, 1] * multiplier + * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset * @param y * @param x * @param z * @param multiplier + * @param offset * @return */ template -inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) { +inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, float offset=0.0, bool useIdxNumbers = false) { PixelData m(y, x, z); std::cout << "Mesh info: " << m << std::endl; std::random_device rd; std::mt19937 mt(rd()); std::uniform_real_distribution dist(0.0, 1.0); + #ifdef HAVE_OPENMP #pragma omp parallel for default(shared) #endif for (size_t i = 0; i < m.mesh.size(); ++i) { - m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier; + m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier + offset; } return m; } From b563da41f7ee1a9cbad96fbe6819b1cf15aa2da7 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 1 Aug 2022 15:02:35 +0200 Subject: [PATCH 02/59] Debug messages turned off --- src/algorithm/ComputeGradient.hpp | 2 +- test/ComputeGradientCudaTest.cpp | 29 +++++++++++++++++++++++++++-- test/ComputeGradientTest.cpp | 27 --------------------------- test/TestTools.hpp | 4 ++-- 4 files changed, 30 insertions(+), 32 deletions(-) diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index 911013b1..d7876248 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -251,7 +251,7 @@ ComputeGradient::BsplineParams ComputeGradient::prepareBSplineParams(size_t dimL const float norm_factor = powf((1 - 2.0*rho*cosf(omg) + powf(rho,2)),2); - std::cout << std::fixed << std::setprecision(9) << "CPU xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl; + // std::cout << std::fixed << std::setprecision(9) << "CPU xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl; // for boundaries std::vector impulse_resp_vec_f(k0+1); //forward diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index e20678fd..690fd4d6 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -15,7 +15,7 @@ namespace { TYPED_TEST_SUITE_P(BsplineTest); TYPED_TEST_P(BsplineTest, testBsplineInXdirCUDA) { - APRTimer timer(true); + APRTimer timer(false); std::vector> yzSizes = {{1, 1}, {32, 32}, @@ -57,7 +57,7 @@ namespace { } TYPED_TEST_P(BsplineTest, testBsplineInZdirCUDA) { - APRTimer timer(true); + APRTimer timer(false); std::vector> xySizes = {{1, 1}, {32, 32}, @@ -144,7 +144,32 @@ namespace { using ImgTypes = ::testing::Types< float, uint16_t, int16_t, uint8_t>; INSTANTIATE_TYPED_TEST_SUITE_P(Testing, BsplineTest, ImgTypes); + TEST(ComputeBspineTest, BSPLINE_FULL_XYZ_DIR_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(127, 128, 129, 100, 10); + // Filter parameters + const float lambda = 3; + const float tolerance = 0.0001; // as defined in get_smooth_bspline_3D + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU bspline"); + ComputeGradient().get_smooth_bspline_3D(mCpu, lambda); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU bspline"); + cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_ALL_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + } #endif // APR_USE_CUDA } diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index c2f14805..4de049fa 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -718,33 +718,6 @@ namespace { EXPECT_EQ(compareMeshes(grad, gradCuda), 0); } - TEST(ComputeBspineTest, BSPLINE_FULL_XYZ_DIR_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 128, 129); - - // Filter parameters - const float lambda = 3; - const float tolerance = 0.0001; // as defined in get_smooth_bspline_3D - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU bspline"); - ComputeGradient().get_smooth_bspline_3D(mCpu, lambda); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU bspline"); - cudaFilterBsplineFull(mGpu, lambda, tolerance, BSPLINE_ALL_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_CUDA) { using ImgType = float; diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 5a1d1ca8..6d6cd440 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -77,7 +77,7 @@ inline int compareMeshes(const PixelData &expected, const PixelData &teste cnt++; } } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; + if (cnt != 0) std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; return cnt; } @@ -115,7 +115,7 @@ inline int64_t compareParticles(const ParticleData &expected, const ParticleD template inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, float offset=0.0, bool useIdxNumbers = false) { PixelData m(y, x, z); - std::cout << "Mesh info: " << m << std::endl; +// std::cout << "Mesh info: " << m << std::endl; std::random_device rd; std::mt19937 mt(rd()); std::uniform_real_distribution dist(0.0, 1.0); From 3db510fba42d80aeed1434620e8dc63e05590d54 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 1 Aug 2022 17:13:54 +0200 Subject: [PATCH 03/59] Fixed Inv Bspline in X direction (CUDA pipeline) --- src/algorithm/invBspline.cuh | 10 +++++++--- test/ComputeGradientCudaTest.cpp | 24 ++++++++++++++++++++++++ test/ComputeGradientTest.cpp | 23 ----------------------- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/src/algorithm/invBspline.cuh b/src/algorithm/invBspline.cuh index d422abf1..c912b054 100644 --- a/src/algorithm/invBspline.cuh +++ b/src/algorithm/invBspline.cuh @@ -49,21 +49,25 @@ __global__ void invBsplineXdir(T *image, size_t x_num, size_t y_num, size_t z_nu const int workerIdx = blockIdx.y * blockDim.y + threadIdx.y ; const int nextElementOffset = y_num; + const float a1 = 1.0/6.0; + const float a2 = 4.0/6.0; + const float a3 = 1.0/6.0; + if (workerIdx < y_num) { int currElementOffset = 0; T v1 = image[workerOffset + currElementOffset]; T v2 = image[workerOffset + currElementOffset + nextElementOffset]; - image[workerOffset + currElementOffset] = (2 * v2 + 4 * v1) / 6.0; + image[workerOffset + currElementOffset] = (a1 * v2 + a2 * v1 + a3 * v2); for (int x = 2; x < x_num; ++x) { T v3 = image[workerOffset + currElementOffset + 2 * nextElementOffset]; - image[workerOffset + currElementOffset + nextElementOffset] = (v1 + 4 * v2 + v3) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = (a1 * v1 + a2 * v2 + a3 * v3); v1 = v2; v2 = v3; currElementOffset += nextElementOffset; } - image[workerOffset + currElementOffset + nextElementOffset] = (2 * v1 + 4 * v2) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = (a1 + a3) * v1 + a2 * v2; } } diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index 690fd4d6..c63900cd 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -170,6 +170,30 @@ namespace { // Compare GPU vs CPU EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); } + + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(127, 61, 66, 100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_x(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_X_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + } + #endif // APR_USE_CUDA } diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index 4de049fa..2a59d1cd 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -785,29 +785,6 @@ namespace { ASSERT_TRUE(compare(m, expect, 0.01)); } - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 61, 66); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_x(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_X_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) { APRTimer timer(true); From 18fce44baf89cebf242e7d0b3db3a194181337af Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 2 Aug 2022 16:29:11 +0200 Subject: [PATCH 04/59] Inverse Bspline pipeline for CUDA fixed --- src/algorithm/ComputeGradient.hpp | 6 +- src/algorithm/invBspline.cuh | 26 ++++--- test/ComputeGradientCudaTest.cpp | 82 +++++++++++++++++++++- test/ComputeGradientTest.cpp | 111 ------------------------------ test/TestTools.hpp | 4 +- 5 files changed, 102 insertions(+), 127 deletions(-) diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index d7876248..529af089 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -687,8 +687,7 @@ void ComputeGradient::calc_inv_bspline_y(PixelData& input){ } //LHS boundary condition - input.mesh[j*x_num*y_num + i*y_num] = a2*temp_vec[0]; - input.mesh[j*x_num*y_num + i*y_num] += (a1+a3)*temp_vec[1]; + input.mesh[j*x_num*y_num + i*y_num] = a1*temp_vec[1] + a2*temp_vec[0] + a3 * temp_vec[1]; for (int64_t k = 1; k < (y_num-1);k++){ const int64_t idx = j * x_num * y_num + i * y_num + k; @@ -696,8 +695,7 @@ void ComputeGradient::calc_inv_bspline_y(PixelData& input){ } //RHS boundary condition - input.mesh[j*x_num*y_num + i*y_num + y_num - 1] = (a1+a3)*temp_vec[y_num - 2]; - input.mesh[j*x_num*y_num + i*y_num + y_num - 1] += a2*temp_vec[y_num - 1]; + input.mesh[j*x_num*y_num + i*y_num + y_num - 1] = a1*temp_vec[y_num - 2] + a2*temp_vec[y_num - 1] + a3*temp_vec[y_num - 2]; } } } diff --git a/src/algorithm/invBspline.cuh b/src/algorithm/invBspline.cuh index c912b054..7c27d853 100644 --- a/src/algorithm/invBspline.cuh +++ b/src/algorithm/invBspline.cuh @@ -9,14 +9,18 @@ __global__ void invBsplineYdir(T *image, size_t x_num, size_t y_num, size_t z_nu int workerOffset = workerIdx; int loopNum = 0; - T p = 0; - T v = 0; + const float a1 = 1.0/6.0; + const float a2 = 4.0/6.0; + const float a3 = 1.0/6.0; + + float p = 0; + float v = 0; bool notLastInRow = true; while (workerOffset < y_num) { if (notLastInRow) v = image[workersOffset + workerOffset]; - T temp = __shfl_sync(active, v, workerIdx + blockDim.y - 1, blockDim.y); + float temp = __shfl_sync(active, v, workerIdx + blockDim.y - 1, blockDim.y); p = notLastInRow ? temp : p; - T n = __shfl_sync(active, v, workerIdx + 1, blockDim.y); + float n = __shfl_sync(active, v, workerIdx + 1, blockDim.y); // handle boundary (reflective mode) if (workerOffset == 0) p = n; @@ -24,7 +28,7 @@ __global__ void invBsplineYdir(T *image, size_t x_num, size_t y_num, size_t z_nu notLastInRow = (workerIdx + 1 + loopNum) % blockDim.y != 0; if (notLastInRow) { - v = (p + v * 4 + n) / 6.0; + v = a1 * p + a2 * v + a3 * n; image[workersOffset + workerOffset] = v; workerOffset += blockDim.y; } @@ -58,7 +62,7 @@ __global__ void invBsplineXdir(T *image, size_t x_num, size_t y_num, size_t z_nu T v1 = image[workerOffset + currElementOffset]; T v2 = image[workerOffset + currElementOffset + nextElementOffset]; - image[workerOffset + currElementOffset] = (a1 * v2 + a2 * v1 + a3 * v2); + image[workerOffset + currElementOffset] = a1 * v2 + a2 * v1 + a3 * v2; for (int x = 2; x < x_num; ++x) { T v3 = image[workerOffset + currElementOffset + 2 * nextElementOffset]; @@ -87,21 +91,25 @@ __global__ void invBsplineZdir(T *image, size_t x_num, size_t y_num, size_t z_nu const int workerIdx = blockIdx.y * blockDim.y + threadIdx.y ; const int nextElementOffset = x_num * y_num; + const float a1 = 1.0/6.0; + const float a2 = 4.0/6.0; + const float a3 = 1.0/6.0; + if (workerIdx < y_num) { int currElementOffset = 0; T v1 = image[workerOffset + currElementOffset]; T v2 = image[workerOffset + currElementOffset + nextElementOffset]; - image[workerOffset + currElementOffset] = (2 * v2 + 4 * v1) / 6.0; + image[workerOffset + currElementOffset] = a1 * v2 + a2 * v1 + a1 * v2; for (int x = 2; x < z_num; ++x) { T v3 = image[workerOffset + currElementOffset + 2 * nextElementOffset]; - image[workerOffset + currElementOffset + nextElementOffset] = (v1 + 4 * v2 + v3) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = a1 * v1 + a2 * v2 + a3 * v3; v1 = v2; v2 = v3; currElementOffset += nextElementOffset; } - image[workerOffset + currElementOffset + nextElementOffset] = (2 * v1 + 4 * v2) / 6.0; + image[workerOffset + currElementOffset + nextElementOffset] = (a1 + a3) * v1 + a2 * v2; } } diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index c63900cd..81320e80 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -10,6 +10,11 @@ namespace { #ifdef APR_USE_CUDA + + // ======================================================================== + // BSPLINE tests + // ======================================================================== + template class BsplineTest : public testing::Test {}; TYPED_TEST_SUITE_P(BsplineTest); @@ -168,9 +173,14 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); } + + // ======================================================================== + // INV. BSPLINE tests + // ======================================================================== + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_RND_CUDA) { APRTimer timer(false); @@ -194,6 +204,76 @@ namespace { EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); } + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(128, 61, 66, 100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_z(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_Z_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + } + + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(127, 61, 71, 100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_y(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_Y_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + } + + TEST(ComputeInverseBspline, CALC_INV_BSPLINE_FULL_XYZ_DIR_RND_CUDA) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(32,32,32,100, 10); + + // Calculate bspline on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU inv bspline"); + ComputeGradient().calc_inv_bspline_y(mCpu); + ComputeGradient().calc_inv_bspline_x(mCpu); + ComputeGradient().calc_inv_bspline_z(mCpu); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU inv bspline"); + cudaInverseBspline(mGpu, INV_BSPLINE_ALL_DIR); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); + } #endif // APR_USE_CUDA } diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index 2a59d1cd..a03d5746 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -718,120 +718,9 @@ namespace { EXPECT_EQ(compareMeshes(grad, gradCuda), 0); } - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_CUDA) { - using ImgType = float; - - ImgType init[] = {1.00, 0.00, 0.00, - 1.00, 0.00, 6.00, - 0.00, 6.00, 0.00, - 6.00, 0.00, 0.00}; - - ImgType expect[] = {1.00, 0.00, 2.00, - 0.83, 1.00, 4.00, - 1.17, 4.00, 1.00, - 4.00, 2.00, 0.00}; - PixelData m(4, 3, 1); - initFromZYXarray(m, init); - - // Calculate and compare - m.printMesh(4,2); - cudaInverseBspline(m, INV_BSPLINE_Y_DIR); - m.printMesh(4,2); - ASSERT_TRUE(compare(m, expect, 0.01)); - } - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_RND_CUDA) { - APRTimer timer(true); - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 33, 31); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_y(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_Y_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_X_CUDA) { - using ImgType = float; - - ImgType init[] = {0.00, 6.00, 0.00, - 1.00, 0.00, 0.00, - 0.00, 0.00, 1.00}; - - ImgType expect[] = {2.00, 4.00, 2.00, - 0.67, 0.16, 0.00, - 0.00, 0.16, 0.67}; - - PixelData m(3, 3, 1); - initFromZYXarray(m, init); - - // Calculate and compare - m.printMesh(4,2); - cudaInverseBspline(m, INV_BSPLINE_X_DIR); - m.printMesh(4,2); - ASSERT_TRUE(compare(m, expect, 0.01)); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(127, 61, 66); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_z(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_Z_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeInverseBspline, CALC_INV_BSPLINE_FULL_XYZ_DIR_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(3,3,3,100); - - // Calculate bspline on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU inv bspline"); - ComputeGradient().calc_inv_bspline_y(mCpu); - ComputeGradient().calc_inv_bspline_x(mCpu); - ComputeGradient().calc_inv_bspline_z(mCpu); - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU inv bspline"); - cudaInverseBspline(mGpu, INV_BSPLINE_ALL_DIR); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } TEST(ComputeThreshold, CALC_THRESHOLD_RND_CUDA) { APRTimer timer(true); diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 6d6cd440..d8b99fc7 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -72,7 +72,7 @@ inline int compareMeshes(const PixelData &expected, const PixelData &teste for (size_t i = 0; i < expected.mesh.size(); ++i) { if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; + std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; } cnt++; } @@ -93,7 +93,7 @@ inline int64_t compareParticles(const ParticleData &expected, const ParticleD for (size_t i = 0; i < expected.size(); ++i) { if (std::abs(expected[i] - tested[i]) > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested particle: " << (float)expected[i] << " vs " << (float)tested[i] << " IDX:" << i << std::endl; + std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested particle: " << (float)expected[i] << " vs " << (float)tested[i] << " IDX:" << i << std::endl; } cnt++; } From ad5f194006661569bd62a0078b4b0d274ccc47a2 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 3 Aug 2022 16:39:23 +0200 Subject: [PATCH 05/59] Downsample and downsample gradient corrected to match GPU --- src/algorithm/ComputeGradient.hpp | 20 +++++--- src/algorithm/ComputeGradientCuda.cu | 8 +-- src/algorithm/bsplineYdir.cuh | 1 - src/algorithm/dsGradient.cuh | 29 ++++++----- test/ComputeGradientCudaTest.cpp | 31 ++++++++++++ test/ComputeGradientTest.cpp | 74 ---------------------------- test/MeshDataTest.cpp | 51 ++----------------- 7 files changed, 68 insertions(+), 146 deletions(-) diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index 529af089..80b72a36 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -887,11 +887,15 @@ void ComputeGradient::calc_bspline_fd_ds_mag(const PixelData &input, PixelDat //compute the boundary values if (y_num >= 2) { - temp[0] = sqrt(pow((right[0] - left[0]) / (2 * hx), 2.0) + pow((down[0] - up[0]) / (2 * hz), 2.0) + - pow((center[1] - center[0 /* boundary */]) / (2 * hy), 2.0)); - temp[y_num - 1] = sqrt(pow((right[y_num - 1] - left[y_num - 1]) / (2 * hx), 2.0) + - pow((down[y_num - 1] - up[y_num - 1]) / (2 * hz), 2.0) + - pow((center[y_num - 1 /* boundary */] - center[y_num - 2]) / (2 * hy), 2.0)); + float dx = (right[0] - left[0]) / (2 * hx); + float dz = (down[0] - up[0]) / (2 * hz); + float dy = (center[1] - center[0 /* boundary */]) / (2 * hy); + temp[0] = sqrtf(dx*dx + dz*dz + dy*dy); + + dx = (right[y_num - 1] - left[y_num - 1]) / (2 * hx); + dz = (down[y_num - 1] - up[y_num - 1]) / (2 * hz); + dy = (center[y_num - 1 /* boundary */] - center[y_num - 2]) / (2 * hy); + temp[y_num - 1] = sqrtf(dx*dx + dz*dz + dy*dy); } else { temp[0] = 0; // same values minus same values in x/y/z } @@ -901,8 +905,10 @@ void ComputeGradient::calc_bspline_fd_ds_mag(const PixelData &input, PixelDat #pragma omp simd #endif for (size_t y = 1; y < y_num - 1; ++y) { - temp[y] = sqrt(pow((right[y] - left[y]) / (2 * hx), 2.0) + pow((down[y] - up[y]) / (2 * hz), 2.0) + - pow((center[y + 1] - center[y - 1]) / (2 * hy), 2.0)); + float dx = (right[y] - left[y]) / (2 * hx); + float dz = (down[y] - up[y]) / (2 * hz); + float dy = (center[y + 1] - center[y - 1]) / (2 * hy); + temp[y] = sqrtf(dx*dx + dz*dz + dy*dy); } // Set as a downsampled gradient maximum from 2x2x2 gradient cubes diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 982e649c..97dcd5b0 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -172,13 +172,13 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc BsplineParams &p, float *bc1, float *bc2, float *bc3, float *bc4, float *boundary, float bspline_offset, const APRParameters &par, cudaStream_t aStream) { - runThresholdImg(cudaImage, image.x_num, image.y_num, image.z_num, par.Ip_th + bspline_offset, aStream); + //runThresholdImg(cudaImage, image.x_num, image.y_num, image.z_num, par.Ip_th + bspline_offset, aStream); runBsplineYdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, boundary, aStream); runBsplineXdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); runBsplineZdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); - runKernelGradient(cudaImage, cudaGrad, image.x_num, image.y_num, image.z_num, local_scale_temp.x_num, local_scale_temp.y_num, par.dx, par.dy, par.dz, aStream); + runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); @@ -186,7 +186,7 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - runThreshold(cudalocal_scale_temp, cudaGrad, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, par.Ip_th, aStream); + //runThreshold(cudalocal_scale_temp, cudaGrad, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, par.Ip_th, aStream); } class CurrentTime { @@ -468,5 +468,5 @@ void cudaDownsampledGradient(PixelData &input, PixelData &grad, co ScopedCudaMemHandler, H2D | D2H> cudaInput(input); ScopedCudaMemHandler, D2H> cudaGrad(grad); - runKernelGradient(cudaInput.get(), cudaGrad.get(), input.x_num, input.y_num, input.z_num, grad.x_num, grad.y_num, hx, hy, hz, 0); + runKernelGradient(cudaInput.get(), cudaGrad.get(), input.getDimension(), grad.getDimension(), hx, hy, hz, 0); } diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index a1026704..1a0986d1 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -217,7 +217,6 @@ __global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = norm_factor * temp1; cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = norm_factor * temp2; } - int64_t k2 = yBlockBegin == dim.y - 1 ? 2 : 0; for (int64_t k = yBlockBegin == dim.y - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) { float temp = temp2*b1 + temp1*b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * norm_factor; diff --git a/src/algorithm/dsGradient.cuh b/src/algorithm/dsGradient.cuh index de4a2c77..8e2efc84 100644 --- a/src/algorithm/dsGradient.cuh +++ b/src/algorithm/dsGradient.cuh @@ -5,11 +5,14 @@ template __global__ void -gradient(const T *input, size_t x_num, size_t y_num, size_t z_num, T *grad, size_t x_num_ds, size_t y_num_ds, - float hx, float hy, float hz) { +gradient(const T *input, PixelDataDim inputDim, T *grad, PixelDataDim gradDim, float hx, float hy, float hz) { const int xi = ((blockIdx.x * blockDim.x) + threadIdx.x) * 2; const int yi = ((blockIdx.y * blockDim.y) + threadIdx.y) * 2; const int zi = ((blockIdx.z * blockDim.z) + threadIdx.z) * 2; + const auto x_num = inputDim.x; + const auto y_num = inputDim.y; + const auto z_num = inputDim.z; + if (xi >= x_num || yi >= y_num || zi >= z_num) return; const size_t xnumynum = x_num * y_num; @@ -33,28 +36,28 @@ gradient(const T *input, size_t x_num, size_t y_num, size_t z_num, T *grad, size for (int y = 1; y <= 2; ++y) { float xd = (temp[z][x - 1][y] - temp[z][x + 1][y]) / (2 * hx); xd = xd * xd; - float yd = (temp[z - 1][x][y] - temp[z + 1][x][y]) / (2 * hy); - yd = yd * yd; - float zd = (temp[z][x][y - 1] - temp[z][x][y + 1]) / (2 * hz); + float zd = (temp[z - 1][x][y] - temp[z + 1][x][y]) / (2 * hz); zd = zd * zd; - float gm = __fsqrt_rn(xd + yd + zd); + float yd = (temp[z][x][y - 1] - temp[z][x][y + 1]) / (2 * hy); + yd = yd * yd; + float gm = sqrtf(xd + zd + yd); if (gm > maxGrad) maxGrad = gm; } - const size_t idx = zi / 2 * x_num_ds * y_num_ds + xi / 2 * y_num_ds + yi / 2; + const size_t idx = zi / 2 * gradDim.x * gradDim.y + xi / 2 * gradDim.y + yi / 2; grad[idx] = maxGrad; } template void runKernelGradient(const T *cudaInput, T *cudaGrad, - size_t xLenInput, size_t yLenInput, size_t zLenInput, - size_t xLenGradient, size_t yLenGradient, + PixelDataDim inputDim, + PixelDataDim gradDim, float hx, float hy, float hz, cudaStream_t aStream) { dim3 threadsPerBlock(1, 64, 1); - dim3 numBlocks((xLenInput + threadsPerBlock.x - 1) / threadsPerBlock.x, - (yLenInput + threadsPerBlock.y - 1) / threadsPerBlock.y, - (zLenInput + threadsPerBlock.z - 1) / threadsPerBlock.z); - gradient <<>> (cudaInput, xLenInput, yLenInput, zLenInput, cudaGrad, xLenGradient, yLenGradient, hx, hy, hz); + dim3 numBlocks((inputDim.x + threadsPerBlock.x - 1) / threadsPerBlock.x, + (inputDim.y + threadsPerBlock.y - 1) / threadsPerBlock.y, + (inputDim.z + threadsPerBlock.z - 1) / threadsPerBlock.z); + gradient <<>> (cudaInput, inputDim, cudaGrad, gradDim, hx, hy, hz); } diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index 81320e80..d7fc6e62 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -274,6 +274,37 @@ namespace { // Compare GPU vs CPU EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); } + + // ======================================================================== + // Downsampled gradient + // ======================================================================== + + TEST(ComputeGradientTest, GPU_VS_CPU_ON_RANDOM_VALUES) { + APRTimer timer(false); + + // Generate random mesh + using ImgType = float; + PixelData m = getRandInitializedMesh(31, 32, 33, 100); + + // Calculate gradient on CPU + PixelData grad; + grad.initDownsampled(m, 0); + timer.start_timer("CPU gradient"); + ComputeGradient().calc_bspline_fd_ds_mag(m, grad, 1, 1, 1); + timer.stop_timer(); + + // Calculate gradient on GPU + PixelData gradCuda; + gradCuda.initDownsampled(m, 0); + timer.start_timer("GPU gradient"); + cudaDownsampledGradient(m, gradCuda, 1, 1, 1); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(grad, gradCuda, 0.0000001), 0); + } + + #endif // APR_USE_CUDA } diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index a03d5746..d94f74c0 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -648,80 +648,6 @@ namespace { #ifdef APR_USE_CUDA - TEST(ComputeGradientTest, 2D_XY_CUDA) { - // Corner points - PixelData m(6, 6, 1, 0); - // expect gradient is 3x3 X/Y plane - float expect[] = {1.41, 0, 4.24, - 0, 0, 0, - 2.82, 0, 5.65}; - // put values in corners - m(0, 0, 0) = 2; - m(5, 0, 0) = 4; - m(0, 5, 0) = 6; - m(5, 5, 0) = 8; - PixelData grad; - grad.initDownsampled(m, 0); - cudaDownsampledGradient(m, grad, 1, 1, 1); - ASSERT_TRUE(compare(grad, expect, 0.01)); - } - - TEST(ComputeGradientTest, Corners3D_CUDA) { - PixelData m(6, 6, 4, 0); - // expect gradient is 3x3x2 X/Y/Z plane - float expect[] = {1.73, 0, 5.19, - 0, 0, 0, - 3.46, 0, 6.92, - - 8.66, 0, 12.12, - 0, 0, 0, - 10.39, 0, 13.85}; - // put values in corners - m(0, 0, 0) = 2; - m(5, 0, 0) = 4; - m(0, 5, 0) = 6; - m(5, 5, 0) = 8; - m(0, 0, 3) = 10; - m(5, 0, 3) = 12; - m(0, 5, 3) = 14; - m(5, 5, 3) = 16; - - PixelData grad; - grad.initDownsampled(m, 0); - cudaDownsampledGradient(m, grad, 1, 1, 1); - ASSERT_TRUE(compare(grad, expect, 0.01)); - } - - TEST(ComputeGradientTest, GPU_VS_CPU_ON_RANDOM_VALUES) { - // Generate random mesh - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(33, 31, 3); - - APRTimer timer(true); - - // Calculate gradient on CPU - PixelData grad; - grad.initDownsampled(m, 0); - timer.start_timer("CPU gradient"); - ComputeGradient().calc_bspline_fd_ds_mag(m, grad, 1, 1, 1); - timer.stop_timer(); - - // Calculate gradient on GPU - PixelData gradCuda; - gradCuda.initDownsampled(m, 0); - timer.start_timer("GPU gradient"); - cudaDownsampledGradient(m, gradCuda, 1, 1, 1); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(grad, gradCuda), 0); - } - - - - - TEST(ComputeThreshold, CALC_THRESHOLD_RND_CUDA) { APRTimer timer(true); diff --git a/test/MeshDataTest.cpp b/test/MeshDataTest.cpp index 869229e3..f9c9bf4b 100644 --- a/test/MeshDataTest.cpp +++ b/test/MeshDataTest.cpp @@ -5,6 +5,7 @@ #include "data_structures/Mesh/PixelData.hpp" #include "data_structures/Mesh/PixelDataCuda.h" #include +#include "TestTools.hpp" namespace { @@ -675,51 +676,7 @@ namespace { } #ifdef APR_USE_CUDA -namespace { - /** - * Compares two meshes - * @param expected - * @param tested - * @param maxNumOfErrPrinted - how many error values should be printed (-1 for all) - * @return number of errors detected - */ - template - int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { - int cnt = 0; - for (size_t i = 0; i < expected.mesh.size(); ++i) { - if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) || - std::isnan(tested.mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested mesh: " << expected.mesh[i] << " vs " << tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; - } - cnt++; - } - } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; - return cnt; - } - /** - * Generates mesh with provided dims with random values in range [0, 1] * multiplier - * @param y - * @param x - * @param z - * @param multiplier - * @return - */ - template - PixelData getRandInitializedMesh(int y, int x, int z, float multiplier = 2.0f, bool useIdxNumbers = false) { - PixelData m(y, x, z); - std::cout << "Mesh info: " << m << std::endl; - std::random_device rd; - std::mt19937 mt(rd()); - std::uniform_real_distribution dist(0.0, 1.0); - for (size_t i = 0; i < m.mesh.size(); ++i) { - m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier; - } - return m; - } -} TEST(MeshDataSimpleTest, DownSampleCuda) { { // reduce/constant_operator calculate maximum value when downsampling PixelData m(5, 6, 4); @@ -773,10 +730,10 @@ TEST(MeshDataSimpleTest, DownSampleCuda) { EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); } { - APRTimer timer(true); + APRTimer timer(false); // reduce/constant_operator calculate average value of pixels when downsampling - PixelData m = getRandInitializedMesh(33, 22, 21); + PixelData m = getRandInitializedMesh(33, 22, 21, 100, 5); for (size_t i = 0; i < m.mesh.size(); ++i) m.mesh[i] = 27 - i; PixelData mCpu; mCpu.initDownsampled(m); @@ -792,7 +749,7 @@ TEST(MeshDataSimpleTest, DownSampleCuda) { downsampleMeanCuda(m, mGpu); timer.stop_timer(); - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); } } #endif From 557eff32759a4c5b5013607082e9ffe01ef8bcb5 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 9 Aug 2022 17:25:58 +0200 Subject: [PATCH 06/59] GPU pipeline fixes - Full Gradient test is working now --- src/algorithm/ComputeGradient.hpp | 1 - src/algorithm/ComputeGradientCuda.cu | 109 ++++++++++++++++-------- src/algorithm/bsplineParams.h | 19 +++++ src/algorithm/bsplineXdir.cuh | 31 +++---- src/algorithm/bsplineYdir.cuh | 79 +++++++++-------- src/algorithm/bsplineZdir.cuh | 33 ++++--- src/data_structures/Mesh/PixelData.hpp | 11 +-- src/data_structures/Mesh/downsample.cuh | 4 +- src/misc/CudaTools.cuh | 42 +++++++-- test/ComputeGradientCudaTest.cpp | 56 +++++++++++- test/ComputeGradientTest.cpp | 55 +----------- 11 files changed, 260 insertions(+), 180 deletions(-) create mode 100644 src/algorithm/bsplineParams.h diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index 80b72a36..ee5aeec8 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -129,7 +129,6 @@ inline void ComputeGradient::get_gradient(PixelData &image_temp, Pixe timer.stop_timer(); } } - } diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 97dcd5b0..99f28558 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -16,6 +16,7 @@ #include "dsGradient.cuh" #include "invBspline.cuh" +#include "bsplineParams.h" #include "bsplineXdir.cuh" #include "bsplineYdir.cuh" #include "bsplineZdir.cuh" @@ -34,6 +35,13 @@ namespace { float norm_factor; } BsplineParams; + struct BsplineParamsCudaMemoryHandlers { + ScopedCudaMemHandler bc1; + ScopedCudaMemHandler bc2; + ScopedCudaMemHandler bc3; + ScopedCudaMemHandler bc4; + }; + float impulse_resp(float k, float rho, float omg) { // Impulse Response Function return (powf(rho, (std::abs(k))) * sinf((std::abs(k) + 1) * omg)) / sinf(omg); @@ -169,14 +177,12 @@ void runThresholdImg(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, flo template void getGradientCuda(const PixelData &image, PixelData &local_scale_temp, ImgType *cudaImage, ImgType *cudaGrad, float *cudalocal_scale_temp, - BsplineParams &p, float *bc1, float *bc2, float *bc3, float *bc4, float *boundary, + BsplineParamsCuda &px, BsplineParamsCuda &py, BsplineParamsCuda &pz, float *boundary, float bspline_offset, const APRParameters &par, cudaStream_t aStream) { - //runThresholdImg(cudaImage, image.x_num, image.y_num, image.z_num, par.Ip_th + bspline_offset, aStream); - - runBsplineYdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, boundary, aStream); - runBsplineXdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); - runBsplineZdir(cudaImage, image.getDimension(), bc1, bc2, bc3, bc4, p.k0, p.b1, p.b2, p.norm_factor, aStream); + runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream); + runBsplineXdir(cudaImage, image.getDimension(), px, aStream); + runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); @@ -185,8 +191,6 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - - //runThreshold(cudalocal_scale_temp, cudaGrad, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, par.Ip_th, aStream); } class CurrentTime { @@ -292,9 +296,12 @@ public: void processOnGpu() { CurrentTime ct; uint64_t start = ct.microseconds(); - getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), - params, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(), - iBsplineOffset, iParameters, iStream); + + // TODO: Need to be fixed !!!!!!!!!!1 + +// getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), +// params, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(), +// iBsplineOffset, iParameters, iStream); std::cout << "1: " << ct.microseconds() - start << std::endl; runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); std::cout << "2: " << ct.microseconds() - start << std::endl; @@ -350,6 +357,33 @@ template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfR template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); +auto transferSpline(BsplineParams &aParams) { + ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0); + ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0); + ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0); + ScopedCudaMemHandler bc4(aParams.bc4.get(), aParams.k0); + + return std::pair { + BsplineParamsCuda { + bc1.get(), + bc2.get(), + bc3.get(), + bc4.get(), + aParams.k0, + aParams.b1, + aParams.b2, + aParams.norm_factor + }, + + BsplineParamsCudaMemoryHandlers { + std::move(bc1), + std::move(bc2), + std::move(bc3), + std::move(bc4) + } + }; +} + template void cudaFilterBsplineFull(PixelData &input, float lambda, float tolerance, TypeOfRecBsplineFlags flags, int maxFilterLen) { cudaStream_t aStream = 0; @@ -361,29 +395,23 @@ void cudaFilterBsplineFull(PixelData &input, float lambda, float tolera timer.start_timer("GpuDeviceTimeFull"); if (flags & BSPLINE_Y_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.y_num, lambda, tolerance, maxFilterLen); - ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); + auto cuda = transferSpline(p); + auto splineCuda = cuda.first; int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * input.x_num * input.z_num; ScopedCudaMemHandler boundary(nullptr, boundaryLen); // allocate memory on device - runBsplineYdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, boundary.get(), aStream); + runBsplineYdir(cudaInput.get(), input.getDimension(), splineCuda, boundary.get(), aStream); } if (flags & BSPLINE_X_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen); - ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); - runBsplineXdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); + auto cuda = transferSpline(p); + auto splineCuda = cuda.first; + runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); } if (flags & BSPLINE_Z_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen); - ScopedCudaMemHandler bc1(p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2(p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3(p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4(p.bc4.get(), p.k0); - runBsplineZdir(cudaInput.get(), input.getDimension(), bc1.get(), bc2.get(), bc3.get(), bc4.get(), p.k0, p.b1, p.b2, p.norm_factor, aStream); + auto cuda = transferSpline(p); + auto splineCuda = cuda.first; + runBsplineZdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); } timer.stop_timer(); } @@ -421,6 +449,8 @@ void computeLevelsCuda(const PixelData &grad_temp, PixelData & // explicit instantiation of handled types template void getGradient(PixelData &, PixelData &, PixelData &, PixelData &, float, const APRParameters &); +template void getGradient(PixelData &, PixelData &, PixelData &, PixelData &, float, const APRParameters &); + template void getGradient(PixelData &image, PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, float bspline_offset, const APRParameters &par) { ScopedCudaMemHandler, D2H | H2D> cudaImage(image); @@ -428,21 +458,30 @@ void getGradient(PixelData &image, PixelData &grad_temp, Pixel ScopedCudaMemHandler, D2H> cudalocal_scale_temp(local_scale_temp); ScopedCudaMemHandler, D2H> cudalocal_scale_temp2(local_scale_temp2); + int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * image.x_num * image.z_num; + ScopedCudaMemHandler boundary(nullptr, boundaryLen); + float tolerance = 0.0001; + + // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. // Should be fixed when other parts of pipeline are ready. - BsplineParams p = prepareBsplineStuff(image.x_num, par.lambda, tolerance); - ScopedCudaMemHandler bc1 (p.bc1.get(), p.k0); - ScopedCudaMemHandler bc2 (p.bc2.get(), p.k0); - ScopedCudaMemHandler bc3 (p.bc3.get(), p.k0); - ScopedCudaMemHandler bc4 (p.bc4.get(), p.k0); - int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * image.x_num * image.z_num; - ScopedCudaMemHandler boundary(nullptr, boundaryLen); + // FIX BSPLINE PARAMS !!!!!!!! to get full gradient pipeline test working !!!!!!!!!!!!!!!!!!!!!!!!!1 + + + BsplineParams px = prepareBsplineStuff(image.x_num, par.lambda, tolerance); + auto cudax = transferSpline(px); + auto splineCudaX = cudax.first; + BsplineParams py = prepareBsplineStuff(image.y_num, par.lambda, tolerance); + auto cuday = transferSpline(py); + auto splineCudaY = cuday.first; + BsplineParams pz = prepareBsplineStuff(image.z_num, par.lambda, tolerance); + auto cudaz = transferSpline(pz); + auto splineCudaZ = cudaz.first; getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(), - p, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(), - bspline_offset, par, 0); + splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, 0); } // explicit instantiation of handled types diff --git a/src/algorithm/bsplineParams.h b/src/algorithm/bsplineParams.h new file mode 100644 index 00000000..44dbd1c1 --- /dev/null +++ b/src/algorithm/bsplineParams.h @@ -0,0 +1,19 @@ +#ifndef APR_BSPLINEPARAMS_H +#define APR_BSPLINEPARAMS_H + + +#include + + +struct BsplineParamsCuda { + float *bc1; + float *bc2; + float *bc3; + float *bc4; + size_t k0; + float b1; + float b2; + float norm_factor; +}; + +#endif //APR_BSPLINEPARAMS_H diff --git a/src/algorithm/bsplineXdir.cuh b/src/algorithm/bsplineXdir.cuh index 6ee3c755..89fd3fc6 100644 --- a/src/algorithm/bsplineXdir.cuh +++ b/src/algorithm/bsplineXdir.cuh @@ -6,6 +6,7 @@ #include #include #include "cudaMisc.cuh" +#include "bsplineParams.h" /** * Runs bspline recursive filter in X direction. Each processed 2D patch consist of number of workers @@ -60,15 +61,13 @@ * @param norm_factor - filter norm factor */ template -__global__ void bsplineXdir(T *image, PixelDataDim dim, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, - float b1, float b2, float norm_factor, bool *error) { +__global__ void bsplineXdir(T *image, PixelDataDim dim, BsplineParamsCuda p, bool *error) { const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y; const size_t zDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.x * dim.y; const size_t nextElementXdirOffset = dim.y; const size_t dirLen = dim.x; - const size_t minLen = min(dirLen, k0); + const size_t minLen = min(dirLen, p.k0); if (yDirOffset < dim.y) { float temp1 = 0; @@ -79,11 +78,11 @@ __global__ void bsplineXdir(T *image, PixelDataDim dim, // calculate boundary values for (int k = 0; k < minLen; ++k) { T val = image[zDirOffset + k * nextElementXdirOffset + yDirOffset]; - temp1 += bc1[k] * val; - temp2 += bc2[k] * val; + temp1 += p.bc1[k] * val; + temp2 += p.bc2[k] * val; val = image[zDirOffset + (dirLen - 1 - k) * nextElementXdirOffset + yDirOffset]; - temp3 += bc3[k] * val; - temp4 += bc4[k] * val; + temp3 += p.bc3[k] * val; + temp4 += p.bc4[k] * val; } size_t errorCnt = 0; @@ -91,15 +90,15 @@ __global__ void bsplineXdir(T *image, PixelDataDim dim, // set boundary values in two first and two last points processed direction image[zDirOffset + 0 * nextElementXdirOffset + yDirOffset] = round(temp1, errorCnt); image[zDirOffset + 1 * nextElementXdirOffset + yDirOffset] = round(temp2, errorCnt); - image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = round(temp3 * norm_factor, errorCnt); - image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = round(temp4 * norm_factor, errorCnt); + image[zDirOffset + (dirLen - 2) * nextElementXdirOffset + yDirOffset] = round(temp3 * p.norm_factor, errorCnt); + image[zDirOffset + (dirLen - 1) * nextElementXdirOffset + yDirOffset] = round(temp4 * p.norm_factor, errorCnt); // Causal Filter loop int64_t offset = zDirOffset + 2 * nextElementXdirOffset + yDirOffset; int64_t offsetLimit = zDirOffset + (dirLen - 2) * nextElementXdirOffset; while (offset < offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = round(image[offset] + b1 * temp2 + b2 * temp1, errorCnt); + const float temp = round(image[offset] + p.b1 * temp2 + p.b2 * temp1, errorCnt); image[offset] = temp; temp1 = temp2; temp2 = temp; @@ -112,8 +111,8 @@ __global__ void bsplineXdir(T *image, PixelDataDim dim, offsetLimit = zDirOffset; while (offset >= offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = image[offset] + b1 * temp3 + b2 * temp4; - image[offset] = round(temp * norm_factor, errorCnt); + const float temp = image[offset] + p.b1 * temp3 + p.b2 * temp4; + image[offset] = round(temp * p.norm_factor, errorCnt); temp4 = temp3; temp3 = temp; @@ -128,9 +127,7 @@ __global__ void bsplineXdir(T *image, PixelDataDim dim, * Function for launching a kernel */ template -void runBsplineXdir(T *cudaImage, PixelDataDim dim, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, - size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) { +void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockX(1, numOfWorkersYdir, 1); dim3 numBlocksX(1, @@ -141,7 +138,7 @@ void runBsplineXdir(T *cudaImage, PixelDataDim dim, bool isErrorDetected = false; { ScopedCudaMemHandler error(&isErrorDetected, 1); - bsplineXdir <<>>(cudaImage, dim, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor, error.get()); + bsplineXdir <<>>(cudaImage, dim, p, error.get()); } if (isErrorDetected) { diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index 1a0986d1..e8aa5bdf 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -6,6 +6,7 @@ #include #include #include "cudaMisc.cuh" +#include "bsplineParams.h" /** @@ -59,9 +60,7 @@ template -__global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, - const float *bc1_vec, const float *bc2_vec, const float *bc3_vec, const float *bc4_vec, - size_t k0, float norm_factor, float *boundary, bool *error) { +__global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, BsplineParamsCuda p, float *boundary, bool *error) { const int xzIndexOfWorker = (blockIdx.x * blockDim.x) + threadIdx.x; const int xzIndexOfBlock = (blockIdx.x * blockDim.x); @@ -72,23 +71,23 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, const int64_t maxXZoffset = dim.x * dim.z; const size_t dirLen = dim.y; - const size_t minLen = min(dirLen, k0); + const size_t minLen = min(dirLen, p.k0); extern __shared__ float sharedMem[]; float *bc1_vec2 = &sharedMem[0]; - float *bc2_vec2 = &bc1_vec2[k0]; - float *cache = (float*)&bc2_vec2[k0]; + float *bc2_vec2 = &bc1_vec2[p.k0]; + float *cache = (float*)&bc2_vec2[p.k0]; // Read from global mem to cache - for (int i = currentWorkerId; i < k0 * numOfWorkers; i += numOfWorkers) { - if (i < k0) { - bc1_vec2[i] = bc1_vec[i]; - bc2_vec2[i] = bc2_vec[i]; + for (int i = currentWorkerId; i < p.k0 * numOfWorkers; i += numOfWorkers) { + if (i < p.k0) { + bc1_vec2[i] = p.bc1[i]; + bc2_vec2[i] = p.bc2[i]; } - int offs = i % k0; - int work = i / k0; + int offs = i % p.k0; + int work = i / p.k0; if (work + xzIndexOfBlock < maxXZoffset) { - cache[work * k0 + offs] = image[workersOffset + dim.y * work + offs]; + cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + offs]; } } __syncthreads(); @@ -98,8 +97,8 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, float temp1 = 0; float temp2 = 0; for (size_t k = 0; k < minLen; ++k) { - temp1 += bc1_vec2[k] * (T)cache[currentWorkerId * k0 + k]; - temp2 += bc2_vec2[k] * (T)cache[currentWorkerId * k0 + k]; + temp1 += bc1_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; + temp2 += bc2_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; } boundary[xzIndexOfWorker*4 + 0] = temp1; boundary[xzIndexOfWorker*4 + 1] = temp2; @@ -108,15 +107,15 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, // ----------------- second end __syncthreads(); - for (int i = currentWorkerId; i < k0 * numOfWorkers; i += numOfWorkers) { - if (i < k0) { - bc1_vec2[i] = bc3_vec[i]; - bc2_vec2[i] = bc4_vec[i]; + for (int i = currentWorkerId; i < p.k0 * numOfWorkers; i += numOfWorkers) { + if (i < p.k0) { + bc1_vec2[i] = p.bc3[i]; + bc2_vec2[i] = p.bc4[i]; } - int offs = i % k0; - int work = i / k0; + int offs = i % p.k0; + int work = i / p.k0; if (work + xzIndexOfBlock < maxXZoffset) { - cache[work * k0 + offs] = image[workersOffset + dim.y * work + dim.y - 1 - offs]; + cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + dim.y - 1 - offs]; } } __syncthreads(); @@ -128,11 +127,11 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, float temp3 = 0; float temp4 = 0; for (size_t k = 0; k < minLen; ++k) { - temp3 += bc1_vec2[k] * (T)cache[currentWorkerId * k0 + k]; - temp4 += bc2_vec2[k] * (T)cache[currentWorkerId * k0 + k]; + temp3 += bc1_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; + temp4 += bc2_vec2[k] * (T)cache[currentWorkerId * p.k0 + k]; } - boundary[xzIndexOfWorker*4 + 2] = round(temp3 * norm_factor, errorCnt); - boundary[xzIndexOfWorker*4 + 3] = round(temp4 * norm_factor, errorCnt); + boundary[xzIndexOfWorker*4 + 2] = round(temp3 * p.norm_factor, errorCnt); + boundary[xzIndexOfWorker*4 + 3] = round(temp4 * p.norm_factor, errorCnt); } if (errorCnt > 0) *error = true; @@ -142,8 +141,7 @@ constexpr int blockWidth = 32; constexpr int numOfThreads = 32; extern __shared__ char sharedMemProcess[]; template -__global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, - const float b1, const float b2, const float norm_factor, float *boundary, bool *error) { +__global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, BsplineParamsCuda p, float *boundary, bool *error) { const int numOfWorkers = blockDim.x; const int currentWorkerId = threadIdx.x; const int xzOffset = blockIdx.x * blockDim.x; @@ -177,7 +175,7 @@ __global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = temp2; } for (size_t k = yBlockBegin == 0 ? 2 : 0; k < blockWidth && k + yBlockBegin < dim.y - 2; ++k) { - float temp = temp2*b1 + temp1*b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; + float temp = temp2*p.b1 + temp1*p.b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp; temp1 = temp2; temp2 = temp; @@ -212,14 +210,14 @@ __global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, // Do operations if (xzOffset + currentWorkerId < maxXZoffset) { if (yBlockBegin == dim.y - 1) { - temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3] / norm_factor; - temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2] / norm_factor; - cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = norm_factor * temp1; - cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = norm_factor * temp2; + temp1 = boundary[(xzOffset + currentWorkerId) * 4 + 3] / p.norm_factor; + temp2 = boundary[(xzOffset + currentWorkerId) * 4 + 2] / p.norm_factor; + cache[currentWorkerId][(0 + currentWorkerId)%blockWidth] = p.norm_factor * temp1; + cache[currentWorkerId][(1 + currentWorkerId)%blockWidth] = p.norm_factor * temp2; } for (int64_t k = yBlockBegin == dim.y - 1 ? 2 : 0; k < blockWidth && yBlockBegin - k >= 0; ++k) { - float temp = temp2*b1 + temp1*b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; - cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * norm_factor; + float temp = temp2*p.b1 + temp1*p.b2 + (T)cache[currentWorkerId][(k + currentWorkerId)%blockWidth]; + cache[currentWorkerId][(k + currentWorkerId)%blockWidth] = temp * p.norm_factor; temp1 = temp2; temp2 = temp; } @@ -244,18 +242,17 @@ __global__ void bsplineYdirProcess(T *image, const PixelDataDim dim, size_t k0, * Function for launching a kernel */ template -void runBsplineYdir(T *cudaImage, PixelDataDim dim, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, - size_t k0, float b1, float b2, float norm_factor, float *boundary, cudaStream_t aStream) { +void runBsplineYdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, float *boundary, cudaStream_t aStream) { + dim3 threadsPerBlock(numOfThreads); dim3 numBlocks((dim.x * dim.z + threadsPerBlock.x - 1) / threadsPerBlock.x); - size_t sharedMemSize = (2 /*bc vectors*/) * (k0) * sizeof(float) + numOfThreads * (k0) * sizeof(float); + size_t sharedMemSize = (2 /*bc vectors*/) * (p.k0) * sizeof(float) + numOfThreads * (p.k0) * sizeof(float); bool isErrorDetected = false; { ScopedCudaMemHandler error(&isErrorDetected, 1); - bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, bc1, bc2, bc3,bc4, k0, norm_factor, boundary, error.get()); + bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); sharedMemSize = numOfThreads * blockWidth * sizeof(float); - bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, k0, b1, b2, norm_factor, boundary, error.get()); + bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); } if (isErrorDetected) { diff --git a/src/algorithm/bsplineZdir.cuh b/src/algorithm/bsplineZdir.cuh index cd59f0fb..c8ba6688 100644 --- a/src/algorithm/bsplineZdir.cuh +++ b/src/algorithm/bsplineZdir.cuh @@ -2,10 +2,11 @@ #define BSPLINE_Z_DIR_H -#include "cudaMisc.cuh" #include #include #include +#include "cudaMisc.cuh" +#include "bsplineParams.h" /** @@ -62,15 +63,13 @@ * @param norm_factor - filter norm factor */ template -__global__ void bsplineZdir(T *image, PixelDataDim dim, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, size_t k0, - float b1, float b2, float norm_factor, bool *error) { +__global__ void bsplineZdir(T *image, PixelDataDim dim, BsplineParamsCuda p, bool *error) { const int yDirOffset = blockIdx.y * blockDim.y + threadIdx.y; const size_t xDirOffset = (blockIdx.z * blockDim.z + threadIdx.z) * dim.y; // x is in 'z' to have good memory coalescing const size_t nextElementZdirOffset = dim.x * dim.y; const size_t dirLen = dim.z; - const size_t minLen = min(dirLen, k0); + const size_t minLen = min(dirLen, p.k0); if (yDirOffset < dim.y) { float temp1 = 0; @@ -81,11 +80,11 @@ __global__ void bsplineZdir(T *image, PixelDataDim dim, // calculate boundary values for (int k = 0; k < minLen; ++k) { T val = image[xDirOffset + k * nextElementZdirOffset + yDirOffset]; - temp1 += bc1[k] * val; - temp2 += bc2[k] * val; + temp1 += p.bc1[k] * val; + temp2 += p.bc2[k] * val; val = image[xDirOffset + (dirLen - 1 - k) * nextElementZdirOffset + yDirOffset]; - temp3 += bc3[k] * val; - temp4 += bc4[k] * val; + temp3 += p.bc3[k] * val; + temp4 += p.bc4[k] * val; } size_t errorCnt = 0; @@ -93,15 +92,15 @@ __global__ void bsplineZdir(T *image, PixelDataDim dim, // set boundary values in two first and two last points processed direction image[xDirOffset + 0 * nextElementZdirOffset + yDirOffset] = round(temp1, errorCnt); image[xDirOffset + 1 * nextElementZdirOffset + yDirOffset] = round(temp2, errorCnt); - image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = round(temp3 * norm_factor, errorCnt); - image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = round(temp4 * norm_factor, errorCnt); + image[xDirOffset + (dirLen - 2) * nextElementZdirOffset + yDirOffset] = round(temp3 * p.norm_factor, errorCnt); + image[xDirOffset + (dirLen - 1) * nextElementZdirOffset + yDirOffset] = round(temp4 * p.norm_factor, errorCnt); // Causal Filter loop int64_t offset = xDirOffset + 2 * nextElementZdirOffset + yDirOffset; int64_t offsetLimit = xDirOffset + (dirLen - 2) * nextElementZdirOffset; while (offset < offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = round(image[offset] + b1 * temp2 + b2 * temp1, errorCnt); + const float temp = round(image[offset] + p.b1 * temp2 + p.b2 * temp1, errorCnt); image[offset] = temp; temp1 = temp2; temp2 = temp; @@ -114,8 +113,8 @@ __global__ void bsplineZdir(T *image, PixelDataDim dim, offsetLimit = xDirOffset; while (offset >= offsetLimit) { __syncthreads(); // only needed for speed imporovement (memory coalescing) - const float temp = image[offset] + b1 * temp3 + b2 * temp4; - image[offset] = round(temp * norm_factor, errorCnt); + const float temp = image[offset] + p.b1 * temp3 + p.b2 * temp4; + image[offset] = round(temp * p.norm_factor, errorCnt); temp4 = temp3; temp3 = temp; @@ -130,9 +129,7 @@ __global__ void bsplineZdir(T *image, PixelDataDim dim, * Function for launching a kernel */ template -void runBsplineZdir(T *cudaImage, PixelDataDim dim, - const float *bc1, const float *bc2, const float *bc3, const float *bc4, - size_t k0, float b1, float b2, float norm_factor, cudaStream_t aStream) { +void runBsplineZdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaStream_t aStream) { constexpr int numOfWorkersYdir = 128; dim3 threadsPerBlockZ(1, numOfWorkersYdir, 1); dim3 numBlocksZ(1, @@ -143,7 +140,7 @@ void runBsplineZdir(T *cudaImage, PixelDataDim dim, bool isErrorDetected = false; { ScopedCudaMemHandler error(&isErrorDetected, 1); - bsplineZdir <<>> (cudaImage, dim, bc1, bc2, bc3, bc4, k0, b1, b2, norm_factor, error.get()); + bsplineZdir <<>> (cudaImage, dim, p, error.get()); } if (isErrorDetected) { diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index 931b95a3..13264ec4 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -950,15 +950,16 @@ void downsample(const PixelData &aInput, PixelData &aOutput, R reduce, C c const size_t shy = std::min(2*y + 1, y_num - 1); const size_t idx = z * x_num_ds * y_num_ds + x * y_num_ds + y; outMesh[idx] = constant_operator( - reduce(reduce(reduce(reduce(reduce(reduce(reduce( // inMesh coordinates + reduce(reduce(reduce(reduce( // inMesh coordinates inMesh[2*z * x_num * y_num + 2*x * y_num + 2*y], // z, x, y - inMesh[2*z * x_num * y_num + 2*x * y_num + shy]), // z, x, y+1 inMesh[2*z * x_num * y_num + shx * y_num + 2*y]), // z, x+1, y - inMesh[2*z * x_num * y_num + shx * y_num + shy]), // z, x+1, y+1 inMesh[shz * x_num * y_num + 2*x * y_num + 2*y]), // z+1, x, y - inMesh[shz * x_num * y_num + 2*x * y_num + shy]), // z+1, x, y+1 inMesh[shz * x_num * y_num + shx * y_num + 2*y]), // z+1, x+1, y - inMesh[shz * x_num * y_num + shx * y_num + shy]) // z+1, x+1, y+1 + reduce(reduce(reduce( + inMesh[2*z * x_num * y_num + 2*x * y_num + shy], // z, x, y+1 + inMesh[2*z * x_num * y_num + shx * y_num + shy]), // z, x+1, y+1 + inMesh[shz * x_num * y_num + 2*x * y_num + shy]), // z+1, x, y+1 + inMesh[shz * x_num * y_num + shx * y_num + shy])) // z+1, x+1, y+1 ); } } diff --git a/src/data_structures/Mesh/downsample.cuh b/src/data_structures/Mesh/downsample.cuh index 947db945..a6548a52 100644 --- a/src/data_structures/Mesh/downsample.cuh +++ b/src/data_structures/Mesh/downsample.cuh @@ -24,14 +24,14 @@ __global__ void downsampleMean(const T *input, S *output, size_t x_num, size_t y size_t idx = (zi * x_num + xi) * y_num + yi; // Go through all elements in 2x2 - T v = input[idx]; + S v = input[idx]; v += input[idx + xs * y_num]; v += input[idx + zs * x_num * y_num]; v += input[idx + xs * y_num + zs * x_num * y_num]; // Get data from odd thread to even one const int workerIdx = threadIdx.y; - T a = __shfl_sync(__activemask(), v, workerIdx + 1); + S a = __shfl_sync(__activemask(), v, workerIdx + 1); // downsampled dimensions twice smaller (rounded up) diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh index 3f9b5fca..558f730a 100644 --- a/src/misc/CudaTools.cuh +++ b/src/misc/CudaTools.cuh @@ -8,16 +8,25 @@ #include #include -//#include #include -//#include - - #include #include + #include "data_structures/Mesh/PixelData.hpp" +#define checkCuda(ans) { cudaAssert((ans), __FILE__, __LINE__); } +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ +#if defined(DEBUG) || defined(_DEBUG) + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +#endif +} + inline void waitForCuda() { cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError(); @@ -211,6 +220,17 @@ public: initialize(); } + ScopedCudaMemHandler (ScopedCudaMemHandler &&obj) { + iData = obj.iData; + obj.iData = nullptr; + iSize = obj.iSize; + obj.iSize = 0; + iBytes = obj.iBytes; + obj.iBytes = 0; + iStream = obj.iStream; + obj.iStream = nullptr; + iCudaMemory = std::move(obj.iCudaMemory); + } ~ScopedCudaMemHandler() { if (DIRECTION & D2H) { @@ -223,15 +243,21 @@ public: size_t getNumOfBytes() const {return iBytes; } void copyH2D() { - cudaMemcpyAsync(iCudaMemory.get(), iData, iBytes, cudaMemcpyHostToDevice, iStream); + if (iData != nullptr) { + checkCuda(cudaMemcpyAsync(iCudaMemory.get(), iData, iBytes, cudaMemcpyHostToDevice, iStream)); + } } void copyH2D(const size_t numElements) { - cudaMemcpyAsync(iCudaMemory.get(), iData, numElements*DataSize, cudaMemcpyHostToDevice, iStream); + if (iData != nullptr) { + checkCuda(cudaMemcpyAsync(iCudaMemory.get(), iData, numElements*DataSize, cudaMemcpyHostToDevice, iStream)); + } } void copyD2H() { - cudaMemcpyAsync((void*)iData, iCudaMemory.get(), iBytes, cudaMemcpyDeviceToHost, iStream); + if (iData != nullptr) { + checkCuda(cudaMemcpyAsync((void *) iData, iCudaMemory.get(), iBytes, cudaMemcpyDeviceToHost, iStream)); + } } private: @@ -240,7 +266,7 @@ private: void initialize() { ElementType *mem = nullptr; - cudaMalloc(&mem, iBytes); + checkCuda(cudaMalloc(&mem, iBytes)); iCudaMemory.reset(mem); if (DIRECTION & H2D) { copyH2D(); diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index d7fc6e62..8bb06106 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -279,7 +279,7 @@ namespace { // Downsampled gradient // ======================================================================== - TEST(ComputeGradientTest, GPU_VS_CPU_ON_RANDOM_VALUES) { + TEST(ComputeGradientTest, GPU_VS_CPU_DOWNSAMPLE_GRADIENT_ON_RANDOM_VALUES) { APRTimer timer(false); // Generate random mesh @@ -305,6 +305,60 @@ namespace { } + // ======================================================================== + // Full pipeline/gradient tests + // ======================================================================== + + TEST(ComputeThreshold, FULL_GRADIENT_TEST) { + APRTimer timer(false); + + // Generate random mesh + using ImageType = uint16_t; + PixelData input_image = getRandInitializedMesh(11, 13, 15, 15, 20); + PixelData &image_temp = input_image; + + PixelData grad_temp; // should be a down-sampled image + grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); + PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + + PixelData grad_temp_GPU; // should be a down-sampled image + grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); + PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, true); + PixelData local_scale_temp2_GPU; + local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate bspline on CPU + PixelData mCpuImage(image_temp, true); + + ComputeGradient computeGradient; + + timer.start_timer(">>>>>>>>>>>>>>>>> CPU gradient"); + computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpuImage(image_temp, true); + timer.start_timer(">>>>>>>>>>>>>>>>> GPU gradient"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + timer.stop_timer(); + + // Compare GPU vs CPU + EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage, 0.0000001), 0); + EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0.0000001), 0); + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0.0000001), 0); + } + #endif // APR_USE_CUDA } diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index d94f74c0..527815f0 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -701,58 +701,9 @@ namespace { EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); } - // TODO: These two test will be fixed as soon as CUDA pipeline is updated. - // Currently turning them off to have testable rest of CUDA impl. -// TEST(ComputeThreshold, FULL_GRADIENT_TEST) { -// APRTimer timer(true); -// -// // Generate random mesh -// using ImageType = float; -// PixelData input_image = getRandInitializedMesh(310, 330, 13, 25); -// PixelData &image_temp = input_image; -// -// PixelData grad_temp; // should be a down-sampled image -// grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// PixelData local_scale_temp2; -// local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// PixelData grad_temp_GPU; // should be a down-sampled image -// grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, true); -// PixelData local_scale_temp2_GPU; -// local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// APRParameters par; -// par.lambda = 3; -// par.Ip_th = 10; -// par.dx = 1; -// par.dy = 1; -// par.dz = 1; -// -// // Calculate bspline on CPU -// PixelData mCpuImage(image_temp, true); -// -// ComputeGradient computeGradient; -// -// timer.start_timer(">>>>>>>>>>>>>>>>> CPU gradient"); -// computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par); -// timer.stop_timer(); -// -// // Calculate bspline on GPU -// PixelData mGpuImage(image_temp, true); -// timer.start_timer(">>>>>>>>>>>>>>>>> GPU gradient"); -// getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); -// timer.stop_timer(); -// -// // Compare GPU vs CPU -// EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage), 0); -// EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0.1), 0); -// EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU), 0); -// } -// + + // TODO: This test will be fixed as soon as CUDA pipeline is updated. + // Currently turning it off to have testable rest of CUDA impl. // TEST(ComputeThreshold, FULL_PIPELINE_TEST) { // APRTimer timer(true); // From d958161cf3f31289f1f83fab86195bd0aa2ae2ec Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 10 Aug 2022 10:53:44 +0200 Subject: [PATCH 07/59] GPU and CPU give same resutls in Release mode - turned off unsafe optimizations --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c4912458..4513e07f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,14 +170,14 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ") if(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_CXX_FLAGS_RELEASE "-O4 -ffast-math") + set(CMAKE_CXX_FLAGS_RELEASE "-O4 -ffast-math -fno-unsafe-math-optimizations") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -pedantic") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Bdynamic") if(NOT WIN32) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ldl -lz") endif() elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -ffast-math") + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -ffast-math -fno-unsafe-math-optimizations") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -pedantic") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lz") endif() @@ -207,8 +207,8 @@ if(APR_USE_CUDA) message(STATUS "APR: Building CUDA for APR") set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_RUNTIME_LIBRARY "Static") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA") - set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math") # -lineinfo for profiling + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=false --default-stream per-thread -Xptxas -v -DAPR_USE_CUDA") + set(CMAKE_CUDA_FLAGS_RELEASE "-O3") # -lineinfo for profiling set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g -G") if(APR_BENCHMARK) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DAPR_BENCHMARK") From 4ace2385c4d3be68c6cab93da3d597cf3d3e618b Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 10 Aug 2022 16:10:57 +0200 Subject: [PATCH 08/59] Quick fix of processOnGpu() - not it gets correct bspline data for each direction --- src/algorithm/ComputeGradientCuda.cu | 75 ++++++++++++++++------------ 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 99f28558..0a6e5507 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -129,6 +129,33 @@ namespace { norm_factor }; } + + auto transferSpline(BsplineParams &aParams) { + ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0); + ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0); + ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0); + ScopedCudaMemHandler bc4(aParams.bc4.get(), aParams.k0); + + return std::pair { + BsplineParamsCuda { + bc1.get(), + bc2.get(), + bc3.get(), + bc4.get(), + aParams.k0, + aParams.b1, + aParams.b2, + aParams.norm_factor + }, + + BsplineParamsCudaMemoryHandlers { + std::move(bc1), + std::move(bc2), + std::move(bc3), + std::move(bc4) + } + }; + } } /** @@ -297,11 +324,22 @@ public: CurrentTime ct; uint64_t start = ct.microseconds(); - // TODO: Need to be fixed !!!!!!!!!!1 - -// getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), -// params, bc1.get(), bc2.get(), bc3.get(), bc4.get(), boundary.get(), -// iBsplineOffset, iParameters, iStream); + // TODO: temporarily bspline params are generated here + // In principle this is OK and correct but would be faster (for processing series of same size images) if + // they would be calculated in constructor of GpuProcessingTaskImpl class (once). + BsplineParams px = prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance); + auto cudax = transferSpline(px); + auto splineCudaX = cudax.first; + BsplineParams py = prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance); + auto cuday = transferSpline(py); + auto splineCudaY = cuday.first; + BsplineParams pz = prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance); + auto cudaz = transferSpline(pz); + auto splineCudaZ = cudaz.first; + + getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), + splineCudaX, splineCudaY, splineCudaZ, boundary.get(), + iBsplineOffset, iParameters, iStream); std::cout << "1: " << ct.microseconds() - start << std::endl; runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); std::cout << "2: " << ct.microseconds() - start << std::endl; @@ -357,32 +395,7 @@ template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfR template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); template void cudaFilterBsplineFull(PixelData &, float, float, TypeOfRecBsplineFlags, int); -auto transferSpline(BsplineParams &aParams) { - ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0); - ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0); - ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0); - ScopedCudaMemHandler bc4(aParams.bc4.get(), aParams.k0); - - return std::pair { - BsplineParamsCuda { - bc1.get(), - bc2.get(), - bc3.get(), - bc4.get(), - aParams.k0, - aParams.b1, - aParams.b2, - aParams.norm_factor - }, - - BsplineParamsCudaMemoryHandlers { - std::move(bc1), - std::move(bc2), - std::move(bc3), - std::move(bc4) - } - }; -} + template void cudaFilterBsplineFull(PixelData &input, float lambda, float tolerance, TypeOfRecBsplineFlags flags, int maxFilterLen) { From b050e07d706f73f84b0452307693f42ebb1a39cb Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 14 Nov 2022 13:34:50 +0100 Subject: [PATCH 09/59] Added new test file for LIS CUDA, GPU now handles boundary (without padding), still float number differences between CPU and GPU --- src/algorithm/LocalIntensityScale.cu | 97 +++++-- src/algorithm/LocalIntensityScaleCuda.h | 3 +- test/CMakeLists.txt | 1 + test/LocalIntensityScaleCudaTest.cpp | 370 ++++++++++++++++++++++++ test/LocalIntensityScaleTest.cpp | 214 -------------- 5 files changed, 445 insertions(+), 240 deletions(-) create mode 100644 test/LocalIntensityScaleCudaTest.cpp diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 5539baef..a0a05c0e 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -37,7 +37,7 @@ * @param z_num */ template -__global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) { +__global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect) { // NOTE: Block size in x/z direction must be 1 const size_t workersOffset = (blockIdx.z * x_num + blockIdx.x) * y_num; const int numOfWorkers = blockDim.y; @@ -53,20 +53,39 @@ __global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_ while(workerOffset < y_num) { if (!waitForNextLoop) v = image[workersOffset + workerOffset]; bool waitForNextValues = (workerIdx + offsetInTheLoop) % numOfWorkers >= (numOfWorkers - offset); + + // Check if current value is one of the mirrored elements (boundary condition) + int numberOfMirrorLeft = offset - workerOffset; + int numberOfMirrorRight = workerOffset + offset - (y_num - 1); + if (boundaryReflect) { + if (numberOfMirrorLeft > 0 && workerOffset >= 1 && workerOffset <= numberOfMirrorLeft) {sum += v; ++countNumOfSumElements;} + if (numberOfMirrorRight > 0 && workerOffset < (y_num - 1) && workerOffset >= (y_num - 1 - numberOfMirrorRight)) {sum += v; ++countNumOfSumElements;} + } for (int off = 1; off <= offset; ++off) { T prevElement = __shfl_sync(active, v, workerIdx + blockDim.y - off, blockDim.y); T nextElement = __shfl_sync(active, v, workerIdx + off, blockDim.y); // LHS boundary check + don't add previous values if they were added in a previous loop execution if (workerOffset >= off && !waitForNextLoop) {sum += prevElement; ++countNumOfSumElements;} + // RHS boundary check + don't read next values since they are not read yet - if (!waitForNextValues && workerOffset + off < y_num) {sum += nextElement; ++countNumOfSumElements;} + if (!waitForNextValues && (workerOffset + off) < y_num) {sum += nextElement; ++countNumOfSumElements;} + + // boundary condition (mirroring) + if (boundaryReflect) { + int element = workerOffset + off; + if (numberOfMirrorLeft > 0 && element >= 1 && element <= numberOfMirrorLeft) {sum += nextElement; ++countNumOfSumElements;} + if (numberOfMirrorRight > 0 && element < (y_num - 1) && element >= (y_num - 1 - numberOfMirrorRight)) {sum += nextElement; ++countNumOfSumElements;} + element = workerOffset - off; + if (numberOfMirrorLeft > 0 && element >= 1 && element <= numberOfMirrorLeft) {sum += prevElement; ++countNumOfSumElements;} + if (numberOfMirrorRight > 0 && element < (y_num - 1) && element >= (y_num - 1 - numberOfMirrorRight)) {sum += prevElement; ++countNumOfSumElements;} + } } waitForNextLoop = waitForNextValues; if (!waitForNextLoop) { sum += v; image[workersOffset + workerOffset] = sum / countNumOfSumElements; - // workere is done with current element - move to next one + // worker is done with current element - move to next one sum = 0; countNumOfSumElements = 1; workerOffset += numOfWorkers; @@ -93,7 +112,7 @@ constexpr int NumberOfWorkers = 32; // Cannot be greater than 32 since there is * read/write operations for given element. */ template -__global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) { +__global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect = false) { const size_t workerOffset = blockIdx.y * blockDim.y + threadIdx.y + (blockIdx.z * blockDim.z + threadIdx.z) * y_num * x_num; const int workerYoffset = blockIdx.y * blockDim.y + threadIdx.y ; const int workerIdx = threadIdx.y; @@ -113,13 +132,19 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_ // saturate cache with #offset elements since it will allow to calculate first element value on LHS float sum = 0; int count = 0; - while (count < offset) { + while (count <= offset) { T v = image[workerOffset + currElementOffset]; sum += v; data[count][workerIdx] = v; + if (boundaryReflect && count > 0) {data[2 * offset - count + 1][workerIdx] = v; sum += v;} currElementOffset += nextElementOffset; ++count; } + currElementOffset -= nextElementOffset; + --count; + if (boundaryReflect) { + count = divisor; + } // Pointer in circular buffer int beginPtr = offset; @@ -147,9 +172,17 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_ } // Handle last #offset elements on RHS + int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + while (saveElementOffset < currElementOffset) { - count = count - 1; + if (!boundaryReflect) count = count - 1; sum -= data[beginPtr][workerIdx]; + + if (boundaryReflect) { + sum += data[boundaryPtr][workerIdx]; + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + } + image[workerOffset + saveElementOffset] = sum / count; beginPtr = (beginPtr + 1) % divisor; saveElementOffset += nextElementOffset; @@ -173,7 +206,7 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_ * read/write operations for given element. */ template -__global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num) { +__global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_t z_num, bool boundaryReflect = false) { const size_t workerOffset = blockIdx.y * blockDim.y + threadIdx.y + (blockIdx.z * blockDim.z + threadIdx.z) * y_num; // *.z is 'x' const int workerYoffset = blockIdx.y * blockDim.y + threadIdx.y ; const int workerIdx = threadIdx.y; @@ -193,13 +226,19 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ // saturate cache with #offset elements since it will allow to calculate first element value on LHS float sum = 0; int count = 0; - while (count < offset) { + while (count <= offset) { T v = image[workerOffset + currElementOffset]; sum += v; data[count][workerIdx] = v; + if (boundaryReflect && count > 0) {data[2 * offset - count + 1][workerIdx] = v; sum += v;} currElementOffset += nextElementOffset; ++count; } + currElementOffset -= nextElementOffset; + --count; + if (boundaryReflect) { + count = divisor; + } // Pointer in circular buffer int beginPtr = offset; @@ -227,9 +266,17 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ } // Handle last #offset elements on RHS + int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + while (saveElementOffset < currElementOffset) { - count = count - 1; + if (!boundaryReflect) count = count - 1; sum -= data[beginPtr][workerIdx]; + + if (boundaryReflect) { + sum += data[boundaryPtr][workerIdx]; + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + } + image[workerOffset + saveElementOffset] = sum / count; beginPtr = (beginPtr + 1) % divisor; saveElementOffset += nextElementOffset; @@ -238,48 +285,48 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ } template -void runMeanYdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { +void runMeanYdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) { dim3 threadsPerBlock(1, NumberOfWorkers, 1); dim3 numBlocks((x_num + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (z_num + threadsPerBlock.z - 1)/threadsPerBlock.z); - meanYdir<<>>(cudaImage, offset, x_num, y_num, z_num); + meanYdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } template -void runMeanXdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { +void runMeanXdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) { dim3 threadsPerBlock(1, NumberOfWorkers, 1); dim3 numBlocks(1, (y_num + threadsPerBlock.y - 1) / threadsPerBlock.y, (z_num + threadsPerBlock.z - 1) / threadsPerBlock.z); // Shared memory size - it is able to keep filter len elements for each worker. const int sharedMemorySize = (offset * 2 + 1) * sizeof(float) * NumberOfWorkers; - meanXdir<<>>(cudaImage, offset, x_num, y_num, z_num); + meanXdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } template -void runMeanZdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream) { +void runMeanZdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_num, cudaStream_t aStream, bool boundaryReflect) { dim3 threadsPerBlock(1, NumberOfWorkers, 1); dim3 numBlocks(1, (y_num + threadsPerBlock.y - 1) / threadsPerBlock.y, (x_num + threadsPerBlock.x - 1) / threadsPerBlock.x); // intentionally here for better memory readings // Shared memory size - it is able to keep filter len elements for each worker. const int sharedMemorySize = (offset * 2 + 1) * sizeof(float) * NumberOfWorkers; - meanZdir<<>>(cudaImage, offset, x_num, y_num, z_num); + meanZdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } template -void runMean(T *cudaImage, const PixelData &image, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream) { +void runMean(T *cudaImage, const PixelData &image, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream, bool boundaryReflect = false) { if (flags & MEAN_Y_DIR) { - runMeanYdir(cudaImage, offsetY, image.x_num, image.y_num, image.z_num, aStream); + runMeanYdir(cudaImage, offsetY, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); } if (flags & MEAN_X_DIR) { - runMeanXdir(cudaImage, offsetX, image.x_num, image.y_num, image.z_num, aStream); + runMeanXdir(cudaImage, offsetX, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); } if (flags & MEAN_Z_DIR) { - runMeanZdir(cudaImage, offsetZ, image.x_num, image.y_num, image.z_num, aStream); + runMeanZdir(cudaImage, offsetZ, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); } } @@ -347,9 +394,9 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete // --------- CUDA ---------------- runCopy1D(cudaImage, cudaTemp, image.mesh.size(), aStream); - runMean(cudaImage, image, win_x, win_y, win_z, MEAN_ALL_DIR, aStream); + runMean(cudaImage, image, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, par.reflect_bc_lis); runAbsDiff1D(cudaImage, cudaTemp, image.mesh.size(), aStream); - runMean(cudaImage, image, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream); + runMean(cudaImage, image, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, par.reflect_bc_lis); runRescaleAndThreshold(cudaImage, image.mesh.size(), var_rescale, par.sigma_th, par.sigma_th_max, aStream); } @@ -360,17 +407,17 @@ template void runLocalIntensityScalePipeline(const PixelData // =================================================== TEST helpers // TODO: should be moved somewhere template -void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags) { +void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags, bool boundaryReflect) { ScopedCudaMemHandler, H2D | D2H> cudaImage(image); APRTimer timer(true); timer.start_timer("GpuDeviceTimeFull"); - runMean(cudaImage.get(), image, offset, offset, offset, flags, 0); + runMean(cudaImage.get(), image, offset, offset, offset, flags, 0, boundaryReflect); timer.stop_timer(); } // explicit instantiation of handled types -template void calcMean(PixelData&, int, TypeOfMeanFlags); -template void calcMean(PixelData&, int, TypeOfMeanFlags); +template void calcMean(PixelData&, int, TypeOfMeanFlags, bool); +template void calcMean(PixelData&, int, TypeOfMeanFlags, bool); template diff --git a/src/algorithm/LocalIntensityScaleCuda.h b/src/algorithm/LocalIntensityScaleCuda.h index a635a156..135e5927 100644 --- a/src/algorithm/LocalIntensityScaleCuda.h +++ b/src/algorithm/LocalIntensityScaleCuda.h @@ -15,8 +15,9 @@ constexpr TypeOfMeanFlags MEAN_X_DIR = 0x02; constexpr TypeOfMeanFlags MEAN_Z_DIR = 0x04; constexpr TypeOfMeanFlags MEAN_ALL_DIR = MEAN_Y_DIR | MEAN_X_DIR | MEAN_Z_DIR; +// TODO: remember to revert by default boundaryReflect=true (or check with CPU code what is current 'default'). template -void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags = MEAN_ALL_DIR); +void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags = MEAN_ALL_DIR, bool boundaryReflect = false); template void getLocalIntensityScale(PixelData &image, PixelData &temp, const APRParameters &par); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ba468743..2918f2c5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,6 +15,7 @@ buildTarget(testPullingScheme PullingSchemeTest.cpp) if(APR_USE_CUDA) buildTarget(testAPRCuda APRTestCuda.cpp) buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp) + buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp) endif() if(APR_BUILD_EXAMPLES) diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp new file mode 100644 index 00000000..6e2b722b --- /dev/null +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -0,0 +1,370 @@ + +#include + +#include "algorithm/LocalIntensityScaleCuda.h" +#include "algorithm/LocalIntensityScale.hpp" +#include "TestTools.hpp" + + +namespace { + +#ifdef APR_USE_CUDA + + // ------------------------------------------------------------------------ + // TODO: REMOVE IT after dev. + // ------------------------------------------------------------------------ + TEST(LocalIntensityScaleCudaTest, REMOVE_ME_AFTER_DEVELOPMENT) { + int y_num = 2; + int x_num = 3; + int z_num = 2; + PixelData m(y_num, x_num, z_num, 0); + PixelData m2(y_num, x_num, z_num, 0); + PixelData m3(y_num, x_num, z_num,0); + float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9 ,10, 11, 12}; + + initFromZYXarray(m, dataIn); + initFromZYXarray(m2, dataIn); + initFromZYXarray(m3, dataIn); + LocalIntensityScale lis; + int off = 0; + lis.calc_sat_mean_x(m, off); + m.printMesh(1); + calcMean(m3, off, MEAN_X_DIR); + m3.printMesh(1); +// lis.calc_sat_mean_y(m2, off); +// m2.printMesh(1); + + + compareMeshes(m3, m, 0.00000001); + } + // ------------------------------------------------------------------------ + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { + APRTimer timer(true); + PixelData m = getRandInitializedMesh(22, 33, 22, 100, 3); + + LocalIntensityScale lis; + for (int offset = 0; offset < 6; ++offset) { + + std::cout << " ============================== " << offset << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean Y-DIR"); + lis.calc_sat_mean_y(mCpu, offset); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_X_DIR) { + APRTimer timer(true); + PixelData m = getRandInitializedMesh(22, 33, 22, 255); + + LocalIntensityScale lis; + for (int offset = 0; offset < 6; ++offset) { + + std::cout << " ============================== " << offset << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.001), 0); + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { + APRTimer timer(true); + using ImgType = float; + PixelData m = getRandInitializedMesh(22, 33, 22, 255); + + LocalIntensityScale lis; + for (int offset = 0; offset < 6; ++offset) { + + std::cout << " ============================== " << offset << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean Z-DIR"); + lis.calc_sat_mean_z(mCpu, offset); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Z-DIR"); + calcMean(mGpu, offset, MEAN_Z_DIR); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); + } + } + + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Y_DIR) { + APRTimer timer(true); + PixelData m(4, 4, 1, 0); + float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; + initFromZYXarray(m, dataIn); + + LocalIntensityScale lis; + + for (int boundary = 1; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + std::cout << "\n\n"; + for (int offset = 1; offset < 2; ++offset) { + // Run on CPU + PixelData mCpuPadded; + paddPixels(m, mCpuPadded, offset * boundary, offset * boundary, 0); + timer.start_timer("CPU mean Y-DIR"); + lis.calc_sat_mean_y(mCpuPadded, offset); + PixelData mCpu; + unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); + + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01, 4), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_X_DIR) { + APRTimer timer(true); + //PixelData m(1, 13, 1, 0); + //float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; + //initFromZYXarray(m, dataIn); + PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset < 6; ++offset) { + // Run on CPU + PixelData mCpuPadded; + paddPixels(m, mCpuPadded, 0, offset * boundary, 0); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpuPadded, offset); + PixelData mCpu; + unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0000001), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Z_DIR) { + APRTimer timer(true); + PixelData m(1, 1, 13, 0); + float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; + initFromZYXarray(m, dataIn); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset < 6; ++offset) { + // Run on CPU + PixelData mCpuPadded; + paddPixels(m, mCpuPadded, 0, 0, offset * boundary); + timer.start_timer("CPU mean Z-DIR"); + lis.calc_sat_mean_z(mCpuPadded, offset); + PixelData mCpu; + unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Z-DIR"); + calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); + } + } + } + + + // !!!!!!!!!!!!!!!!!!!!!!! NOT YET CHECKED !!!!!!!!!!!!!!!!!!!!!!!!!!!!! + // TODO: See what these tests are doing and fix/change/remove them! + + TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) { + { // OFFSET=0 + + PixelData m(8, 1, 1, 0); + float dataIn[] = {3,6,9,12,15,18,21,24}; + float expect[] = {3,6,9,12,15,18,21,24}; + + initFromZYXarray(m, dataIn); + + calcMean(m, 0, MEAN_Y_DIR); + + ASSERT_TRUE(compare(m, expect, 0.05)); + } + { // OFFSET=1 + + PixelData m(8, 1, 1, 0); + float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8}; + float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5}; + + initFromZYXarray(m, dataIn); + + calcMean(m, 1, MEAN_Y_DIR); + + ASSERT_TRUE(compare(m, expect, 0.05)); + } + { // OFFSET=2 (+symmetricity check) + + PixelData m(8, 1, 1, 0); + float dataIn[] = {3,6,9,12,15,18,21,24}; + float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21}; + + initFromZYXarray(m, dataIn); + + calcMean(m, 2, MEAN_Y_DIR); + + ASSERT_TRUE(compare(m, expect, 0.05)); + + // check if data in opposite order gives same result + float dataIn2[] = {24,21,18,15,12,9,6,3}; + float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6}; + + initFromZYXarray(m, dataIn2); + + calcMean(m, 2, MEAN_Y_DIR); + + ASSERT_TRUE(compare(m, expect2, 0.05)); + } + } + + + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { + APRTimer timer(true); + PixelData m = getRandInitializedMesh(33, 31, 13); + + LocalIntensityScale lis; + for (int offset = 0; offset < 6; ++offset) { + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean ALL-DIR"); + lis.calc_sat_mean_y(mCpu, offset); + lis.calc_sat_mean_x(mCpu, offset); + lis.calc_sat_mean_z(mCpu, offset); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean ALL-DIR"); + calcMean(mGpu, offset); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); + } + } + + //@KG: The CPU code doesn't work for uint16 --> overflow will likely result. + +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) { +// APRTimer timer(true); +// PixelData m = getRandInitializedMesh(33, 31, 13); +// +// LocalIntensityScale lis; +// for (int offset = 0; offset < 6; ++offset) { +// // Run on CPU +// PixelData mCpu(m, true); +// timer.start_timer("CPU mean ALL-DIR"); +// lis.calc_sat_mean_y(mCpu, offset); +// lis.calc_sat_mean_x(mCpu, offset); +// lis.calc_sat_mean_z(mCpu, offset); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean ALL-DIR"); +// calcMean(mGpu, offset); +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 1), 0); +// } +// } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { + APRTimer timer(true); + PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); + + APRParameters params; + params.sigma_th = 1; + params.sigma_th_max = 2; + params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. + + // Run on CPU + PixelData mCpu(m, true); + PixelData mCpuTemp(m, false); + timer.start_timer("CPU LIS FULL"); + + LocalIntensityScale localIntensityScale; + + localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + PixelData mGpuTemp(m, false); + timer.start_timer("GPU LIS ALL-DIR"); + getLocalIntensityScale(mGpu, mGpuTemp, params); + timer.stop_timer(); + + // Compare results + //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + } + + +#endif // APR_USE_CUDA +} + + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/LocalIntensityScaleTest.cpp b/test/LocalIntensityScaleTest.cpp index a9f1558b..e8b194d3 100644 --- a/test/LocalIntensityScaleTest.cpp +++ b/test/LocalIntensityScaleTest.cpp @@ -5,9 +5,6 @@ #include #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/LocalIntensityScale.hpp" -#include "algorithm/LocalIntensityScaleCuda.h" -#include "data_structures/APR/APR.hpp" -#include "algorithm/APRConverter.hpp" #include "TestTools.hpp" @@ -168,217 +165,6 @@ namespace { } } - -// ============================================================================ -// ==================== CUDA IMPL TESTS ============================= -// ============================================================================ - -#ifdef APR_USE_CUDA - - TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) { - { // OFFSET=0 - - PixelData m(8, 1, 1, 0); - float dataIn[] = {3,6,9,12,15,18,21,24}; - float expect[] = {3,6,9,12,15,18,21,24}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 0, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - } - { // OFFSET=1 - - PixelData m(8, 1, 1, 0); - float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8}; - float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 1, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - } - { // OFFSET=2 (+symmetricity check) - - PixelData m(8, 1, 1, 0); - float dataIn[] = {3,6,9,12,15,18,21,24}; - float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 2, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - - // check if data in opposite order gives same result - float dataIn2[] = {24,21,18,15,12,9,6,3}; - float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6}; - - initFromZYXarray(m, dataIn2); - - calcMean(m, 2, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect2, 0.05)); - } - } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(33, 31, 13); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean Y-DIR"); - lis.calc_sat_mean_y(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Y-DIR"); - calcMean(mGpu, offset, MEAN_Y_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - TEST(LocalIntensityScaleCudaTest, 1GPU_VS_CPU_X_DIR) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(33, 31, 13); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean X-DIR"); - lis.calc_sat_mean_x(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { - APRTimer timer(true); - using ImgType = float; - PixelData m = getRandInitializedMesh(310, 330, 13, 255); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean Z-DIR"); - lis.calc_sat_mean_z(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Z-DIR"); - calcMean(mGpu, offset, MEAN_Z_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(33, 31, 13); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean ALL-DIR"); - lis.calc_sat_mean_y(mCpu, offset); - lis.calc_sat_mean_x(mCpu, offset); - lis.calc_sat_mean_z(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean ALL-DIR"); - calcMean(mGpu, offset); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - //@KG: The CPU code doesn't work for uint16 --> overflow will likely result. - -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) { -// APRTimer timer(true); -// PixelData m = getRandInitializedMesh(33, 31, 13); -// -// LocalIntensityScale lis; -// for (int offset = 0; offset < 6; ++offset) { -// // Run on CPU -// PixelData mCpu(m, true); -// timer.start_timer("CPU mean ALL-DIR"); -// lis.calc_sat_mean_y(mCpu, offset); -// lis.calc_sat_mean_x(mCpu, offset); -// lis.calc_sat_mean_z(mCpu, offset); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean ALL-DIR"); -// calcMean(mGpu, offset); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 1), 0); -// } -// } - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(310, 330, 13, 25); - - APRParameters params; - params.sigma_th = 1; - params.sigma_th_max = 2; - params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. - - // Run on CPU - PixelData mCpu(m, true); - PixelData mCpuTemp(m, false); - timer.start_timer("CPU LIS FULL"); - - LocalIntensityScale localIntensityScale; - - localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - PixelData mGpuTemp(m, false); - timer.start_timer("GPU LIS ALL-DIR"); - getLocalIntensityScale(mGpu, mGpuTemp, params); - timer.stop_timer(); - - // Compare results - //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - -#endif // APR_USE_CUDA - } int main(int argc, char **argv) { From 570ab20ecc4fc30cf4cab0611350da5d9fc379d2 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 31 Jan 2023 15:36:14 +0100 Subject: [PATCH 10/59] Local Intensity Scale (LIS) not works in X-dir as expected. GPU and CPU gives same results. --- src/algorithm/LocalIntensityScale.cu | 16 +- src/algorithm/LocalIntensityScale.hpp | 130 +++++++--- src/data_structures/Mesh/PixelData.hpp | 15 +- test/LocalIntensityScaleCudaTest.cpp | 329 +++++++++++++++++++------ test/LocalIntensityScaleTest.cpp | 24 +- test/MeshDataTest.cpp | 10 + test/TestTools.hpp | 29 ++- 7 files changed, 418 insertions(+), 135 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index a0a05c0e..3673b406 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -175,8 +175,16 @@ __global__ void meanXdir(T *image, int offset, size_t x_num, size_t y_num, size_ int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; while (saveElementOffset < currElementOffset) { - if (!boundaryReflect) count = count - 1; - sum -= data[beginPtr][workerIdx]; + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' and do not remove first element from moving filter + // since 'sum' of filter elements contains all elements from processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + // In such a case first 'o' element should not be removed when filter moves right. + if (x_num - (currElementOffset - saveElementOffset)/nextElementOffset > offset || boundaryReflect) { + if (!boundaryReflect) count = count - 1; + sum -= data[beginPtr][workerIdx]; + } if (boundaryReflect) { sum += data[boundaryPtr][workerIdx]; @@ -410,9 +418,9 @@ template void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags, bool boundaryReflect) { ScopedCudaMemHandler, H2D | D2H> cudaImage(image); APRTimer timer(true); - timer.start_timer("GpuDeviceTimeFull"); +// timer.start_timer("GpuDeviceTimeFull"); runMean(cudaImage.get(), image, offset, offset, offset, flags, 0, boundaryReflect); - timer.stop_timer(); +// timer.stop_timer(); } // explicit instantiation of handled types diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index 3d5942c2..bee7f303 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -156,7 +156,7 @@ void get_local_intensity_scale(PixelData &local_scale_temp, PixelData &input, const size_t offset); template - void calc_sat_mean_x(PixelData &input, const size_t offset); + void calc_sat_mean_x(PixelData &input, const size_t offset, bool boundaryReflect = false); template void calc_sat_mean_y(PixelData &input, const size_t offset); @@ -367,63 +367,119 @@ inline void LocalIntensityScale::calc_sat_mean_y(PixelData& input, const size } template -inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size_t offset) { +inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size_t offset, bool boundaryReflect) { + const size_t z_num = input.z_num; const size_t x_num = input.x_num; const size_t y_num = input.y_num; - std::vector temp_vec(y_num*(2*offset + 1),0); + const size_t divisor = offset + 1 + offset; + std::vector circularBuffer(y_num * divisor, 0); + std::vector sum(y_num, 0); - #ifdef HAVE_OPENMP - #pragma omp parallel for default(shared) firstprivate(temp_vec) - #endif + auto &mesh = input.mesh; + size_t dimLen = x_num; + + if (dimLen < offset) { + throw std::runtime_error("offset cannot be bigger than processed dimension length!"); + } + +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) firstprivate(circularBuffer, sum) +#endif for(size_t j = 0; j < z_num; j++) { size_t jxnumynum = j * x_num * y_num; - for(size_t k = 0; k < y_num ; k++){ - temp_vec[k] = input.mesh[jxnumynum + k]; - } + size_t count = 0; // counts number of active elements in filter + size_t currElementOffset = 0; // offset of element in processed dimension + size_t nextElementOffset = 1; + size_t saveElementOffset = 0; // offset used to finish RHS boundary - for(size_t i = 1; i < 2 * offset + 1; i++) { - for(size_t k = 0; k < y_num; k++) { - temp_vec[i*y_num + k] = input.mesh[jxnumynum + i*y_num + k] + temp_vec[(i-1)*y_num + k]; + // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS + while(count <= offset) { + for (size_t k = 0; k < y_num; ++k) { + auto v = mesh[jxnumynum + currElementOffset * y_num + k]; + sum[k] += v; + circularBuffer[count * y_num + k] = v; + if (boundaryReflect && count > 0) { circularBuffer[(2 * offset - count + 1) * y_num + k] = v; sum[k] += v;} } + + currElementOffset += nextElementOffset; + ++count; } - // LHS boundary - for(size_t i = 0; i < offset + 1; i++){ - for(size_t k = 0; k < y_num; k++) { - input.mesh[jxnumynum + i * y_num + k] = (temp_vec[(i + offset) * y_num + k]) / (i + offset + 1); - } + currElementOffset -= nextElementOffset; + --count; + + if (boundaryReflect) { + count = divisor; } - // middle - size_t current_index = offset + 1; - size_t index_modulo = 0; - for(size_t i = offset + 1; i < x_num - offset; i++){ - // the current cumsum - index_modulo = (current_index + offset) % (2*offset + 1); // current_index - offset - 1 - size_t previous_modulo = (current_index + offset - 1) % (2*offset + 1); // the index of previous cumsum + // Pointer in circular buffer + int beginPtr = offset; - for(size_t k = 0; k < y_num; k++) { - float temp = input.mesh[jxnumynum + (i + offset)*y_num + k] + temp_vec[previous_modulo*y_num + k]; - input.mesh[jxnumynum + i*y_num + k] = (temp - temp_vec[index_modulo*y_num + k]) / - (2*offset + 1); - temp_vec[index_modulo*y_num + k] = temp; + // main loop going through all elements in range [0, x_num-offset) + for (size_t x = 0; x < dimLen - offset; ++x) { + for (size_t k = 0; k < y_num; ++k) { + // Read new element + T v = mesh[jxnumynum + currElementOffset * y_num + k]; + + // Update sum to cover [-offset, offset] of currently processed element + sum[k] += v; + if (count >= divisor || x == 0) sum[k] -= circularBuffer[beginPtr * y_num + k]; + + // Save new element + circularBuffer[beginPtr * y_num + k] = v; } - current_index = (current_index + 1) % (2*offset + 1); + // move pointer in circular buffer and number of active elements hold there + beginPtr = (beginPtr + 1) % divisor; + count = std::min(count + 1, divisor); + + for (size_t k = 0; k < y_num; ++k) { + // save currently processed element + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + + // Move to next elements + currElementOffset += nextElementOffset; + saveElementOffset += nextElementOffset; } - // RHS boundary - current_index = (current_index + offset) % (2*offset + 1); - for(size_t i = x_num - offset; i < x_num; i++){ - for(size_t k = 0; k < y_num; k++){ - input.mesh[jxnumynum + i*y_num + k] = (temp_vec[index_modulo*y_num + k] - - temp_vec[current_index*y_num + k]) / (x_num - i + offset); + // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value + int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + + // Handle last #offset elements on RHS + while(saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' since 'sum' of filter elements contains all elements from + // processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset) > offset; + + if (removeElementFromFilter) { + if (!boundaryReflect) count = count - 1; } - current_index = (current_index + 1) % (2*offset + 1); + + for (size_t k = 0; k < y_num; ++k) { + if (removeElementFromFilter || boundaryReflect) { + sum[k] -= circularBuffer[beginPtr * y_num + k]; + } + + if (boundaryReflect) { + sum[k] += circularBuffer[boundaryPtr * y_num + k]; + } + + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } + + std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop } } diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index 13264ec4..68de3b00 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -34,7 +34,7 @@ struct PixelDataDim { size_t x; size_t z; - PixelDataDim(size_t y, size_t x, size_t z) : y(y), x(x), z(z) {} + constexpr PixelDataDim(size_t y, size_t x, size_t z) : y(y), x(x), z(z) {} size_t size() const { return y * x * z; } @@ -436,6 +436,19 @@ public : */ PixelData(int aSizeOfY, int aSizeOfX, int aSizeOfZ, T aInitVal) { initWithValue(aSizeOfY, aSizeOfX, aSizeOfZ, aInitVal); } + /** + * Constructor - initialize initial size of mesh to provided values + * @param aDims - PixelDataDim with length of each dimension + */ + PixelData(PixelDataDim aDims) { init(aDims.y, aDims.x, aDims.z); } + + /** + * Constructor - creates mesh with provided dimentions initialized to aInitVal + * @param aDims - PixelDataDim with length of each dimension + * @param aInitVal - initial value of all elements + */ + PixelData(PixelDataDim aDims, T aInitVal) { initWithValue(aDims.y, aDims.x, aDims.z, aInitVal); } + /** * Move constructor * @param aObj mesh to be moved diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index 6e2b722b..b0e084ca 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -10,33 +10,260 @@ namespace { #ifdef APR_USE_CUDA + TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_X_DIR_VS_MANUALLY_CALCULATED_VALUES) { + // Belows data is precomputed for x-len = 5 (and maximum offset = 4) so do not change these numbers! + constexpr PixelDataDim const dim{1, 5, 1}; + float expectedData[2][5][dim.x] = { + { // with no boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 + {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 + {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 + {2.50, 3.00, 3.00, 3.00, 3.5}, // offset = 3 + {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 + }, + { // with boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, + {1.66, 2.00, 3.00, 4.00, 4.33}, + {2.20, 2.40, 3.00, 3.60, 3.80}, + {2.71, 2.85, 3.00, 3.14, 3.28}, + {3.22, 3.11, 3.00, 2.88, 2.77} + } + }; + + + APRTimer timer(false); // set to true to see timings + + PixelData m(dim); + float dataIn[] = {1, 2, 3, 4, 5}; + initFromZYXarray(m, dataIn); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 4; ++offset) { +// std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + PixelData expected(dim); + initFromZYXarray(expected, expectedData[boundary][offset]); + EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0); + EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0); + + // Also GPU and CPU should give exactly same output + EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_X_DIR_RANDOM_VALUES) { + APRTimer timer(false); + + constexpr PixelDataDim const dim{63, 65, 96}; + PixelData m = getRandInitializedMesh(dim, 50, 10); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 6; ++offset) { + //std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + PixelData mCpu; + mCpu.init(m); + mCpu.copyFromMesh(m); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + // ------------------------------------------------------------------------ - // TODO: REMOVE IT after dev. + // Below tests are not yet fixed. // ------------------------------------------------------------------------ - TEST(LocalIntensityScaleCudaTest, REMOVE_ME_AFTER_DEVELOPMENT) { - int y_num = 2; - int x_num = 3; - int z_num = 2; - PixelData m(y_num, x_num, z_num, 0); - PixelData m2(y_num, x_num, z_num, 0); - PixelData m3(y_num, x_num, z_num,0); - float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8, 9 ,10, 11, 12}; + TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_Z_DIR_VS_MANUALLY_CALCULATED_VALUES) { + // Belows data is precomputed for x-len = 5 (and maximum offset = 4) so do not change these numbers! + constexpr PixelDataDim const dim{1, 5, 1}; + float expectedData[2][5][dim.x] = { + { // with no boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 + {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 + {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 + {2.50, 3.00, 3.00, 3.00, 3.5}, // offset = 3 + {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 + }, + { // with boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, + {1.66, 2.00, 3.00, 4.00, 4.33}, + {2.20, 2.40, 3.00, 3.60, 3.80}, + {2.71, 2.85, 3.00, 3.14, 3.28}, + {3.22, 3.11, 3.00, 2.88, 2.77} + } + }; + + + APRTimer timer(false); // set to true to see timings + + PixelData m(dim); + float dataIn[] = {1, 2, 3, 4, 5}; initFromZYXarray(m, dataIn); - initFromZYXarray(m2, dataIn); - initFromZYXarray(m3, dataIn); + LocalIntensityScale lis; - int off = 0; - lis.calc_sat_mean_x(m, off); - m.printMesh(1); - calcMean(m3, off, MEAN_X_DIR); - m3.printMesh(1); -// lis.calc_sat_mean_y(m2, off); -// m2.printMesh(1); + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 4; ++offset) { +// std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); - compareMeshes(m3, m, 0.00000001); + // Compare results + PixelData expected(dim); + initFromZYXarray(expected, expectedData[boundary][offset]); + EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0); + EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0); + + // Also GPU and CPU should give exactly same output + EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0); + } + } } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Z_DIR_RANDOM_VALUES) { + APRTimer timer(false); + + constexpr PixelDataDim const dim{63, 65, 96}; + PixelData m = getRandInitializedMesh(dim, 50, 10); + + LocalIntensityScale lis; + + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 6; ++offset) { + //std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + + // Run on CPU +// PixelData mCpu2; +// PixelData mCpuPadded; +// timer.start_timer("CPU old mean X-DIR"); +// paddPixels(m, mCpuPadded, 0, offset * boundary, 0); +// lis.calc_sat_mean_x_orig(mCpuPadded, offset); +// unpaddPixels(mCpuPadded, mCpu2, dim.y, dim.x, dim.z); +// timer.stop_timer(); + + PixelData mCpu; + mCpu.init(m); + mCpu.copyFromMesh(m); + timer.start_timer("CPU mean X-DIR"); + lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.stop_timer(); + + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + +// TEST(LocalIntensityScaleCudaTest, REMOVE_ME_AFTER_DEVELOPMENT) { +// int y_num = 1; +// int x_num = 5; +// int z_num = 1; +//#if 1 +// PixelData m(y_num, x_num, z_num, 0); +// PixelData m2(y_num, x_num, z_num, 0); +// PixelData m3(y_num, x_num, z_num, 0); +// PixelData m4(y_num, x_num, z_num, 0); +// float dataIn[] = {1, 2, 3, 4, 5}; +//// float dataIn[] = {75.4539260864, 42.5445404053, 0.00003, 4, 0.00005, 6, 0.00007, 8, 0.00009, 10, 0.000011, 12}; +// +// initFromZYXarray(m, dataIn); +// initFromZYXarray(m2, dataIn); +// initFromZYXarray(m3, dataIn); +// initFromZYXarray(m4, dataIn); +//#else +// PixelData m = getRandInitializedMesh(y_num, x_num, z_num, 200, 0); +// PixelData m2(m, true); +// PixelData m3(m, true); +// PixelData m4(m, true); +//#endif +// +// LocalIntensityScale lis; +// +// int off = 4; +// +// std::cout << "INP:"; m.printMesh(1); +// +// bool boundary = true; +// +// APRTimer timer(true); +// calcMean(m3, off, MEAN_X_DIR, boundary); +// timer.start_timer("new"); +// lis.calc_sat_mean_x(m2, off, boundary); +// timer.stop_timer(); +// +// timer.start_timer("old"); +// PixelData mCpuPadded; +// paddPixels(m, mCpuPadded, 0, off, 0); +// lis.calc_sat_mean_x_orig(mCpuPadded, off); +// unpaddPixels(mCpuPadded, m4, m.y_num, m.x_num, m.z_num); +// timer.stop_timer(); +// +// std::cout << "CPU: "; m2.printMesh(1); +// std::cout << "GPU: "; m3.printMesh(1); +// std::cout << "CPU old: "; m4.printMesh(1); +// +// std::cout << "GPU vs NEW\n"; +// compareMeshes(m3, m2, 0.00000001, 3); +// std::cout << "OLD vs GPU\n"; +// compareMeshes(m4, m3, 0.00000001, 3); +// std::cout << "OLD vs NEW\n"; +// EXPECT_EQ(compareMeshes(m4, m2, 0.00000001, 3), 0); +// } // ------------------------------------------------------------------------ TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { @@ -65,31 +292,7 @@ namespace { } } - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_X_DIR) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(22, 33, 22, 255); - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - - std::cout << " ============================== " << offset << std::endl; - - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean X-DIR"); - lis.calc_sat_mean_x(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.001), 0); - } - } TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { APRTimer timer(true); @@ -154,45 +357,15 @@ namespace { } } - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_X_DIR) { - APRTimer timer(true); - //PixelData m(1, 13, 1, 0); - //float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; - //initFromZYXarray(m, dataIn); - PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); - LocalIntensityScale lis; - - for (int boundary = 0; boundary <= 1; ++ boundary) { - // boundary = 0 there is no reflected boundary - // boudnary = 1 there is boundary reflect - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpuPadded; - paddPixels(m, mCpuPadded, 0, offset * boundary, 0); - timer.start_timer("CPU mean X-DIR"); - lis.calc_sat_mean_x(mCpuPadded, offset); - PixelData mCpu; - unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0000001), 0); - } - } - } TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Z_DIR) { APRTimer timer(true); - PixelData m(1, 1, 13, 0); - float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; - initFromZYXarray(m, dataIn); +// PixelData m(1, 1, 13, 0); +// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; +// initFromZYXarray(m, dataIn); + PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); + LocalIntensityScale lis; diff --git a/test/LocalIntensityScaleTest.cpp b/test/LocalIntensityScaleTest.cpp index e8b194d3..09a6466b 100644 --- a/test/LocalIntensityScaleTest.cpp +++ b/test/LocalIntensityScaleTest.cpp @@ -21,7 +21,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_y(m, 0); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=1 @@ -34,7 +34,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_y(m, 1); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=2 (+symmetricity check) @@ -47,7 +47,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_y(m, 2); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); // check if data in opposite order gives same result float dataIn2[] = {24,21,18,15,12,9,6,3}; @@ -57,7 +57,7 @@ namespace { lis.calc_sat_mean_y(m, 2); - ASSERT_TRUE(compare(m, expect2, 0.05)); + ASSERT_TRUE(compare(m, expect2, 0.000001)); } } @@ -73,7 +73,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_x(m, 0); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=1 @@ -86,7 +86,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_x(m, 1); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=2 (+symmetricity check) @@ -99,7 +99,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_x(m, 2); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); // check if data in opposite order gives same result float dataIn2[] = {24,21,18,15,12,9,6,3}; @@ -109,7 +109,7 @@ namespace { lis.calc_sat_mean_x(m, 2); - ASSERT_TRUE(compare(m, expect2, 0.05)); + ASSERT_TRUE(compare(m, expect2, 0.000001)); } } @@ -125,7 +125,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_z(m, 0); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=1 @@ -138,7 +138,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_z(m, 1); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); } { // OFFSET=2 (+symmetricity check) @@ -151,7 +151,7 @@ namespace { LocalIntensityScale lis; lis.calc_sat_mean_z(m, 2); - ASSERT_TRUE(compare(m, expect, 0.05)); + ASSERT_TRUE(compare(m, expect, 0.000001)); // check if data in opposite order gives same result float dataIn2[] = {24,21,18,15,12,9,6,3}; @@ -161,7 +161,7 @@ namespace { lis.calc_sat_mean_z(m, 2); - ASSERT_TRUE(compare(m, expect2, 0.05)); + ASSERT_TRUE(compare(m, expect2, 0.000001)); } } diff --git a/test/MeshDataTest.cpp b/test/MeshDataTest.cpp index f9c9bf4b..a3c4bec6 100644 --- a/test/MeshDataTest.cpp +++ b/test/MeshDataTest.cpp @@ -338,6 +338,16 @@ namespace { ASSERT_EQ(md.mesh.size(), 100*200*300); } + // size provided + { + PixelDataDim dim(100, 200, 300); + PixelData md(dim); + ASSERT_EQ(md.x_num, 200); + ASSERT_EQ(md.y_num, 100); + ASSERT_EQ(md.z_num, 300); + ASSERT_EQ(md.mesh.size(), 100*200*300); + } + // mesh provided { // generate some data diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 0896eea5..d0211f6f 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -68,16 +68,26 @@ inline bool initFromZYXarray(PixelData &mesh, const float *data) { */ template inline int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { + if (expected.getDimension() != tested.getDimension()) { + std::stringstream errMsg; + errMsg << "Dimensions of expected and tested meshes differ! " << expected.getDimension() << " vs " << tested.getDimension(); + throw std::runtime_error(errMsg.str()); + } + int cnt = 0; + double maxErrorFound = 0; + for (size_t i = 0; i < expected.mesh.size(); ++i) { - if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError) { + auto diff = std::abs(expected.mesh[i] - tested.mesh[i]); + if (diff > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; } cnt++; } + if (diff > maxErrorFound) maxErrorFound = diff; } - if (cnt != 0) std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << std::endl; + if (cnt != 0) std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << " maxErrorFound = " << maxErrorFound << std::endl; return cnt; } @@ -112,7 +122,6 @@ inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTyp return cnt; } - /** * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset * @param y @@ -120,6 +129,7 @@ inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTyp * @param z * @param multiplier * @param offset + * @param useIdxNumbers - instead of random values put values from 0..sizeof(mesh)-1 * @return */ template @@ -139,6 +149,19 @@ inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier return m; } +/** + * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset + * @param dim - dimension of generated mesh + * @param multiplier + * @param offset + * @param useIdxNumbers - instead of random values put values from 0..sizeof(mesh)-1 + * @return + */ +template +inline PixelData getRandInitializedMesh(PixelDataDim dim, float multiplier = 2.0f, float offset=0.0, bool useIdxNumbers = false) { + return getRandInitializedMesh(dim.y, dim.x, dim.z, multiplier, offset, useIdxNumbers); +} + struct TestBenchStats{ double inf_norm=0; From 17e5d8edace7ce253cfae82db13c82bff31b9f74 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 1 Feb 2023 15:21:57 +0100 Subject: [PATCH 11/59] Local Intensity Scale (LIS) now works in Z-dir as expected. GPU and CPU gives same results. --- src/algorithm/LocalIntensityScale.cu | 12 ++- src/algorithm/LocalIntensityScale.hpp | 137 ++++++++++++++++++-------- test/LocalIntensityScaleCudaTest.cpp | 50 ++++------ 3 files changed, 123 insertions(+), 76 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 3673b406..11e005fa 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -277,8 +277,16 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; while (saveElementOffset < currElementOffset) { - if (!boundaryReflect) count = count - 1; - sum -= data[beginPtr][workerIdx]; + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' and do not remove first element from moving filter + // since 'sum' of filter elements contains all elements from processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + // In such a case first 'o' element should not be removed when filter moves right. + if (z_num - (currElementOffset - saveElementOffset)/nextElementOffset > offset || boundaryReflect) { + if (!boundaryReflect) count = count - 1; + sum -= data[beginPtr][workerIdx]; + } if (boundaryReflect) { sum += data[boundaryPtr][workerIdx]; diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index bee7f303..4e3213f0 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -153,7 +153,7 @@ void get_local_intensity_scale(PixelData &local_scale_temp, PixelData &input_image, PixelData &var); template - void calc_sat_mean_z(PixelData &input, const size_t offset); + void calc_sat_mean_z(PixelData &input, const size_t offset, bool boundaryReflect = false); template void calc_sat_mean_x(PixelData &input, const size_t offset, bool boundaryReflect = false); @@ -456,7 +456,7 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size // processed dimension: // dim elements: xxxxxx // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) - bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset) > offset; + bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset)/nextElementOffset > offset; if (removeElementFromFilter) { if (!boundaryReflect) count = count - 1; @@ -484,69 +484,120 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size } template -inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input,const size_t offset) { +inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input, const size_t offset, bool boundaryReflect) { + const size_t z_num = input.z_num; const size_t x_num = input.x_num; const size_t y_num = input.y_num; - std::vector temp_vec(y_num*(2*offset + 1),0); - size_t xnumynum = x_num * y_num; + const size_t divisor = offset + 1 + offset; + std::vector circularBuffer(y_num * divisor, 0); + std::vector sum(y_num, 0); - #ifdef HAVE_OPENMP - #pragma omp parallel for default(shared) firstprivate(temp_vec) - #endif - for(size_t i = 0; i < x_num; i++) { + auto &mesh = input.mesh; + size_t dimLen = z_num; - size_t iynum = i * y_num; + if (dimLen < offset) { + throw std::runtime_error("offset cannot be bigger than processed dimension length!"); + } - //prefetching - for(size_t k = 0; k < y_num ; k++){ - temp_vec[k] = input.mesh[iynum + k]; - } +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) firstprivate(circularBuffer, sum) +#endif + for (size_t j = 0; j < x_num; j++) { + size_t jxnumynum = j * y_num; - for(size_t j = 1; j < 2 * offset + 1; j++) { - for(size_t k = 0; k < y_num; k++) { - temp_vec[j*y_num + k] = input.mesh[j * xnumynum + iynum + k] + temp_vec[(j-1)*y_num + k]; + size_t count = 0; // counts number of active elements in filter + size_t currElementOffset = 0; // offset of element in processed dimension + size_t nextElementOffset = x_num; + size_t saveElementOffset = 0; // offset used to finish RHS boundary + + // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS + while(count <= offset) { + for (size_t k = 0; k < y_num; ++k) { + auto v = mesh[jxnumynum + currElementOffset * y_num + k]; + sum[k] += v; + circularBuffer[count * y_num + k] = v; + if (boundaryReflect && count > 0) { circularBuffer[(2 * offset - count + 1) * y_num + k] = v; sum[k] += v;} } + + currElementOffset += nextElementOffset; + ++count; } - // LHS boundary - for(size_t j = 0; j < offset + 1; j++){ - for(size_t k = 0; k < y_num; k++) { - input.mesh[j * xnumynum + iynum + k] = (temp_vec[(j + offset)*y_num + k]) / (j + offset + 1); - } + currElementOffset -= nextElementOffset; + --count; + + if (boundaryReflect) { + count = divisor; } - // middle - size_t current_index = offset + 1; - size_t index_modulo = 0; - for(size_t j = offset + 1; j < z_num - offset; j++){ + // Pointer in circular buffer + int beginPtr = offset; - index_modulo = (current_index + offset) % (2*offset + 1); // current_index - offset - 1 - size_t previous_modulo = (current_index + offset - 1) % (2*offset + 1); // the index of previous cumsum + // main loop going through all elements in range [0, x_num-offset) + for (size_t z = 0; z < dimLen - offset; ++z) { + for (size_t k = 0; k < y_num; ++k) { + // Read new element + T v = mesh[jxnumynum + currElementOffset * y_num + k]; - for(size_t k = 0; k < y_num; k++) { - // the current cumsum - float temp = input.mesh[(j + offset) * xnumynum + iynum + k] + temp_vec[previous_modulo*y_num + k]; - input.mesh[j * xnumynum + iynum + k] = (temp - temp_vec[index_modulo*y_num + k]) / - (2*offset + 1); - temp_vec[index_modulo*y_num + k] = temp; + // Update sum to cover [-offset, offset] of currently processed element + sum[k] += v; + if (count >= divisor || z == 0) sum[k] -= circularBuffer[beginPtr * y_num + k]; + + // Save new element + circularBuffer[beginPtr * y_num + k] = v; } - current_index = (current_index + 1) % (2*offset + 1); + // move pointer in circular buffer and number of active elements hold there + beginPtr = (beginPtr + 1) % divisor; + count = std::min(count + 1, divisor); + + for (size_t k = 0; k < y_num; ++k) { + // save currently processed element + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + + // Move to next elements + currElementOffset += nextElementOffset; + saveElementOffset += nextElementOffset; } - // RHS boundary - current_index = (current_index + offset) % (2*offset + 1); - for(size_t j = z_num - offset; j < z_num; j++){ - for(size_t k = 0; k < y_num; k++){ - input.mesh[j * xnumynum + iynum + k] = (temp_vec[index_modulo*y_num + k] - - temp_vec[current_index*y_num + k]) / (z_num - j + offset); + // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value + int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + + // Handle last #offset elements on RHS + while(saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' since 'sum' of filter elements contains all elements from + // processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset)/nextElementOffset > offset; + + if (removeElementFromFilter) { + if (!boundaryReflect) count = count - 1; + } + + for (size_t k = 0; k < y_num; ++k) { + if (removeElementFromFilter || boundaryReflect) { + sum[k] -= circularBuffer[beginPtr * y_num + k]; + } + + if (boundaryReflect) { + sum[k] += circularBuffer[boundaryPtr * y_num + k]; + } + + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; } - current_index = (current_index + 1) % (2*offset + 1); + boundaryPtr = (boundaryPtr - 1 + (2*offset+1)) % divisor; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } + + std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop } } -#endif //PARTPLAY_LOCAL_INTENSITY_SCALE_HPP +#endif diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index b0e084ca..a5c52b63 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -18,7 +18,7 @@ namespace { {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 - {2.50, 3.00, 3.00, 3.00, 3.5}, // offset = 3 + {2.50, 3.00, 3.00, 3.00, 3.50}, // offset = 3 {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 }, { // with boundary values @@ -30,7 +30,6 @@ namespace { } }; - APRTimer timer(false); // set to true to see timings PixelData m(dim); @@ -72,7 +71,7 @@ namespace { TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_X_DIR_RANDOM_VALUES) { APRTimer timer(false); - constexpr PixelDataDim const dim{63, 65, 96}; + constexpr PixelDataDim const dim{49, 53, 51}; PixelData m = getRandInitializedMesh(dim, 50, 10); LocalIntensityScale lis; @@ -102,20 +101,15 @@ namespace { } } - - // ------------------------------------------------------------------------ - // Below tests are not yet fixed. - // ------------------------------------------------------------------------ - TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_Z_DIR_VS_MANUALLY_CALCULATED_VALUES) { // Belows data is precomputed for x-len = 5 (and maximum offset = 4) so do not change these numbers! - constexpr PixelDataDim const dim{1, 5, 1}; - float expectedData[2][5][dim.x] = { + constexpr PixelDataDim const dim{1, 1, 5}; + float expectedData[2][5][dim.z] = { { // with no boundary values {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 - {2.50, 3.00, 3.00, 3.00, 3.5}, // offset = 3 + {2.50, 3.00, 3.00, 3.00, 3.50}, // offset = 3 {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 }, { // with boundary values @@ -127,7 +121,6 @@ namespace { } }; - APRTimer timer(false); // set to true to see timings PixelData m(dim); @@ -140,18 +133,18 @@ namespace { // boundary = 0 there is no reflected boundary // boudnary = 1 there is boundary reflect for (int offset = 0; offset <= 4; ++offset) { -// std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; +// std::cout << "------------------ OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; // Run on CPU PixelData mCpu(m, true); timer.start_timer("CPU mean X-DIR"); - lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + lis.calc_sat_mean_z(mCpu, offset, (boundary > 0)); timer.stop_timer(); // Run on GPU PixelData mGpu(m, true); timer.start_timer("GPU mean X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); timer.stop_timer(); // Compare results @@ -169,7 +162,7 @@ namespace { TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Z_DIR_RANDOM_VALUES) { APRTimer timer(false); - constexpr PixelDataDim const dim{63, 65, 96}; + constexpr PixelDataDim const dim{49,51,53}; PixelData m = getRandInitializedMesh(dim, 50, 10); LocalIntensityScale lis; @@ -178,37 +171,32 @@ namespace { // boundary = 0 there is no reflected boundary // boudnary = 1 there is boundary reflect for (int offset = 0; offset <= 6; ++offset) { - //std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; - - // Run on CPU -// PixelData mCpu2; -// PixelData mCpuPadded; -// timer.start_timer("CPU old mean X-DIR"); -// paddPixels(m, mCpuPadded, 0, offset * boundary, 0); -// lis.calc_sat_mean_x_orig(mCpuPadded, offset); -// unpaddPixels(mCpuPadded, mCpu2, dim.y, dim.x, dim.z); -// timer.stop_timer(); +// std::cout << "---------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; PixelData mCpu; mCpu.init(m); mCpu.copyFromMesh(m); - timer.start_timer("CPU mean X-DIR"); - lis.calc_sat_mean_x(mCpu, offset, (boundary > 0)); + timer.start_timer("CPU mean Z-DIR"); + lis.calc_sat_mean_z(mCpu, offset, (boundary > 0)); timer.stop_timer(); // Run on GPU PixelData mGpu(m, true); - timer.start_timer("GPU mean X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR, (boundary > 0)); + timer.start_timer("GPU mean Z-DIR"); + calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); timer.stop_timer(); - // Compare results EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } } } + // ------------------------------------------------------------------------ + // Below tests are not yet fixed. + // ------------------------------------------------------------------------ + + // TEST(LocalIntensityScaleCudaTest, REMOVE_ME_AFTER_DEVELOPMENT) { // int y_num = 1; // int x_num = 5; From 5ad9865239a3d924feb88d0b0cf7fafab88a15fa Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 17 Feb 2023 15:49:26 +0100 Subject: [PATCH 12/59] Updated compareMeshes to show maximum error found --- test/TestTools.hpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/test/TestTools.hpp b/test/TestTools.hpp index d0211f6f..4ec15afe 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -46,7 +46,7 @@ inline bool compare(PixelData &mesh, const float *data, const float epsilon) } template -inline bool initFromZYXarray(PixelData &mesh, const float *data) { +inline bool initFromZYXarray(PixelData &mesh, T *data) { size_t dataIdx = 0; for (int z = 0; z < mesh.z_num; ++z) { for (int y = 0; y < mesh.y_num; ++y) { @@ -76,18 +76,33 @@ inline int compareMeshes(const PixelData &expected, const PixelData &teste int cnt = 0; double maxErrorFound = 0; + T maxErrorExpectedValue = 0; + T maxErrorTestedValue = 0; + std::string maxErrorIdx = ""; for (size_t i = 0; i < expected.mesh.size(); ++i) { auto diff = std::abs(expected.mesh[i] - tested.mesh[i]); if (diff > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; + std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: " + << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] + << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; } cnt++; } - if (diff > maxErrorFound) maxErrorFound = diff; + if (diff > maxErrorFound) { + maxErrorFound = diff; + maxErrorExpectedValue = expected.mesh[i]; + maxErrorTestedValue = tested.mesh[i]; + maxErrorIdx = tested.getStrIndex(i); + } + } + if (cnt != 0) { + std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() + << ", maxErrorFound = " << maxErrorFound << " at IDX: " << maxErrorIdx << " " + << maxErrorExpectedValue << " vs " << maxErrorTestedValue + << "(" << (100*(long double)maxErrorFound/(long double)maxErrorExpectedValue) << "%)"<& input, const size #ifdef HAVE_OPENMP #pragma omp parallel for default(shared) firstprivate(circularBuffer, sum) #endif - for(size_t j = 0; j < z_num; j++) { + for (size_t j = 0; j < z_num; j++) { size_t jxnumynum = j * x_num * y_num; size_t count = 0; // counts number of active elements in filter @@ -395,8 +395,12 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size size_t nextElementOffset = 1; size_t saveElementOffset = 0; // offset used to finish RHS boundary + // Clear buffers so they can be reused in next 'z_num' loop + std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop + std::fill(circularBuffer.begin(), circularBuffer.end(), 0); + // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS - while(count <= offset) { + while (count <= offset) { for (size_t k = 0; k < y_num; ++k) { auto v = mesh[jxnumynum + currElementOffset * y_num + k]; sum[k] += v; @@ -408,42 +412,45 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size ++count; } - currElementOffset -= nextElementOffset; - --count; - if (boundaryReflect) { - count = divisor; + count += offset; // elements in above loop in range [1, offset] were summed twice } // Pointer in circular buffer - int beginPtr = offset; + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, x_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const size_t lastElement = x_num - 1 - offset; + for (size_t x = 0; x <= lastElement; ++x) { + // Calculate and save currently processed element and move to the new one + for (size_t k = 0; k < y_num; ++k) { + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'circularBuffer' buffer + if (x == lastElement) break; - // main loop going through all elements in range [0, x_num-offset) - for (size_t x = 0; x < dimLen - offset; ++x) { for (size_t k = 0; k < y_num; ++k) { // Read new element T v = mesh[jxnumynum + currElementOffset * y_num + k]; // Update sum to cover [-offset, offset] of currently processed element + sum[k] -= circularBuffer[beginPtr * y_num + k]; sum[k] += v; - if (count >= divisor || x == 0) sum[k] -= circularBuffer[beginPtr * y_num + k]; - // Save new element + // Store new element in circularBuffer circularBuffer[beginPtr * y_num + k] = v; } - // move pointer in circular buffer and number of active elements hold there - beginPtr = (beginPtr + 1) % divisor; + // Move to next elements to read and in circular buffer count = std::min(count + 1, divisor); - - for (size_t k = 0; k < y_num; ++k) { - // save currently processed element - mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; - } - - // Move to next elements + beginPtr = (beginPtr + 1) % divisor; currElementOffset += nextElementOffset; - saveElementOffset += nextElementOffset; } // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value @@ -478,8 +485,6 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size beginPtr = (beginPtr + 1) % divisor; saveElementOffset += nextElementOffset; } - - std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop } } diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index a5c52b63..d66810c9 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -80,7 +80,7 @@ namespace { // boundary = 0 there is no reflected boundary // boudnary = 1 there is boundary reflect for (int offset = 0; offset <= 6; ++offset) { - //std::cout << "OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; +// std::cout << "------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; PixelData mCpu; mCpu.init(m); @@ -96,7 +96,94 @@ namespace { timer.stop_timer(); // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); // Expect exactly same results + } + } + } + + /** + * Generate input and expected output using easy brute force approach. + * When comparing vs CPU or GPU outputs there is small error expected since little difference in order of float + * operations. + * @tparam T - type of generated data + * @param len - length + * @param offset - offset for which expected output should be calculated + * @param boundary - use boundary? + * @param useRandomNumbers - use random numbers or if false then index numbers in buffers [1..len] + * @return tuple of [input, expectedOutput] + */ + template + auto generateInputAndExpected(int len, int offset, bool boundary, bool useRandomNumbers) { + std::vector input(len); + std::vector expected(len); + + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_real_distribution dist(0.0, 10.0); + + // Feel input and calculate expected data + for (int i = 0; i < len; ++i) input[i] = useRandomNumbers ? dist(mt) : i + 1; + + for (int i = 0; i < len; ++i) { + int count = 0; + T sum = 0; + for (int x = i - offset; x <= i + offset; ++x) { + int currIdx = x; + if (boundary) { + currIdx = abs(x); + if (currIdx > len - 1) currIdx = (len - 1) - (currIdx - (len - 1)); + } + + if (currIdx < 0 || currIdx >= len) continue; + + sum += input[currIdx]; + count++; + } + expected[i] = sum / count; + } + return std::make_tuple(input, expected); + } + + TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_RANDOM_VALUES_X_DIR) { + // Input params + using T = uint16_t; + + for (int b = 0; b <= 1; b++) { + for (int len = 5; len <= 45; len += 20) { + for (int offset = 0; offset <= 6 && offset < len; offset++) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = b > 0; + bool useRandomNumbers = r > 0; +// std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl; + + auto t = generateInputAndExpected(len, offset, hasBoundary, useRandomNumbers); + auto input = std::get<0>(t); + auto expected = std::get<1>(t); + PixelData m(1, len, 1, 0); + initFromZYXarray(m, input.data()); + PixelData expectedMesh(1, len, 1, 0); + initFromZYXarray(expectedMesh, expected.data()); + + APRTimer timer(false); + LocalIntensityScale lis; + + // Run on CPU old-impl + timer.start_timer("CPU X-DIR"); + PixelData mCpu(m, true); + lis.calc_sat_mean_x(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU X-DIR"); + calcMean(mGpu, offset, MEAN_X_DIR, (hasBoundary > 0)); + timer.stop_timer(); + + EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; + } + } } } } @@ -254,213 +341,213 @@ namespace { // } // ------------------------------------------------------------------------ - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(22, 33, 22, 100, 3); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - - std::cout << " ============================== " << offset << std::endl; - - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean Y-DIR"); - lis.calc_sat_mean_y(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Y-DIR"); - calcMean(mGpu, offset, MEAN_Y_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } - - - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { - APRTimer timer(true); - using ImgType = float; - PixelData m = getRandInitializedMesh(22, 33, 22, 255); - - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - - std::cout << " ============================== " << offset << std::endl; - - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean Z-DIR"); - lis.calc_sat_mean_z(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Z-DIR"); - calcMean(mGpu, offset, MEAN_Z_DIR); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); - } - } - - - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Y_DIR) { - APRTimer timer(true); - PixelData m(4, 4, 1, 0); - float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; - initFromZYXarray(m, dataIn); - - LocalIntensityScale lis; - - for (int boundary = 1; boundary <= 1; ++ boundary) { - // boundary = 0 there is no reflected boundary - // boudnary = 1 there is boundary reflect - std::cout << "\n\n"; - for (int offset = 1; offset < 2; ++offset) { - // Run on CPU - PixelData mCpuPadded; - paddPixels(m, mCpuPadded, offset * boundary, offset * boundary, 0); - timer.start_timer("CPU mean Y-DIR"); - lis.calc_sat_mean_y(mCpuPadded, offset); - PixelData mCpu; - unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Y-DIR"); - calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { +// APRTimer timer(true); +// PixelData m = getRandInitializedMesh(22, 33, 22, 100, 3); +// +// LocalIntensityScale lis; +// for (int offset = 0; offset < 6; ++offset) { +// +// std::cout << " ============================== " << offset << std::endl; +// +// // Run on CPU +// PixelData mCpu(m, true); +// timer.start_timer("CPU mean Y-DIR"); +// lis.calc_sat_mean_y(mCpu, offset); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean Y-DIR"); +// calcMean(mGpu, offset, MEAN_Y_DIR); +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); +// } +// } - timer.stop_timer(); - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01, 4), 0); - } - } - } +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { +// APRTimer timer(true); +// using ImgType = float; +// PixelData m = getRandInitializedMesh(22, 33, 22, 255); +// +// LocalIntensityScale lis; +// for (int offset = 0; offset < 6; ++offset) { +// +// std::cout << " ============================== " << offset << std::endl; +// +// // Run on CPU +// PixelData mCpu(m, true); +// timer.start_timer("CPU mean Z-DIR"); +// lis.calc_sat_mean_z(mCpu, offset); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean Z-DIR"); +// calcMean(mGpu, offset, MEAN_Z_DIR); +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); +// } +// } - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Z_DIR) { - APRTimer timer(true); -// PixelData m(1, 1, 13, 0); -// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Y_DIR) { +// APRTimer timer(true); +// PixelData m(4, 4, 1, 0); +// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; // initFromZYXarray(m, dataIn); - PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); - +// +// LocalIntensityScale lis; +// +// for (int boundary = 1; boundary <= 1; ++ boundary) { +// // boundary = 0 there is no reflected boundary +// // boudnary = 1 there is boundary reflect +// std::cout << "\n\n"; +// for (int offset = 1; offset < 2; ++offset) { +// // Run on CPU +// PixelData mCpuPadded; +// paddPixels(m, mCpuPadded, offset * boundary, offset * boundary, 0); +// timer.start_timer("CPU mean Y-DIR"); +// lis.calc_sat_mean_y(mCpuPadded, offset); +// PixelData mCpu; +// unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean Y-DIR"); +// calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); +// +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01, 4), 0); +// } +// } +// } - LocalIntensityScale lis; - for (int boundary = 0; boundary <= 1; ++ boundary) { - // boundary = 0 there is no reflected boundary - // boudnary = 1 there is boundary reflect - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpuPadded; - paddPixels(m, mCpuPadded, 0, 0, offset * boundary); - timer.start_timer("CPU mean Z-DIR"); - lis.calc_sat_mean_z(mCpuPadded, offset); - PixelData mCpu; - unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); - timer.stop_timer(); - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean Z-DIR"); - calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); - } - } - } +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Z_DIR) { +// APRTimer timer(true); +//// PixelData m(1, 1, 13, 0); +//// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; +//// initFromZYXarray(m, dataIn); +// PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); +// +// +// LocalIntensityScale lis; +// +// for (int boundary = 0; boundary <= 1; ++ boundary) { +// // boundary = 0 there is no reflected boundary +// // boudnary = 1 there is boundary reflect +// for (int offset = 0; offset < 6; ++offset) { +// // Run on CPU +// PixelData mCpuPadded; +// paddPixels(m, mCpuPadded, 0, 0, offset * boundary); +// timer.start_timer("CPU mean Z-DIR"); +// lis.calc_sat_mean_z(mCpuPadded, offset); +// PixelData mCpu; +// unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean Z-DIR"); +// calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); +// } +// } +// } // !!!!!!!!!!!!!!!!!!!!!!! NOT YET CHECKED !!!!!!!!!!!!!!!!!!!!!!!!!!!!! // TODO: See what these tests are doing and fix/change/remove them! - TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) { - { // OFFSET=0 - - PixelData m(8, 1, 1, 0); - float dataIn[] = {3,6,9,12,15,18,21,24}; - float expect[] = {3,6,9,12,15,18,21,24}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 0, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - } - { // OFFSET=1 - - PixelData m(8, 1, 1, 0); - float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8}; - float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 1, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - } - { // OFFSET=2 (+symmetricity check) - - PixelData m(8, 1, 1, 0); - float dataIn[] = {3,6,9,12,15,18,21,24}; - float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21}; - - initFromZYXarray(m, dataIn); - - calcMean(m, 2, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect, 0.05)); - - // check if data in opposite order gives same result - float dataIn2[] = {24,21,18,15,12,9,6,3}; - float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6}; - - initFromZYXarray(m, dataIn2); - - calcMean(m, 2, MEAN_Y_DIR); - - ASSERT_TRUE(compare(m, expect2, 0.05)); - } - } - +// TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) { +// { // OFFSET=0 +// +// PixelData m(8, 1, 1, 0); +// float dataIn[] = {3,6,9,12,15,18,21,24}; +// float expect[] = {3,6,9,12,15,18,21,24}; +// +// initFromZYXarray(m, dataIn); +// +// calcMean(m, 0, MEAN_Y_DIR); +// +// ASSERT_TRUE(compare(m, expect, 0.05)); +// } +// { // OFFSET=1 +// +// PixelData m(8, 1, 1, 0); +// float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8}; +// float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5}; +// +// initFromZYXarray(m, dataIn); +// +// calcMean(m, 1, MEAN_Y_DIR); +// +// ASSERT_TRUE(compare(m, expect, 0.05)); +// } +// { // OFFSET=2 (+symmetricity check) +// +// PixelData m(8, 1, 1, 0); +// float dataIn[] = {3,6,9,12,15,18,21,24}; +// float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21}; +// +// initFromZYXarray(m, dataIn); +// +// calcMean(m, 2, MEAN_Y_DIR); +// +// ASSERT_TRUE(compare(m, expect, 0.05)); +// +// // check if data in opposite order gives same result +// float dataIn2[] = {24,21,18,15,12,9,6,3}; +// float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6}; +// +// initFromZYXarray(m, dataIn2); +// +// calcMean(m, 2, MEAN_Y_DIR); +// +// ASSERT_TRUE(compare(m, expect2, 0.05)); +// } +// } - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(33, 31, 13); - LocalIntensityScale lis; - for (int offset = 0; offset < 6; ++offset) { - // Run on CPU - PixelData mCpu(m, true); - timer.start_timer("CPU mean ALL-DIR"); - lis.calc_sat_mean_y(mCpu, offset); - lis.calc_sat_mean_x(mCpu, offset); - lis.calc_sat_mean_z(mCpu, offset); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - timer.start_timer("GPU mean ALL-DIR"); - calcMean(mGpu, offset); - timer.stop_timer(); - - // Compare results - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); - } - } +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { +// APRTimer timer(true); +// PixelData m = getRandInitializedMesh(33, 31, 13); +// +// LocalIntensityScale lis; +// for (int offset = 0; offset < 6; ++offset) { +// // Run on CPU +// PixelData mCpu(m, true); +// timer.start_timer("CPU mean ALL-DIR"); +// lis.calc_sat_mean_y(mCpu, offset); +// lis.calc_sat_mean_x(mCpu, offset); +// lis.calc_sat_mean_z(mCpu, offset); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// timer.start_timer("GPU mean ALL-DIR"); +// calcMean(mGpu, offset); +// timer.stop_timer(); +// +// // Compare results +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); +// } +// } //@KG: The CPU code doesn't work for uint16 --> overflow will likely result. @@ -489,36 +576,36 @@ namespace { // } // } - TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { - APRTimer timer(true); - PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); - - APRParameters params; - params.sigma_th = 1; - params.sigma_th_max = 2; - params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. - - // Run on CPU - PixelData mCpu(m, true); - PixelData mCpuTemp(m, false); - timer.start_timer("CPU LIS FULL"); - - LocalIntensityScale localIntensityScale; - - localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params); - timer.stop_timer(); - - // Run on GPU - PixelData mGpu(m, true); - PixelData mGpuTemp(m, false); - timer.start_timer("GPU LIS ALL-DIR"); - getLocalIntensityScale(mGpu, mGpuTemp, params); - timer.stop_timer(); - - // Compare results - //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); - } +// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { +// APRTimer timer(true); +// PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); +// +// APRParameters params; +// params.sigma_th = 1; +// params.sigma_th_max = 2; +// params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. +// +// // Run on CPU +// PixelData mCpu(m, true); +// PixelData mCpuTemp(m, false); +// timer.start_timer("CPU LIS FULL"); +// +// LocalIntensityScale localIntensityScale; +// +// localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params); +// timer.stop_timer(); +// +// // Run on GPU +// PixelData mGpu(m, true); +// PixelData mGpuTemp(m, false); +// timer.start_timer("GPU LIS ALL-DIR"); +// getLocalIntensityScale(mGpu, mGpuTemp, params); +// timer.stop_timer(); +// +// // Compare results +// //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); +// } #endif // APR_USE_CUDA From 521d8264634085b5f87d70da7bba86cae5693e18 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 24 Feb 2023 13:42:28 +0100 Subject: [PATCH 14/59] LIS in Z-dir redesigned so code is clearer and faster. Also new test added. --- src/algorithm/LocalIntensityScale.cu | 35 ++++++++++------- src/algorithm/LocalIntensityScale.hpp | 45 +++++++++++---------- test/LocalIntensityScaleCudaTest.cpp | 56 +++++++++++++++++++++++++-- 3 files changed, 99 insertions(+), 37 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 11de9de3..06e2996a 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -249,35 +249,42 @@ __global__ void meanZdir(T *image, int offset, size_t x_num, size_t y_num, size_ currElementOffset += nextElementOffset; ++count; } - currElementOffset -= nextElementOffset; - --count; + if (boundaryReflect) { - count = divisor; + count += offset; // elements in above loop in range [1, offset] were summed twice } // Pointer in circular buffer - int beginPtr = offset; + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, z_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const int lastElement = z_num - 1 - offset; + for (int z = 0; z <= lastElement; ++z) { + // Calculate and save currently processed element and move to the new one + image[workerOffset + saveElementOffset] = sum / count; + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'data' buffer + if (z == lastElement) break; - // main loop going through all elements in range [0, z_num-offset) - for (int z = 0; z < z_num - offset; ++z) { // Read new element T v = image[workerOffset + currElementOffset]; // Update sum to cover [-offset, offset] of currently processed element - sum += v; sum -= data[beginPtr][workerIdx]; + sum += v; - // Save and move pointer + // Store new element in circularBuffer data[beginPtr][workerIdx] = v; - beginPtr = (beginPtr + 1) % divisor; - // Update count and save currently processed element + // Move to next elements to read and in circular buffer count = min(count + 1, divisor); - image[workerOffset + saveElementOffset] = sum / count; - - // Move to next elements + beginPtr = (beginPtr + 1) % divisor; currElementOffset += nextElementOffset; - saveElementOffset += nextElementOffset; } // Handle last #offset elements on RHS diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index 28376e84..30cc4be6 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -517,6 +517,10 @@ inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input, const size size_t nextElementOffset = x_num; size_t saveElementOffset = 0; // offset used to finish RHS boundary + // Clear buffers so they can be reused in next 'x_num' loop + std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop + std::fill(circularBuffer.begin(), circularBuffer.end(), 0); + // saturate circular buffer with #offset elements since it will allow to calculate first element value on LHS while(count <= offset) { for (size_t k = 0; k < y_num; ++k) { @@ -530,42 +534,45 @@ inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input, const size ++count; } - currElementOffset -= nextElementOffset; - --count; - if (boundaryReflect) { - count = divisor; + count += offset; // elements in above loop in range [1, offset] were summed twice } // Pointer in circular buffer - int beginPtr = offset; + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, z_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const size_t lastElement = z_num - 1 - offset; + for (size_t z = 0; z <= lastElement; ++z) { + // Calculate and save currently processed element and move to the new one + for (size_t k = 0; k < y_num; ++k) { + mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; + } + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'circularBuffer' buffer + if (z == lastElement) break; - // main loop going through all elements in range [0, x_num-offset) - for (size_t z = 0; z < dimLen - offset; ++z) { for (size_t k = 0; k < y_num; ++k) { // Read new element T v = mesh[jxnumynum + currElementOffset * y_num + k]; // Update sum to cover [-offset, offset] of currently processed element + sum[k] -= circularBuffer[beginPtr * y_num + k]; sum[k] += v; - if (count >= divisor || z == 0) sum[k] -= circularBuffer[beginPtr * y_num + k]; // Save new element circularBuffer[beginPtr * y_num + k] = v; } - // move pointer in circular buffer and number of active elements hold there - beginPtr = (beginPtr + 1) % divisor; + // Move to next elements to read and in circular buffer count = std::min(count + 1, divisor); - - for (size_t k = 0; k < y_num; ++k) { - // save currently processed element - mesh[jxnumynum + saveElementOffset * y_num + k] = sum[k] / count; - } - - // Move to next elements + beginPtr = (beginPtr + 1) % divisor; currElementOffset += nextElementOffset; - saveElementOffset += nextElementOffset; } // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value @@ -600,8 +607,6 @@ inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input, const size beginPtr = (beginPtr + 1) % divisor; saveElementOffset += nextElementOffset; } - - std::fill(sum.begin(), sum.end(), 0); // Clear 'sum; vector before next loop } } diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index d66810c9..d15f6561 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -144,9 +144,9 @@ namespace { return std::make_tuple(input, expected); } - TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_RANDOM_VALUES_X_DIR) { + TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_X_DIR) { // Input params - using T = uint16_t; + using T = float; for (int b = 0; b <= 1; b++) { for (int len = 5; len <= 45; len += 20) { @@ -179,6 +179,8 @@ namespace { calcMean(mGpu, offset, MEAN_X_DIR, (hasBoundary > 0)); timer.stop_timer(); + // expectedMesh because of different order of calculation will have small floating-point differences + // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; @@ -249,7 +251,7 @@ namespace { TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Z_DIR_RANDOM_VALUES) { APRTimer timer(false); - constexpr PixelDataDim const dim{49,51,53}; + constexpr PixelDataDim const dim{49, 51, 53}; PixelData m = getRandInitializedMesh(dim, 50, 10); LocalIntensityScale lis; @@ -279,6 +281,54 @@ namespace { } } + TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_Z_DIR) { + // Input params + using T = float; + + for (int b = 0; b <= 1; b++) { + for (int len = 5; len <= 45; len += 20) { + for (int offset = 0; offset <= 6 && offset < len; offset++) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = b > 0; + bool useRandomNumbers = r > 0; +// std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl; + + auto t = generateInputAndExpected(len, offset, hasBoundary, useRandomNumbers); + auto input = std::get<0>(t); + auto expected = std::get<1>(t); + PixelData m(1, 1, len, 0); + initFromZYXarray(m, input.data()); + PixelData expectedMesh(1, 1, len, 0); + initFromZYXarray(expectedMesh, expected.data()); + + APRTimer timer(false); + LocalIntensityScale lis; + + // Run on CPU old-impl + timer.start_timer("CPU Z-DIR"); + PixelData mCpu(m, true); + lis.calc_sat_mean_z(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU Z-DIR"); + calcMean(mGpu, offset, MEAN_Z_DIR, (hasBoundary > 0)); + timer.stop_timer(); + + // expectedMesh because of different order of calculation will have small floating-point differences + // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! + EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) + << "---!!!!!!--- GPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) + << "---!!!!!!--- CPU values does not match"; + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; + } + } + } + } + } + // ------------------------------------------------------------------------ // Below tests are not yet fixed. // ------------------------------------------------------------------------ From b297adf6b982aedbba44cc54859154e78b1f970c Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 13 Mar 2023 15:30:34 +0100 Subject: [PATCH 15/59] Local Intensity Scale (LIS) now works in Y-dir as expected. GPU and CPU gives same results. --- src/algorithm/LocalIntensityScale.cu | 158 ++++++--- src/algorithm/LocalIntensityScale.hpp | 121 ++++--- test/LocalIntensityScaleCudaTest.cpp | 486 +++++++++++--------------- 3 files changed, 375 insertions(+), 390 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 06e2996a..057e4de2 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -14,21 +14,14 @@ /** + * Calculates mean in Y direction * - * How it works along y-dir (let's suppose offset = 2 and number of workers = 8 for simplicity): - * - * image idx: 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 - * - * loop #1 - * workersIdx 0 1 2 3 4 5 6 7 - * loop #2 - * workersIdx 6 7 0 1 2 3 4 5 - * loop #3 - * workersIdx 4 5 6 7 0 1 2 3 - * .............. - * - * so #offset workers must wait in each loop to have next elements to sum - * + * NOTE: This is not optimal implementation but.. correct and more or less fast as previous one. + * The reason for change was to have results exactly same as in CPU side. + * Currently after reading whole y-dir line of data mean calculation is done only by one from all threads in block + * so here is some room for improvements. + * If needed may be optimized in future. The main limitation is size of shared memory needed which + * limits number of CUDA blocks that can run in parallel. * @tparam T * @param image * @param offset @@ -41,59 +34,113 @@ __global__ void meanYdir(T *image, int offset, size_t x_num, size_t y_num, size_ // NOTE: Block size in x/z direction must be 1 const size_t workersOffset = (blockIdx.z * x_num + blockIdx.x) * y_num; const int numOfWorkers = blockDim.y; - const unsigned int active = __activemask(); const int workerIdx = threadIdx.y; + + extern __shared__ char sharedMemChar[]; + T *buffer = (T*) sharedMemChar; + T *data = (T*) &buffer[y_num]; + + // Read whole line of data from y-direction int workerOffset = workerIdx; + while (workerOffset < y_num) { + buffer[workerOffset] = image[workersOffset + workerOffset]; + workerOffset += numOfWorkers; + } + + const int divisor = 2 * offset + 1; + size_t currElementOffset = 0; + size_t saveElementOffset = 0; + size_t nextElementOffset = 1; + + if (workerIdx == 0) { + // clear shared mem + for (int i = offset; i < divisor; ++i) data[i] = 0; + + // saturate cache with #offset elements since it will allow to calculate first element value on LHS + float sum = 0; + int count = 0; + while (count <= offset) { + T v = buffer[currElementOffset]; + sum += v; + data[count] = v; + if (boundaryReflect && count > 0) { + data[2 * offset - count + 1] = v; + sum += v; + } + currElementOffset += nextElementOffset; + ++count; + } - int offsetInTheLoop = 0; - T sum = 0; - T v = 0; - bool waitForNextLoop = false; - int countNumOfSumElements = 1; - while(workerOffset < y_num) { - if (!waitForNextLoop) v = image[workersOffset + workerOffset]; - bool waitForNextValues = (workerIdx + offsetInTheLoop) % numOfWorkers >= (numOfWorkers - offset); - - // Check if current value is one of the mirrored elements (boundary condition) - int numberOfMirrorLeft = offset - workerOffset; - int numberOfMirrorRight = workerOffset + offset - (y_num - 1); if (boundaryReflect) { - if (numberOfMirrorLeft > 0 && workerOffset >= 1 && workerOffset <= numberOfMirrorLeft) {sum += v; ++countNumOfSumElements;} - if (numberOfMirrorRight > 0 && workerOffset < (y_num - 1) && workerOffset >= (y_num - 1 - numberOfMirrorRight)) {sum += v; ++countNumOfSumElements;} + count += offset; // elements in above loop in range [1, offset] were summed twice } - for (int off = 1; off <= offset; ++off) { - T prevElement = __shfl_sync(active, v, workerIdx + blockDim.y - off, blockDim.y); - T nextElement = __shfl_sync(active, v, workerIdx + off, blockDim.y); - // LHS boundary check + don't add previous values if they were added in a previous loop execution - if (workerOffset >= off && !waitForNextLoop) {sum += prevElement; ++countNumOfSumElements;} - // RHS boundary check + don't read next values since they are not read yet - if (!waitForNextValues && (workerOffset + off) < y_num) {sum += nextElement; ++countNumOfSumElements;} + // Pointer in circular buffer + int beginPtr = (offset + 1) % divisor; + + // main loop going through all elements in range [0, y_num - 1 - offset], so till last element that + // does not need handling RHS for offset '^' + // x x x x ... x x x x x x x + // o o ^ o o + // + const int lastElement = y_num - 1 - offset; + for (int y = 0; y <= lastElement; ++y) { + // Calculate and save currently processed element and move to the new one + buffer[saveElementOffset] = sum / count; + saveElementOffset += nextElementOffset; + + // There is no more elements to process in that loop, all stuff left to be processed is already in 'data' buffer + if (y == lastElement) break; + + // Read new element + T v = buffer[currElementOffset]; + + // Update sum to cover [-offset, offset] of currently processed element + sum -= data[beginPtr]; + sum += v; + + // Store new element in circularBuffer + data[beginPtr] = v; + + // Move to next elements to read and in circular buffer + count = min(count + 1, divisor); + beginPtr = (beginPtr + 1) % divisor; + currElementOffset += nextElementOffset; + } + + // Handle last #offset elements on RHS + int boundaryPtr = (beginPtr - 1 - 1 + (2 * offset + 1)) % divisor; + + while (saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' and do not remove first element from moving filter + // since 'sum' of filter elements contains all elements from processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter) + // In such a case first 'o' element should not be removed when filter moves right. + if (y_num - (currElementOffset - saveElementOffset) / nextElementOffset > offset || boundaryReflect) { + if (!boundaryReflect) count = count - 1; + sum -= data[beginPtr]; + } - // boundary condition (mirroring) if (boundaryReflect) { - int element = workerOffset + off; - if (numberOfMirrorLeft > 0 && element >= 1 && element <= numberOfMirrorLeft) {sum += nextElement; ++countNumOfSumElements;} - if (numberOfMirrorRight > 0 && element < (y_num - 1) && element >= (y_num - 1 - numberOfMirrorRight)) {sum += nextElement; ++countNumOfSumElements;} - element = workerOffset - off; - if (numberOfMirrorLeft > 0 && element >= 1 && element <= numberOfMirrorLeft) {sum += prevElement; ++countNumOfSumElements;} - if (numberOfMirrorRight > 0 && element < (y_num - 1) && element >= (y_num - 1 - numberOfMirrorRight)) {sum += prevElement; ++countNumOfSumElements;} + sum += data[boundaryPtr]; + boundaryPtr = (boundaryPtr - 1 + (2 * offset + 1)) % divisor; } - } - waitForNextLoop = waitForNextValues; - if (!waitForNextLoop) { - sum += v; - image[workersOffset + workerOffset] = sum / countNumOfSumElements; - // worker is done with current element - move to next one - sum = 0; - countNumOfSumElements = 1; - workerOffset += numOfWorkers; + buffer[saveElementOffset] = sum / count; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } - offsetInTheLoop += offset; } -} + // Save whole line of data + workerOffset = workerIdx; + while (workerOffset < y_num) { + image[workersOffset + workerOffset] = buffer[workerOffset]; + workerOffset += numOfWorkers; + } +} constexpr int NumberOfWorkers = 32; // Cannot be greater than 32 since there is no inter-warp communication implemented. /** @@ -320,7 +367,8 @@ void runMeanYdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_ dim3 numBlocks((x_num + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (z_num + threadsPerBlock.z - 1)/threadsPerBlock.z); - meanYdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); + const int sharedMemorySize = sizeof(T) * y_num + (offset * 2 + 1) * sizeof(float); + meanYdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } template diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index 30cc4be6..3f7fffef 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -159,7 +159,7 @@ void get_local_intensity_scale(PixelData &local_scale_temp, PixelData &input, const size_t offset, bool boundaryReflect = false); template - void calc_sat_mean_y(PixelData &input, const size_t offset); + void calc_sat_mean_y(PixelData &input, const size_t offset, bool boundaryReflect = false); void get_window(float &var_rescale, std::vector &var_win, const APRParameters &par); @@ -302,66 +302,91 @@ inline void LocalIntensityScale::get_window_alt(float& var_rescale, std::vector< } } -/** - * Calculates a O(1) recursive mean using SAT. - * @tparam T - * @param input - * @param offset - */ template -inline void LocalIntensityScale::calc_sat_mean_y(PixelData& input, const size_t offset){ +inline void LocalIntensityScale::calc_sat_mean_y(PixelData& input, const size_t offset, bool boundaryReflect) { const size_t z_num = input.z_num; const size_t x_num = input.x_num; const size_t y_num = input.y_num; - std::vector temp_vec(y_num); - float divisor = 2 * offset + 1; + const size_t divisor = offset + 1 + offset; + + auto &mesh = input.mesh; + const size_t dimLen = y_num; #ifdef HAVE_OPENMP - #pragma omp parallel for default(shared) firstprivate(temp_vec) + #pragma omp parallel for default(shared) #endif - for(size_t j = 0; j < z_num; ++j) { - for(size_t i = 0; i < x_num; ++i){ - size_t index = j * x_num*y_num + i * y_num; - - //first pass over and calculate cumsum - float temp = 0; - for (size_t k = 0; k < y_num; ++k) { - temp += input.mesh[index + k]; - temp_vec[k] = temp; + for (size_t j = 0; j < z_num; ++j) { + for (size_t i = 0; i < x_num; ++i) { + size_t index = j * x_num * y_num + i * y_num; + + size_t count = 0; + size_t currElementOffset = 0; + size_t nextElementOffset = 1; + size_t saveElementOffset = 0; + + std::vector circularBuffer(divisor, 0); + T sum = 0; + + while (count <= offset) { + auto v = mesh[index + currElementOffset]; + sum += v; + circularBuffer[count] = v; + if (boundaryReflect && count > 0) { circularBuffer[2 * offset - count + 1] = v; sum += v;} + + currElementOffset += nextElementOffset; + count++; } - //handling boundary conditions (LHS) - for (size_t k = 0; k <= offset; ++k) { - input.mesh[index + k] = 0; - } + if (boundaryReflect) count += offset; - //second pass calculate mean - for (size_t k = offset + 1; k < y_num; ++k) { - input.mesh[index + k] = -temp_vec[k - offset - 1]/divisor; - } + int beginPtr = (offset + 1) % divisor; - //second pass calculate mean - for (size_t k = 0; k < (y_num-offset); ++k) { - input.mesh[index + k] += temp_vec[k + offset]/divisor; - } + const int lastElement = dimLen - 1 - offset; + for (int i = 0; i <= lastElement; ++i) { + mesh[index + saveElementOffset] = sum / count; + saveElementOffset += nextElementOffset; + + if (i == lastElement) break; - float counter = 0; - //handling boundary conditions (RHS) - for (size_t k = (y_num - offset); k < (y_num); ++k) { - counter++; - input.mesh[index + k]*= divisor; - input.mesh[index + k]+= temp_vec[y_num-1]; - input.mesh[index + k]*= 1.0/(divisor - counter); + auto v = mesh[index + currElementOffset]; + + sum -= circularBuffer[beginPtr]; + sum += v; + + circularBuffer[beginPtr] = v; + + count = std::min(count + 1, divisor); + beginPtr = (beginPtr + 1) % divisor; + currElementOffset += nextElementOffset; } - //handling boundary conditions (LHS), need to rehandle the boundary - for (size_t k = 1; k <= offset; ++k) { - input.mesh[index + k] *= divisor/(k + offset + 1.0); + int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor; + while(saveElementOffset < currElementOffset) { + // If filter length is too big in comparison to processed dimension + // do not decrease 'count' since 'sum' of filter elements contains all elements from + // processed dimension: + // dim elements: xxxxxx + // filter elements: oooooo^ooooo (o - offset elements, ^ - middle of the filter + bool removeElementFromFilter = dimLen - (currElementOffset - saveElementOffset) / nextElementOffset > offset; + + if (removeElementFromFilter) { + if (!boundaryReflect) count = count - 1; + } + if (removeElementFromFilter || boundaryReflect) { + sum -= circularBuffer[beginPtr]; + } + if (boundaryReflect) { + sum += circularBuffer[boundaryPtr]; + } + + mesh[index + saveElementOffset] = sum / count; + + boundaryPtr = (boundaryPtr - 1 + divisor) % divisor; + beginPtr = (beginPtr + 1) % divisor; + saveElementOffset += nextElementOffset; } - //end point boundary condition - input.mesh[index] *= divisor/(offset + 1.0); } } } @@ -453,8 +478,8 @@ inline void LocalIntensityScale::calc_sat_mean_x(PixelData& input, const size currElementOffset += nextElementOffset; } - // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value - int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + // boundaryPtr is used only in boundaryReflect mode, adding divisor makes it always non-negative value + int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor; // Handle last #offset elements on RHS while(saveElementOffset < currElementOffset) { @@ -575,8 +600,8 @@ inline void LocalIntensityScale::calc_sat_mean_z(PixelData& input, const size currElementOffset += nextElementOffset; } - // boundaryPtr is used only in boundaryReflect mode, adding (2*offset+1) makes it always non-negative value - int boundaryPtr = (beginPtr - 1 - 1 + (2*offset+1)) % divisor; + // boundaryPtr is used only in boundaryReflect mode, adding divisor makes it always non-negative value + int boundaryPtr = (beginPtr - 1 - 1 + divisor) % divisor; // Handle last #offset elements on RHS while(saveElementOffset < currElementOffset) { diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index d15f6561..d2ca284b 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -176,14 +176,14 @@ namespace { // Run on GPU PixelData mGpu(m, true); timer.start_timer("GPU X-DIR"); - calcMean(mGpu, offset, MEAN_X_DIR, (hasBoundary > 0)); + calcMean(mGpu, offset, MEAN_X_DIR, hasBoundary); timer.stop_timer(); // expectedMesh because of different order of calculation will have small floating-point differences // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; } } } @@ -313,15 +313,13 @@ namespace { // Run on GPU PixelData mGpu(m, true); timer.start_timer("GPU Z-DIR"); - calcMean(mGpu, offset, MEAN_Z_DIR, (hasBoundary > 0)); + calcMean(mGpu, offset, MEAN_Z_DIR, hasBoundary); timer.stop_timer(); // expectedMesh because of different order of calculation will have small floating-point differences // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! - EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) - << "---!!!!!!--- GPU values does not match"; - EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) - << "---!!!!!!--- CPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; } } @@ -329,311 +327,215 @@ namespace { } } - // ------------------------------------------------------------------------ - // Below tests are not yet fixed. - // ------------------------------------------------------------------------ + TEST(LocalIntensityScaleCudaTest, CPU_AND_GPU_TEST_Y_DIR_VS_MANUALLY_CALCULATED_VALUES) { + // Belows data is precomputed for y_len = 5 (and maximum offset = 4) so do not change these numbers! + constexpr PixelDataDim const dim{5, 1, 1}; + float expectedData[2][5][dim.y] = { + { // with no boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, // offset = 0 + {1.50, 2.00, 3.00, 4.00, 4.50}, // offset = 1 + {2.00, 2.50, 3.00, 3.50, 4.00}, // offset = 2 + {2.50, 3.00, 3.00, 3.00, 3.50}, // offset = 3 + {3.00, 3.00, 3.00, 3.00, 3.00} // offset = 4 + }, + { // with boundary values + {1.00, 2.00, 3.00, 4.00, 5.00}, + {1.66, 2.00, 3.00, 4.00, 4.33}, + {2.20, 2.40, 3.00, 3.60, 3.80}, + {2.71, 2.85, 3.00, 3.14, 3.28}, + {3.22, 3.11, 3.00, 2.88, 2.77} + } + }; + APRTimer timer(false); // set to true to see timings -// TEST(LocalIntensityScaleCudaTest, REMOVE_ME_AFTER_DEVELOPMENT) { -// int y_num = 1; -// int x_num = 5; -// int z_num = 1; -//#if 1 -// PixelData m(y_num, x_num, z_num, 0); -// PixelData m2(y_num, x_num, z_num, 0); -// PixelData m3(y_num, x_num, z_num, 0); -// PixelData m4(y_num, x_num, z_num, 0); -// float dataIn[] = {1, 2, 3, 4, 5}; -//// float dataIn[] = {75.4539260864, 42.5445404053, 0.00003, 4, 0.00005, 6, 0.00007, 8, 0.00009, 10, 0.000011, 12}; -// -// initFromZYXarray(m, dataIn); -// initFromZYXarray(m2, dataIn); -// initFromZYXarray(m3, dataIn); -// initFromZYXarray(m4, dataIn); -//#else -// PixelData m = getRandInitializedMesh(y_num, x_num, z_num, 200, 0); -// PixelData m2(m, true); -// PixelData m3(m, true); -// PixelData m4(m, true); -//#endif -// -// LocalIntensityScale lis; -// -// int off = 4; -// -// std::cout << "INP:"; m.printMesh(1); -// -// bool boundary = true; -// -// APRTimer timer(true); -// calcMean(m3, off, MEAN_X_DIR, boundary); -// timer.start_timer("new"); -// lis.calc_sat_mean_x(m2, off, boundary); -// timer.stop_timer(); -// -// timer.start_timer("old"); -// PixelData mCpuPadded; -// paddPixels(m, mCpuPadded, 0, off, 0); -// lis.calc_sat_mean_x_orig(mCpuPadded, off); -// unpaddPixels(mCpuPadded, m4, m.y_num, m.x_num, m.z_num); -// timer.stop_timer(); -// -// std::cout << "CPU: "; m2.printMesh(1); -// std::cout << "GPU: "; m3.printMesh(1); -// std::cout << "CPU old: "; m4.printMesh(1); -// -// std::cout << "GPU vs NEW\n"; -// compareMeshes(m3, m2, 0.00000001, 3); -// std::cout << "OLD vs GPU\n"; -// compareMeshes(m4, m3, 0.00000001, 3); -// std::cout << "OLD vs NEW\n"; -// EXPECT_EQ(compareMeshes(m4, m2, 0.00000001, 3), 0); -// } - // ------------------------------------------------------------------------ + PixelData m(dim); + float dataIn[] = {1, 2, 3, 4, 5}; + initFromZYXarray(m, dataIn); -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Y_DIR) { -// APRTimer timer(true); -// PixelData m = getRandInitializedMesh(22, 33, 22, 100, 3); -// -// LocalIntensityScale lis; -// for (int offset = 0; offset < 6; ++offset) { -// -// std::cout << " ============================== " << offset << std::endl; -// -// // Run on CPU -// PixelData mCpu(m, true); -// timer.start_timer("CPU mean Y-DIR"); -// lis.calc_sat_mean_y(mCpu, offset); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean Y-DIR"); -// calcMean(mGpu, offset, MEAN_Y_DIR); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); -// } -// } + LocalIntensityScale lis; + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 4; ++offset) { + // std::cout << "------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean Y-DIR"); + lis.calc_sat_mean_y(mCpu, offset, (boundary > 0)); + timer.stop_timer(); -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_Z_DIR) { -// APRTimer timer(true); -// using ImgType = float; -// PixelData m = getRandInitializedMesh(22, 33, 22, 255); -// -// LocalIntensityScale lis; -// for (int offset = 0; offset < 6; ++offset) { -// -// std::cout << " ============================== " << offset << std::endl; -// -// // Run on CPU -// PixelData mCpu(m, true); -// timer.start_timer("CPU mean Z-DIR"); -// lis.calc_sat_mean_z(mCpu, offset); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean Z-DIR"); -// calcMean(mGpu, offset, MEAN_Z_DIR); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); -// } -// } + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); + timer.stop_timer(); + // Compare results + PixelData expected(dim); + initFromZYXarray(expected, expectedData[boundary][offset]); + EXPECT_EQ(compareMeshes(expected, mGpu, 0.01), 0); + EXPECT_EQ(compareMeshes(expected, mCpu, 0.01), 0); -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Y_DIR) { -// APRTimer timer(true); -// PixelData m(4, 4, 1, 0); -// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; -// initFromZYXarray(m, dataIn); -// -// LocalIntensityScale lis; -// -// for (int boundary = 1; boundary <= 1; ++ boundary) { -// // boundary = 0 there is no reflected boundary -// // boudnary = 1 there is boundary reflect -// std::cout << "\n\n"; -// for (int offset = 1; offset < 2; ++offset) { -// // Run on CPU -// PixelData mCpuPadded; -// paddPixels(m, mCpuPadded, offset * boundary, offset * boundary, 0); -// timer.start_timer("CPU mean Y-DIR"); -// lis.calc_sat_mean_y(mCpuPadded, offset); -// PixelData mCpu; -// unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean Y-DIR"); -// calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); -// -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01, 4), 0); -// } -// } -// } + // Also GPU and CPU should give exactly same output + EXPECT_EQ(compareMeshes(mGpu, mCpu, 0), 0); + } + } + } + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WITH_AND_WITHOUT_BOUNDARY_Y_DIR_RANDOM_VALUES) { + APRTimer timer(false); + constexpr PixelDataDim const dim{49, 51, 53}; + PixelData m = getRandInitializedMesh(dim, 2, 0,false); -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_WIHT_AND_WITHOUT_BOUNDARY_Z_DIR) { -// APRTimer timer(true); -//// PixelData m(1, 1, 13, 0); -//// float dataIn[] = {1,2,3,4,5,6,7,8,9,10,11,12,13}; -//// initFromZYXarray(m, dataIn); -// PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); -// -// -// LocalIntensityScale lis; -// -// for (int boundary = 0; boundary <= 1; ++ boundary) { -// // boundary = 0 there is no reflected boundary -// // boudnary = 1 there is boundary reflect -// for (int offset = 0; offset < 6; ++offset) { -// // Run on CPU -// PixelData mCpuPadded; -// paddPixels(m, mCpuPadded, 0, 0, offset * boundary); -// timer.start_timer("CPU mean Z-DIR"); -// lis.calc_sat_mean_z(mCpuPadded, offset); -// PixelData mCpu; -// unpaddPixels(mCpuPadded, mCpu, m.y_num, m.x_num, m.z_num); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean Z-DIR"); -// calcMean(mGpu, offset, MEAN_Z_DIR, (boundary > 0)); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); -// } -// } -// } + LocalIntensityScale lis; + for (int boundary = 0; boundary <= 1; ++ boundary) { + // boundary = 0 there is no reflected boundary + // boudnary = 1 there is boundary reflect + for (int offset = 0; offset <= 6; ++offset) { +// std::cout << "---------------- OFFSET=" << offset << " boundary=" << (boundary > 0) << std::endl; - // !!!!!!!!!!!!!!!!!!!!!!! NOT YET CHECKED !!!!!!!!!!!!!!!!!!!!!!!!!!!!! - // TODO: See what these tests are doing and fix/change/remove them! + PixelData mCpu(m, true); + timer.start_timer("CPU mean Y-DIR"); + lis.calc_sat_mean_y(mCpu, offset, (boundary > 0)); + timer.stop_timer(); -// TEST(LocalIntensityScaleCudaTest, 1D_Y_DIR) { -// { // OFFSET=0 -// -// PixelData m(8, 1, 1, 0); -// float dataIn[] = {3,6,9,12,15,18,21,24}; -// float expect[] = {3,6,9,12,15,18,21,24}; -// -// initFromZYXarray(m, dataIn); -// -// calcMean(m, 0, MEAN_Y_DIR); -// -// ASSERT_TRUE(compare(m, expect, 0.05)); -// } -// { // OFFSET=1 -// -// PixelData m(8, 1, 1, 0); -// float dataIn[] = {1, 2, 3, 4, 5, 6, 7, 8}; -// float expect[] = {1.5, 2, 3, 4, 5, 6, 7, 7.5}; -// -// initFromZYXarray(m, dataIn); -// -// calcMean(m, 1, MEAN_Y_DIR); -// -// ASSERT_TRUE(compare(m, expect, 0.05)); -// } -// { // OFFSET=2 (+symmetricity check) -// -// PixelData m(8, 1, 1, 0); -// float dataIn[] = {3,6,9,12,15,18,21,24}; -// float expect[] = {6, 7.5, 9, 12, 15, 18, 19.5, 21}; -// -// initFromZYXarray(m, dataIn); -// -// calcMean(m, 2, MEAN_Y_DIR); -// -// ASSERT_TRUE(compare(m, expect, 0.05)); -// -// // check if data in opposite order gives same result -// float dataIn2[] = {24,21,18,15,12,9,6,3}; -// float expect2[] = {21, 19.5, 18, 15,12, 9, 7.5, 6}; -// -// initFromZYXarray(m, dataIn2); -// -// calcMean(m, 2, MEAN_Y_DIR); -// -// ASSERT_TRUE(compare(m, expect2, 0.05)); -// } -// } + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR, (boundary > 0)); + timer.stop_timer(); + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + TEST(LocalIntensityScaleCudaTest, GPU_CPU_VS_PRECOMPUTED_VALUES_Y_DIR) { + // Input params + using T = float; -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { -// APRTimer timer(true); -// PixelData m = getRandInitializedMesh(33, 31, 13); -// -// LocalIntensityScale lis; -// for (int offset = 0; offset < 6; ++offset) { -// // Run on CPU -// PixelData mCpu(m, true); -// timer.start_timer("CPU mean ALL-DIR"); -// lis.calc_sat_mean_y(mCpu, offset); -// lis.calc_sat_mean_x(mCpu, offset); -// lis.calc_sat_mean_z(mCpu, offset); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean ALL-DIR"); -// calcMean(mGpu, offset); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.01), 0); -// } -// } + for (int b = 0; b <= 1; b++) { + for (int len = 5; len <= 45; len += 20) { + for (int offset = 0; offset <= 6 && offset < len; offset++) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = b > 0; + bool useRandomNumbers = r > 0; +// std::cout << "========================> len=" << len << " offset=" << offset << " hasBoundary=" << hasBoundary << " useRandomNumbers=" << useRandomNumbers << std::endl; - //@KG: The CPU code doesn't work for uint16 --> overflow will likely result. + auto t = generateInputAndExpected(len, offset, hasBoundary, useRandomNumbers); + auto input = std::get<0>(t); + auto expected = std::get<1>(t); + PixelData m(len, 1, 1, 0); + initFromZYXarray(m, input.data()); + PixelData expectedMesh(len, 1, 1, 0); + initFromZYXarray(expectedMesh, expected.data()); + + APRTimer timer(false); + LocalIntensityScale lis; + + // Run on CPU old-impl + timer.start_timer("CPU Y-DIR"); + PixelData mCpu(m, true); + lis.calc_sat_mean_y(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU Y-DIR"); + calcMean(mGpu, offset, MEAN_Y_DIR, hasBoundary); + timer.stop_timer(); + + // expectedMesh because of different order of calculation will have small floating-point differences + // comparing to CPU or GPU fast implementation, anyway GPU and CPU should have exactly same values! + EXPECT_EQ(compareMeshes(expectedMesh, mGpu, 0.00001), 0) << "---!!!!!!--- GPU values does not match"; + EXPECT_EQ(compareMeshes(expectedMesh, mCpu, 0.00001), 0) << "---!!!!!!--- CPU values does not match"; + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0) << "---!!!!!!--- CPU vs GPU values does not match"; + } + } + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS) { + APRTimer timer(false); + PixelData m = getRandInitializedMesh(33, 32, 31); + + LocalIntensityScale lis; + for (int boundary = 0; boundary <= 1; boundary++) { + for (int offset = 0; offset <= 6; ++offset) { + bool hasBoundary = (boundary > 0); +// std::cout << "========================> " << " offset=" << offset << " hasBoundary=" << hasBoundary << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean ALL-DIR"); + lis.calc_sat_mean_y(mCpu, offset, hasBoundary); + lis.calc_sat_mean_x(mCpu, offset, hasBoundary); + lis.calc_sat_mean_z(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean ALL-DIR"); + calcMean(mGpu, offset, MEAN_ALL_DIR, hasBoundary); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) { + APRTimer timer(false); + PixelData m = getRandInitializedMesh(33, 31, 13); + + LocalIntensityScale lis; + for (int boundary = 0; boundary <= 1; boundary++) { + for (int offset = 0; offset <= 6; ++offset) { + bool hasBoundary = (boundary > 0); +// std::cout << "========================> " << " offset=" << offset << " hasBoundary=" << hasBoundary << std::endl; + + // Run on CPU + PixelData mCpu(m, true); + timer.start_timer("CPU mean ALL-DIR"); + lis.calc_sat_mean_y(mCpu, offset, hasBoundary); + lis.calc_sat_mean_x(mCpu, offset, hasBoundary); + lis.calc_sat_mean_z(mCpu, offset, hasBoundary); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + timer.start_timer("GPU mean ALL-DIR"); + calcMean(mGpu, offset, MEAN_ALL_DIR, hasBoundary); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + + + // ------------------------------------------------------------------------ + // Below tests are not yet fixed. + // ------------------------------------------------------------------------ -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_ALL_DIRS_UINT16) { -// APRTimer timer(true); -// PixelData m = getRandInitializedMesh(33, 31, 13); -// -// LocalIntensityScale lis; -// for (int offset = 0; offset < 6; ++offset) { -// // Run on CPU -// PixelData mCpu(m, true); -// timer.start_timer("CPU mean ALL-DIR"); -// lis.calc_sat_mean_y(mCpu, offset); -// lis.calc_sat_mean_x(mCpu, offset); -// lis.calc_sat_mean_z(mCpu, offset); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// timer.start_timer("GPU mean ALL-DIR"); -// calcMean(mGpu, offset); -// timer.stop_timer(); -// -// // Compare results -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 1), 0); -// } -// } // TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { // APRTimer timer(true); -// PixelData m = getRandInitializedMesh(31, 33, 13, 25, 10); +// PixelData m = getRandInitializedMesh(5, 5, 1, 25, 10, true); // // APRParameters params; // params.sigma_th = 1; // params.sigma_th_max = 2; -// params.reflect_bc_lis = false; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. +// params.reflect_bc_lis = true; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. // // // Run on CPU // PixelData mCpu(m, true); @@ -652,9 +554,19 @@ namespace { // getLocalIntensityScale(mGpu, mGpuTemp, params); // timer.stop_timer(); // +// m.printMeshT(1); +// mCpu.printMeshT(1); +// mGpu.printMeshT(1); +// // // Compare results // //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); +// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0000), 0); +// +// +// PixelData padd; +// paddPixels(m, padd, 2, 2, 0); +// m.printMeshT(1); +// padd.printMeshT(1); // } From 2cdf3fe6e9dcf39151feeadddfc68c3bb92c7287 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 16 Mar 2023 12:52:23 +0100 Subject: [PATCH 16/59] Whole LIS pipeline is matching exactly CPU implementation + tests updated --- src/algorithm/LocalIntensityScale.cu | 71 +++++++++++++++---- src/data_structures/Mesh/PixelData.cu | 21 +++++- src/data_structures/Mesh/PixelDataCuda.h | 28 ++++++-- src/data_structures/Mesh/paddPixelData.cuh | 81 ++++++++++++++++++++++ src/misc/CudaTools.cuh | 10 ++- test/LocalIntensityScaleCudaTest.cpp | 81 ++++++++++------------ test/TestTools.hpp | 2 +- 7 files changed, 224 insertions(+), 70 deletions(-) create mode 100644 src/data_structures/Mesh/paddPixelData.cuh diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 057e4de2..ee563e33 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -11,7 +11,7 @@ //#include #include "misc/CudaTools.cuh" - +#include "data_structures/Mesh/paddPixelData.cuh" /** * Calculates mean in Y direction @@ -393,18 +393,18 @@ void runMeanZdir(T* cudaImage, int offset, size_t x_num, size_t y_num, size_t z_ meanZdir<<>>(cudaImage, offset, x_num, y_num, z_num, boundaryReflect); } -template -void runMean(T *cudaImage, const PixelData &image, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream, bool boundaryReflect = false) { +template +void runMean(T *cudaImage, const PixelDataDim dim, int offsetX, int offsetY, int offsetZ, TypeOfMeanFlags flags, cudaStream_t aStream, bool boundaryReflect = false) { if (flags & MEAN_Y_DIR) { - runMeanYdir(cudaImage, offsetY, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); + runMeanYdir(cudaImage, offsetY, dim.x, dim.y, dim.z, aStream, boundaryReflect); } if (flags & MEAN_X_DIR) { - runMeanXdir(cudaImage, offsetX, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); + runMeanXdir(cudaImage, offsetX, dim.x, dim.y, dim.z, aStream, boundaryReflect); } if (flags & MEAN_Z_DIR) { - runMeanZdir(cudaImage, offsetZ, image.x_num, image.y_num, image.z_num, aStream, boundaryReflect); + runMeanZdir(cudaImage, offsetZ, dim.x, dim.y, dim.z, aStream, boundaryReflect); } } @@ -444,9 +444,9 @@ __global__ void rescaleAndThreshold(T *data, size_t len, float varRescale, float size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { float rescaled = varRescale * data[idx]; - if (rescaled < sigmaThreshold) { - rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold; - } +// if (rescaled < sigmaThreshold) { +// rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold; +// } data[idx] = rescaled; } } @@ -470,12 +470,53 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete size_t win_x2 = var_win[4]; size_t win_z2 = var_win[5]; + + // TODO: !!!!!!!!!! handle constant_intensity_scale parameter - it is another thing that changed since last GPU pipeline impl. + // rescaleAndThreshold - currently there is no thresholding as in new CPU code (should it be permanent?) + // --------- CUDA ---------------- - runCopy1D(cudaImage, cudaTemp, image.mesh.size(), aStream); - runMean(cudaImage, image, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, par.reflect_bc_lis); - runAbsDiff1D(cudaImage, cudaTemp, image.mesh.size(), aStream); - runMean(cudaImage, image, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, par.reflect_bc_lis); - runRescaleAndThreshold(cudaImage, image.mesh.size(), var_rescale, par.sigma_th, par.sigma_th_max, aStream); + + // padd + CudaMemoryUniquePtr paddedImage; + CudaMemoryUniquePtr paddedTemp; + PixelDataDim paddSize(std::max(win_y, win_y2), std::max(win_x, win_x2), std::max(win_z, win_z2)); + PixelDataDim imageSize = image.getDimension(); + PixelDataDim paddedImageSize = imageSize + paddSize + paddSize; // padding on both ends of each dimension + + S *ci = cudaImage; + S *ct = cudaTemp; + PixelDataDim dim = image.getDimension(); + + if (par.reflect_bc_lis) { + // padd + S *mem = nullptr; + checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); + paddedImage.reset(mem); + mem = nullptr; + checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); + paddedTemp.reset(mem); + + runPaddPixels(cudaImage, paddedImage.get(), imageSize, paddedImageSize, paddSize, aStream); + runPaddPixels(cudaTemp, paddedTemp.get(), imageSize, paddedImageSize, paddSize, aStream); + + ci = paddedImage.get(); + ct = paddedTemp.get(); + dim = paddedImageSize; + } + + + runCopy1D(ci, ct, dim.size(), aStream); + runMean(ci, dim, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, false); + runAbsDiff1D(ci, ct, dim.size(), aStream); + runMean(ci, dim, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, false); + runRescaleAndThreshold(ci, dim.size(), var_rescale, par.sigma_th, par.sigma_th_max, aStream); + + if (par.reflect_bc_lis) { + // unpadd + runUnpaddPixels(ci, cudaImage, paddedImageSize, imageSize, paddSize, aStream); + runUnpaddPixels(ct, cudaTemp, paddedImageSize, imageSize, paddSize, aStream); + } + } template void runLocalIntensityScalePipeline(const PixelData&, const APRParameters&, float*, float*, cudaStream_t); @@ -489,7 +530,7 @@ void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags, bool bound ScopedCudaMemHandler, H2D | D2H> cudaImage(image); APRTimer timer(true); // timer.start_timer("GpuDeviceTimeFull"); - runMean(cudaImage.get(), image, offset, offset, offset, flags, 0, boundaryReflect); + runMean(cudaImage.get(), image.getDimension(), offset, offset, offset, flags, 0, boundaryReflect); // timer.stop_timer(); } diff --git a/src/data_structures/Mesh/PixelData.cu b/src/data_structures/Mesh/PixelData.cu index fd27f4d5..35924482 100644 --- a/src/data_structures/Mesh/PixelData.cu +++ b/src/data_structures/Mesh/PixelData.cu @@ -10,11 +10,14 @@ #include "misc/CudaTools.cuh" #include "downsample.cuh" -#include +#include "paddPixelData.cuh" + // explicit instantiation of handled types template void downsampleMeanCuda(const PixelData&, PixelData&); template void downsampleMaxCuda(const PixelData&, PixelData&); +template void paddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); +template void unpaddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); template void downsampleMeanCuda(const PixelData &input, PixelData &output) { @@ -31,3 +34,19 @@ void downsampleMaxCuda(const PixelData &input, PixelData &output) { runDownsampleMax(in.get(), out.get(), input.x_num, input.y_num, input.z_num, 0); }; + +template +void paddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize) { + ScopedCudaMemHandler, H2D> inputData(input); + ScopedCudaMemHandler, D2H> outputData(output); + + runPaddPixels(inputData.get(), outputData.get(), input.getDimension(), output.getDimension(), padSize, 0); +}; + +template +void unpaddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize) { + ScopedCudaMemHandler, H2D> inputData(input); + ScopedCudaMemHandler, D2H> outputData(output); + + runUnpaddPixels(inputData.get(), outputData.get(), input.getDimension(), output.getDimension(), padSize, 0); +}; diff --git a/src/data_structures/Mesh/PixelDataCuda.h b/src/data_structures/Mesh/PixelDataCuda.h index 34f7a56c..97f2144e 100644 --- a/src/data_structures/Mesh/PixelDataCuda.h +++ b/src/data_structures/Mesh/PixelDataCuda.h @@ -1,17 +1,35 @@ -// -// Created by Krzysztof Gonciarz on 4/9/18. -// - #ifndef LIBAPR_PIXELDATACUDA_H #define LIBAPR_PIXELDATACUDA_H #include "PixelData.hpp" + template void downsampleMeanCuda(const PixelData &aInput, PixelData &aOutput); template void downsampleMaxCuda(const PixelData &input, PixelData &output); -#endif //LIBAPR_PIXELDATACUDA_H +/** + * Copies data from input to output (which is bigger by pad size) reflecting around the edge pixels. + * @tparam T + * @param input + * @param output + * @param padSize + */ +template +void paddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); + +/** + * Copies data from input to output (which is smaller by pad size). + * @tparam T + * @param input + * @param output + * @param padSize + */ +template +void unpaddPixelsCuda(const PixelData &input, PixelData &output, const PixelDataDim &padSize); + +#endif + diff --git a/src/data_structures/Mesh/paddPixelData.cuh b/src/data_structures/Mesh/paddPixelData.cuh new file mode 100644 index 00000000..dae96d79 --- /dev/null +++ b/src/data_structures/Mesh/paddPixelData.cuh @@ -0,0 +1,81 @@ +#ifndef LIBAPR_PADDPIXELDATA_CUH +#define LIBAPR_PADDPIXELDATA_CUH + + +#include "data_structures/Mesh/PixelData.hpp" + + +template +__global__ void paddPixels(const T* input, T *output, const PixelDataDim inputSize, const PixelDataDim outputSize, const PixelDataDim padSize) { + size_t yIdx = blockIdx.y * blockDim.y + threadIdx.y; + size_t xIdx = blockIdx.x * blockDim.x + threadIdx.x; + size_t zIdx = blockIdx.z * blockDim.z + threadIdx.z; + + // copy data to output (padded) cube + if (yIdx < outputSize.y && xIdx < outputSize.x && zIdx < outputSize.z) { + + // output cube index + size_t outputIdx = (zIdx * outputSize.x + xIdx) * outputSize.y + yIdx; + + // input cube index + int yIn = yIdx - padSize.y; + if (yIn < 0) yIn = -yIn; // reflected boundary on LHS + if (yIn >= inputSize.y) yIn -= 2 * (yIn - (inputSize.y - 1)); // reflected boundary on RHS + + int xIn = xIdx - padSize.x; + if (xIn < 0) xIn = -xIn; // reflected boundary on LHS + if (xIn >= inputSize.x) xIn -= 2 * (xIn - (inputSize.x - 1)); // reflected boundary on RHS + + int zIn = zIdx - padSize.z; + if (zIn < 0) zIn = -zIn; // reflected boundary on LHS + if (zIn >= inputSize.z) zIn -= 2 * (zIn - (inputSize.z - 1)); // reflected boundary on RHS + + size_t inputIdx = (zIn * inputSize.x + xIn) * inputSize.y + yIn; + + output[outputIdx] = input[inputIdx]; + } +} + +template +void runPaddPixels(const T* input, T *output, const PixelDataDim &inputSize, const PixelDataDim &outputSize, const PixelDataDim &padSize, cudaStream_t aStream) { + dim3 threadsPerBlock(1, 64, 1); + dim3 numBlocks((outputSize.x + threadsPerBlock.x - 1) / threadsPerBlock.x, + (outputSize.y + threadsPerBlock.y - 1) / threadsPerBlock.y, + (outputSize.z + threadsPerBlock.z - 1) / threadsPerBlock.z); + + paddPixels<<>>(input, output, inputSize, outputSize, padSize); +} + +template +__global__ void unpaddPixels(const T* input, T *output, const PixelDataDim inputSize, const PixelDataDim outputSize, const PixelDataDim padSize) { + size_t yIdx = blockIdx.y * blockDim.y + threadIdx.y; + size_t xIdx = blockIdx.x * blockDim.x + threadIdx.x; + size_t zIdx = blockIdx.z * blockDim.z + threadIdx.z; + + // copy data to output (unpadded) cube + if (yIdx < outputSize.y && xIdx < outputSize.x && zIdx < outputSize.z) { + + // output cube index + size_t outputIdx = (zIdx * outputSize.x + xIdx) * outputSize.y + yIdx; + + // input cube index (map coordinates of output cube to internal cube of padded cube) + int yIn = yIdx + padSize.y; + int xIn = xIdx + padSize.x; + int zIn = zIdx + padSize.z; + size_t inputIdx = (zIn * inputSize.x + xIn) * inputSize.y + yIn; + + output[outputIdx] = input[inputIdx]; + } +} + +template +void runUnpaddPixels(const T* input, T *output, const PixelDataDim &inputSize, const PixelDataDim &outputSize, const PixelDataDim &padSize, cudaStream_t aStream) { + dim3 threadsPerBlock(1, 64, 1); + dim3 numBlocks((outputSize.x + threadsPerBlock.x - 1) / threadsPerBlock.x, + (outputSize.y + threadsPerBlock.y - 1) / threadsPerBlock.y, + (outputSize.z + threadsPerBlock.z - 1) / threadsPerBlock.z); + + unpaddPixels<<>>(input, output, inputSize, outputSize, padSize); +} + +#endif diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh index 558f730a..bb17e5fa 100644 --- a/src/misc/CudaTools.cuh +++ b/src/misc/CudaTools.cuh @@ -94,12 +94,18 @@ public: // Useful type for keeping CUDA allocated memory (which is released with cudaFree) -template +cudaError_t CUDARTAPI deleter(void *devPtr) { + //std::cout << "cudaFree() called...\n"; + return cudaFree(devPtr); +} + +template struct CudaMemoryUniquePtr : public std::unique_ptr { using std::unique_ptr::unique_ptr; // inheriting other constructors - explicit CudaMemoryUniquePtr(T *aMemory = nullptr) : std::unique_ptr(aMemory, &cudaFree) {} + explicit CudaMemoryUniquePtr(T *aMemory = nullptr) : std::unique_ptr(aMemory, &deleter) {} }; + /** * Directions for sending data between Host and Device */ diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index d2ca284b..2fa4f60c 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -4,7 +4,7 @@ #include "algorithm/LocalIntensityScaleCuda.h" #include "algorithm/LocalIntensityScale.hpp" #include "TestTools.hpp" - +#include "data_structures/Mesh/PixelDataCuda.h" namespace { @@ -522,52 +522,41 @@ namespace { } } + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { + APRTimer timer(false); + + for (int boundary = 0; boundary <= 1; ++boundary) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = (boundary > 0); + bool useRandomNumbers = (r > 0); + + PixelData m = getRandInitializedMesh(31, 33, 32, 25, 10, !useRandomNumbers); + + APRParameters params; + params.sigma_th = 1; + params.sigma_th_max = 2; + params.reflect_bc_lis = hasBoundary; + + // Run on CPU + PixelData mCpu(m, true); + PixelData mCpuTemp(m, false); + timer.start_timer("CPU LIS FULL"); + LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params); + timer.stop_timer(); - // ------------------------------------------------------------------------ - // Below tests are not yet fixed. - // ------------------------------------------------------------------------ - - -// TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE) { -// APRTimer timer(true); -// PixelData m = getRandInitializedMesh(5, 5, 1, 25, 10, true); -// -// APRParameters params; -// params.sigma_th = 1; -// params.sigma_th_max = 2; -// params.reflect_bc_lis = true; //#TODO: @KG: The CPU pipeline uses this to true, so needs to now be implimented. -// -// // Run on CPU -// PixelData mCpu(m, true); -// PixelData mCpuTemp(m, false); -// timer.start_timer("CPU LIS FULL"); -// -// LocalIntensityScale localIntensityScale; -// -// localIntensityScale.get_local_intensity_scale(mCpu, mCpuTemp, params); -// timer.stop_timer(); -// -// // Run on GPU -// PixelData mGpu(m, true); -// PixelData mGpuTemp(m, false); -// timer.start_timer("GPU LIS ALL-DIR"); -// getLocalIntensityScale(mGpu, mGpuTemp, params); -// timer.stop_timer(); -// -// m.printMeshT(1); -// mCpu.printMeshT(1); -// mGpu.printMeshT(1); -// -// // Compare results -// //EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0.01), 0); //this is not needed these values are not required. -// EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0000), 0); -// -// -// PixelData padd; -// paddPixels(m, padd, 2, 2, 0); -// m.printMeshT(1); -// padd.printMeshT(1); -// } + // Run on GPU + PixelData mGpu(m, true); + PixelData mGpuTemp(m, false); + timer.start_timer("GPU LIS FULL"); + getLocalIntensityScale(mGpu, mGpuTemp, params); + timer.stop_timer(); + + // Compare results + EXPECT_EQ(compareMeshes(mCpuTemp, mGpuTemp, 0), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } #endif // APR_USE_CUDA diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 4ec15afe..b533674d 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -159,7 +159,7 @@ inline PixelData getRandInitializedMesh(int y, int x, int z, float multiplier #pragma omp parallel for default(shared) #endif for (size_t i = 0; i < m.mesh.size(); ++i) { - m.mesh[i] = useIdxNumbers ? i : dist(mt) * multiplier + offset; + m.mesh[i] = useIdxNumbers ? i + 1 : dist(mt) * multiplier + offset; } return m; } From e093c01acd691b6ecf2e3a9adf25f0233b67bdbf Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 16 Mar 2023 14:13:48 +0100 Subject: [PATCH 17/59] Quick fix of linking error --- src/misc/CudaTools.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh index bb17e5fa..155ce317 100644 --- a/src/misc/CudaTools.cuh +++ b/src/misc/CudaTools.cuh @@ -94,7 +94,7 @@ public: // Useful type for keeping CUDA allocated memory (which is released with cudaFree) -cudaError_t CUDARTAPI deleter(void *devPtr) { +static cudaError_t CUDARTAPI deleter(void *devPtr) { //std::cout << "cudaFree() called...\n"; return cudaFree(devPtr); } From 053380d267bc06491e8404fa6a2d60f7a65903b7 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 16 Mar 2023 15:26:58 +0100 Subject: [PATCH 18/59] maximum error diff. GPU vs CPU for compute gradient set to 0 --- test/ComputeGradientCudaTest.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index 8bb06106..83502a62 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -56,7 +56,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } } } @@ -98,7 +98,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } } } @@ -140,7 +140,7 @@ namespace { timer.stop_timer(); //Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.0001, 2), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } } } @@ -173,7 +173,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } @@ -201,7 +201,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Z_RND_CUDA) { @@ -224,7 +224,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } TEST(ComputeInverseBspline, CALC_INV_BSPLINE_Y_RND_CUDA) { @@ -247,7 +247,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.00001), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } TEST(ComputeInverseBspline, CALC_INV_BSPLINE_FULL_XYZ_DIR_RND_CUDA) { @@ -272,7 +272,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu, 0.000001), 0); + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); } // ======================================================================== @@ -301,7 +301,7 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(grad, gradCuda, 0.0000001), 0); + EXPECT_EQ(compareMeshes(grad, gradCuda, 0), 0); } @@ -354,9 +354,9 @@ namespace { timer.stop_timer(); // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage, 0.0000001), 0); - EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0.0000001), 0); - EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0.0000001), 0); + EXPECT_EQ(compareMeshes(mCpuImage, mGpuImage, 0), 0); + EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0); + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); } #endif // APR_USE_CUDA From 97cf75e2b82ff09612b41218272b7e8e445f936e Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 17 Mar 2023 10:31:48 +0100 Subject: [PATCH 19/59] rescaleAndThreshold in now only rescaling (to reflect changed in CPU side) --- src/algorithm/LocalIntensityScale.cu | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index ee563e33..11eb0275 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -439,23 +439,19 @@ void runAbsDiff1D(T *data, const T *reference, size_t len, cudaStream_t aStream) } template -__global__ void rescaleAndThreshold(T *data, size_t len, float varRescale, float sigmaThreshold, float sigmaThresholdMax) { - const float max_th = 60000.0; +__global__ void rescale(T *data, size_t len, float varRescale) { size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { float rescaled = varRescale * data[idx]; -// if (rescaled < sigmaThreshold) { -// rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold; -// } data[idx] = rescaled; } } template -void runRescaleAndThreshold(T *data, size_t len, float varRescale, float sigma, float sigmaMax, cudaStream_t aStream) { +void runRescale(T *data, size_t len, float varRescale, cudaStream_t aStream) { dim3 threadsPerBlock(64); dim3 numBlocks((len + threadsPerBlock.x - 1) / threadsPerBlock.x); - rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, varRescale, sigma, sigmaMax); + rescale <<< numBlocks, threadsPerBlock, 0, aStream >>>(data, len, varRescale); } template @@ -472,7 +468,7 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete // TODO: !!!!!!!!!! handle constant_intensity_scale parameter - it is another thing that changed since last GPU pipeline impl. - // rescaleAndThreshold - currently there is no thresholding as in new CPU code (should it be permanent?) + // rescale - currently there is no thresholding as in new CPU code (should it be permanent?) // --------- CUDA ---------------- @@ -509,7 +505,7 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete runMean(ci, dim, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, false); runAbsDiff1D(ci, ct, dim.size(), aStream); runMean(ci, dim, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, false); - runRescaleAndThreshold(ci, dim.size(), var_rescale, par.sigma_th, par.sigma_th_max, aStream); + runRescale(ci, dim.size(), var_rescale, aStream); if (par.reflect_bc_lis) { // unpadd From 83c2a3104be1253573fe26d76d69c767c3e08f9b Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 17 Mar 2023 10:33:25 +0100 Subject: [PATCH 20/59] rescaleAndThreshold in now only rescaling (to reflect changed in CPU side) --- src/algorithm/LocalIntensityScale.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 11eb0275..cf4b29f1 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -468,7 +468,6 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete // TODO: !!!!!!!!!! handle constant_intensity_scale parameter - it is another thing that changed since last GPU pipeline impl. - // rescale - currently there is no thresholding as in new CPU code (should it be permanent?) // --------- CUDA ---------------- From 5b5a719411b98cf0d0141eca8e8452ee6f7957c2 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 17 Mar 2023 16:20:24 +0100 Subject: [PATCH 21/59] constant_intensity_scale handling in LIS added for GPU --- src/algorithm/LocalIntensityScale.cu | 119 +++++++++++++++++--------- src/algorithm/LocalIntensityScale.hpp | 2 + test/LocalIntensityScaleCudaTest.cpp | 37 ++++++++ 3 files changed, 119 insertions(+), 39 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index cf4b29f1..64e4c710 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -454,11 +454,43 @@ void runRescale(T *data, size_t len, float varRescale, cudaStream_t aStream) { rescale <<< numBlocks, threadsPerBlock, 0, aStream >>>(data, len, varRescale); } +template +__global__ void constantScale(S *image, size_t len) { + // This is totally naive and slow implementation (only 1 thread is used) just to have CPU + // code implemented in CUDA. This code will not be run in any normal usage of APR + // and it is just here for sanity check and or super small images cases (like few pixels) + // so DO NOT TRY TO OPTIMIZE IT - use your time for something more productive or have + // some beers... still better than writing fast version of this code. + + float min_val = 660000; + double sum = 0; + + for (size_t i = 0; i < len; ++i) { + float tmp = image[i]; + + sum += tmp; + if (tmp < min_val) min_val = tmp; + } + + float scale_val = (float) (sum / (float)len - min_val); + + for (size_t i = 0; i < len; ++i) { + image[i] = scale_val; + } +} + +template +void runConstantScale(S *image, PixelDataDim &dim) { + // Check kernel description for further info! + constantScale<<<1, 1>>>(image, dim.size()); +} + template void runLocalIntensityScalePipeline(const PixelData &image, const APRParameters &par, S *cudaImage, S *cudaTemp, cudaStream_t aStream) { float var_rescale; std::vector var_win; - LocalIntensityScale().get_window_alt(var_rescale, var_win, par,image); + auto lis = LocalIntensityScale(); + lis.get_window_alt(var_rescale, var_win, par, image); size_t win_y = var_win[0]; size_t win_x = var_win[1]; size_t win_z = var_win[2]; @@ -467,51 +499,60 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete size_t win_z2 = var_win[5]; - // TODO: !!!!!!!!!! handle constant_intensity_scale parameter - it is another thing that changed since last GPU pipeline impl. - // --------- CUDA ---------------- + bool constant_scale = false; - // padd - CudaMemoryUniquePtr paddedImage; - CudaMemoryUniquePtr paddedTemp; - PixelDataDim paddSize(std::max(win_y, win_y2), std::max(win_x, win_x2), std::max(win_z, win_z2)); - PixelDataDim imageSize = image.getDimension(); - PixelDataDim paddedImageSize = imageSize + paddSize + paddSize; // padding on both ends of each dimension - - S *ci = cudaImage; - S *ct = cudaTemp; - PixelDataDim dim = image.getDimension(); - - if (par.reflect_bc_lis) { - // padd - S *mem = nullptr; - checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); - paddedImage.reset(mem); - mem = nullptr; - checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); - paddedTemp.reset(mem); - - runPaddPixels(cudaImage, paddedImage.get(), imageSize, paddedImageSize, paddSize, aStream); - runPaddPixels(cudaTemp, paddedTemp.get(), imageSize, paddedImageSize, paddSize, aStream); - - ci = paddedImage.get(); - ct = paddedTemp.get(); - dim = paddedImageSize; + if (par.constant_intensity_scale || (lis.number_active_dimensions == 0)) { + // include the case where the local intensity scale doesn't make sense due to the image being to small. + // (This is for just edge cases and sanity checking) + constant_scale = true; } + PixelDataDim imageSize = image.getDimension(); - runCopy1D(ci, ct, dim.size(), aStream); - runMean(ci, dim, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, false); - runAbsDiff1D(ci, ct, dim.size(), aStream); - runMean(ci, dim, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, false); - runRescale(ci, dim.size(), var_rescale, aStream); + if (!constant_scale) { + CudaMemoryUniquePtr paddedImage; + CudaMemoryUniquePtr paddedTemp; + PixelDataDim paddSize(std::max(win_y, win_y2), std::max(win_x, win_x2), std::max(win_z, win_z2)); + PixelDataDim paddedImageSize = imageSize + paddSize + paddSize; // padding on both ends of each dimension + + S *ci = cudaImage; + S *ct = cudaTemp; + PixelDataDim dim = image.getDimension(); + + if (par.reflect_bc_lis) { + // padd + S *mem = nullptr; + checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); + paddedImage.reset(mem); + mem = nullptr; + checkCuda(cudaMalloc(&mem, sizeof(S) * paddedImageSize.size())); + paddedTemp.reset(mem); + + runPaddPixels(cudaImage, paddedImage.get(), imageSize, paddedImageSize, paddSize, aStream); + runPaddPixels(cudaTemp, paddedTemp.get(), imageSize, paddedImageSize, paddSize, aStream); + + ci = paddedImage.get(); + ct = paddedTemp.get(); + dim = paddedImageSize; + } - if (par.reflect_bc_lis) { - // unpadd - runUnpaddPixels(ci, cudaImage, paddedImageSize, imageSize, paddSize, aStream); - runUnpaddPixels(ct, cudaTemp, paddedImageSize, imageSize, paddSize, aStream); + // Run LIS pipeline + runCopy1D(ci, ct, dim.size(), aStream); + runMean(ci, dim, win_x, win_y, win_z, MEAN_ALL_DIR, aStream, false); + runAbsDiff1D(ci, ct, dim.size(), aStream); + runMean(ci, dim, win_x2, win_y2, win_z2, MEAN_ALL_DIR, aStream, false); + runRescale(ci, dim.size(), var_rescale, aStream); + + if (par.reflect_bc_lis) { + // unpadd + runUnpaddPixels(ci, cudaImage, paddedImageSize, imageSize, paddSize, aStream); + runUnpaddPixels(ct, cudaTemp, paddedImageSize, imageSize, paddSize, aStream); + } + } + else { + runConstantScale(cudaImage, imageSize); } - } template void runLocalIntensityScalePipeline(const PixelData&, const APRParameters&, float*, float*, cudaStream_t); diff --git a/src/algorithm/LocalIntensityScale.hpp b/src/algorithm/LocalIntensityScale.hpp index 3f7fffef..e576efd5 100644 --- a/src/algorithm/LocalIntensityScale.hpp +++ b/src/algorithm/LocalIntensityScale.hpp @@ -16,6 +16,8 @@ class LocalIntensityScale { bool active_x = true; bool active_z = true; +public: + int number_active_dimensions = 3; diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index 2fa4f60c..ce6ff111 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -558,6 +558,43 @@ namespace { } } + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE_CONSTANT_SCALE) { + APRTimer timer(false); + + for (int boundary = 0; boundary <= 1; ++boundary) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = (boundary > 0); + bool useRandomNumbers = (r > 0); + + PixelData m = getRandInitializedMesh(31, 33, 32, 25, 10, !useRandomNumbers); + + APRParameters params; + params.sigma_th = 1; + params.sigma_th_max = 2; + params.reflect_bc_lis = hasBoundary; + params.constant_intensity_scale = true; + + // Run on CPU + PixelData mCpu(m, true); + PixelData mCpuTemp(m, false); + timer.start_timer("CPU LIS FULL"); + LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + PixelData mGpuTemp(m, false); + timer.start_timer("GPU LIS FULL"); + getLocalIntensityScale(mGpu, mGpuTemp, params); + timer.stop_timer(); + + // Compare results + // NOTE: mCpuTemp and mGpuTemp are not checked since in case of + // constant_intensity_scale they are not set to any value + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } #endif // APR_USE_CUDA } From 5d0375ad59f6d9d9a2223084aecc2cc325a62dbe Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 20 Mar 2023 11:24:04 +0100 Subject: [PATCH 22/59] Removed unused threshold functions --- src/algorithm/ComputeGradientCuda.cu | 62 --------------------------- src/algorithm/ComputeGradientCuda.hpp | 4 -- test/ComputeGradientTest.cpp | 54 ----------------------- 3 files changed, 120 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 0a6e5507..bc7beed7 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -158,49 +158,6 @@ namespace { } } -/** - * Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise. - * @param input - * @param output - * @param length - len of input/output arrays - * @param thresholdLevel - */ -template -__global__ void threshold(const T *input, S *output, size_t length, float thresholdLevel) { - size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; - if (idx < length) { - if (input[idx] <= thresholdLevel) { output[idx] = 0; } - } -} - -template -void runThreshold(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, size_t z_num, float Ip_th, cudaStream_t aStream) { - dim3 threadsPerBlock(64); - dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1)/threadsPerBlock.x); - threshold<<>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th); -}; - -/** - * Thresholds input array to have minimum thresholdLevel. - * @param input - * @param length - len of input/output arrays - * @param thresholdLevel - */ -template -__global__ void thresholdImg(T *input, size_t length, float thresholdLevel) { - size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; - if (idx < length) { - if (input[idx] < thresholdLevel) { input[idx] = thresholdLevel; } - } -} - -template -void runThresholdImg(T *cudaImage, size_t x_num, size_t y_num, size_t z_num, float Ip_th_offset, cudaStream_t aStream) { - dim3 threadsPerBlock(64); - dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1) / threadsPerBlock.x); - thresholdImg<<< numBlocks, threadsPerBlock, 0, aStream >>> (cudaImage, x_num * y_num * z_num, Ip_th_offset); -}; - template void getGradientCuda(const PixelData &image, PixelData &local_scale_temp, ImgType *cudaImage, ImgType *cudaGrad, float *cudalocal_scale_temp, @@ -497,25 +454,6 @@ void getGradient(PixelData &image, PixelData &grad_temp, Pixel splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, 0); } -// explicit instantiation of handled types -template void thresholdImg(PixelData &, const float); -template -void thresholdImg(PixelData &image, const float threshold) { - ScopedCudaMemHandler, H2D | D2H> cudaImage(image); - - runThresholdImg(cudaImage.get(), image.x_num, image.y_num, image.z_num, threshold, 0); -} - -// explicit instantiation of handled types -template void thresholdGradient(PixelData &, const PixelData &, const float); -template -void thresholdGradient(PixelData &output, const PixelData &input, const float Ip_th) { - ScopedCudaMemHandler, H2D> cudaInput(input); - ScopedCudaMemHandler, H2D | D2H> cudaOutput(output); - - runThreshold(cudaInput.get(), cudaOutput.get(), input.x_num, input.y_num, input.z_num, Ip_th, 0); -} - void cudaDownsampledGradient(PixelData &input, PixelData &grad, const float hx, const float hy, const float hz) { ScopedCudaMemHandler, H2D | D2H> cudaInput(input); ScopedCudaMemHandler, D2H> cudaGrad(grad); diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index 36bb70b1..1fcf088b 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -32,10 +32,6 @@ template void computeLevelsCuda(const PixelData &grad_temp, PixelData &local_scale_temp, int maxLevel, float relError, float dx = 1, float dy = 1, float dz = 1); template void getGradient(PixelData &image, PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, float bspline_offset, const APRParameters &par); -template -void thresholdImg(PixelData &image, const float threshold); -template -void thresholdGradient(PixelData &output, const PixelData &input, const float Ip_th); void cudaDownsampledGradient(PixelData &input, PixelData &grad, const float hx, const float hy, const float hz); template diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index 527815f0..9ba510e6 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -648,60 +648,6 @@ namespace { #ifdef APR_USE_CUDA - TEST(ComputeThreshold, CALC_THRESHOLD_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData m = getRandInitializedMesh(31, 33, 13); - PixelData g = getRandInitializedMesh(31, 33, 13); - float thresholdLevel = 1; - - // Calculate bspline on CPU - PixelData mCpu(g, true); - timer.start_timer("CPU threshold"); - ComputeGradient().threshold_gradient(mCpu, m, thresholdLevel); - - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(g, true); - timer.start_timer("GPU threshold"); - thresholdGradient(mGpu, m, thresholdLevel); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - TEST(ComputeThreshold, CALC_THRESHOLD_IMG_RND_CUDA) { - APRTimer timer(true); - - // Generate random mesh - using ImgType = float; - PixelData g = getRandInitializedMesh(31, 33, 13, 1, true); - - float thresholdLevel = 10; - - // Calculate bspline on CPU - PixelData mCpu(g, true); - timer.start_timer("CPU threshold"); - for (size_t i = 0; i < mCpu.mesh.size(); ++i) { - if (mCpu.mesh[i] <= (thresholdLevel)) { mCpu.mesh[i] = thresholdLevel; } - } - timer.stop_timer(); - - // Calculate bspline on GPU - PixelData mGpu(g, true); - timer.start_timer("GPU threshold"); - thresholdImg(mGpu, thresholdLevel); - timer.stop_timer(); - - // Compare GPU vs CPU - EXPECT_EQ(compareMeshes(mCpu, mGpu), 0); - } - - // TODO: This test will be fixed as soon as CUDA pipeline is updated. // Currently turning it off to have testable rest of CUDA impl. // TEST(ComputeThreshold, FULL_PIPELINE_TEST) { From 53ef94baa6c748bcebb35ce11cdc5d1ddb267823 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 20 Mar 2023 13:39:51 +0100 Subject: [PATCH 23/59] FullPipeline test moved to new file --- src/algorithm/ComputeGradientCuda.cu | 3 +- test/CMakeLists.txt | 1 + test/ComputeGradientCudaTest.cpp | 2 +- test/ComputeGradientTest.cpp | 74 -------------------------- test/FullPipelineCudaTest.cpp | 79 ++++++++++++++++++++++++++++ 5 files changed, 83 insertions(+), 76 deletions(-) create mode 100644 test/FullPipelineCudaTest.cpp diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index bc7beed7..87ebdaa5 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -266,6 +266,7 @@ public: CurrentTime ct; uint64_t start = ct.microseconds(); image.copyH2D(); + checkCuda(cudaStreamSynchronize(iStream)); std::cout << "SEND time: " << ct.microseconds() - start << std::endl; } @@ -273,7 +274,7 @@ public: CurrentTime ct; uint64_t start = ct.microseconds(); local_scale_temp.copyD2H(); - cudaStreamSynchronize(iStream); + checkCuda(cudaStreamSynchronize(iStream)); std::cout << "RCV time: " << ct.microseconds() - start << std::endl; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 2918f2c5..193ce405 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -16,6 +16,7 @@ if(APR_USE_CUDA) buildTarget(testAPRCuda APRTestCuda.cpp) buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp) buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp) + buildTarget(testFullPipelineCuda FullPipelineCudaTest.cpp) endif() if(APR_BUILD_EXAMPLES) diff --git a/test/ComputeGradientCudaTest.cpp b/test/ComputeGradientCudaTest.cpp index 83502a62..588c5ea3 100644 --- a/test/ComputeGradientCudaTest.cpp +++ b/test/ComputeGradientCudaTest.cpp @@ -314,7 +314,7 @@ namespace { // Generate random mesh using ImageType = uint16_t; - PixelData input_image = getRandInitializedMesh(11, 13, 15, 15, 20); + PixelData input_image = getRandInitializedMesh(33, 35, 37, 15, 20); PixelData &image_temp = input_image; PixelData grad_temp; // should be a down-sampled image diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index 9ba510e6..ca60fca3 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -2,13 +2,10 @@ * Created by Krzysztof Gonciarz 2018 */ #include -#include #include #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/ComputeGradient.hpp" -#include "algorithm/ComputeGradientCuda.hpp" #include -#include "algorithm/APRConverter.hpp" #include "TestTools.hpp" namespace { @@ -641,77 +638,6 @@ namespace { ASSERT_TRUE(compare(m, expect, 0.01)); } - - // ======================= CUDA ======================================= - // ======================= CUDA ======================================= - // ======================= CUDA ======================================= - -#ifdef APR_USE_CUDA - - // TODO: This test will be fixed as soon as CUDA pipeline is updated. - // Currently turning it off to have testable rest of CUDA impl. -// TEST(ComputeThreshold, FULL_PIPELINE_TEST) { -// APRTimer timer(true); -// -// // Generate random mesh -// using ImageType = float; -// PixelData input_image = getRandInitializedMesh(310, 330, 32, 25); -// int maxLevel = ceil(std::log2(330)); -// -// PixelData &image_temp = input_image; -// -// PixelData grad_temp; // should be a down-sampled image -// grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// PixelData local_scale_temp2; -// local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// PixelData grad_temp_GPU; // should be a down-sampled image -// grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); -// PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors -// local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// PixelData local_scale_temp2_GPU; -// local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); -// -// -// APRParameters par; -// par.lambda = 3; -// par.Ip_th = 10; -// par.sigma_th = 0; -// par.sigma_th_max = 0; -// par.dx = 1; -// par.dy = 1; -// par.dz = 1; -// -// ComputeGradient computeGradient; -// LocalIntensityScale localIntensityScale; -// LocalParticleCellSet localParticleSet; -// -// // Calculate bspline on CPU -// PixelData mCpuImage(image_temp, true); -// timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); -// computeGradient.get_gradient(mCpuImage, grad_temp, local_scale_temp, par); -// localIntensityScale.get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); -// localParticleSet.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); -// timer.stop_timer(); -// -// // Calculate bspline on GPU -// PixelData mGpuImage(image_temp, true); -// timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); -// GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); -// gpt.doAll(); -// timer.stop_timer(); -// -// // Compare GPU vs CPU -// // allow some differences since float point diffs -// // TODO: It would be much better to count number of diffs with delta==1 and allow some of these -// EXPECT_TRUE(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0.01) < 29); -// } - - -#endif // APR_USE_CUDA - } int main(int argc, char **argv) { diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp new file mode 100644 index 00000000..31312080 --- /dev/null +++ b/test/FullPipelineCudaTest.cpp @@ -0,0 +1,79 @@ + +#include + +#include "algorithm/LocalIntensityScaleCuda.h" +#include "algorithm/LocalIntensityScale.hpp" +#include "algorithm/ComputeGradient.hpp" +#include "algorithm/ComputeGradientCuda.hpp" +#include "TestTools.hpp" +#include "data_structures/Mesh/PixelDataCuda.h" +#include "algorithm/APRConverter.hpp" + +namespace { +#ifdef APR_USE_CUDA + + TEST(ComputeThreshold, FULL_PIPELINE_TEST) { + APRTimer timer(true); + + // Generate random mesh + using ImageType = float; + PixelData input_image = getRandInitializedMesh(310, 330, 32, 25); + int maxLevel = ceil(std::log2(330)); + + PixelData grad_temp; // should be a down-sampled image + grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); + PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + + PixelData grad_temp_GPU; // should be a down-sampled image + grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); + PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + PixelData local_scale_temp2_GPU; + local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate bspline on CPU + PixelData mCpuImage(input_image, true); + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + // Calculate bspline on GPU + PixelData mGpuImage(input_image, true); + +// timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); +// GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); +// gpt.doAll(); + + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + } + + +#endif // APR_USE_CUDA +} + + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file From ac2c22e54f32f40afb6fe5a9030cb9c6bf57ea96 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 20 Mar 2023 13:49:25 +0100 Subject: [PATCH 24/59] PixelDataDim updated with maximum dimension lenght and nuber of dimensions --- src/data_structures/Mesh/PixelData.hpp | 2 ++ test/MeshDataTest.cpp | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index 68de3b00..d3867ed7 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -37,6 +37,8 @@ struct PixelDataDim { constexpr PixelDataDim(size_t y, size_t x, size_t z) : y(y), x(x), z(z) {} size_t size() const { return y * x * z; } + size_t maxDimSize() const { return std::max(x, std::max(y, z)); } + int numOfDimensions() const { return (int)(x > 1) + (int)(y > 1) + (int)(z > 1); } PixelDataDim operator+(const PixelDataDim &rhs) const { return {y + rhs.y, x + rhs.x, z + rhs.z}; } PixelDataDim operator-(const PixelDataDim &rhs) const { return {y - rhs.y, x - rhs.x, z - rhs.z}; } diff --git a/test/MeshDataTest.cpp b/test/MeshDataTest.cpp index a3c4bec6..20b1bbe3 100644 --- a/test/MeshDataTest.cpp +++ b/test/MeshDataTest.cpp @@ -35,6 +35,7 @@ namespace { ASSERT_EQ(d.x, 20); ASSERT_EQ(d.z, 30); ASSERT_EQ(d.size(), 10*20*30); + ASSERT_EQ(d.maxDimSize(), 30); } { // adding int to all dims @@ -81,6 +82,16 @@ namespace { ASSERT_FALSE(x == z); ASSERT_TRUE(x != z); } + { // number of dimensions + const PixelDataDim x = {2, 3, 5}; + const PixelDataDim y = {2, 1, 5}; + const PixelDataDim z = {1, 4, 1}; + const PixelDataDim w = {1, 1, 1}; + ASSERT_EQ(x.numOfDimensions(), 3); + ASSERT_EQ(y.numOfDimensions(), 2); + ASSERT_EQ(z.numOfDimensions(), 1); + ASSERT_EQ(w.numOfDimensions(), 0); + } } TEST_F(VectorDataTest, InitTest) { From 122a96a13524e956e9383f4263f1dc041cf081fe Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 20 Mar 2023 16:06:41 +0100 Subject: [PATCH 25/59] GradLisLevels test working now --- test/FullPipelineCudaTest.cpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 31312080..ede7ee12 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -9,16 +9,17 @@ #include "data_structures/Mesh/PixelDataCuda.h" #include "algorithm/APRConverter.hpp" + namespace { #ifdef APR_USE_CUDA - TEST(ComputeThreshold, FULL_PIPELINE_TEST) { + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS) { APRTimer timer(true); - // Generate random mesh + // Generate random mesh - keep it large enough to catch all possible computation errors using ImageType = float; - PixelData input_image = getRandInitializedMesh(310, 330, 32, 25); - int maxLevel = ceil(std::log2(330)); + PixelData input_image = getRandInitializedMesh(1000, 1000, 1000, 13); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); PixelData grad_temp; // should be a down-sampled image grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); @@ -54,11 +55,7 @@ namespace { // Calculate bspline on GPU PixelData mGpuImage(input_image, true); - -// timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); -// GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); -// gpt.doAll(); - + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); From 6a5db358d6a79092bb669212458125a7c9efec6f Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 24 Mar 2023 13:02:12 +0100 Subject: [PATCH 26/59] full pipeline tests fixed --- src/algorithm/ComputeGradientCuda.cu | 90 +++++++++++++------------ src/algorithm/ComputeGradientCuda.hpp | 2 +- src/algorithm/LocalIntensityScale.cu | 16 +++-- src/algorithm/LocalIntensityScaleCuda.h | 1 - src/algorithm/bsplineXdir.cuh | 2 +- src/algorithm/bsplineYdir.cuh | 2 +- src/algorithm/bsplineZdir.cuh | 2 +- src/data_structures/Mesh/PixelData.hpp | 8 +++ test/FullPipelineCudaTest.cpp | 58 +++++++++++++++- 9 files changed, 125 insertions(+), 56 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 87ebdaa5..4db49d4d 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -130,11 +130,11 @@ namespace { }; } - auto transferSpline(BsplineParams &aParams) { - ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0); - ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0); - ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0); - ScopedCudaMemHandler bc4(aParams.bc4.get(), aParams.k0); + auto transferSpline(BsplineParams &aParams, cudaStream_t aStream) { + ScopedCudaMemHandler bc1(aParams.bc1.get(), aParams.k0, aStream); + ScopedCudaMemHandler bc2(aParams.bc2.get(), aParams.k0, aStream); + ScopedCudaMemHandler bc3(aParams.bc3.get(), aParams.k0, aStream); + ScopedCudaMemHandler bc4(aParams.bc4.get(), aParams.k0, aStream); return std::pair { BsplineParamsCuda { @@ -164,10 +164,13 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc BsplineParamsCuda &px, BsplineParamsCuda &py, BsplineParamsCuda &pz, float *boundary, float bspline_offset, const APRParameters &par, cudaStream_t aStream) { + // TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim + runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream); runBsplineXdir(cudaImage, image.getDimension(), px, aStream); runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); + runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); @@ -195,6 +198,7 @@ public: } }; + template template class GpuProcessingTask::GpuProcessingTaskImpl { @@ -234,11 +238,11 @@ class GpuProcessingTask::GpuProcessingTaskImpl { public: - GpuProcessingTaskImpl(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : - iCpuImage(image), + GpuProcessingTaskImpl(const PixelData &inputImage, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : + iCpuImage(inputImage), iCpuLevels(levels), iStream(getStream()), - image (image, iStream), + image (inputImage, iStream), gradient (levels, iStream), local_scale_temp (levels, iStream), local_scale_temp2 (levels, iStream), @@ -247,19 +251,17 @@ public: iMaxLevel(maxLevel), // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. // Should be fixed when other parts of pipeline are ready. - params(prepareBsplineStuff((size_t)image.x_num, parameters.lambda, tolerance)), + params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)), bc1(params.bc1.get(), params.k0, iStream), bc2(params.bc2.get(), params.k0, iStream), bc3(params.bc3.get(), params.k0, iStream), bc4(params.bc4.get(), params.k0, iStream), - boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)image.x_num * (size_t)image.z_num}, + boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, boundary{nullptr, boundaryLen, iStream} { // std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n"; std::cout << iCpuImage << std::endl; std::cout << iCpuLevels << std::endl; - std::cout << "\n\n\n"; - } void sendDataToGpu() { @@ -286,13 +288,13 @@ public: // In principle this is OK and correct but would be faster (for processing series of same size images) if // they would be calculated in constructor of GpuProcessingTaskImpl class (once). BsplineParams px = prepareBsplineStuff(iCpuImage.x_num, iParameters.lambda, tolerance); - auto cudax = transferSpline(px); + auto cudax = transferSpline(px, iStream); auto splineCudaX = cudax.first; BsplineParams py = prepareBsplineStuff(iCpuImage.y_num, iParameters.lambda, tolerance); - auto cuday = transferSpline(py); + auto cuday = transferSpline(py, iStream); auto splineCudaY = cuday.first; BsplineParams pz = prepareBsplineStuff(iCpuImage.z_num, iParameters.lambda, tolerance); - auto cudaz = transferSpline(pz); + auto cudaz = transferSpline(pz, iStream); auto splineCudaZ = cudaz.first; getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), @@ -315,7 +317,7 @@ public: }; template -GpuProcessingTask::GpuProcessingTask(PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) +GpuProcessingTask::GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : impl{new GpuProcessingTaskImpl(image, levels, parameters, bspline_offset, maxLevel)} {std::cout << "GpuProcessingTask\n";} template @@ -359,28 +361,27 @@ template void cudaFilterBsplineFull(PixelData &input, float lambda, float tolerance, TypeOfRecBsplineFlags flags, int maxFilterLen) { cudaStream_t aStream = 0; - - ScopedCudaMemHandler, D2H | H2D> cudaInput(input); + ScopedCudaMemHandler, D2H | H2D> cudaInput(input, aStream); APRTimer timer(false); timer.start_timer("GpuDeviceTimeFull"); if (flags & BSPLINE_Y_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.y_num, lambda, tolerance, maxFilterLen); - auto cuda = transferSpline(p); + auto cuda = transferSpline(p, aStream); auto splineCuda = cuda.first; int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * input.x_num * input.z_num; - ScopedCudaMemHandler boundary(nullptr, boundaryLen); // allocate memory on device + ScopedCudaMemHandler boundary(nullptr, boundaryLen, aStream); // allocate memory on device runBsplineYdir(cudaInput.get(), input.getDimension(), splineCuda, boundary.get(), aStream); } if (flags & BSPLINE_X_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.x_num, lambda, tolerance, maxFilterLen); - auto cuda = transferSpline(p); + auto cuda = transferSpline(p, aStream); auto splineCuda = cuda.first; runBsplineXdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); } if (flags & BSPLINE_Z_DIR) { BsplineParams p = prepareBsplineStuff((size_t)input.z_num, lambda, tolerance, maxFilterLen); - auto cuda = transferSpline(p); + auto cuda = transferSpline(p, aStream); auto splineCuda = cuda.first; runBsplineZdir(cudaInput.get(), input.getDimension(), splineCuda, aStream); } @@ -391,16 +392,18 @@ void cudaFilterBsplineFull(PixelData &input, float lambda, float tolera template void cudaInverseBspline(PixelData &, TypeOfInvBsplineFlags); template void cudaInverseBspline(PixelData &input, TypeOfInvBsplineFlags flags) { - ScopedCudaMemHandler, H2D | D2H> cudaInput(input); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaInput(input, aStream); if (flags & INV_BSPLINE_Y_DIR) { - runInvBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0); + runInvBsplineYdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream); } if (flags & INV_BSPLINE_X_DIR) { - runInvBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0); + runInvBsplineXdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream); } if (flags & INV_BSPLINE_Z_DIR) { - runInvBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, 0); + runInvBsplineZdir(cudaInput.get(), input.x_num, input.y_num, input.z_num, aStream); } } @@ -408,13 +411,14 @@ void cudaInverseBspline(PixelData &input, TypeOfInvBsplineFlags flags) template void computeLevelsCuda(const PixelData &, PixelData &, int, float, float, float, float); template void computeLevelsCuda(const PixelData &grad_temp, PixelData &local_scale_temp, int maxLevel, float relError, float dx, float dy, float dz) { - ScopedCudaMemHandler, H2D> cudaGrad(grad_temp); - ScopedCudaMemHandler, D2H | H2D> cudaLis(local_scale_temp); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D> cudaGrad(grad_temp, aStream); + ScopedCudaMemHandler, D2H | H2D> cudaLis(local_scale_temp, aStream); float min_dim = std::min(dy, std::min(dx, dz)); float level_factor = pow(2, maxLevel) * min_dim; const float mult_const = level_factor/relError; - cudaStream_t aStream = 0; runComputeLevels(cudaGrad.get(), cudaLis.get(), grad_temp.mesh.size(), mult_const, aStream); } @@ -424,17 +428,17 @@ template void getGradient(PixelData &, PixelData &, PixelDat template void getGradient(PixelData &image, PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, float bspline_offset, const APRParameters &par) { - ScopedCudaMemHandler, D2H | H2D> cudaImage(image); - ScopedCudaMemHandler, D2H | H2D> cudaGrad(grad_temp); - ScopedCudaMemHandler, D2H> cudalocal_scale_temp(local_scale_temp); - ScopedCudaMemHandler, D2H> cudalocal_scale_temp2(local_scale_temp2); + cudaStream_t aStream = 0; + ScopedCudaMemHandler, D2H | H2D> cudaImage(image, aStream); + ScopedCudaMemHandler, D2H | H2D> cudaGrad(grad_temp, aStream); + ScopedCudaMemHandler, D2H> cudalocal_scale_temp(local_scale_temp, aStream); + ScopedCudaMemHandler, D2H> cudalocal_scale_temp2(local_scale_temp2, aStream); int boundaryLen = (2 /*two first elements*/ + 2 /* two last elements */) * image.x_num * image.z_num; - ScopedCudaMemHandler boundary(nullptr, boundaryLen); + ScopedCudaMemHandler boundary(nullptr, boundaryLen, aStream); float tolerance = 0.0001; - // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. // Should be fixed when other parts of pipeline are ready. @@ -442,22 +446,24 @@ void getGradient(PixelData &image, PixelData &grad_temp, Pixel BsplineParams px = prepareBsplineStuff(image.x_num, par.lambda, tolerance); - auto cudax = transferSpline(px); + auto cudax = transferSpline(px, aStream); auto splineCudaX = cudax.first; BsplineParams py = prepareBsplineStuff(image.y_num, par.lambda, tolerance); - auto cuday = transferSpline(py); + auto cuday = transferSpline(py, aStream); auto splineCudaY = cuday.first; BsplineParams pz = prepareBsplineStuff(image.z_num, par.lambda, tolerance); - auto cudaz = transferSpline(pz); + auto cudaz = transferSpline(pz, aStream); auto splineCudaZ = cudaz.first; getGradientCuda(image, local_scale_temp, cudaImage.get(), cudaGrad.get(), cudalocal_scale_temp.get(), - splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, 0); + splineCudaX, splineCudaY, splineCudaZ, boundary.get(), bspline_offset, par, aStream); } void cudaDownsampledGradient(PixelData &input, PixelData &grad, const float hx, const float hy, const float hz) { - ScopedCudaMemHandler, H2D | D2H> cudaInput(input); - ScopedCudaMemHandler, D2H> cudaGrad(grad); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaInput(input, aStream); + ScopedCudaMemHandler, D2H> cudaGrad(grad, aStream); - runKernelGradient(cudaInput.get(), cudaGrad.get(), input.getDimension(), grad.getDimension(), hx, hy, hz, 0); + runKernelGradient(cudaInput.get(), cudaGrad.get(), input.getDimension(), grad.getDimension(), hx, hy, hz, aStream); } diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index 1fcf088b..a8ebe1bf 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -42,7 +42,7 @@ class GpuProcessingTask { public: - GpuProcessingTask(PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel); + GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel); ~GpuProcessingTask(); GpuProcessingTask(GpuProcessingTask&&); diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 64e4c710..2b5c186d 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -563,11 +563,11 @@ template void runLocalIntensityScalePipeline(const PixelData // TODO: should be moved somewhere template void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags, bool boundaryReflect) { - ScopedCudaMemHandler, H2D | D2H> cudaImage(image); - APRTimer timer(true); -// timer.start_timer("GpuDeviceTimeFull"); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaImage(image, aStream); + runMean(cudaImage.get(), image.getDimension(), offset, offset, offset, flags, 0, boundaryReflect); -// timer.stop_timer(); } // explicit instantiation of handled types @@ -577,9 +577,11 @@ template void calcMean(PixelData&, int, TypeOfMeanFlags, bool); template void getLocalIntensityScale(PixelData &image, PixelData &temp, const APRParameters &par) { - ScopedCudaMemHandler, H2D | D2H> cudaImage(image); - ScopedCudaMemHandler, D2H> cudaTemp(temp); + cudaStream_t aStream = 0; + + ScopedCudaMemHandler, H2D | D2H> cudaImage(image, aStream); + ScopedCudaMemHandler, D2H> cudaTemp(temp, aStream); - runLocalIntensityScalePipeline(image, par, cudaImage.get(), cudaTemp.get(), 0); + runLocalIntensityScalePipeline(image, par, cudaImage.get(), cudaTemp.get(), aStream); } template void getLocalIntensityScale(PixelData&, PixelData&, const APRParameters&); diff --git a/src/algorithm/LocalIntensityScaleCuda.h b/src/algorithm/LocalIntensityScaleCuda.h index 135e5927..f572d5e5 100644 --- a/src/algorithm/LocalIntensityScaleCuda.h +++ b/src/algorithm/LocalIntensityScaleCuda.h @@ -15,7 +15,6 @@ constexpr TypeOfMeanFlags MEAN_X_DIR = 0x02; constexpr TypeOfMeanFlags MEAN_Z_DIR = 0x04; constexpr TypeOfMeanFlags MEAN_ALL_DIR = MEAN_Y_DIR | MEAN_X_DIR | MEAN_Z_DIR; -// TODO: remember to revert by default boundaryReflect=true (or check with CPU code what is current 'default'). template void calcMean(PixelData &image, int offset, TypeOfMeanFlags flags = MEAN_ALL_DIR, bool boundaryReflect = false); diff --git a/src/algorithm/bsplineXdir.cuh b/src/algorithm/bsplineXdir.cuh index 89fd3fc6..1df52a80 100644 --- a/src/algorithm/bsplineXdir.cuh +++ b/src/algorithm/bsplineXdir.cuh @@ -137,7 +137,7 @@ void runBsplineXdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaSt // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. bool isErrorDetected = false; { - ScopedCudaMemHandler error(&isErrorDetected, 1); + ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); bsplineXdir <<>>(cudaImage, dim, p, error.get()); } diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index e8aa5bdf..b487cb63 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -249,7 +249,7 @@ void runBsplineYdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, float size_t sharedMemSize = (2 /*bc vectors*/) * (p.k0) * sizeof(float) + numOfThreads * (p.k0) * sizeof(float); bool isErrorDetected = false; { - ScopedCudaMemHandler error(&isErrorDetected, 1); + ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); bsplineYdirBoundary <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); sharedMemSize = numOfThreads * blockWidth * sizeof(float); bsplineYdirProcess <<< numBlocks, threadsPerBlock, sharedMemSize, aStream >>>(cudaImage, dim, p, boundary, error.get()); diff --git a/src/algorithm/bsplineZdir.cuh b/src/algorithm/bsplineZdir.cuh index c8ba6688..43550ff8 100644 --- a/src/algorithm/bsplineZdir.cuh +++ b/src/algorithm/bsplineZdir.cuh @@ -139,7 +139,7 @@ void runBsplineZdir(T *cudaImage, PixelDataDim dim, BsplineParamsCuda &p, cudaSt // access it but this is enough for us to know that somewhere in one on more kernels overflow was detected. bool isErrorDetected = false; { - ScopedCudaMemHandler error(&isErrorDetected, 1); + ScopedCudaMemHandler error(&isErrorDetected, 1, aStream); bsplineZdir <<>> (cudaImage, dim, p, error.get()); } diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index d3867ed7..9b68458d 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -725,6 +725,10 @@ public : init(y_num_ds, x_num_ds, z_num_ds, aUsePinnedMemory); } + void initDownsampled(const PixelDataDim &dim, bool aUsePinnedMemory) { + initDownsampled(dim.y, dim.x, dim.z, aUsePinnedMemory); + } + /** * Initializes mesh with size of half of provided dimensions (rounding up if not divisible by 2) and initialize values * @param aSizeOfY @@ -740,6 +744,10 @@ public : initWithValue(y_num_ds, x_num_ds, z_num_ds, aInitVal, aUsePinnedMemory); } + void initDownsampled(const PixelDataDim &dim, T aInitVal, bool aUsePinnedMemory) { + initDownsampled(dim.y, dim.x, dim.z, aInitVal, aUsePinnedMemory); + } + /** * Initializes mesh with size of half of provided mesh dimensions (rounding up if not divisible by 2) * @param aMesh - mesh used to get dimensions diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index ede7ee12..6528227a 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -18,7 +18,7 @@ namespace { // Generate random mesh - keep it large enough to catch all possible computation errors using ImageType = float; - PixelData input_image = getRandInitializedMesh(1000, 1000, 1000, 13); + PixelData input_image = getRandInitializedMesh(100, 100, 100, 13); int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); PixelData grad_temp; // should be a down-sampled image @@ -65,7 +65,61 @@ namespace { EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); } + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GPT) { + APRTimer timer(true); + + // Generate random mesh - keep it large enough to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim{333, 1000, 333}; + PixelData input_image = getRandInitializedMesh(dim, 99, 0, false); + int maxLevel = ceil(std::log2(dim.maxDimSize())); + PixelData grad_temp; // should be a down-sampled image + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp.initDownsampled(dim,false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + PixelData grad_temp_GPU; // should be a down-sampled image + grad_temp_GPU.initDownsampled(dim, 0, false); + PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp_GPU.initDownsampled(dim, false); + PixelData local_scale_temp2_GPU; + local_scale_temp2_GPU.initDownsampled(dim, false); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate bspline on CPU + PixelData mCpuImage(input_image, true); + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + + // Calculate bspline on GPU + PixelData mGpuImage(input_image, true); + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + + { + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); + gpt.doAll(); + } + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + } #endif // APR_USE_CUDA } @@ -73,4 +127,4 @@ namespace { int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -} \ No newline at end of file +} From 4088e9d8bc043af0451807804e9dc0715c8fab89 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 20 Jul 2023 13:31:38 +0200 Subject: [PATCH 27/59] Changes from old branches added + modified to GenInfo instead of APRAccess --- src/algorithm/OVPC.h | 4 +- test/CMakeLists.txt | 1 + test/PullingSchemeTest.cpp | 303 +++++++++++++++++++++++++++++-------- 3 files changed, 246 insertions(+), 62 deletions(-) diff --git a/src/algorithm/OVPC.h b/src/algorithm/OVPC.h index f8e975ac..6925f325 100644 --- a/src/algorithm/OVPC.h +++ b/src/algorithm/OVPC.h @@ -9,7 +9,7 @@ #include #include "data_structures/Mesh/PixelData.hpp" -#include "data_structures/APR/APRAccess.hpp" +#include "data_structures/APR/GenInfo.hpp" #include "algorithm/PullingScheme.hpp" @@ -33,7 +33,7 @@ class OVPC { public: template - OVPC(const APRAccess &aAprAccess, const PixelData &aInputLevels) { + OVPC(const GenInfo &aAprAccess, const PixelData &aInputLevels) { // Level Max is one less since we are working on downsampled version iLevelMax = aAprAccess.l_max - 1; iLevelMin = aAprAccess.l_min; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 193ce405..aeb66421 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -17,6 +17,7 @@ if(APR_USE_CUDA) buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp) buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp) buildTarget(testFullPipelineCuda FullPipelineCudaTest.cpp) + buildTarget(testPullingSchemeCuda PullingSchemeTest.cpp) endif() if(APR_BUILD_EXAMPLES) diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index f72897cd..a84fcdf4 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -4,13 +4,13 @@ #include #include "data_structures/Mesh/PixelData.hpp" -//TODO: only APRAccess.hpp should be included here but currently because of dependencies it does not work :( -#include "data_structures/APR/APR.hpp" -#include "algorithm/APRConverter.hpp" -//#include "data_structures/APR/APRAccess.hpp" +#include "data_structures/APR/access/APRAccessStructures.hpp" #include "algorithm/PullingScheme.hpp" +#include "algorithm/OVPC.h" #include "TestTools.hpp" + #ifdef APR_USE_CUDA +#include "algorithm/PullingSchemeCuda.hpp" #include "algorithm/ComputeGradientCuda.hpp" #endif @@ -18,7 +18,7 @@ namespace { template PixelData generateLevels(const PixelData &dimsMesh, int maxLevel) { PixelData levels(dimsMesh, false); - for (size_t i = 0; i < levels.mesh.size(); ++i) { + for (int i = 0; i < levels.mesh.size(); ++i) { levels.mesh[i] = ( i/2 ) % (maxLevel + 2); } // std::cout << "LEVELS: " << std::endl; @@ -26,86 +26,269 @@ namespace { return levels; } -// void printParticleCellTree(const std::vector> &particleCellTree) { -// for (int l = 0; l < particleCellTree.size(); ++l) { -// auto &tree = particleCellTree[l]; -// std::cout << "------ 1level=" << l << " " << tree << std::endl; -// tree.printMesh(3,0); -// } -// } + template + void printParticleCellTree(const std::vector> &particleCellTree) { + for (int l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; +// std::cout << "-- level = " << l << ", " << tree << std::endl; + tree.printMeshT(3,0); + } + } - TEST(PullingSchemeTest, Init) { + template + inline int compareParticleCellTrees(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { + int cnt = 0; + int numOfParticles = 0; + for (size_t i = 0; i < expected.mesh.size(); ++i) { + if (expected.mesh[i] < 8) { + if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) || + std::isnan(tested.mesh[i])) { + if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { + std::cout << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; + } + cnt++; + } + if (expected.mesh[i] > 0) numOfParticles++; + } + } + std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << " Particles:" << numOfParticles << std::endl; + return cnt; + } - GenInfo aprInfo; + TEST(PullingSchemeTest, NEWvsOLD) { + GenInfo access; + access.l_max = 9; + access.l_min = 1; + access.org_dims[0] = std::pow(2, access.l_max); + access.org_dims[1] = std::pow(2, access.l_max); + access.org_dims[2] = std::pow(2, access.l_max); + int l = access.l_max - 1; - aprInfo.l_max = 4; - aprInfo.l_min = 2; - aprInfo.org_dims[0] = 8; - aprInfo.org_dims[1] = 16; - aprInfo.org_dims[2] = 1; + PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); + PixelData levels2(levels, true); +// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; +// initFromZYXarray(levels, values); - PullingScheme ps; - ps.initialize_particle_cell_tree(aprInfo); - std::vector> &pctree = ps.getParticleCellTree(); - - // TEST: check if zeroed and correct number of levels - ASSERT_EQ(aprInfo.l_max, pctree.size()); // all levels [0, access.level_max - 1] - for (size_t l = 0; l < pctree.size(); ++l) { - auto &tree = pctree[l]; - for (auto &e : tree.mesh) { - ASSERT_EQ(0, e); - } - } +// levels.printMeshT(3, 1); - // Generate mesh with test levels - PixelData levels = generateLevels(pctree[aprInfo.l_max - 1], aprInfo.l_max); + APRTimer t(true); - // Fill particle cell tree with levels - int l_max = aprInfo.l_max - 1; - int l_min = aprInfo.l_min; + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(access); + int l_max = access.l_max - 1; + int l_min = access.l_min; ps.fill(l_max, levels); - - PixelData levelsDS; + PixelData levelsDS; for(int l_ = l_max - 1; l_ >= l_min; l_--){ - //down sample the resolution level k, using a max reduction downsample(levels, levelsDS, [](const float &x, const float &y) -> float { return std::max(x, y); }, [](const float &x) -> float { return x; }, true); ps.fill(l_,levelsDS); levels.swap(levelsDS); } + ps.pulling_scheme_main(); + t.stop_timer(); + + t.start_timer("OVPC1"); + OVPC nps(access, levels2); + t.stop_timer(); + t.start_timer("OVPC2"); + nps.generateTree(); + t.stop_timer(); + +// printParticleCellTree(nps.getParticleCellTree()); +// printParticleCellTree(ps.getParticleCellTree()); + + for (l = l_min; l <= l_max; ++l) + compareParticleCellTrees(ps.getParticleCellTree()[l], nps.getParticleCellTree()[l]); + + } + +// TEST(PullingSchemeTest, Init) { +// +// GenInfo access; +// access.l_max = 5; +// access.l_min = 1; +// access.org_dims[0] = 32; +// access.org_dims[1] = 1; +// access.org_dims[2] = 1; // +// PullingScheme ps; +// ps.initialize_particle_cell_tree(access); +// std::vector> &pctree = ps.getParticleCellTree(); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> Initialized tree:\n"; // printParticleCellTree(pctree); -// ps.fill_neighbours(l_max); -// pctree[l_max].printMesh(3, 0); +// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; +// +// // TEST: check if zeroed and correct number of levels +// ASSERT_EQ(access.l_max, pctree.size()); // all levels [0, access.level_max - 1] +// for (int l = 0; l < pctree.size(); ++l) { +// auto &tree = pctree[l]; +// for (auto &e : tree.mesh) { +// ASSERT_EQ(0, e); +// } +// } +// +// // Generate mesh with test levels +// PixelData levels(pctree.back(), false);// = generateLevels(pctree[access.l_max - 1], access.l_max); +//// float values[] = {4, 1, 1, 1, 1, 1, 1, 2}; +// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; +// initFromZYXarray(levels, values); +// +// +// OVPC nps(access, levels); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS1:\n"; +// printParticleCellTree(nps.getParticleCellTree()); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS1:\n"; +// nps.generateTree(); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS2:\n"; +// printParticleCellTree(nps.getParticleCellTree()); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS2:\n"; +// // Fill particle cell tree with levels +// int l_max = access.l_max - 1; +// int l_min = access.l_min; +// ps.fill(l_max, levels); +// +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> LEVELS:\n"; +// levels.printMeshT(3,0); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> LEVELS:\n"; +// +// PixelData levelsDS; +// for(int l_ = l_max - 1; l_ >= l_min; l_--){ +// //down sample the resolution level k, using a max reduction +// downsample(levels, levelsDS, +// [](const float &x, const float &y) -> float { return std::max(x, y); }, +// [](const float &x) -> float { return x; }, true); +// levelsDS.printMeshT(3, 0); +// ps.fill(l_,levelsDS); +// levelsDS.printMeshT(3,0); +// levels.swap(levelsDS); +// } +// +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> Filled tree:\n"; +// printParticleCellTree(pctree); +// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; +// +//// ps.fill_neighbours(l_max); +//// pctree[l_max].printMesh(3, 0); +// +// // ps.pulling_scheme_main(); +// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> MAIN tree:\n"; // printParticleCellTree(pctree); - } +// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; +// +// access.initialize_structure_from_particle_cell_tree(false, ps.getParticleCellTree()); +// std::cout << "NUM OF PARTICLES: " << access.get_total_number_particles() << std::endl; +// +// +// APRIterator apr_iterator(access); +// std::cout << "Total number of particles: " << apr_iterator.total_number_particles() << std::endl; +// +// int prev = 0; +// for (unsigned int level = apr_iterator.level_min(); level <= apr_iterator.level_max(); ++level) { +// std::cout << "Level: " << level << std::endl; +// int w = (int) (std::pow(2, 5-level) * 3); +// for (int z = 0; z < apr_iterator.spatial_index_z_max(level); ++z) { +// for (int x = 0; x < apr_iterator.spatial_index_x_max(level); ++x) { +// for (apr_iterator.set_new_lzx(level, z, x); apr_iterator.global_index() < apr_iterator.end_index; apr_iterator.set_iterator_to_particle_next_particle()) { +// for (int i = prev; i < apr_iterator.y(); ++i ) std::cout << std::setw(w) << "."; +// std::cout << std::setw(w) << apr_iterator.y(); +// prev = apr_iterator.y() + 1; +// } +// for (int pp = prev; pp < apr_iterator.spatial_index_y_max(level); ++pp) +// std::cout << std::setw(w) << "."; +// +// prev = 0; +// std::cout << std::endl; +// } +// std::cout << std::endl; +// } +// } +// +// } + #ifdef APR_USE_CUDA - TEST(PullingSchemeTest, computeLevels) { - using ImgType = float; - const int maxLevel = 3; - const float relError = 0.1; +// TEST(PullingSchemeTest, computeLevels) { +// using ImgType = float; +// const int maxLevel = 3; +// const float relError = 0.1; +// +// PixelData grad = getRandInitializedMesh(10, 20, 33); +// PixelData localIntensityScaleCpu = getRandInitializedMesh(10, 20, 33); +// +// PixelData localIntensityScaleGpu(localIntensityScaleCpu, true); +// PixelData elo(localIntensityScaleCpu, true); +// APRTimer timer(true); +// +// timer.start_timer("CPU Levels"); +// APRConverter().computeLevels(grad, localIntensityScaleCpu, maxLevel, relError); +// timer.stop_timer(); +// +// timer.start_timer("GPU Levels"); +// computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError); +// timer.stop_timer(); +// +// EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0); +// } - PixelData grad = getRandInitializedMesh(10, 20, 33); - PixelData localIntensityScaleCpu = getRandInitializedMesh(10, 20, 33); - PixelData localIntensityScaleGpu(localIntensityScaleCpu, true); - PixelData elo(localIntensityScaleCpu, true); - APRTimer timer(true); - LocalParticleCellSet localParticleCellSet; + TEST(PullingSchemeTest, DS) { + GenInfo access; + access.l_max = 11; + access.l_min = 1; + access.org_dims[0] = std::pow(2, access.l_max)/2; + access.org_dims[1] = std::pow(2, access.l_max)/2; + access.org_dims[2] = std::pow(2, access.l_max); - timer.start_timer("CPU PS FULL"); - localParticleCellSet.computeLevels(grad, localIntensityScaleCpu, maxLevel, relError,1,1,1); - timer.stop_timer(); - timer.start_timer("GPU PS FULL"); - computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError); - timer.stop_timer(); + PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); + PixelData levels2(levels, true); + + // PixelData levels(16,1,1); +// float values[] = {4, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 2}; +// initFromZYXarray(levels, values); + + APRTimer t(true); + if (false) { + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(access); + int l_max = access.l_max - 1; + int l_min = access.l_min; + ps.fill(l_max, levels2); + PixelData levelsDS; + for (int l_ = l_max - 1; l_ >= l_min; l_--) { + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + ps.fill(l_, levelsDS); + levels2.swap(levelsDS); + } + t.stop_timer(); + } + { + t.start_timer("CUDA"); + int levelMax = access.l_max - 1; + int levelMin = access.l_min; + PixelData ds(levels.y_num, levels.x_num, levels.z_num * (levelMax - levelMin + 1), 0); + std::cout << levels << std::endl; +// std::cout << ds << std::endl; + computeOVPC(levels, ds, levelMin, levelMax); +// ds.printMeshT(3,1); + t.stop_timer(); + } + { + t.start_timer("OVPC1"); + OVPC nps(access, levels); + nps.generateTree(); + t.stop_timer(); +// printParticleCellTree(nps.getParticleCellTree()); + } + } - EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0); - } #endif } From b8f250404dfe697eaea8a5730927f2dfde00c668 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 21 Jul 2023 13:00:56 +0200 Subject: [PATCH 28/59] Added debug printout to GenInfo --- src/data_structures/APR/GenInfo.hpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp index f8fd090e..ba8ccb3a 100644 --- a/src/data_structures/APR/GenInfo.hpp +++ b/src/data_structures/APR/GenInfo.hpp @@ -5,6 +5,11 @@ #ifndef LIBAPR_GENINFO_HPP #define LIBAPR_GENINFO_HPP + +#include +#include +#include + //Note this function sets up the domain for the APR for a given input size. class GenInfo { @@ -97,6 +102,25 @@ class GenInfo { z_num[l] = ceil(z_org / cellSize); } } + + friend std::ostream & operator<<(std::ostream &os, const GenInfo &gi) { + os << "GenInfo {\n"; + os << " Original dimensions(y/x/z): [" << gi.org_dims[0] << ", " << gi.org_dims[1] << ", " << gi.org_dims[2] << "]\n"; + os << " Number of dimensions: " << static_cast(gi.number_dimensions) << "\n"; + os << " l_min, l_max: {" << gi.l_min << " - " << gi.l_max << "}\n"; + os << " total number of particles: " << gi.total_number_particles << "\n"; + os << " y_num, x_num, z_num:\n"; + for (int l = gi.l_min; l <= gi.l_max; ++l) { + os << " level [" << l << "] = " << gi.y_num[l] << ", " << gi.x_num[l] << ", " << gi.z_num[l] << "\n"; + } + os << " level_size:\n"; + for (int l = gi.l_min; l <= gi.l_max; ++l) { + os << " level " << l << ": " << gi.level_size[l] << "\n"; + } + os << "}"; + + return os; + } }; From 6400a9a83a07343cdc1b7c979d4bf1478e858137 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 11 Aug 2023 15:35:05 +0200 Subject: [PATCH 29/59] Moved old CUDA tests to new file --- test/CMakeLists.txt | 2 +- test/PullingSchemeCudaTest.cpp | 93 ++++++++++++++++++++++++++++++++++ test/PullingSchemeTest.cpp | 84 ------------------------------ 3 files changed, 94 insertions(+), 85 deletions(-) create mode 100644 test/PullingSchemeCudaTest.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index aeb66421..e1f2817e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -17,7 +17,7 @@ if(APR_USE_CUDA) buildTarget(testComputeGradientCuda ComputeGradientCudaTest.cpp) buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp) buildTarget(testFullPipelineCuda FullPipelineCudaTest.cpp) - buildTarget(testPullingSchemeCuda PullingSchemeTest.cpp) + buildTarget(testPullingSchemeCuda PullingSchemeCudaTest.cpp) endif() if(APR_BUILD_EXAMPLES) diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp new file mode 100644 index 00000000..5dedc414 --- /dev/null +++ b/test/PullingSchemeCudaTest.cpp @@ -0,0 +1,93 @@ +#include + +#include "algorithm/PullingScheme.hpp" +#include "algorithm/OVPC.h" + +#include "algorithm/PullingSchemeCuda.hpp" +#include "algorithm/ComputeGradientCuda.hpp" + +#include "TestTools.hpp" + +// TEST(PullingSchemeTest, computeLevels) { +// using ImgType = float; +// const int maxLevel = 3; +// const float relError = 0.1; +// +// PixelData grad = getRandInitializedMesh(10, 20, 33); +// PixelData localIntensityScaleCpu = getRandInitializedMesh(10, 20, 33); +// +// PixelData localIntensityScaleGpu(localIntensityScaleCpu, true); +// PixelData elo(localIntensityScaleCpu, true); +// APRTimer timer(true); +// +// timer.start_timer("CPU Levels"); +// APRConverter().computeLevels(grad, localIntensityScaleCpu, maxLevel, relError); +// timer.stop_timer(); +// +// timer.start_timer("GPU Levels"); +// computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError); +// timer.stop_timer(); +// +// EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0); +// } + + + +TEST(PullingSchemeTest, DS) { + GenInfo access; + access.l_max = 11; + access.l_min = 1; + access.org_dims[0] = std::pow(2, access.l_max)/2; + access.org_dims[1] = std::pow(2, access.l_max)/2; + access.org_dims[2] = std::pow(2, access.l_max); + + + PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); + PixelData levels2(levels, true); + + // PixelData levels(16,1,1); + // float values[] = {4, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 2}; + // initFromZYXarray(levels, values); + + APRTimer t(true); + if (false) { + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(access); + int l_max = access.l_max - 1; + int l_min = access.l_min; + ps.fill(l_max, levels2); + PixelData levelsDS; + for (int l_ = l_max - 1; l_ >= l_min; l_--) { + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + ps.fill(l_, levelsDS); + levels2.swap(levelsDS); + } + t.stop_timer(); + } + { + t.start_timer("CUDA"); + int levelMax = access.l_max - 1; + int levelMin = access.l_min; + PixelData ds(levels.y_num, levels.x_num, levels.z_num * (levelMax - levelMin + 1), 0); + std::cout << levels << std::endl; + // std::cout << ds << std::endl; + computeOVPC(levels, ds, levelMin, levelMax); + // ds.printMeshT(3,1); + t.stop_timer(); + } + { + t.start_timer("OVPC1"); + OVPC nps(access, levels); + nps.generateTree(); + t.stop_timer(); + // printParticleCellTree(nps.getParticleCellTree()); + } +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index a84fcdf4..50fef13c 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -9,10 +9,6 @@ #include "algorithm/OVPC.h" #include "TestTools.hpp" -#ifdef APR_USE_CUDA -#include "algorithm/PullingSchemeCuda.hpp" -#include "algorithm/ComputeGradientCuda.hpp" -#endif namespace { template @@ -209,87 +205,7 @@ namespace { // // } -#ifdef APR_USE_CUDA -// TEST(PullingSchemeTest, computeLevels) { -// using ImgType = float; -// const int maxLevel = 3; -// const float relError = 0.1; -// -// PixelData grad = getRandInitializedMesh(10, 20, 33); -// PixelData localIntensityScaleCpu = getRandInitializedMesh(10, 20, 33); -// -// PixelData localIntensityScaleGpu(localIntensityScaleCpu, true); -// PixelData elo(localIntensityScaleCpu, true); -// APRTimer timer(true); -// -// timer.start_timer("CPU Levels"); -// APRConverter().computeLevels(grad, localIntensityScaleCpu, maxLevel, relError); -// timer.stop_timer(); -// -// timer.start_timer("GPU Levels"); -// computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError); -// timer.stop_timer(); -// -// EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0); -// } - - - - TEST(PullingSchemeTest, DS) { - GenInfo access; - access.l_max = 11; - access.l_min = 1; - access.org_dims[0] = std::pow(2, access.l_max)/2; - access.org_dims[1] = std::pow(2, access.l_max)/2; - access.org_dims[2] = std::pow(2, access.l_max); - - - PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); - PixelData levels2(levels, true); - - // PixelData levels(16,1,1); -// float values[] = {4, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 2}; -// initFromZYXarray(levels, values); - - APRTimer t(true); - if (false) { - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(access); - int l_max = access.l_max - 1; - int l_min = access.l_min; - ps.fill(l_max, levels2); - PixelData levelsDS; - for (int l_ = l_max - 1; l_ >= l_min; l_--) { - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - ps.fill(l_, levelsDS); - levels2.swap(levelsDS); - } - t.stop_timer(); - } - { - t.start_timer("CUDA"); - int levelMax = access.l_max - 1; - int levelMin = access.l_min; - PixelData ds(levels.y_num, levels.x_num, levels.z_num * (levelMax - levelMin + 1), 0); - std::cout << levels << std::endl; -// std::cout << ds << std::endl; - computeOVPC(levels, ds, levelMin, levelMax); -// ds.printMeshT(3,1); - t.stop_timer(); - } - { - t.start_timer("OVPC1"); - OVPC nps(access, levels); - nps.generateTree(); - t.stop_timer(); -// printParticleCellTree(nps.getParticleCellTree()); - } - } -#endif } int main(int argc, char **argv) { From 4b35b8eac84364ca56b2ea3fecd085c722556101 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 11 Aug 2023 15:35:49 +0200 Subject: [PATCH 30/59] Moved old CUDA tests to new file --- test/PullingSchemeCudaTest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index 5dedc414..c956a53f 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -8,7 +8,7 @@ #include "TestTools.hpp" -// TEST(PullingSchemeTest, computeLevels) { +// TEST(PullingSchemeCudaTest, computeLevels) { // using ImgType = float; // const int maxLevel = 3; // const float relError = 0.1; @@ -33,7 +33,7 @@ -TEST(PullingSchemeTest, DS) { +TEST(PullingSchemeCudaTest, DS) { GenInfo access; access.l_max = 11; access.l_min = 1; From 1ed5d4f91ca708ef75f408a348d40fb9578ac415 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 30 Oct 2023 12:43:34 +0100 Subject: [PATCH 31/59] Added CUDA_ARCHITECTURES set to OFF (keep current behaviour) to suppress warning --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4513e07f..d0aee009 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -237,6 +237,7 @@ if(APR_BUILD_STATIC_LIB) # generate static library used as a intermediate step in generating fat lib set(STATIC_TARGET_NAME staticLib) add_library(${STATIC_TARGET_NAME} STATIC $ ${APR_CUDA_SOURCE_FILES}) + set_property(TARGET ${STATIC_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF) target_compile_features(${STATIC_TARGET_NAME} PUBLIC cxx_std_14) set_target_properties(${STATIC_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) set_target_properties(${STATIC_TARGET_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION OFF) @@ -258,7 +259,7 @@ if(APR_BUILD_SHARED_LIB) # generate fat shared library set(SHARED_TARGET_NAME sharedLib) add_library(${SHARED_TARGET_NAME} SHARED $ ${APR_CUDA_SOURCE_FILES}) - + set_property(TARGET ${SHARED_TARGET_NAME} PROPERTY CUDA_ARCHITECTURES OFF) target_include_directories(${SHARED_TARGET_NAME} PUBLIC $ $) set_target_properties(${SHARED_TARGET_NAME} PROPERTIES OUTPUT_NAME ${LIBRARY_NAME}) set_target_properties(${SHARED_TARGET_NAME} PROPERTIES LIBRARY_OUTPUT_NAME ${LIBRARY_NAME}) From 93ac1206537dec79cc193571134474581ecaa410 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 8 Nov 2023 17:07:39 +0100 Subject: [PATCH 32/59] Temporary test updated to print particles using LinearAccess iterator --- src/data_structures/APR/GenInfo.hpp | 10 ++ test/PullingSchemeTest.cpp | 204 +++++++++++++++++++++++++++- test/TestTools.hpp | 2 +- 3 files changed, 214 insertions(+), 2 deletions(-) diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp index ba8ccb3a..e506100a 100644 --- a/src/data_structures/APR/GenInfo.hpp +++ b/src/data_structures/APR/GenInfo.hpp @@ -34,6 +34,11 @@ class GenInfo { std::vector level_size; // precomputation of the size of each level, used by the iterators. + //initialize the information given the original dimensions + void init(const PixelDataDim &dim) { + init(dim.y, dim.x, dim.z); + } + //initialize the information given the original dimensions void init(uint64_t y_org,uint64_t x_org,uint64_t z_org){ @@ -69,6 +74,11 @@ class GenInfo { } } + //initialize the information given the original dimensions + void init_tree(const PixelDataDim &dim){ + init_tree(dim.y, dim.x, dim.z); + } + //initialize the information given the original dimensions void init_tree(uint64_t y_org,uint64_t x_org,uint64_t z_org){ diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index 50fef13c..83d97366 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -8,6 +8,7 @@ #include "algorithm/PullingScheme.hpp" #include "algorithm/OVPC.h" #include "TestTools.hpp" +#include "algorithm/APRConverter.hpp" namespace { @@ -24,7 +25,7 @@ namespace { template void printParticleCellTree(const std::vector> &particleCellTree) { - for (int l = 0; l < particleCellTree.size(); ++l) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { auto &tree = particleCellTree[l]; // std::cout << "-- level = " << l << ", " << tree << std::endl; tree.printMeshT(3,0); @@ -51,6 +52,207 @@ namespace { return cnt; } + // ------------------------------------------------------------------------ + + TEST(PullingSchemeTest, DeleteMeAfterDeevelopment) { + // TODO: delete me after development + // Full 'get apr' pipeline to test imp. on different stages + // Useful during debugging and can be removed once finished + + // Prepare input data (image) + int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + +// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; + // PS values for above 'image': int values[] = {4,0,0,0, 0,0,0,0, 4,0,0,0, 0,0,0,0}; + + int len = sizeof(values)/sizeof(int); + PixelData data(len, 1, 1); + initFromZYXarray(data, values); + std::cout << "----- Input image:\n"; + data.printMeshT(3, 1); + + // Produce APR + APR apr; + APRConverter aprConverter; + aprConverter.par.rel_error = 0.01; + aprConverter.par.lambda = 0.1; + aprConverter.get_apr(apr, data); + + // Print information about APR and all particles + std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; + for (int l = apr.level_min(); l <= apr.level_max(); ++l) { + std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; + } + std::cout << "APR particles z x y level:\n"; + auto it = apr.iterator(); + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; + + // Sample input + ParticleData particleIntensities; + particleIntensities.sample_image(apr, data); + + // Reconstruct image from particles + PixelData reconstructImg; + APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); + std::cout << "----- Reconstructed image:"< levelImg; + APRReconstruction::reconstruct_level(apr, levelImg); + std::cout << "----- Image levels:" << std::endl; + levelImg.printMeshT(3, 1); + + // Show intensities and levels of each particle + std::cout << "----- Particle intensities:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + particleIntensities.fill_with_levels(apr); + + std::cout << "----- Particle levels:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + // Show some general information about generated APR + double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); + std::cout << std::endl; + std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; + std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; + } + + TEST(PullingSchemeTest, PullingScheme1D) { + + //int values[] = {4,4,1,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 }; +// int values[] = {3,2,2,2, 2,2,1,1}; +// int values[] = {3,0,0,0, 0,0,0,0}; +// int values[] = {3,0,0,0, 0,0,0,0}; +// int values[] = {4,0,0,0, 0,0,0,0, 4,0,0,0, 0,0,0,0}; + int values[] = {0,2,2,3, 4,5,6,7}; + int len = sizeof(values)/sizeof(int); + PixelData levels(len ,1, 1); + initFromZYXarray(levels, values); + levels.printMeshT(3, 1); + + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y * 2, dim.x, dim.z); // time two in y-direction since PS container is downsized. + std::cout << gi << std::endl; + + APRTimer t(true); + + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + int l_max = gi.l_max - 1; + int l_min = gi.l_min; + std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; + ps.fill(l_max, levels); + std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); + PixelData levelsDS; + for(int l = l_max - 1; l >= l_min; l--){ + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + ps.fill(l, levelsDS); + std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); + levels.swap(levelsDS); + } + printParticleCellTree(ps.getParticleCellTree()); + ps.pulling_scheme_main(); + t.stop_timer(); + + std::cout << "----------PS:\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "-------------\n"; + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + std::cout << "1\n"; + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + std::cout << "2\n"; + LinearIterator it(linearAccess, gi); + + std::cout << "===========================\n"; + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; + } + + TEST(PullingSchemeTest, Simple) { + GenInfo gi; + // TODO: Investigate why OVPC fails if one of the dimension is equal to 1 + // Investigate why sub-dimension in printParticleCellTree is different in OVPC nad PS + gi.init(8, 1, 2); + + std::cout << gi << std::endl; + + PixelData levels = getRandInitializedMesh( + std::ceil(gi.org_dims[0]/2), + std::ceil(gi.org_dims[1]/2), + std::ceil(gi.org_dims[2]/2), + gi.l_max + 1); + PixelData levels2(levels, true); +// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; +// initFromZYXarray(levels, values); + +// levels.printMeshT(3, 1); + + APRTimer t(true); + + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + int l_max = gi.l_max - 1; + int l_min = gi.l_min; + ps.fill(l_max, levels); + PixelData levelsDS; + for(int l_ = l_max - 1; l_ >= l_min; l_--){ + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + ps.fill(l_,levelsDS); + levels.swap(levelsDS); + } + ps.pulling_scheme_main(); + t.stop_timer(); + + t.start_timer("OVPC1"); + OVPC nps(gi, levels2); + t.stop_timer(); + t.start_timer("OVPC2"); + nps.generateTree(); + t.stop_timer(); + + std::cout << "----------OVPC:\n"; + printParticleCellTree(nps.getParticleCellTree()); + std::cout << "----------PS:\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "-------------\n"; + + for (int l = l_min; l <= l_max; ++l) + compareParticleCellTrees(ps.getParticleCellTree()[l], nps.getParticleCellTree()[l]); + + } + + TEST(PullingSchemeTest, NEWvsOLD) { GenInfo access; access.l_max = 9; diff --git a/test/TestTools.hpp b/test/TestTools.hpp index b533674d..491599aa 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -46,7 +46,7 @@ inline bool compare(PixelData &mesh, const float *data, const float epsilon) } template -inline bool initFromZYXarray(PixelData &mesh, T *data) { +inline bool initFromZYXarray(PixelData &mesh, const T *data) { size_t dataIdx = 0; for (int z = 0; z < mesh.z_num; ++z) { for (int y = 0; y < mesh.y_num; ++y) { From 70543d2e6e3677616e6e0a192e809c9633d14d01 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 30 Nov 2023 11:28:44 +0100 Subject: [PATCH 33/59] TODO about some problems with edge case --- src/data_structures/APR/access/LinearAccess.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/data_structures/APR/access/LinearAccess.hpp b/src/data_structures/APR/access/LinearAccess.hpp index 5f92c0ef..b170fd2c 100644 --- a/src/data_structures/APR/access/LinearAccess.hpp +++ b/src/data_structures/APR/access/LinearAccess.hpp @@ -226,6 +226,10 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet initialize_xz_linear(); //edge case + // TODO: Don't know why we need that edge case but it would be good if it run properly + // For example 'genInfo->total_number_particles' is not set, maybe other values are not set either but + // it need to be investigated or this edge case removed (?) - if level_max() <= 2 then there are no many particles + // anyway so any code should be fast enough... if(level_max()<=2){ // For performance reasons and clarity of the code, it doesn't make sense here to handle these cases. Below assumes there is atleast levels <=2; From dd3d448e0851807b1852b33cb4dd29dde4c65566 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 6 Dec 2023 17:22:25 +0100 Subject: [PATCH 34/59] Fixed test where out of range idx was given --- test/ComputeGradientTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/ComputeGradientTest.cpp b/test/ComputeGradientTest.cpp index ca60fca3..0d822357 100644 --- a/test/ComputeGradientTest.cpp +++ b/test/ComputeGradientTest.cpp @@ -369,7 +369,7 @@ namespace { 0.0000000000, 0.2193282992, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.2930246294, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000, 0.0000000000 }; // put values in corners - m(1, 1, 4) = 1; + m(0, 1, 2) = 1; // Calculate bspline on CPU PixelData mCpu(m, true); From 1a112ecf9515524c24b90980bad7663fc13a0abe Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 13 Dec 2023 18:28:38 +0100 Subject: [PATCH 35/59] Pulling Scheme tests (and OVPC on CPU) finished. --- src/data_structures/Mesh/PixelData.hpp | 10 + test/PullingSchemeCudaTest.cpp | 144 +++++ test/PullingSchemeTest.cpp | 799 +++++++++++++++---------- 3 files changed, 626 insertions(+), 327 deletions(-) diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index 9b68458d..e0a037f0 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -513,6 +513,16 @@ public : * @return element @(y, x, z) */ T& operator()(int y, int x, int z) { + // TODO: In number of places during running tests below check shows problems. + // Investigate and try to fix. Such check in future probably should be permanent + // to discover all problems rather than hiding them. +#ifndef NDEBUG // with Cmake we need to use double neg. condition since there is not ifdef DEBUG defined :( + if ((y < 0 || y >= y_num) || (x < 0 || x >= x_num) || (z < 0 || z >= z_num)) { +// std::cerr << "Provided coordinates=(" << y << ", " << x << ", " << z; +// std::cerr << ") while PixelData size=(" << y_num << ", " << x_num << ", " << z_num << ")" << std::endl; +// throw std::runtime_error("Provided (y,x,z) coordinates are out of range!"); + } +#endif y = std::min(y, y_num-1); x = std::min(x, x_num-1); z = std::min(z, z_num-1); diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index c956a53f..5ca6f3cc 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -5,9 +5,153 @@ #include "algorithm/PullingSchemeCuda.hpp" #include "algorithm/ComputeGradientCuda.hpp" +#include "algorithm/APRConverter.hpp" #include "TestTools.hpp" + + +TEST(PullingSchemeTest, DeleteMeAfterDevelopment) { + // TODO: delete me after development + // Full 'get apr' pipeline to test imp. on different stages + // Useful during debugging and can be removed once finished + + // Prepare input data (image) + int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + // PS input values = 5 0 0 0 0 0 0 0 + +// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; +// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; + + int len = sizeof(values)/sizeof(int); + PixelData data(len, 1, 1); + initFromZYXarray(data, values); + std::cout << "----- Input image:\n"; + data.printMeshT(3, 1); + + // Produce APR + APR apr; + APRConverter aprConverter; + aprConverter.par.rel_error = 0.1; + aprConverter.par.lambda = 0.1; + aprConverter.par.sigma_th = 0.0001; + aprConverter.par.neighborhood_optimization = true; + aprConverter.get_apr(apr, data); + + // Print information about APR and all particles + std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; + for (int l = apr.level_min(); l <= apr.level_max(); ++l) { + std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; + } + std::cout << "APR particles z x y level:\n"; + auto it = apr.iterator(); + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; + + // Sample input + ParticleData particleIntensities; + particleIntensities.sample_image(apr, data); + + // Reconstruct image from particles + PixelData reconstructImg; + APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); + std::cout << "----- Reconstructed image:"< levelImg; + APRReconstruction::reconstruct_level(apr, levelImg); + std::cout << "----- Image levels:" << std::endl; + levelImg.printMeshT(3, 1); + + // Show intensities and levels of each particle + std::cout << "----- Particle intensities:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + particleIntensities.fill_with_levels(apr); + + std::cout << "----- Particle levels:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + // Show some general information about generated APR + double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); + std::cout << std::endl; + std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; + std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; +} + + + +TEST(PullingSchemeTest, PullingScheme1D) { + + int values[] = {0,0,0,5, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + + PixelData levels(3,3,3, 0); + levels(2,2,2) = 11; + +// initFromZYXarray(levels, values); + levels.printMeshT(3, 1); + + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + std::cout << "Levels dim: " << dim << std::endl; + gi.init(dim.y * 2, dim.x * 2, dim.z * 2); // time two in y-direction since PS container is downsized. + std::cout << gi << std::endl; + + APRTimer t(true); + + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + int l_max = gi.l_max - 1; + int l_min = gi.l_min; + std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; + + fillPS(ps, levels); + + std::cout << "---------- Filled PS tree\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "---------------\n"; + + ps.pulling_scheme_main(); + t.stop_timer(); + + std::cout << "----------PS:\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "-------------\n"; + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + std::cout << "1\n"; + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + std::cout << "2\n"; + LinearIterator it(linearAccess, gi); + + std::cout << "===========================\n"; + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; +} + // TEST(PullingSchemeCudaTest, computeLevels) { // using ImgType = float; // const int maxLevel = 3; diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index 83d97366..e1347b1c 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -8,21 +8,19 @@ #include "algorithm/PullingScheme.hpp" #include "algorithm/OVPC.h" #include "TestTools.hpp" -#include "algorithm/APRConverter.hpp" + namespace { - template - PixelData generateLevels(const PixelData &dimsMesh, int maxLevel) { - PixelData levels(dimsMesh, false); - for (int i = 0; i < levels.mesh.size(); ++i) { - levels.mesh[i] = ( i/2 ) % (maxLevel + 2); - } -// std::cout << "LEVELS: " << std::endl; - levels.printMesh(3, 0); - return levels; - } + // ================================================================================================================= + // ======== Some test helpers + // ================================================================================================================= + + /** + * Prints PCT + * @param particleCellTree + */ template void printParticleCellTree(const std::vector> &particleCellTree) { for (uint64_t l = 0; l < particleCellTree.size(); ++l) { @@ -32,381 +30,528 @@ namespace { } } - template - inline int compareParticleCellTrees(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { - int cnt = 0; - int numOfParticles = 0; - for (size_t i = 0; i < expected.mesh.size(); ++i) { - if (expected.mesh[i] < 8) { - if (std::abs(expected.mesh[i] - tested.mesh[i]) > maxError || std::isnan(expected.mesh[i]) || - std::isnan(tested.mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; - } - cnt++; + // Class for storing expected values for one element of Particle Cell Tree (output of Pulling Scheme) + class LevelData { + public: + int level; + int y; + int x; + int z; + uint8_t expectedType; // seed, boundary, filler... + }; + + /** + * Verify computed Particle Cell Tree (PCT) vs expected values + * Expected values should list all data for types=1,2,3 (seed, boundary filler) which are used to generate particles: + * {levels, y,x,z(position), type} + * All other values are ignored (and used by Pulling Scheme (PS) only for intermediate calculations) + * @param aPCT - PCT produces by PS (note: values in PCT will be changed during verification!) + * @param expectedValues expected values + * @return true if correct, false otherwise + */ + template + bool verifyParticleCellTree(std::vector> &aPCT, const std::vector &expectedValues) { + + const uint8_t AlreadyCheckedMark = 255; + const uint8_t MaxValueOfImportantType = FILLER_TYPE; // All types above are used by PS during computation phase only + + for (const auto &r : expectedValues) { + // std::cout << r.level << " " << r.y << "," << r.x << "," << r.z << " " << (int)r.expectedType << std::endl; + + auto &v = aPCT[r.level](r.y, r.x, r.z); + // Add dim. checks for accessing pct + if (v == r.expectedType) { + v = AlreadyCheckedMark; } - if (expected.mesh[i] > 0) numOfParticles++; + else { + std::cout << "Error! Data at (" << r.y << "," << r.x << "," << r.z << ") expected = " << (int)r.expectedType << " got = " << (int)v << std::endl; + return false; } } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.mesh.size() << " Particles:" << numOfParticles << std::endl; - return cnt; - } - - // ------------------------------------------------------------------------ - - TEST(PullingSchemeTest, DeleteMeAfterDeevelopment) { - // TODO: delete me after development - // Full 'get apr' pipeline to test imp. on different stages - // Useful during debugging and can be removed once finished - // Prepare input data (image) - int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + for (int level = 0; level < aPCT.size(); level++) { + auto &d = aPCT[level]; + auto y_num = d.y_num; + auto x_num = d.x_num; + auto z_num = d.z_num; + + for (int j = 0; j < z_num; j++) { + for (int i = 0; i < x_num; i++) { + for (int k = 0; k < y_num; k++) { + const auto &v = d(k, i, j); + if (v != AlreadyCheckedMark && v <= MaxValueOfImportantType && v > 0) { + std::cout << "Error! Data on level = " << level << " at (" << k << "," << i << "," << j << ") with value = " << (int)v << " not verified or bad!" << std::endl; + return false; + } + } + } + } + } -// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; - // PS values for above 'image': int values[] = {4,0,0,0, 0,0,0,0, 4,0,0,0, 0,0,0,0}; + return true; + } - int len = sizeof(values)/sizeof(int); - PixelData data(len, 1, 1); - initFromZYXarray(data, values); - std::cout << "----- Input image:\n"; - data.printMeshT(3, 1); - - // Produce APR - APR apr; - APRConverter aprConverter; - aprConverter.par.rel_error = 0.01; - aprConverter.par.lambda = 0.1; - aprConverter.get_apr(apr, data); - - // Print information about APR and all particles - std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; - for (int l = apr.level_min(); l <= apr.level_max(); ++l) { - std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; - } - std::cout << "APR particles z x y level:\n"; - auto it = apr.iterator(); - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + /** + * Compare + * @param expected - expected levels + * @param tested - levels to verify + * @param maxError + * @param maxNumOfErrPrinted - how many error outputs should be printed + * @return + */ + template + inline int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { + int cntGlobal = 0; + for (int level = 0; level < expected.size(); level++) { + int cnt = 0; + int numOfParticles = 0; + for (size_t i = 0; i < expected[level].mesh.size(); ++i) { + if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= FILLER_TYPE) { + if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) || + std::isnan(tested[level].mesh[i])) { + if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { + std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs " + << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl; + } + cnt++; } + if (expected[level].mesh[i] > 0) numOfParticles++; } } + cntGlobal += cnt; + if (cnt > 0) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl; + } + return cntGlobal; + } + + template + void fillPS(PullingScheme &aPS, PixelData &levels) { + auto l_max = aPS.pct_level_max(); + auto l_min = aPS.pct_level_min(); + +// std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); + + aPS.fill(l_max, levels); + PixelData levelsDS; + for (int l = l_max - 1; l >= l_min; l--) { + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + aPS.fill(l, levelsDS); +// std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); + levels.swap(levelsDS); } - std::cout << std::endl; - - // Sample input - ParticleData particleIntensities; - particleIntensities.sample_image(apr, data); - - // Reconstruct image from particles - PixelData reconstructImg; - APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); - std::cout << "----- Reconstructed image:"< levelImg; - APRReconstruction::reconstruct_level(apr, levelImg); - std::cout << "----- Image levels:" << std::endl; - levelImg.printMeshT(3, 1); - - // Show intensities and levels of each particle - std::cout << "----- Particle intensities:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - particleIntensities.fill_with_levels(apr); - - std::cout << "----- Particle levels:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - // Show some general information about generated APR - double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); - std::cout << std::endl; - std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; - std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; } - TEST(PullingSchemeTest, PullingScheme1D) { + // ================================================================================================================= + // ======== Pulling Scheme algorithm tests + // ================================================================================================================= + TEST(PullingSchemeTest, PullingScheme1D_Ydir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(len, 1, 1); // <-- Y-dir + initFromZYXarray(levels, values); // <-- Y-dir + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, dim.x, dim.z); + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - initialize with data"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 1,0,0, 2}, + {3, 2,0,0, 3}, + {3, 3,0,0, 3}, - //int values[] = {4,4,1,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 }; -// int values[] = {3,2,2,2, 2,2,1,1}; -// int values[] = {3,0,0,0, 0,0,0,0}; -// int values[] = {3,0,0,0, 0,0,0,0}; -// int values[] = {4,0,0,0, 0,0,0,0, 4,0,0,0, 0,0,0,0}; - int values[] = {0,2,2,3, 4,5,6,7}; + {2, 2,0,0, 3}, + {2, 3,0,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + TEST(PullingSchemeTest, PullingScheme1D_Xdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; int len = sizeof(values)/sizeof(int); - PixelData levels(len ,1, 1); + PixelData levels(1, len, 1); // <-- X-dir initFromZYXarray(levels, values); - levels.printMeshT(3, 1); + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 GenInfo gi; const PixelDataDim dim = levels.getDimension(); - gi.init(dim.y * 2, dim.x, dim.z); // time two in y-direction since PS container is downsized. - std::cout << gi << std::endl; + gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir - APRTimer t(true); + // Initialize all needed objects + APRTimer t(false); - t.start_timer("PS1"); + t.start_timer("PS - initialize with data"); PullingScheme ps; ps.initialize_particle_cell_tree(gi); - int l_max = gi.l_max - 1; - int l_min = gi.l_min; - std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; - ps.fill(l_max, levels); - std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); - PixelData levelsDS; - for(int l = l_max - 1; l >= l_min; l--){ - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - ps.fill(l, levelsDS); - std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); - levels.swap(levelsDS); - } - printParticleCellTree(ps.getParticleCellTree()); + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); ps.pulling_scheme_main(); t.stop_timer(); - std::cout << "----------PS:\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "-------------\n"; - - LinearAccess linearAccess; - linearAccess.genInfo = &gi; - APRParameters par; - std::cout << "1\n"; - linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); - std::cout << "2\n"; - LinearIterator it(linearAccess, gi); - - std::cout << "===========================\n"; - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,1,0, 2}, + {3, 0,2,0, 3}, + {3, 0,3,0, 3} , + + {2, 0,2,0, 3}, + {2, 0,3,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); } - TEST(PullingSchemeTest, Simple) { + TEST(PullingSchemeTest, PullingScheme1D_Zdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, 1, len); // <-- Z-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 GenInfo gi; - // TODO: Investigate why OVPC fails if one of the dimension is equal to 1 - // Investigate why sub-dimension in printParticleCellTree is different in OVPC nad PS - gi.init(8, 1, 2); + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - initialize with data"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,0,1, 2}, + {3, 0,0,2, 3}, + {3, 0,0,3, 3} , - std::cout << gi << std::endl; + {2, 0,0,2, 3}, + {2, 0,0,3, 3} + }; - PixelData levels = getRandInitializedMesh( - std::ceil(gi.org_dims[0]/2), - std::ceil(gi.org_dims[1]/2), - std::ceil(gi.org_dims[2]/2), - gi.l_max + 1); - PixelData levels2(levels, true); -// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; -// initFromZYXarray(levels, values); + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } -// levels.printMeshT(3, 1); + TEST(PullingSchemeTest, PullingScheme3D_smallCube) { + // Prepare input data for PS + PixelData levels(3, 3, 3); + levels(2, 2, 2) = 3; - APRTimer t(true); + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, 2 * dim.x, 2 * dim.z); - t.start_timer("PS1"); + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - initialize with data"); PullingScheme ps; ps.initialize_particle_cell_tree(gi); - int l_max = gi.l_max - 1; - int l_min = gi.l_min; - ps.fill(l_max, levels); - PixelData levelsDS; - for(int l_ = l_max - 1; l_ >= l_min; l_--){ - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - ps.fill(l_,levelsDS); - levels.swap(levelsDS); - } + fillPS(ps, levels); + t.stop_timer(); + + t.start_timer("PS - compute"); ps.pulling_scheme_main(); t.stop_timer(); - t.start_timer("OVPC1"); - OVPC nps(gi, levels2); + // List of expected types + std::vector ev = { + {2, 0,0,0, 3}, + {2, 0,1,0, 3}, + {2, 0,2,0, 3}, + {2, 1,0,0, 3}, + {2, 1,1,0, 3}, + {2, 1,2,0, 3}, + {2, 2,0,0, 3}, + {2, 2,1,0, 3}, + {2, 2,2,0, 3}, + + {2, 0,0,1, 3}, + {2, 0,1,1, 3}, + {2, 0,2,1, 3}, + {2, 1,0,1, 3}, + {2, 1,1,1, 2}, + {2, 1,2,1, 2}, + {2, 2,0,1, 3}, + {2, 2,1,1, 2}, + {2, 2,2,1, 2}, + + {2, 0,0,2, 3}, + {2, 0,1,2, 3}, + {2, 0,2,2, 3}, + {2, 1,0,2, 3}, + {2, 1,1,2, 2}, + {2, 1,2,2, 2}, + {2, 2,0,2, 3}, + {2, 2,1,2, 2}, + {2, 2,2,2, 1}, + + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + // ================================================================================================================= + // ======== OVPC - Optimal Valid Particle Cell - alternative version of original Pulling Scheme algorithm + // ================================================================================================================= + TEST(PullingSchemeTest, OVPC_Ydir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(len, 1, 1); // <-- Y-dir + initFromZYXarray(levels, values); // <-- Y-dir + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, dim.x, dim.z); + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); t.stop_timer(); - t.start_timer("OVPC2"); - nps.generateTree(); + t.start_timer("OVPC - compute"); + ps.generateTree(); t.stop_timer(); - std::cout << "----------OVPC:\n"; - printParticleCellTree(nps.getParticleCellTree()); - std::cout << "----------PS:\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "-------------\n"; + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 1,0,0, 2}, + {3, 2,0,0, 3}, + {3, 3,0,0, 3}, - for (int l = l_min; l <= l_max; ++l) - compareParticleCellTrees(ps.getParticleCellTree()[l], nps.getParticleCellTree()[l]); + {2, 2,0,0, 3}, + {2, 3,0,0, 3} + }; + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); } + TEST(PullingSchemeTest, OVPC_Xdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, len, 1); // <-- X-dir + initFromZYXarray(levels, values); - TEST(PullingSchemeTest, NEWvsOLD) { - GenInfo access; - access.l_max = 9; - access.l_min = 1; - access.org_dims[0] = std::pow(2, access.l_max); - access.org_dims[1] = std::pow(2, access.l_max); - access.org_dims[2] = std::pow(2, access.l_max); - int l = access.l_max - 1; + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir - PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); - PixelData levels2(levels, true); -// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; -// initFromZYXarray(levels, values); + // Initialize all needed objects + APRTimer t(false); -// levels.printMeshT(3, 1); + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); + t.stop_timer(); + t.start_timer("OVPC - compute"); + ps.generateTree(); + t.stop_timer(); - APRTimer t(true); + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,1,0, 2}, + {3, 0,2,0, 3}, + {3, 0,3,0, 3} , - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(access); - int l_max = access.l_max - 1; - int l_min = access.l_min; - ps.fill(l_max, levels); - PixelData levelsDS; - for(int l_ = l_max - 1; l_ >= l_min; l_--){ - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - ps.fill(l_,levelsDS); - levels.swap(levelsDS); - } - ps.pulling_scheme_main(); - t.stop_timer(); + {2, 0,2,0, 3}, + {2, 0,3,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + TEST(PullingSchemeTest, OVPC_Zdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, 1, len); // <-- Z-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir + + // Initialize all needed objects + APRTimer t(false); - t.start_timer("OVPC1"); - OVPC nps(access, levels2); + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); t.stop_timer(); - t.start_timer("OVPC2"); - nps.generateTree(); + t.start_timer("OVPC - compute"); + ps.generateTree(); t.stop_timer(); -// printParticleCellTree(nps.getParticleCellTree()); -// printParticleCellTree(ps.getParticleCellTree()); + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,0,1, 2}, + {3, 0,0,2, 3}, + {3, 0,0,3, 3} , - for (l = l_min; l <= l_max; ++l) - compareParticleCellTrees(ps.getParticleCellTree()[l], nps.getParticleCellTree()[l]); + {2, 0,0,2, 3}, + {2, 0,0,3, 3} + }; + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); } -// TEST(PullingSchemeTest, Init) { -// -// GenInfo access; -// access.l_max = 5; -// access.l_min = 1; -// access.org_dims[0] = 32; -// access.org_dims[1] = 1; -// access.org_dims[2] = 1; -// -// PullingScheme ps; -// ps.initialize_particle_cell_tree(access); -// std::vector> &pctree = ps.getParticleCellTree(); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> Initialized tree:\n"; -// printParticleCellTree(pctree); -// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; -// -// // TEST: check if zeroed and correct number of levels -// ASSERT_EQ(access.l_max, pctree.size()); // all levels [0, access.level_max - 1] -// for (int l = 0; l < pctree.size(); ++l) { -// auto &tree = pctree[l]; -// for (auto &e : tree.mesh) { -// ASSERT_EQ(0, e); -// } -// } -// -// // Generate mesh with test levels -// PixelData levels(pctree.back(), false);// = generateLevels(pctree[access.l_max - 1], access.l_max); -//// float values[] = {4, 1, 1, 1, 1, 1, 1, 2}; -// float values[] = {1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1}; -// initFromZYXarray(levels, values); -// -// -// OVPC nps(access, levels); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS1:\n"; -// printParticleCellTree(nps.getParticleCellTree()); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS1:\n"; -// nps.generateTree(); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS2:\n"; -// printParticleCellTree(nps.getParticleCellTree()); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> NPS2:\n"; -// // Fill particle cell tree with levels -// int l_max = access.l_max - 1; -// int l_min = access.l_min; -// ps.fill(l_max, levels); -// -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> LEVELS:\n"; -// levels.printMeshT(3,0); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> LEVELS:\n"; -// -// PixelData levelsDS; -// for(int l_ = l_max - 1; l_ >= l_min; l_--){ -// //down sample the resolution level k, using a max reduction -// downsample(levels, levelsDS, -// [](const float &x, const float &y) -> float { return std::max(x, y); }, -// [](const float &x) -> float { return x; }, true); -// levelsDS.printMeshT(3, 0); -// ps.fill(l_,levelsDS); -// levelsDS.printMeshT(3,0); -// levels.swap(levelsDS); -// } -// -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> Filled tree:\n"; -// printParticleCellTree(pctree); -// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; -// -//// ps.fill_neighbours(l_max); -//// pctree[l_max].printMesh(3, 0); -// -// -// ps.pulling_scheme_main(); -// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>> MAIN tree:\n"; -// printParticleCellTree(pctree); -// std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"; -// -// access.initialize_structure_from_particle_cell_tree(false, ps.getParticleCellTree()); -// std::cout << "NUM OF PARTICLES: " << access.get_total_number_particles() << std::endl; -// -// -// APRIterator apr_iterator(access); -// std::cout << "Total number of particles: " << apr_iterator.total_number_particles() << std::endl; -// -// int prev = 0; -// for (unsigned int level = apr_iterator.level_min(); level <= apr_iterator.level_max(); ++level) { -// std::cout << "Level: " << level << std::endl; -// int w = (int) (std::pow(2, 5-level) * 3); -// for (int z = 0; z < apr_iterator.spatial_index_z_max(level); ++z) { -// for (int x = 0; x < apr_iterator.spatial_index_x_max(level); ++x) { -// for (apr_iterator.set_new_lzx(level, z, x); apr_iterator.global_index() < apr_iterator.end_index; apr_iterator.set_iterator_to_particle_next_particle()) { -// for (int i = prev; i < apr_iterator.y(); ++i ) std::cout << std::setw(w) << "."; -// std::cout << std::setw(w) << apr_iterator.y(); -// prev = apr_iterator.y() + 1; -// } -// for (int pp = prev; pp < apr_iterator.spatial_index_y_max(level); ++pp) -// std::cout << std::setw(w) << "."; -// -// prev = 0; -// std::cout << std::endl; -// } -// std::cout << std::endl; -// } -// } -// -// } + TEST(PullingSchemeTest, OVPC_smallCube) { + // Prepare input data for PS + PixelData levels(3, 3, 3); + levels(2, 2, 2) = 3; + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, 2 * dim.x, 2 * dim.z); + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPC - initialize"); + OVPC ps(gi, levels); + t.stop_timer(); + t.start_timer("OVPC - compute"); + ps.generateTree(); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {2, 0,0,0, 3}, + {2, 0,1,0, 3}, + {2, 0,2,0, 3}, + {2, 1,0,0, 3}, + {2, 1,1,0, 3}, + {2, 1,2,0, 3}, + {2, 2,0,0, 3}, + {2, 2,1,0, 3}, + {2, 2,2,0, 3}, + + {2, 0,0,1, 3}, + {2, 0,1,1, 3}, + {2, 0,2,1, 3}, + {2, 1,0,1, 3}, + {2, 1,1,1, 2}, + {2, 1,2,1, 2}, + {2, 2,0,1, 3}, + {2, 2,1,1, 2}, + {2, 2,2,1, 2}, + + {2, 0,0,2, 3}, + {2, 0,1,2, 3}, + {2, 0,2,2, 3}, + {2, 1,0,2, 3}, + {2, 1,1,2, 2}, + {2, 1,2,2, 2}, + {2, 2,0,2, 3}, + {2, 2,1,2, 2}, + {2, 2,2,2, 1}, + + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(ps.getParticleCellTree(), ev)); + } + + + // ================================================================================================================= + // ======== PS vs OVPC + // ================================================================================================================= + + TEST(PullingSchemeTest, PSvsOVPC) { + // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC + GenInfo gi; + gi.init(255, 257, 199); + + // Generate random levels for PS and OVPC + PixelData levels(std::ceil(gi.org_dims[0]/2.0), + std::ceil(gi.org_dims[1]/2.0), + std::ceil(gi.org_dims[2]/2.0), + 0); + // Add a few particles only - it will end up with Pulling Scheme generate particles on (almost) all + // levels - good case to compare with OVPC + const int numOfParticles = 3; + std::srand(std::time(nullptr)); + for (int i = 0; i < numOfParticles; ++i) { + levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = gi.l_max; + } + PixelData levelsOVPC(levels, true); // just copy 'levels' + APRTimer t(false); + + // Run test methods and compare results + t.start_timer("OVPC - init"); + OVPC nps(gi, levelsOVPC); + t.stop_timer(); + t.start_timer("OVPC compute"); + nps.generateTree(); + t.stop_timer(); + + + t.start_timer("PS - init"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levels); + t.stop_timer(); + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), nps.getParticleCellTree()), 0); + } } From 64ca641a49da879a66ce04a750388d9ca1cf1b5e Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 14 Dec 2023 09:25:08 +0100 Subject: [PATCH 36/59] Fixes for tests --- test/PullingSchemeCudaTest.cpp | 62 ++++++++++++++++++++++++++++++++++ test/PullingSchemeTest.cpp | 12 +++---- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index 5ca6f3cc..afeb59f1 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -9,7 +9,69 @@ #include "TestTools.hpp" +/** + * Prints PCT + * @param particleCellTree + */ +template +void printParticleCellTree(const std::vector> &particleCellTree) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; +// std::cout << "-- level = " << l << ", " << tree << std::endl; + tree.printMeshT(3,0); + } +} +/** + * Compare + * @param expected - expected levels + * @param tested - levels to verify + * @param maxError + * @param maxNumOfErrPrinted - how many error outputs should be printed + * @return + */ +template +int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { + int cntGlobal = 0; + for (size_t level = 0; level < expected.size(); level++) { + int cnt = 0; + int numOfParticles = 0; + for (size_t i = 0; i < expected[level].mesh.size(); ++i) { + if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= FILLER_TYPE) { + if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) || + std::isnan(tested[level].mesh[i])) { + if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { + std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs " + << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl; + } + cnt++; + } + if (expected[level].mesh[i] > 0) numOfParticles++; + } + } + cntGlobal += cnt; + if (cnt > 0) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl; + } + return cntGlobal; +} +template +void fillPS(PullingScheme &aPS, PixelData &levels) { + auto l_max = aPS.pct_level_max(); + auto l_min = aPS.pct_level_min(); + +// std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); + + aPS.fill(l_max, levels); + PixelData levelsDS; + for (int l = l_max - 1; l >= l_min; l--) { + downsample(levels, levelsDS, + [](const float &x, const float &y) -> float { return std::max(x, y); }, + [](const float &x) -> float { return x; }, true); + aPS.fill(l, levelsDS); +// std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); + levels.swap(levelsDS); + } +} TEST(PullingSchemeTest, DeleteMeAfterDevelopment) { // TODO: delete me after development diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index e1347b1c..c7c66b63 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -64,12 +64,12 @@ namespace { v = AlreadyCheckedMark; } else { - std::cout << "Error! Data at (" << r.y << "," << r.x << "," << r.z << ") expected = " << (int)r.expectedType << " got = " << (int)v << std::endl; + std::cout << "Error! Data on level=" << r.level << " at (" << r.y << "," << r.x << "," << r.z << ") expected=" << (int)r.expectedType << " got=" << (int)v << std::endl; return false; } } - for (int level = 0; level < aPCT.size(); level++) { + for (size_t level = 0; level < aPCT.size(); level++) { auto &d = aPCT[level]; auto y_num = d.y_num; auto x_num = d.x_num; @@ -100,9 +100,9 @@ namespace { * @return */ template - inline int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { + int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { int cntGlobal = 0; - for (int level = 0; level < expected.size(); level++) { + for (size_t level = 0; level < expected.size(); level++) { int cnt = 0; int numOfParticles = 0; for (size_t i = 0; i < expected[level].mesh.size(); ++i) { @@ -271,7 +271,7 @@ namespace { TEST(PullingSchemeTest, PullingScheme3D_smallCube) { // Prepare input data for PS - PixelData levels(3, 3, 3); + PixelData levels(3, 3, 3, 0); levels(2, 2, 2) = 3; // Prepare GenInfo structure - @@ -450,7 +450,7 @@ namespace { TEST(PullingSchemeTest, OVPC_smallCube) { // Prepare input data for PS - PixelData levels(3, 3, 3); + PixelData levels(3, 3, 3, 0); levels(2, 2, 2) = 3; // Prepare GenInfo structure - From 9f31bfda2863740ebcb331060e34e1b81abd88cb Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 9 Jan 2024 11:27:04 +0100 Subject: [PATCH 37/59] Fixed OVPC - clamping values of input levels is necessary --- src/algorithm/OVPC.cu | 53 ++++++++++--------- src/algorithm/OVPC.h | 3 +- .../APR/access/RandomAccess.hpp | 4 +- src/data_structures/Mesh/ImagePatch.hpp | 2 +- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index f568212b..9794df2b 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -1,39 +1,28 @@ #include "PullingSchemeCuda.hpp" #include -#include -//#include -#include #include "misc/CudaTools.cuh" #include "data_structures/Mesh/downsample.cuh" +#include "algorithm/OVPC.h" -namespace { - using ElementType = uint8_t; - static constexpr int BIT_SHIFT = 6; - static constexpr ElementType OVPC_SEED = 1; - static constexpr ElementType OVPC_BOUNDARY = 2; - static constexpr ElementType OVPC_FILLER = 3; - - static constexpr ElementType SEED_MASK = OVPC_SEED << BIT_SHIFT; - static constexpr ElementType BOUNDARY_MASK = OVPC_BOUNDARY << BIT_SHIFT; - static constexpr ElementType FILLER_MASK = OVPC_FILLER << BIT_SHIFT; - static constexpr ElementType MASK = 0x03 << BIT_SHIFT; -} template -__global__ void copy1D(const T *input, S *output, size_t length) { +__global__ void copyAndClampLevels(const T *input, S *output, size_t length, int levelMin, int levelMax) { size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; if (idx < length) { - output[idx] = input[idx]; + T v = input[idx]; + if (v > levelMax) v = levelMax; + if (v < levelMin) v = levelMin; + output[idx] = v; } } template -void runCopy1D(T *inputData, S *outputData, size_t lenght, cudaStream_t aStream) { +void runCopyAndClampLevels(T *inputData, S *outputData, size_t lenght, int levelMin, int levelMax, cudaStream_t aStream) { dim3 threadsPerBlock(128); dim3 numBlocks((lenght + threadsPerBlock.x - 1)/threadsPerBlock.x); - copy1D<<>>(inputData, outputData, lenght); + copyAndClampLevels<<>>(inputData, outputData, lenght, levelMin, levelMax); }; @@ -57,7 +46,7 @@ __global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int lev for (int x = xmin; x <= xmax; ++x) { for (int y = ymin; y <= ymax; ++y) { const size_t idx = z * xLen * yLen + x * yLen + y; - T currentLevel = ~MASK & data[idx]; + T currentLevel = ~OVPC::MASK & data[idx]; if (currentLevel > level) { ok = false; break; } else if (currentLevel == level) neig = true; } @@ -66,9 +55,9 @@ __global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int lev if (ok) { const size_t idx = zi * xLen * yLen + xi * yLen + yi; T status = data[idx]; - if (status == level) data[idx] |= SEED_MASK; - else if (neig) data[idx] |= BOUNDARY_MASK; - else data[idx] |= FILLER_MASK; + if (status == level) data[idx] |= OVPC::SEED; + else if (neig) data[idx] |= OVPC::BOUNDARY; + else data[idx] |= OVPC::FILLER; } } @@ -103,11 +92,11 @@ __global__ void secondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t for (int x = xmin; x <= xmax; ++x) { for (int y = ymin; y <= ymax; ++y) { size_t children_index = z * xLenc * yLenc + x * yLenc + y; - child[children_index] = status >= (OVPC_SEED << BIT_SHIFT) ? 0 : child[children_index] >> BIT_SHIFT; + child[children_index] = status >= (OVPC::OVPC_SEED << OVPC::BIT_SHIFT) ? 0 : child[children_index] >> OVPC::BIT_SHIFT; } } } - if (isLevelMax) data[zi * xLen * yLen + xi * yLen + yi] = status >> BIT_SHIFT; + if (isLevelMax) data[zi * xLen * yLen + xi * yLen + yi] = status >> OVPC::BIT_SHIFT; } template @@ -124,9 +113,19 @@ template void computeOVPC(const PixelData&, PixelData&, template void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax) { + + ScopedCudaMemHandler, H2D> in(input); ScopedCudaMemHandler, D2H> mem(output); + + CudaTimer t(true, "OVPCCUDA"); + + t.start_timer("wait"); + waitForCuda(); + t.stop_timer(); + + t.start_timer("ALL"); // TODO: This is not needed later - just for having clear debug //cudaMemset(mem.get(), 0, mem.getNumOfBytes()); @@ -157,7 +156,7 @@ void computeOVPC(const PixelData &input, PixelData &output, int levelMin, zDS = ceil(zDS/2.0); } - runCopy1D(in.get(), levels[levelMax], in.getSize(), 0); + runCopyAndClampLevels(in.get(), levels[levelMax], in.getSize(), levelMin, levelMax, 0); for (int l = levelMax - 1; l >= levelMin; --l) { runDownsampleMax(levels[l + 1], levels[l], xSize[l + 1], ySize[l + 1], zSize[l + 1], 0); @@ -172,4 +171,6 @@ void computeOVPC(const PixelData &input, PixelData &output, int levelMin, for (int l = levelMax - 1; l >= levelMin; --l) { runSecondPhase(levels[l], levels[l+1], xSize[l], ySize[l], zSize[l], xSize[l+1], ySize[l+1], zSize[l+1], l == levelMin, 0); } + waitForCuda(); + t.stop_timer(); }; diff --git a/src/algorithm/OVPC.h b/src/algorithm/OVPC.h index 6925f325..e0c8a67b 100644 --- a/src/algorithm/OVPC.h +++ b/src/algorithm/OVPC.h @@ -14,6 +14,8 @@ class OVPC { + +public: // Element big enouth to keep all the levels + 2 highest bits for type // for uint8_t we have [ 2 bit - type(empty, seed, boundary, filler) | 6 bit - level(0-63) ] using ElementType = uint8_t; @@ -31,7 +33,6 @@ class OVPC { int iLevelMin; std::vector> iParticleCellTree; -public: template OVPC(const GenInfo &aAprAccess, const PixelData &aInputLevels) { // Level Max is one less since we are working on downsampled version diff --git a/src/data_structures/APR/access/RandomAccess.hpp b/src/data_structures/APR/access/RandomAccess.hpp index 0daf7a54..aa8f67bc 100644 --- a/src/data_structures/APR/access/RandomAccess.hpp +++ b/src/data_structures/APR/access/RandomAccess.hpp @@ -1210,7 +1210,7 @@ inline void RandomAccess::initialize_tree_access(RandomAccess& APROwn_access, st } -void RandomAccess::init_data_structure_tree(RandomAccess& APROwn_access, SparseGaps>& y_begin){ +inline void RandomAccess::init_data_structure_tree(RandomAccess& APROwn_access, SparseGaps>& y_begin){ uint64_t cumsum = 0; APRTimer apr_timer(false); @@ -1423,7 +1423,7 @@ inline void RandomAccess::initialize_tree_access_sparse(RandomAccess& APROwn_acc } -void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParameters& apr_parameters, SparseGaps &p_map) { +inline void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(APRParameters& apr_parameters, SparseGaps &p_map) { // // Initialize the new structure; // diff --git a/src/data_structures/Mesh/ImagePatch.hpp b/src/data_structures/Mesh/ImagePatch.hpp index a249efdd..01d27fd3 100644 --- a/src/data_structures/Mesh/ImagePatch.hpp +++ b/src/data_structures/Mesh/ImagePatch.hpp @@ -38,7 +38,7 @@ struct ImagePatch { }; -void initPatchGlobal(ImagePatch& patch, int z_begin_global, int z_end_global, int x_begin_global, int x_end_global, int y_begin_global, int y_end_global) { +inline void initPatchGlobal(ImagePatch& patch, int z_begin_global, int z_end_global, int x_begin_global, int x_end_global, int y_begin_global, int y_end_global) { patch.z_begin_global = z_begin_global; patch.x_begin_global = x_begin_global; patch.y_begin_global = y_begin_global; From 2707207b90cbeb6a11425d3b1ada1b65b8566de8 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 5 Feb 2024 16:18:14 +0100 Subject: [PATCH 38/59] Updated OVPC (PS) for CUDA - now it gives correct ans same results as PS CPU ver. --- src/algorithm/LocalParticleCellSet.hpp | 3 + src/algorithm/OVPC.cu | 78 ++++-- src/algorithm/OVPC.h | 4 +- src/algorithm/PullingScheme.hpp | 35 ++- src/algorithm/PullingSchemeCuda.hpp | 3 +- test/PullingSchemeCudaTest.cpp | 338 ++++++++++++++++++------- test/PullingSchemeTest.cpp | 23 +- 7 files changed, 339 insertions(+), 145 deletions(-) diff --git a/src/algorithm/LocalParticleCellSet.hpp b/src/algorithm/LocalParticleCellSet.hpp index 7935076b..f20e08c1 100644 --- a/src/algorithm/LocalParticleCellSet.hpp +++ b/src/algorithm/LocalParticleCellSet.hpp @@ -49,6 +49,9 @@ inline int __builtin_clz(unsigned int x) #endif +#include "algorithm/PullingScheme.hpp" +#include "algorithm/PullingSchemeSparse.hpp" + class LocalParticleCellSet { public: diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index 9794df2b..070c4d81 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -27,7 +27,7 @@ void runCopyAndClampLevels(T *inputData, S *outputData, size_t lenght, int level template -__global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int level) { +__global__ void firstStep(T *data, size_t xLen, size_t yLen, size_t zLen, int level) { const int xi = (blockIdx.x * blockDim.x) + threadIdx.x; const int yi = (blockIdx.y * blockDim.y) + threadIdx.y; const int zi = (blockIdx.z * blockDim.z) + threadIdx.z; @@ -40,39 +40,38 @@ __global__ void oneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int lev int zmin = zi > 0 ? zi - 1 : 0; int zmax = zi < zLen - 1 ? zi + 1 : zLen - 1; - bool ok = true; - bool neig = false; + bool hasNeighHigherLevel = false; + bool hasNeighSameLevel = false; for (int z = zmin; z <= zmax; ++z) { for (int x = xmin; x <= xmax; ++x) { for (int y = ymin; y <= ymax; ++y) { const size_t idx = z * xLen * yLen + x * yLen + y; T currentLevel = ~OVPC::MASK & data[idx]; - if (currentLevel > level) { ok = false; break; } - else if (currentLevel == level) neig = true; + if (currentLevel > level) { hasNeighHigherLevel = true; break; } + else if (currentLevel == level) hasNeighSameLevel = true; } } } - if (ok) { + if (!hasNeighHigherLevel) { const size_t idx = zi * xLen * yLen + xi * yLen + yi; T status = data[idx]; if (status == level) data[idx] |= OVPC::SEED; - else if (neig) data[idx] |= OVPC::BOUNDARY; + else if (hasNeighSameLevel) data[idx] |= OVPC::BOUNDARY; else data[idx] |= OVPC::FILLER; } } template -void runOneLevel(T *data, size_t xLen, size_t yLen, size_t zLen, int level, cudaStream_t aStream) { +void runFirstStep(T *data, size_t xLen, size_t yLen, size_t zLen, int level, cudaStream_t aStream) { dim3 threadsPerBlock(1, 128, 1); dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x, (yLen + threadsPerBlock.y - 1) / threadsPerBlock.y, (zLen + threadsPerBlock.z - 1) / threadsPerBlock.z); -// dim3 numBlocks((xLen * yLen * zLen + threadsPerBlock.x - 1)/threadsPerBlock.x); - oneLevel<<>>(data, xLen, yLen, zLen, level); + firstStep<<>>(data, xLen, yLen, zLen, level); }; template -__global__ void secondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax) { +__global__ void secondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMin) { const int xi = (blockIdx.x * blockDim.x) + threadIdx.x; const int yi = (blockIdx.y * blockDim.y) + threadIdx.y; const int zi = (blockIdx.z * blockDim.z) + threadIdx.z; @@ -96,16 +95,16 @@ __global__ void secondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t } } } - if (isLevelMax) data[zi * xLen * yLen + xi * yLen + yi] = status >> OVPC::BIT_SHIFT; + if (isLevelMin) data[zi * xLen * yLen + xi * yLen + yi] = status >> OVPC::BIT_SHIFT; } template -void runSecondPhase(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax, cudaStream_t aStream) { +void runSecondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, size_t xLenc, size_t yLenc, size_t zLenc, bool isLevelMax, cudaStream_t aStream) { dim3 threadsPerBlock(1, 128, 1); dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x, (yLen + threadsPerBlock.y - 1) / threadsPerBlock.y, (zLen + threadsPerBlock.z - 1) / threadsPerBlock.z); - secondPhase<<>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax); + secondStep<<>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax); }; // explicit instantiation of handled types @@ -114,6 +113,8 @@ template void computeOVPC(const PixelData&, PixelData&, template void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax) { + // TODO: Depending on implementation of computing particles (next step after OVPC) some port of this method + // might be useful. Leaving it here rigtht now just in case. If not needed in next steps DELETE IT. ScopedCudaMemHandler, H2D> in(input); ScopedCudaMemHandler, D2H> mem(output); @@ -156,6 +157,7 @@ void computeOVPC(const PixelData &input, PixelData &output, int levelMin, zDS = ceil(zDS/2.0); } + runCopyAndClampLevels(in.get(), levels[levelMax], in.getSize(), levelMin, levelMax, 0); for (int l = levelMax - 1; l >= levelMin; --l) { @@ -165,12 +167,54 @@ void computeOVPC(const PixelData &input, PixelData &output, int levelMin, // ================== Phase 1 - top to down for (int l = levelMin; l <= levelMax; ++l) { - runOneLevel(levels[l], xSize[l], ySize[l], zSize[l], l, 0); + runFirstStep(levels[l], xSize[l], ySize[l], zSize[l], l, 0); } // ================== Phase 1 - down to top for (int l = levelMax - 1; l >= levelMin; --l) { - runSecondPhase(levels[l], levels[l+1], xSize[l], ySize[l], zSize[l], xSize[l+1], ySize[l+1], zSize[l+1], l == levelMin, 0); + runSecondStep(levels[l], levels[l+1], xSize[l], ySize[l], zSize[l], xSize[l+1], ySize[l+1], zSize[l+1], l == levelMin, 0); } waitForCuda(); t.stop_timer(); -}; +} + +// explicit instantiation of handled types +template void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); +template void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); + +/** + * CUDA implementation of Pullin Scheme (OVPC - Optimal Valid Particle Cell set). + * @tparam T - type of input levels + * @tparam S - type of output Particle Cell Tree + * @param input - input levels computed in earlier stages + * @param pct - Particle Cell Tree - as input is used for dimensions of each level, will be filled with computed + * Pulling Scheme as a output + * @param levelMin - min level of APR + * @param levelMax - max level of APR + */ +template +void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax) { + // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing + // all steps + ScopedCudaMemHandler, H2D> in(input); + std::vector, D2H>> w; + for (int l = 0; l <= levelMax; ++l) { + w.push_back(std::move(ScopedCudaMemHandler, D2H>(pct[l]))); + } + + // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range + runCopyAndClampLevels(in.get(), w[levelMax].get(), in.getSize(), levelMin, levelMax, 0); + + // Downsample with max reduction to levelMin to fill the rest of the tree + for (int l = levelMax - 1; l >= levelMin; --l) { + runDownsampleMax(w[l + 1].get(), w[l].get(), pct[l + 1].x_num, pct[l + 1].y_num, pct[l + 1].z_num, 0); + } + + // ================== Phase 1 - top to down + for (int l = levelMin; l <= levelMax; ++l) { + runFirstStep(w[l].get(), pct[l].x_num, pct[l].y_num, pct[l].z_num, l, 0); + } + // ================== Phase 1 - down to top + for (int l = levelMax - 1; l >= levelMin; --l) { + runSecondStep(w[l].get(), w[l+1].get(), pct[l].x_num, pct[l].y_num, pct[l].z_num, pct[l + 1].x_num, pct[l + 1].y_num, pct[l + 1].z_num, l == levelMin, 0); + } +} diff --git a/src/algorithm/OVPC.h b/src/algorithm/OVPC.h index e0c8a67b..f55bfee3 100644 --- a/src/algorithm/OVPC.h +++ b/src/algorithm/OVPC.h @@ -44,8 +44,8 @@ class OVPC { iParticleCellTree[iLevelMax].init(aInputLevels.y_num, aInputLevels.x_num, aInputLevels.z_num); fillLevel(iLevelMax, aInputLevels); - // Downsample with max reduction to levelMin to fill the rest of the tree - for(int level = iLevelMax - 1; level >= iLevelMin; --level) { + // Downsample with max reduction to levelMin to fill rest of the tree + for (int level = iLevelMax - 1; level >= iLevelMin; --level) { downsample(iParticleCellTree[level + 1], iParticleCellTree[level], [](const float &x, const float &y) -> float { return std::max(x, y); }, [](const float &x) -> float { return x; }, true); diff --git a/src/algorithm/PullingScheme.hpp b/src/algorithm/PullingScheme.hpp index 58ae9ee2..c6756df6 100644 --- a/src/algorithm/PullingScheme.hpp +++ b/src/algorithm/PullingScheme.hpp @@ -51,13 +51,13 @@ for(jn = j * 2; jn < j * 2 + children_boundaries[0]; jn++) \ class PullingScheme { - double powr(uint64_t num,uint64_t pow2){ +public: + + static double powr(uint64_t num,uint64_t pow2){ //return (uint64_t) std::round(std::pow(num,pow2)); return std::round(pow(num,pow2)); } - -public: template void fill(float k, const PixelData &input); @@ -65,6 +65,7 @@ class PullingScheme { void fill_patch(float level, const PixelData &input, ImagePatch& patch); void pulling_scheme_main(); + static std::vector> generateParticleCellTree(const GenInfo &aprInfo); void initialize_particle_cell_tree(const GenInfo &aprInfo); std::vector>& getParticleCellTree() { return particle_cell_tree; } @@ -86,6 +87,25 @@ class PullingScheme { int l_max; }; + +inline std::vector> PullingScheme::generateParticleCellTree(const GenInfo &aprInfo) { + int l_max = aprInfo.l_max - 1; + int l_min = aprInfo.l_min; + + std::vector> pct; + pct.resize(l_max + 1); + + for (int l = l_min; l <= l_max; ++l) { + pct[l].initWithValue(ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1)), + ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1)), + ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1)), + EMPTY); + + } + + return pct; +} + /** * Initializes particle_cell_tree up to level (max - 1) */ @@ -93,14 +113,7 @@ inline void PullingScheme::initialize_particle_cell_tree(const GenInfo &aprInfo) l_max = aprInfo.l_max - 1; l_min = aprInfo.l_min; - particle_cell_tree.resize(l_max + 1); - - for (int l = l_min; l <= l_max; ++l) { - particle_cell_tree[l].initWithValue(ceil(aprInfo.org_dims[0] / powr(2.0, l_max - l + 1)), - ceil(aprInfo.org_dims[1] / powr(2.0, l_max - l + 1)), - ceil(aprInfo.org_dims[2] / powr(2.0, l_max - l + 1)), - EMPTY); - } + particle_cell_tree = generateParticleCellTree(aprInfo); } /** diff --git a/src/algorithm/PullingSchemeCuda.hpp b/src/algorithm/PullingSchemeCuda.hpp index 79a23560..f98c0883 100644 --- a/src/algorithm/PullingSchemeCuda.hpp +++ b/src/algorithm/PullingSchemeCuda.hpp @@ -12,6 +12,7 @@ using TreeElementType = uint8_t; template void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax); - +template +void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); #endif //LIBAPR_PULLINGSCHEMECUDA_HPP diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index afeb59f1..20eae2e1 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -6,9 +6,72 @@ #include "algorithm/PullingSchemeCuda.hpp" #include "algorithm/ComputeGradientCuda.hpp" #include "algorithm/APRConverter.hpp" +#include "algorithm/LocalParticleCellSet.hpp" #include "TestTools.hpp" + +// Class for storing expected values for one element of Particle Cell Tree (output of Pulling Scheme) +class LevelData { +public: + int level; + int y; + int x; + int z; + uint8_t expectedType; // seed, boundary, filler... +}; + +/** + * Verify computed Particle Cell Tree (PCT) vs expected values + * Expected values should list all data for types=1,2,3 (seed, boundary filler) which are used to generate particles: + * {levels, y,x,z(position), type} + * All other values are ignored (and used by Pulling Scheme (PS) only for intermediate calculations) + * @param aPCT - PCT produces by PS (note: values in PCT will be changed during verification!) + * @param expectedValues expected values + * @return true if correct, false otherwise + */ +template +bool verifyParticleCellTree(std::vector> &aPCT, const std::vector &expectedValues) { + + const uint8_t AlreadyCheckedMark = 255; + const uint8_t MaxValueOfImportantType = FILLER_TYPE; // All types above are used by PS during computation phase only + + for (const auto &r : expectedValues) { + // std::cout << r.level << " " << r.y << "," << r.x << "," << r.z << " " << (int)r.expectedType << std::endl; + + auto &v = aPCT[r.level](r.y, r.x, r.z); + // Add dim. checks for accessing pct + if (v == r.expectedType) { + v = AlreadyCheckedMark; + } + else { + std::cout << "Error! Data on level=" << r.level << " at (" << r.y << "," << r.x << "," << r.z << ") expected=" << (int)r.expectedType << " got=" << (int)v << std::endl; + return false; + } + } + + for (size_t level = 0; level < aPCT.size(); level++) { + auto &d = aPCT[level]; + auto y_num = d.y_num; + auto x_num = d.x_num; + auto z_num = d.z_num; + + for (int j = 0; j < z_num; j++) { + for (int i = 0; i < x_num; i++) { + for (int k = 0; k < y_num; k++) { + const auto &v = d(k, i, j); + if (v != AlreadyCheckedMark && v <= MaxValueOfImportantType && v > 0) { + std::cout << "Error! Data on level = " << level << " at (" << k << "," << i << "," << j << ") with value = " << (int)v << " not verified or bad!" << std::endl; + return false; + } + } + } + } + } + + return true; +} + /** * Prints PCT * @param particleCellTree @@ -56,24 +119,11 @@ int compareParticleCellTrees(const std::vector> &expected, const st template void fillPS(PullingScheme &aPS, PixelData &levels) { - auto l_max = aPS.pct_level_max(); - auto l_min = aPS.pct_level_min(); - -// std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); - - aPS.fill(l_max, levels); - PixelData levelsDS; - for (int l = l_max - 1; l >= l_min; l--) { - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - aPS.fill(l, levelsDS); -// std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); - levels.swap(levelsDS); - } + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); } -TEST(PullingSchemeTest, DeleteMeAfterDevelopment) { +TEST(PullingSchemeTest, DeleteMeAfterDevelopment_fullAprPipeline) { // TODO: delete me after development // Full 'get apr' pipeline to test imp. on different stages // Useful during debugging and can be removed once finished @@ -154,12 +204,14 @@ TEST(PullingSchemeTest, DeleteMeAfterDevelopment) { -TEST(PullingSchemeTest, PullingScheme1D) { - +TEST(PullingSchemeTest, DeleteMeAfterDevelopment_PS) { + // TODO: delete me after development + // Runs PS to test imp. on different stages + // Useful during debugging and can be removed once finished int values[] = {0,0,0,5, 0,0,0,0}; int len = sizeof(values)/sizeof(int); - PixelData levels(3,3,3, 0); + PixelData levels(3,3,3, 0); levels(2,2,2) = 11; // initFromZYXarray(levels, values); @@ -171,7 +223,7 @@ TEST(PullingSchemeTest, PullingScheme1D) { gi.init(dim.y * 2, dim.x * 2, dim.z * 2); // time two in y-direction since PS container is downsized. std::cout << gi << std::endl; - APRTimer t(true); + APRTimer t(false); t.start_timer("PS1"); PullingScheme ps; @@ -214,83 +266,177 @@ TEST(PullingSchemeTest, PullingScheme1D) { std::cout << std::endl; } -// TEST(PullingSchemeCudaTest, computeLevels) { -// using ImgType = float; -// const int maxLevel = 3; -// const float relError = 0.1; -// -// PixelData grad = getRandInitializedMesh(10, 20, 33); -// PixelData localIntensityScaleCpu = getRandInitializedMesh(10, 20, 33); -// -// PixelData localIntensityScaleGpu(localIntensityScaleCpu, true); -// PixelData elo(localIntensityScaleCpu, true); -// APRTimer timer(true); -// -// timer.start_timer("CPU Levels"); -// APRConverter().computeLevels(grad, localIntensityScaleCpu, maxLevel, relError); -// timer.stop_timer(); -// -// timer.start_timer("GPU Levels"); -// computeLevelsCuda(grad, localIntensityScaleGpu, maxLevel, relError); -// timer.stop_timer(); -// -// EXPECT_EQ(compareMeshes(localIntensityScaleCpu, localIntensityScaleGpu), 0); -// } - - - -TEST(PullingSchemeCudaTest, DS) { - GenInfo access; - access.l_max = 11; - access.l_min = 1; - access.org_dims[0] = std::pow(2, access.l_max)/2; - access.org_dims[1] = std::pow(2, access.l_max)/2; - access.org_dims[2] = std::pow(2, access.l_max); - - - PixelData levels = getRandInitializedMesh(access.org_dims[0]/2,access.org_dims[1]/2,access.org_dims[2]/2, access.l_max + 1); - PixelData levels2(levels, true); - - // PixelData levels(16,1,1); - // float values[] = {4, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 2}; - // initFromZYXarray(levels, values); - - APRTimer t(true); - if (false) { - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(access); - int l_max = access.l_max - 1; - int l_min = access.l_min; - ps.fill(l_max, levels2); - PixelData levelsDS; - for (int l_ = l_max - 1; l_ >= l_min; l_--) { - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - ps.fill(l_, levelsDS); - levels2.swap(levelsDS); - } - t.stop_timer(); - } - { - t.start_timer("CUDA"); - int levelMax = access.l_max - 1; - int levelMin = access.l_min; - PixelData ds(levels.y_num, levels.x_num, levels.z_num * (levelMax - levelMin + 1), 0); - std::cout << levels << std::endl; - // std::cout << ds << std::endl; - computeOVPC(levels, ds, levelMin, levelMax); - // ds.printMeshT(3,1); - t.stop_timer(); - } - { - t.start_timer("OVPC1"); - OVPC nps(access, levels); - nps.generateTree(); - t.stop_timer(); - // printParticleCellTree(nps.getParticleCellTree()); + +TEST(PullingSchemeTest, PSvsOVPCCUDA) { + // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC + GenInfo gi; + gi.init(255, 257, 199); + + // Generate random levels for PS and OVPC + PixelData levels(std::ceil(gi.org_dims[0]/2.0), + std::ceil(gi.org_dims[1]/2.0), + std::ceil(gi.org_dims[2]/2.0), + 0); + // Add a few particles only - it will end up with Pulling Scheme generate particles on (almost) all + // levels - good case to compare with OVPC + const int numOfParticles = 3; + std::srand(std::time(nullptr)); + for (int i = 0; i < numOfParticles; ++i) { + levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = gi.l_max; } + PixelData levelsOVPC(levels, true); // just copy 'levels' + PixelData levelsPS(levels, true); + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("PS - init"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + fillPS(ps, levelsPS); + t.stop_timer(); + t.start_timer("PS - compute"); + ps.pulling_scheme_main(); + t.stop_timer(); + + // Run test methods and compare results + t.start_timer("OVPCCUDA - init"); + int levelMax = gi.l_max - 1; + int levelMin = gi.l_min; + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + t.stop_timer(); + t.start_timer("OVPCCUDA - compute"); + computeOvpcCuda(levelsOVPC, pct, levelMin, levelMax); + t.stop_timer(); + + // -------------- Verify result + ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); +} + +TEST(PullingSchemeTest, OVPCCUDA_Ydir) { + // Prepare input data for PS + float values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(len, 1, 1); // <-- Y-dir + initFromZYXarray(levels, values); // <-- Y-dir + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir + + int levelMax = gi.l_max - 1; + int levelMin = gi.l_min; + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPCCUDA - initialize"); + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + t.stop_timer(); + + t.start_timer("OVPCCUDA - compute"); + computeOvpcCuda(levels, pct, levelMin, levelMax); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 1,0,0, 2}, + {3, 2,0,0, 3}, + {3, 3,0,0, 3}, + + {2, 2,0,0, 3}, + {2, 3,0,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(pct, ev)); +} + +TEST(PullingSchemeTest, OVPCCUDA_Xdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, len, 1); // <-- X-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, 2 * dim.x, dim.z); // <-- X-dir + + int levelMax = gi.l_max - 1; + int levelMin = gi.l_min; + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPCCUDA - initialize"); + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + t.stop_timer(); + + t.start_timer("OVPCCUDA - compute"); + computeOvpcCuda(levels, pct, levelMin, levelMax); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,1,0, 2}, + {3, 0,2,0, 3}, + {3, 0,3,0, 3} , + + {2, 0,2,0, 3}, + {2, 0,3,0, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(pct, ev)); +} + +TEST(PullingSchemeTest, OVPCCUDA_Zdir) { + // Prepare input data for PS + int values[] = {9,0,0,0, 0,0,0,0}; + int len = sizeof(values)/sizeof(int); + PixelData levels(1, 1, len); // <-- Z-dir + initFromZYXarray(levels, values); + + // Prepare GenInfo structure - + // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + gi.init(dim.y, dim.x, 2 * dim.z); // <-- Z-dir + + int levelMax = gi.l_max - 1; + int levelMin = gi.l_min; + + // Initialize all needed objects + APRTimer t(false); + + t.start_timer("OVPCCUDA - initialize"); + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + t.stop_timer(); + + t.start_timer("OVPCCUDA - compute"); + computeOvpcCuda(levels, pct, levelMin, levelMax); + t.stop_timer(); + + // List of expected types + std::vector ev = { + {3, 0,0,0, 1}, + {3, 0,0,1, 2}, + {3, 0,0,2, 3}, + {3, 0,0,3, 3} , + + {2, 0,0,2, 3}, + {2, 0,0,3, 3} + }; + + // -------------- Verify result + EXPECT_TRUE(verifyParticleCellTree(pct, ev)); } int main(int argc, char **argv) { diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index c7c66b63..be922d90 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -8,7 +8,7 @@ #include "algorithm/PullingScheme.hpp" #include "algorithm/OVPC.h" #include "TestTools.hpp" - +#include "algorithm/LocalParticleCellSet.hpp" namespace { @@ -126,21 +126,8 @@ namespace { template void fillPS(PullingScheme &aPS, PixelData &levels) { - auto l_max = aPS.pct_level_max(); - auto l_min = aPS.pct_level_min(); - -// std::cout << "LEVEL: " << l_max << std::endl; levels.printMeshT(3, 1); - - aPS.fill(l_max, levels); - PixelData levelsDS; - for (int l = l_max - 1; l >= l_min; l--) { - downsample(levels, levelsDS, - [](const float &x, const float &y) -> float { return std::max(x, y); }, - [](const float &x) -> float { return x; }, true); - aPS.fill(l, levelsDS); -// std::cout << "LEVEL: " << l << std::endl; levelsDS.printMeshT(3, 1); - levels.swap(levelsDS); - } + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); } // ================================================================================================================= @@ -157,7 +144,7 @@ namespace { // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 GenInfo gi; const PixelDataDim dim = levels.getDimension(); - gi.init(2 * dim.y, dim.x, dim.z); + gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir // Initialize all needed objects APRTimer t(false); @@ -345,7 +332,7 @@ namespace { // remember: data for PS is downsampled so is representing image twice bigger so Y-dir size need to be multiplied by 2 GenInfo gi; const PixelDataDim dim = levels.getDimension(); - gi.init(2 * dim.y, dim.x, dim.z); + gi.init(2 * dim.y, dim.x, dim.z); // <-- Y-dir // Initialize all needed objects APRTimer t(false); From 3cb4529d73a015d8f135b7fae5973c9aac15363c Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 16 Feb 2024 16:23:48 +0100 Subject: [PATCH 39/59] PullingSchemeCudaTest finished, added init file for LinearAcccess test --- src/algorithm/LocalParticleCellSet.hpp | 1 + .../APR/access/LinearAccess.hpp | 23 +-- test/CMakeLists.txt | 1 + test/LinearAccessTest.cpp | 185 ++++++++++++++++++ test/PullingSchemeCudaTest.cpp | 148 +------------- 5 files changed, 197 insertions(+), 161 deletions(-) create mode 100644 test/LinearAccessTest.cpp diff --git a/src/algorithm/LocalParticleCellSet.hpp b/src/algorithm/LocalParticleCellSet.hpp index f20e08c1..f834805a 100644 --- a/src/algorithm/LocalParticleCellSet.hpp +++ b/src/algorithm/LocalParticleCellSet.hpp @@ -51,6 +51,7 @@ inline int __builtin_clz(unsigned int x) #include "algorithm/PullingScheme.hpp" #include "algorithm/PullingSchemeSparse.hpp" +#include "io/TiffUtils.hpp" class LocalParticleCellSet { diff --git a/src/data_structures/APR/access/LinearAccess.hpp b/src/data_structures/APR/access/LinearAccess.hpp index b170fd2c..b00a02b0 100644 --- a/src/data_structures/APR/access/LinearAccess.hpp +++ b/src/data_structures/APR/access/LinearAccess.hpp @@ -226,40 +226,31 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet initialize_xz_linear(); //edge case - // TODO: Don't know why we need that edge case but it would be good if it run properly - // For example 'genInfo->total_number_particles' is not set, maybe other values are not set either but - // it need to be investigated or this edge case removed (?) - if level_max() <= 2 then there are no many particles - // anyway so any code should be fast enough... if(level_max()<=2){ // For performance reasons and clarity of the code, it doesn't make sense here to handle these cases. Below assumes there is atleast levels <=2; //just initialize full resolution const auto level_start = level_xz_vec[level_max()]; - uint64_t counter = 0; + uint64_t particleCounter = 0; for (int z = 0; z < z_num(level_max()); ++z) { for (int x = 0; x < x_num(level_max()); ++x) { const size_t offset_pc_data = z * x_num(level_max()) + x; - for (int y = 0; y < y_num(level_max()); ++y) { - - counter++; - } - xz_end_vec[level_start + offset_pc_data] = counter; + particleCounter += y_num(level_max()); + xz_end_vec[level_start + offset_pc_data] = particleCounter; } } - y_vec.resize(counter); - counter = 0; + genInfo->total_number_particles = xz_end_vec.back(); + y_vec.resize(genInfo->total_number_particles); + size_t idx = 0; for (int z = 0; z < z_num(level_max()); ++z) { for (int x = 0; x < x_num(level_max()); ++x) { - for (int y = 0; y < y_num(level_max()); ++y) { - y_vec[counter] = y; - counter++; + y_vec[idx++] = y; } } } - return; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6ac7e381..8df65d9d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -11,6 +11,7 @@ buildTarget(testComputeGradient ComputeGradientTest.cpp) buildTarget(testLocalIntensityScale LocalIntensityScaleTest.cpp) buildTarget(testPullingScheme PullingSchemeTest.cpp) buildTarget(testAPRParameters APRParametersTest.cpp) +buildTarget(testLinearAccess LinearAccessTest.cpp) #APR GPU Tests if(APR_USE_CUDA) diff --git a/test/LinearAccessTest.cpp b/test/LinearAccessTest.cpp new file mode 100644 index 00000000..766307b9 --- /dev/null +++ b/test/LinearAccessTest.cpp @@ -0,0 +1,185 @@ +#include + +#include "algorithm/PullingScheme.hpp" +#include "algorithm/LocalParticleCellSet.hpp" +#include "algorithm/APRConverter.hpp" + +#include "TestTools.hpp" + +template +void fillPS(PullingScheme &aPS, PixelData &levels) { + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); +} + +/** + * Prints PCT + * @param particleCellTree + */ +template +void printParticleCellTree(const std::vector> &particleCellTree) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; +// std::cout << "-- level = " << l << ", " << tree << std::endl; + tree.printMeshT(3,0); + } +} + +TEST(PullingSchemeTest, DeleteMeAfterDevelopment_fullAprPipeline) { + // TODO: delete me after development + // Full 'get apr' pipeline to test imp. on different stages + // Useful during debugging and can be removed once finished + + // Prepare input data (image) + int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + // PS input values = 5 0 0 0 0 0 0 0 + +// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; +// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; + + int len = sizeof(values)/sizeof(int); + PixelData data(len, 1, 1); + initFromZYXarray(data, values); + std::cout << "----- Input image:\n"; + data.printMeshT(3, 1); + + // Produce APR + APR apr; + APRConverter aprConverter; + aprConverter.par.rel_error = 0.1; + aprConverter.par.lambda = 0.1; + aprConverter.par.sigma_th = 0.0001; + aprConverter.par.neighborhood_optimization = true; + aprConverter.get_apr(apr, data); + + // Print information about APR and all particles + std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; + for (int l = apr.level_min(); l <= apr.level_max(); ++l) { + std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; + } + std::cout << "APR particles z x y level:\n"; + auto it = apr.iterator(); + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; + + // Sample input + ParticleData particleIntensities; + particleIntensities.sample_image(apr, data); + + // Reconstruct image from particles + PixelData reconstructImg; + APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); + std::cout << "----- Reconstructed image:"< levelImg; + APRReconstruction::reconstruct_level(apr, levelImg); + std::cout << "----- Image levels:" << std::endl; + levelImg.printMeshT(3, 1); + + // Show intensities and levels of each particle + std::cout << "----- Particle intensities:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + particleIntensities.fill_with_levels(apr); + + std::cout << "----- Particle levels:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + // Show some general information about generated APR + double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); + std::cout << std::endl; + std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; + std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; +} + + + +TEST(PullingSchemeTest, DeleteMeAfterDevelopment_PS) { + // TODO: delete me after development + // Runs PS to test imp. on different stages + // Useful during debugging and can be removed once finished +// int values[] = {0,0,0,5, 0,0,0,0}; +// int len = sizeof(values)/sizeof(int); + + PixelData levels(2, 2, 2, 0); + levels(0,0,0) = 4; + +// initFromZYXarray(levels, values); + std::cout << "---------------\n"; + levels.printMeshT(3, 1); + std::cout << "---------------\n"; + + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + std::cout << "Levels dim: " << dim << std::endl; + gi.init(dim.y * 2, dim.x * 2, dim.z * 2); // time two in y-direction since PS container is downsized. + std::cout << gi << std::endl; + + APRTimer t(false); + + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + int l_max = gi.l_max - 1; + int l_min = gi.l_min; + std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; + + fillPS(ps, levels); + + std::cout << "---------- Filled PS tree\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "---------------\n"; + + ps.pulling_scheme_main(); + t.stop_timer(); + + std::cout << "----------PS:\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "-------------\n"; + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + + std::cout << gi << std::endl; + auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << " "; std::cout << std::endl; }; + prt(linearAccess.y_vec); + prt(linearAccess.xz_end_vec); + prt(linearAccess.level_xz_vec); + + LinearIterator it(linearAccess, gi); + for (int l = 0; l <= 3; l++) { + std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl; + } + std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl; + + std::cout << "===========================\n"; + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index 20eae2e1..53eec162 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -5,12 +5,10 @@ #include "algorithm/PullingSchemeCuda.hpp" #include "algorithm/ComputeGradientCuda.hpp" -#include "algorithm/APRConverter.hpp" #include "algorithm/LocalParticleCellSet.hpp" #include "TestTools.hpp" - // Class for storing expected values for one element of Particle Cell Tree (output of Pulling Scheme) class LevelData { public: @@ -84,6 +82,7 @@ void printParticleCellTree(const std::vector> &particleCellTree) { tree.printMeshT(3,0); } } + /** * Compare * @param expected - expected levels @@ -119,153 +118,12 @@ int compareParticleCellTrees(const std::vector> &expected, const st template void fillPS(PullingScheme &aPS, PixelData &levels) { - PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); } -TEST(PullingSchemeTest, DeleteMeAfterDevelopment_fullAprPipeline) { - // TODO: delete me after development - // Full 'get apr' pipeline to test imp. on different stages - // Useful during debugging and can be removed once finished - - // Prepare input data (image) - int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; - // PS input values = 5 0 0 0 0 0 0 0 - -// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; -// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; - - int len = sizeof(values)/sizeof(int); - PixelData data(len, 1, 1); - initFromZYXarray(data, values); - std::cout << "----- Input image:\n"; - data.printMeshT(3, 1); - - // Produce APR - APR apr; - APRConverter aprConverter; - aprConverter.par.rel_error = 0.1; - aprConverter.par.lambda = 0.1; - aprConverter.par.sigma_th = 0.0001; - aprConverter.par.neighborhood_optimization = true; - aprConverter.get_apr(apr, data); - - // Print information about APR and all particles - std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; - for (int l = apr.level_min(); l <= apr.level_max(); ++l) { - std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; - } - std::cout << "APR particles z x y level:\n"; - auto it = apr.iterator(); - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; - - // Sample input - ParticleData particleIntensities; - particleIntensities.sample_image(apr, data); - - // Reconstruct image from particles - PixelData reconstructImg; - APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); - std::cout << "----- Reconstructed image:"< levelImg; - APRReconstruction::reconstruct_level(apr, levelImg); - std::cout << "----- Image levels:" << std::endl; - levelImg.printMeshT(3, 1); - - // Show intensities and levels of each particle - std::cout << "----- Particle intensities:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - particleIntensities.fill_with_levels(apr); - - std::cout << "----- Particle levels:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - // Show some general information about generated APR - double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); - std::cout << std::endl; - std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; - std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; -} - - - -TEST(PullingSchemeTest, DeleteMeAfterDevelopment_PS) { - // TODO: delete me after development - // Runs PS to test imp. on different stages - // Useful during debugging and can be removed once finished - int values[] = {0,0,0,5, 0,0,0,0}; - int len = sizeof(values)/sizeof(int); - - PixelData levels(3,3,3, 0); - levels(2,2,2) = 11; - -// initFromZYXarray(levels, values); - levels.printMeshT(3, 1); - - GenInfo gi; - const PixelDataDim dim = levels.getDimension(); - std::cout << "Levels dim: " << dim << std::endl; - gi.init(dim.y * 2, dim.x * 2, dim.z * 2); // time two in y-direction since PS container is downsized. - std::cout << gi << std::endl; - - APRTimer t(false); - - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(gi); - int l_max = gi.l_max - 1; - int l_min = gi.l_min; - std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; - - fillPS(ps, levels); - - std::cout << "---------- Filled PS tree\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "---------------\n"; - - ps.pulling_scheme_main(); - t.stop_timer(); - - std::cout << "----------PS:\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "-------------\n"; - - LinearAccess linearAccess; - linearAccess.genInfo = &gi; - APRParameters par; - std::cout << "1\n"; - linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); - std::cout << "2\n"; - LinearIterator it(linearAccess, gi); - - std::cout << "===========================\n"; - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; -} +// ------------------------------------------------------------------------------------------------------------------------------------------- TEST(PullingSchemeTest, PSvsOVPCCUDA) { // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC From 027e52aaac7068a0457a46d6f3c665d2136efe91 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Feb 2024 18:29:49 +0100 Subject: [PATCH 40/59] Finished LinearAccess tests (for linear structure only), added draft for LinearAccessCuda --- test/CMakeLists.txt | 1 + test/LinearAccessCudaTest.cpp | 182 ++++++++++++++++++ test/LinearAccessTest.cpp | 339 +++++++++++++++++++--------------- test/TestTools.hpp | 6 +- 4 files changed, 381 insertions(+), 147 deletions(-) create mode 100644 test/LinearAccessCudaTest.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8df65d9d..d3377fb0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -20,6 +20,7 @@ if(APR_USE_CUDA) buildTarget(testLocalIntensityScaleCuda LocalIntensityScaleCudaTest.cpp) buildTarget(testFullPipelineCuda FullPipelineCudaTest.cpp) buildTarget(testPullingSchemeCuda PullingSchemeCudaTest.cpp) + buildTarget(testLinearAccessCuda LinearAccessCudaTest.cpp) endif() if(APR_BUILD_EXAMPLES) diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp new file mode 100644 index 00000000..bae93233 --- /dev/null +++ b/test/LinearAccessCudaTest.cpp @@ -0,0 +1,182 @@ +#include + +#include "algorithm/LocalParticleCellSet.hpp" +#include "algorithm/PullingScheme.hpp" +#include "algorithm/APRConverter.hpp" + +#include "TestTools.hpp" + + +template +void fillPS(PullingScheme &aPS, PixelData &levels) { + PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); +} + +/** + * Prints PCT + * @param particleCellTree + */ +template +void printParticleCellTree(const std::vector> &particleCellTree) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; +// std::cout << "-- level = " << l << ", " << tree << std::endl; + tree.printMeshT(3,0); + } +} + +TEST(LinearAccessTest, DeleteMeAfterDevelopment_fullAprPipeline) { + // TODO: delete me after development + // Full 'get apr' pipeline to test imp. on different stages + // Useful during debugging and can be removed once finished + + // Prepare input data (image) + int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + // PS input values = 5 0 0 0 0 0 0 0 + +// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; +// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; + + int len = sizeof(values)/sizeof(int); + PixelData data(len, 1, 1); + initFromZYXarray(data, values); + std::cout << "----- Input image:\n"; + data.printMeshT(3, 1); + + // Produce APR + APR apr; + APRConverter aprConverter; + aprConverter.par.rel_error = 0.1; + aprConverter.par.lambda = 0.1; + aprConverter.par.sigma_th = 0.0001; + aprConverter.par.neighborhood_optimization = true; + aprConverter.get_apr(apr, data); + + // Print information about APR and all particles + std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; + for (int l = apr.level_min(); l <= apr.level_max(); ++l) { + std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; + } + std::cout << "APR particles z x y level:\n"; + auto it = apr.iterator(); + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; + + // Sample input + ParticleData particleIntensities; + particleIntensities.sample_image(apr, data); + + // Reconstruct image from particles + PixelData reconstructImg; + APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); + std::cout << "----- Reconstructed image:"< levelImg; + APRReconstruction::reconstruct_level(apr, levelImg); + std::cout << "----- Image levels:" << std::endl; + levelImg.printMeshT(3, 1); + + // Show intensities and levels of each particle + std::cout << "----- Particle intensities:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + particleIntensities.fill_with_levels(apr); + + std::cout << "----- Particle levels:\n"; + for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; + std::cout << std::endl; + + // Show some general information about generated APR + double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); + std::cout << std::endl; + std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; + std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; +} + + + +TEST(LinearAccessTest, DeleteMeAfterDevelopment_PS) { + // TODO: delete me after development + // Runs PS to test imp. on different stages + // Useful during debugging and can be removed once finished +// int values[] = {0,0,0,5, 0,0,0,0}; +// int len = sizeof(values)/sizeof(int); + + PixelData levels(3, 4,4, 0); + levels(0,0,0) = 4; + +// initFromZYXarray(levels, values); + std::cout << "---------------\n"; + levels.printMeshT(3, 1); + std::cout << "---------------\n"; + + GenInfo gi; + const PixelDataDim dim = levels.getDimension(); + std::cout << "Levels dim: " << dim << std::endl; + gi.init(dim.y * 2, dim.x * 1, dim.z * 1); // time two in y-direction since PS container is downsized. + std::cout << gi << std::endl; + + APRTimer t(false); + + t.start_timer("PS1"); + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + int l_max = gi.l_max - 1; + int l_min = gi.l_min; + std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; + + fillPS(ps, levels); + + std::cout << "---------- Filled PS tree\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "---------------\n"; + + ps.pulling_scheme_main(); + t.stop_timer(); + + std::cout << "----------PS:\n"; + printParticleCellTree(ps.getParticleCellTree()); + std::cout << "-------------\n"; + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = false; + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + + std::cout << gi << std::endl; + auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; + prt(linearAccess.y_vec); + prt(linearAccess.xz_end_vec); + prt(linearAccess.level_xz_vec); + + LinearIterator it(linearAccess, gi); + for (int l = 0; l <= 3; l++) { + std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl; + } + std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl; + + std::cout << "===========================\n"; + for (int level = it.level_min(); level <= it.level_max(); ++level) { + for (int z = 0; z < it.z_num(level); z++) { + for (int x = 0; x < it.x_num(level); ++x) { + for (it.begin(level, z, x); it < it.end(); it++) { + std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; + } + } + } + } + std::cout << std::endl; +} \ No newline at end of file diff --git a/test/LinearAccessTest.cpp b/test/LinearAccessTest.cpp index 766307b9..00ef84be 100644 --- a/test/LinearAccessTest.cpp +++ b/test/LinearAccessTest.cpp @@ -2,181 +2,230 @@ #include "algorithm/PullingScheme.hpp" #include "algorithm/LocalParticleCellSet.hpp" -#include "algorithm/APRConverter.hpp" #include "TestTools.hpp" -template -void fillPS(PullingScheme &aPS, PixelData &levels) { - PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); - LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); -} /** - * Prints PCT - * @param particleCellTree + * Create PCT with provided data + * @param aprInfo + * @param levels complete list of values from level min to level max in form { {level, min, values}, ..., {level, max, values} } + * @return Particle Cell Tree with values */ -template -void printParticleCellTree(const std::vector> &particleCellTree) { - for (uint64_t l = 0; l < particleCellTree.size(); ++l) { - auto &tree = particleCellTree[l]; -// std::cout << "-- level = " << l << ", " << tree << std::endl; - tree.printMeshT(3,0); - } -} +auto makePCT(const GenInfo &aprInfo, std::initializer_list> levels) { + auto pct = PullingScheme::generateParticleCellTree(aprInfo); + + + int l = aprInfo.l_min; -TEST(PullingSchemeTest, DeleteMeAfterDevelopment_fullAprPipeline) { - // TODO: delete me after development - // Full 'get apr' pipeline to test imp. on different stages - // Useful during debugging and can be removed once finished - - // Prepare input data (image) - int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; - // PS input values = 5 0 0 0 0 0 0 0 - -// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; -// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; - - int len = sizeof(values)/sizeof(int); - PixelData data(len, 1, 1); - initFromZYXarray(data, values); - std::cout << "----- Input image:\n"; - data.printMeshT(3, 1); - - // Produce APR - APR apr; - APRConverter aprConverter; - aprConverter.par.rel_error = 0.1; - aprConverter.par.lambda = 0.1; - aprConverter.par.sigma_th = 0.0001; - aprConverter.par.neighborhood_optimization = true; - aprConverter.get_apr(apr, data); - - // Print information about APR and all particles - std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; - for (int l = apr.level_min(); l <= apr.level_max(); ++l) { - std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; + // PS levels range is [l_max - 1, l_min] + if (((aprInfo.l_max - 1) - aprInfo.l_min + 1) != (int) levels.size()) { + throw std::runtime_error("Wrong number of level data provided!"); } - std::cout << "APR particles z x y level:\n"; - auto it = apr.iterator(); - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } + for (auto &level : levels) { + if (pct[l].getDimension().size() != level.size()) { + std::cerr << "Provided data for level=" << l << " differs from level size " << pct[l].getDimension().size() << " vs. " << level.size() << std::endl; + std::cerr << aprInfo << std::endl; + throw std::runtime_error("Not this time..."); } + std::copy(level.begin(), level.end(), pct[l].mesh.begin()); + l++; } - std::cout << std::endl; - - // Sample input - ParticleData particleIntensities; - particleIntensities.sample_image(apr, data); - - // Reconstruct image from particles - PixelData reconstructImg; - APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); - std::cout << "----- Reconstructed image:"< levelImg; - APRReconstruction::reconstruct_level(apr, levelImg); - std::cout << "----- Image levels:" << std::endl; - levelImg.printMeshT(3, 1); - - // Show intensities and levels of each particle - std::cout << "----- Particle intensities:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - particleIntensities.fill_with_levels(apr); - - std::cout << "----- Particle levels:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - // Show some general information about generated APR - double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); - std::cout << std::endl; - std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; - std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; + + return pct; +} + +TEST(LinearAccessTest, optimizationForSmallLevels) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(4, 3, 2); + auto pct = makePCT(gi, {{1, 2, 3, 4}}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // all 'y' particles for each xz + std::vector expected_xz_end_vec = {0, 0, 0, 4, 8, 12, 16, 20, 24}; + std::vector expected_level_xz_vec = {1, 1, 3, 9}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); } +TEST(LinearAccessTest, yDirNeighbourhoodOptTrue) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(16, 1, 1); + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); -TEST(PullingSchemeTest, DeleteMeAfterDevelopment_PS) { - // TODO: delete me after development - // Runs PS to test imp. on different stages - // Useful during debugging and can be removed once finished -// int values[] = {0,0,0,5, 0,0,0,0}; -// int len = sizeof(values)/sizeof(int); + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; - PixelData levels(2, 2, 2, 0); - levels(0,0,0) = 4; + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); -// initFromZYXarray(levels, values); - std::cout << "---------------\n"; - levels.printMeshT(3, 1); - std::cout << "---------------\n"; + // ---- Verify output + std::vector expected_y_vec = {2, 3, 1, 2, 3, 0, 1}; + std::vector expected_xz_end_vec = {0, 0, 2, 5, 7}; + std::vector expected_level_xz_vec = {1, 1, 2, 3, 4, 5}; + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); +} + +TEST(LinearAccessTest, yDirNeighbourhoodOptFalse) { + + // --- Create input data structures and objects GenInfo gi; - const PixelDataDim dim = levels.getDimension(); - std::cout << "Levels dim: " << dim << std::endl; - gi.init(dim.y * 2, dim.x * 2, dim.z * 2); // time two in y-direction since PS container is downsized. - std::cout << gi << std::endl; + gi.init(16, 1, 1); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = false; - APRTimer t(false); + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(gi); - int l_max = gi.l_max - 1; - int l_min = gi.l_min; - std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; + // ---- Verify output + std::vector expected_y_vec = {2, 3, 2, 3, 0, 1, 2, 3}; + std::vector expected_xz_end_vec = {0, 0, 2, 4, 8}; + std::vector expected_level_xz_vec = {1, 1, 2, 3, 4, 5}; - fillPS(ps, levels); + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); +} - std::cout << "---------- Filled PS tree\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "---------------\n"; +TEST(LinearAccessTest, xDirNeighbourhoodOptTrue) { - ps.pulling_scheme_main(); - t.stop_timer(); + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 16, 1); - std::cout << "----------PS:\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "-------------\n"; + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); LinearAccess linearAccess; linearAccess.genInfo = &gi; APRParameters par; - linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + par.neighborhood_optimization = true; - std::cout << gi << std::endl; - auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << " "; std::cout << std::endl; }; - prt(linearAccess.y_vec); - prt(linearAccess.xz_end_vec); - prt(linearAccess.level_xz_vec); + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); - LinearIterator it(linearAccess, gi); - for (int l = 0; l <= 3; l++) { - std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl; - } - std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl; - - std::cout << "===========================\n"; - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); +} + +TEST(LinearAccessTest, xDirNeighbourhoodOptFalse) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 16, 1); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = false; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); +} + +TEST(LinearAccessTest, zDirNeighbourhoodOptTrue) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 1, 16); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); +} + +TEST(LinearAccessTest, zDirNeighbourhoodOptFalse) { + + // --- Create input data structures and objects + GenInfo gi; + gi.init(1, 1, 16); + + auto pct = makePCT(gi, {{0, 0}, + {0, 0, 3, 3}, + {1, 2, 3, 3, 0, 0, 0, 0}}); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = false; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 0, 0, 0, 0, 0, 0, 0}; + std::vector expected_xz_end_vec = {0, 0, 0, 0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + std::vector expected_level_xz_vec = {1, 1, 3, 7, 15, 31}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); } int main(int argc, char **argv) { diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 491599aa..2baa2369 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -126,14 +126,16 @@ inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTyp } for (size_t i = 0; i < expected.size(); ++i) { - if (std::abs(expected[i] - tested[i]) > maxError) { + if (std::abs((double)(expected[i] - tested[i])) > maxError) { if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested particle: " << (float)expected[i] << " vs " << (float)tested[i] << " IDX:" << i << std::endl; } cnt++; } } - std::cout << "Number of errors / all points: " << cnt << " / " << expected.size() << std::endl; + if (cnt != 0) { + std::cout << "Number of errors / all points: " << cnt << " / " << expected.size() << std::endl; + } return cnt; } From e83b9527e2dec702776a71e00ffff6d81409b568 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 23 Feb 2024 10:25:53 +0100 Subject: [PATCH 41/59] Check also total_number_particles in LinearAccess test --- test/LinearAccessCudaTest.cpp | 16 ++++++++++------ test/LinearAccessTest.cpp | 14 ++++++++++++++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp index bae93233..24e961fd 100644 --- a/test/LinearAccessCudaTest.cpp +++ b/test/LinearAccessCudaTest.cpp @@ -26,7 +26,7 @@ void printParticleCellTree(const std::vector> &particleCellTree) { } } -TEST(LinearAccessTest, DeleteMeAfterDevelopment_fullAprPipeline) { +TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_fullAprPipeline) { // TODO: delete me after development // Full 'get apr' pipeline to test imp. on different stages // Useful during debugging and can be removed once finished @@ -106,15 +106,14 @@ TEST(LinearAccessTest, DeleteMeAfterDevelopment_fullAprPipeline) { } - -TEST(LinearAccessTest, DeleteMeAfterDevelopment_PS) { +TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { // TODO: delete me after development // Runs PS to test imp. on different stages // Useful during debugging and can be removed once finished // int values[] = {0,0,0,5, 0,0,0,0}; // int len = sizeof(values)/sizeof(int); - PixelData levels(3, 4,4, 0); + PixelData levels(3, 1, 1, 0); levels(0,0,0) = 4; // initFromZYXarray(levels, values); @@ -153,7 +152,7 @@ TEST(LinearAccessTest, DeleteMeAfterDevelopment_PS) { LinearAccess linearAccess; linearAccess.genInfo = &gi; APRParameters par; - par.neighborhood_optimization = false; + par.neighborhood_optimization = true; linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); std::cout << gi << std::endl; @@ -179,4 +178,9 @@ TEST(LinearAccessTest, DeleteMeAfterDevelopment_PS) { } } std::cout << std::endl; -} \ No newline at end of file +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/LinearAccessTest.cpp b/test/LinearAccessTest.cpp index 00ef84be..b6c67db8 100644 --- a/test/LinearAccessTest.cpp +++ b/test/LinearAccessTest.cpp @@ -58,6 +58,8 @@ TEST(LinearAccessTest, optimizationForSmallLevels) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, yDirNeighbourhoodOptTrue) { @@ -86,6 +88,8 @@ TEST(LinearAccessTest, yDirNeighbourhoodOptTrue) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, yDirNeighbourhoodOptFalse) { @@ -114,6 +118,8 @@ TEST(LinearAccessTest, yDirNeighbourhoodOptFalse) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, xDirNeighbourhoodOptTrue) { @@ -142,6 +148,8 @@ TEST(LinearAccessTest, xDirNeighbourhoodOptTrue) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, xDirNeighbourhoodOptFalse) { @@ -170,6 +178,8 @@ TEST(LinearAccessTest, xDirNeighbourhoodOptFalse) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, zDirNeighbourhoodOptTrue) { @@ -198,6 +208,8 @@ TEST(LinearAccessTest, zDirNeighbourhoodOptTrue) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } TEST(LinearAccessTest, zDirNeighbourhoodOptFalse) { @@ -226,6 +238,8 @@ TEST(LinearAccessTest, zDirNeighbourhoodOptFalse) { EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); } int main(int argc, char **argv) { From 2cc5bcabd534274477f0a47ab69b50e3453d244f Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 2 Aug 2024 15:33:24 +0200 Subject: [PATCH 42/59] LinearAccessCuda implemented (it is not used yet in CUDA pipeline) --- CMakeLists.txt | 1 + src/algorithm/PullingScheme.hpp | 7 + .../APR/access/LinearAccess.hpp | 39 +- .../APR/access/LinearAccessCuda.cu | 593 ++++++++++++++++++ .../APR/access/LinearAccessCuda.hpp | 17 + src/data_structures/Mesh/PixelData.hpp | 15 +- test/LinearAccessCudaTest.cpp | 227 ++++++- test/PullingSchemeCudaTest.cpp | 33 - test/PullingSchemeTest.cpp | 33 - test/TestTools.hpp | 40 +- 10 files changed, 905 insertions(+), 100 deletions(-) create mode 100644 src/data_structures/APR/access/LinearAccessCuda.cu create mode 100644 src/data_structures/APR/access/LinearAccessCuda.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7daa68a3..56cd98ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -226,6 +226,7 @@ if(APR_USE_CUDA) src/algorithm/LocalIntensityScale.cu src/algorithm/OVPC.cu src/data_structures/APR/access/GPUAccess.cu + src/data_structures/APR/access/LinearAccessCuda.cu src/numerics/miscCuda.cu src/numerics/APRDownsampleGPU.cu src/numerics/PixelNumericsGPU.cu diff --git a/src/algorithm/PullingScheme.hpp b/src/algorithm/PullingScheme.hpp index c6756df6..05b0b723 100644 --- a/src/algorithm/PullingScheme.hpp +++ b/src/algorithm/PullingScheme.hpp @@ -13,14 +13,21 @@ #include "data_structures/Mesh/ImagePatch.hpp" #include +// Main types #define EMPTY 0 #define SEED_TYPE 1 #define BOUNDARY_TYPE 2 #define FILLER_TYPE 3 + +// Type used in linear/random access +#define UPSAMPLING_SEED_TYPE 4 + +// Types specific for this implementation of Pulling Scheme (OVPC is not using them) #define ASCENDANT 8 #define PROPOGATE 15 #define ASCENDANTNEIGHBOUR 16 + #define NEIGHBOURLOOP(jn,in,kn, boundaries) \ for(jn = boundaries[0][0]; jn < boundaries[0][1]; jn++) \ for(in = boundaries[1][0]; in < boundaries[1][1]; in++) \ diff --git a/src/data_structures/APR/access/LinearAccess.hpp b/src/data_structures/APR/access/LinearAccess.hpp index b00a02b0..b92476c2 100644 --- a/src/data_structures/APR/access/LinearAccess.hpp +++ b/src/data_structures/APR/access/LinearAccess.hpp @@ -11,6 +11,7 @@ #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/APRParameters.hpp" +#include "algorithm/PullingScheme.hpp" #include "APRAccessStructures.hpp" @@ -225,6 +226,9 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet initialize_xz_linear(); + // ********************************************************************************************************************* + // FULL RESOLUTION + // ********************************************************************************************************************* //edge case if(level_max()<=2){ // For performance reasons and clarity of the code, it doesn't make sense here to handle these cases. Below assumes there is atleast levels <=2; @@ -254,10 +258,11 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet return; } - // ======================================================================== + // ********************************************************************************************************************* + // FIRST STEP + // ********************************************************************************************************************* apr_timer.start_timer("first_step"); - const uint8_t UPSAMPLING_SEED_TYPE = 4; const uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization for (int level = level_min()+1; level < level_max(); ++level) { const size_t xLen = genInfo->x_num[level]; @@ -288,7 +293,9 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet } apr_timer.stop_timer(); - // ======================================================================== + // ********************************************************************************************************************* + // SECOND STEP + // ********************************************************************************************************************* apr_timer.start_timer("second_step"); @@ -323,14 +330,15 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet } } + +// ********************************************************************************************************************* +// SECOND STEP LAST LEVEL +// +// l_max - 1 is special as it also has the l_max information that then needs to be upsampled. +// ********************************************************************************************************************* std::vector temp_max_xz; temp_max_xz.resize(genInfo->z_num[genInfo->l_max - 1]*genInfo->x_num[genInfo->l_max - 1],0); - /* - * l_max - 1 is special as it also has the l_max information that then needs to be upsampled. - * - */ - size_t l_minus_1 = genInfo->l_max - 1; const size_t xLen = genInfo->x_num[l_minus_1]; const size_t zLen = genInfo->z_num[l_minus_1]; @@ -404,6 +412,11 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet apr_timer.stop_timer(); + + // ********************************************************************************************************************* + // THIRD STEP - Get Y values + // ********************************************************************************************************************* + apr_timer.start_timer("init y"); genInfo->total_number_particles = xz_end_vec.back(); @@ -447,10 +460,11 @@ inline void LinearAccess::initialize_linear_structure(APRParameters& apr_paramet } } - /* - * l_max - 1 is special as it also has the l_max information that then needs to be upsampled. - * - */ + // ********************************************************************************************************************* + // 4th STEP LAST LEVEL + // + // l_max - 1 is special as it also has the l_max information that then needs to be upsampled. + // ********************************************************************************************************************* #ifdef HAVE_OPENMP @@ -540,7 +554,6 @@ inline void LinearAccess::initialize_linear_structure_sparse(APRParameters& apr_ // ======================================================================== apr_timer.start_timer("first_step"); - const uint8_t UPSAMPLING_SEED_TYPE = 4; const uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization for (int level = level_min()+1; level < level_max(); ++level) { const size_t xLen = genInfo->x_num[level]; diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu new file mode 100644 index 00000000..8ce7e347 --- /dev/null +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -0,0 +1,593 @@ +#include "LinearAccessCuda.hpp" + +#include "misc/CudaTools.cuh" + +// CUDA version of GenInfo structure +typedef struct GenInfoCuda_t { + int l_min; + int l_max; + + int *org_dims; // fixed size: [3] + + uint8_t number_dimensions; + + int *x_num; + int *y_num; + int *z_num; + + // this differs from original GenInfo structure + // since we need to be able to send data back from GPU to CPU + uint64_t *total_number_particles; + + int *level_size; + + uint64_t get_total_number_particles() const { return *total_number_particles; } + + __device__ int level_max() const { return l_max; } + __device__ int level_min() const { return l_min; } + +} GenInfoCuda; + +// ----------------------------- + +/* + * Class for easy transfering to/from GPU of GenInfo structure. + */ +class GenInfoGpuAccess { + GenInfo &gi; + + cudaStream_t iStream; + + ScopedCudaMemHandler org_dims; + ScopedCudaMemHandler x_num; + ScopedCudaMemHandler y_num; + ScopedCudaMemHandler z_num; + ScopedCudaMemHandler total_number_particles; + ScopedCudaMemHandler level_size; + + +public: + GenInfoGpuAccess(GenInfo &genInfo, cudaStream_t cudaStream) : + gi(genInfo), + iStream(cudaStream), + org_dims(gi.org_dims, 3, iStream), + x_num(gi.x_num.data(), gi.x_num.size(), iStream), + y_num(gi.y_num.data(), gi.y_num.size(), iStream), + z_num(gi.z_num.data(), gi.z_num.size(), iStream), + total_number_particles(&gi.total_number_particles, 1, iStream), + level_size(gi.level_size.data(), gi.level_size.size(), iStream) + { + } + + GenInfoCuda getGenInfoCuda() { + GenInfoCuda gic; + + gic.l_min = gi.l_min; + gic.l_max = gi.l_max; + gic.org_dims = org_dims.get(); + gic.number_dimensions = gi.number_dimensions; + gic.x_num = x_num.get(); + gic.y_num = y_num.get(); + gic.z_num = z_num.get(); + gic.total_number_particles = total_number_particles.get(); + gic.level_size = level_size.get(); + + return gic; + } + + ~GenInfoGpuAccess() { + copyDtoH(); + } + + void copyHtoD() { + // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension) + total_number_particles.copyH2D(); + } + + void copyDtoH() { + // The only data that can change between CPU & GPU (the rest values are fixed based on input image dimension) + total_number_particles.copyD2H(); + } +}; + +typedef ScopedCudaMemHandler, H2D | D2H> ParticleCellTreeLevelCuda; +typedef std::vector ParticleCellTreeCuda; + +// ********************************************************************************************************************* +// FULL RESOLUTION +// ********************************************************************************************************************* +/** + * Handle edge case for #levels <= 2 + * For performance reasons and clarity of the code, + * it doesn't make sense here to handle these cases. + * Below assumes there is at least levels <=2; + * @param level_xz + * @param xz_end + * @param y + * @param gic - cuda version of GenInfo + */ +__global__ void fullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, GenInfoCuda gic) { + + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const unsigned levelMax = gic.level_max(); + const uint64_t xMax = gic.x_num[levelMax]; + const uint64_t yMax = gic.y_num[levelMax]; + const uint64_t zMax = gic.z_num[levelMax]; + + + if (x < xMax && z < zMax) { + const uint64_t levelStart = level_xz[levelMax]; + uint64_t offset_pc_data = z * xMax + x; + uint64_t particleCounter = (1 + x + z * xMax) * yMax; + + xz_end[levelStart + offset_pc_data] = particleCounter; + + for (int i = 0; i < yMax; ++i) { + uint64_t idx = (xMax * z + x) * yMax + i; + y[idx] = i; + } + } + + if (x == 0 && z == 0) { + *gic.total_number_particles = xMax * yMax * zMax; + } +} + +void runFullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, GenInfo &gi, GenInfoGpuAccess &giga, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[gi.l_max] + threadsPerBlock.z - 1)/threadsPerBlock.z); + fullResolution<<>>(level_xz, xz_end, y, giga.getGenInfoCuda()); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFullResolution failed"); + } +} + + +// ********************************************************************************************************************* +// FIRST STEP +// ********************************************************************************************************************* + +constexpr uint8_t UPSAMPLING_SEED_TYPE = 4; +static constexpr uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization + + +__global__ void firstStep(uint8_t *prevLevel, uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level]; + const uint64_t yLen = gic.y_num[level]; + const uint64_t zLen = gic.z_num[level]; + const uint64_t xLenDS = gic.x_num[level - 1]; + const uint64_t yLenDS = gic.y_num[level - 1]; + + if (x < xLen && z < zLen) { + const size_t offset_part_map_ds = (x / 2) * yLenDS + (z / 2) * yLenDS * xLenDS; + const size_t offset_part_map = x * yLen + z * yLen * xLen; + + for (size_t y = 0; y < yLenDS; ++y) { + uint8_t status = prevLevel[offset_part_map_ds + y]; + if (status > 0 && status <= min_type) { + currLevel[offset_part_map + 2 * y] = seed_us; // 2 * y + currLevel[offset_part_map + min(2 * y + 1, yLen - 1)] = seed_us; // 2 * y + 1 + } + } + } +} + +void runFirstStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + for (int level = gi.l_min + 1; level < gi.l_max; ++level) { + dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); + auto &p_mapPrev = p_map[level - 1]; + auto &p_mapCurr = p_map[level]; + firstStep<<>>(p_mapPrev.get(), p_mapCurr.get(), level, min_type, giga.getGenInfoCuda()); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFirstStep failed"); + } +} + + +// ********************************************************************************************************************* +// SECOND STEP +// ********************************************************************************************************************* + + +__global__ void secondStep(const uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level]; + const uint64_t yLen = gic.y_num[level]; + const uint64_t zLen = gic.z_num[level]; + + const uint64_t level_start = level_xz[level]; + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + const size_t offset_part_map = yLen * offset_pc_data; + + uint64_t counter = 0; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + counter++; + } + } + + xz_end[level_start + offset_pc_data] = counter; + } +} + +void runSecondStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + for (int level = gi.l_min; level < gi.l_max - 1; ++level) { + dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); + auto &p_mapCurr = p_map[level]; + secondStep<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runSecondStep failed"); + } +} + + +// ********************************************************************************************************************* +// SECOND STEP LAST LEVEL +// +// l_max - 1 is special as it also has the l_max information that then needs to be upsampled. +// ********************************************************************************************************************* + + +__global__ void secondStepLastLevel(const uint8_t *currLevel, int level_minus_1, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level_minus_1]; + const uint64_t yLen = gic.y_num[level_minus_1]; + const uint64_t zLen = gic.z_num[level_minus_1]; + + const uint64_t xLen_m = gic.x_num[level_minus_1 + 1]; // level max + const uint64_t yLen_m = gic.y_num[level_minus_1 + 1]; // level max + const uint64_t zLen_m = gic.z_num[level_minus_1 + 1]; // level max + + const uint64_t level_start = level_xz[level_minus_1]; + const uint64_t level_start_m = level_xz[level_minus_1 + 1]; // level max + + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + const size_t offset_part_map = yLen * offset_pc_data; + + uint64_t counter = 0; + uint64_t counter_l = 0; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + counter++; + } + else if (status > 0 && status <= min_type) { + counter_l++; + + if ((2 * y) < (yLen_m - 1)) { + counter_l++; + } + } + } + + xz_end[level_start + offset_pc_data] = counter; + + // In original CPU code value of counter_l is remembered in temporary buffer and later + // write down to xz_end vector. Here is the solution without need of temp. buffer. + for (size_t dz = 0; dz <= 1; dz++) { + for (size_t dx = 0; dx <= 1; dx++) { + size_t uz = 2 * z + dz; // upsampled z + size_t ux = 2 * x + dx; // upsampled x + if (uz < zLen_m && ux < xLen_m) { + const size_t offset_pc_data_m = uz * xLen_m + ux; + xz_end[level_start_m + offset_pc_data_m] = counter_l; + } + } + } + + } +} + +__global__ void secondStepCountParticles(GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total) { + // std::partial_sum on one CUDA core naive implementation + size_t sum = xz_end[0]; + for (size_t i = 1; i < counter_total; i++) { + sum += xz_end[i]; + xz_end[i] = sum; + } + + *gic.total_number_particles = xz_end[counter_total -1]; +} + +void runSecondStepLastLevel(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + dim3 numBlocks( (gi.x_num[gi.l_max - 1] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[gi.l_max - 1] + threadsPerBlock.z - 1)/threadsPerBlock.z); + + int level = gi.l_max - 1; + auto &p_mapCurr = p_map[level]; + secondStepLastLevel<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runSecondStepLastLevel #1 failed"); + } + + secondStepCountParticles<<<1, 1, 0, aStream>>>(giga.getGenInfoCuda(), level_xz, xz_end, counter_total); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runSecondStepLastLevel #2 failed"); + } +} + + +// ********************************************************************************************************************* +// THIRD STEP - Get Y values +// ********************************************************************************************************************* + + +__global__ void getYvalues(const uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level]; + const uint64_t yLen = gic.y_num[level]; + const uint64_t zLen = gic.z_num[level]; + + const uint64_t level_start = level_xz[level]; + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + const size_t offset_part_map = yLen * offset_pc_data; + + uint64_t counter = 0; + + uint64_t offset_y = xz_end[level_start + offset_pc_data - 1]; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + y_vec[counter + offset_y] = y; + counter++; + } + } + } +} + +void runGetYvalues(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + + for (int level = gi.l_min; level < gi.l_max - 1; ++level) { + dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); + auto &p_mapCurr = p_map[level]; + getYvalues<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runGetYvalues failed"); + } +} + + +// ********************************************************************************************************************* +// 4th STEP LAST LEVEL +// +// l_max - 1 is special as it also has the l_max information that then needs to be upsampled. +// ********************************************************************************************************************* + + +__global__ void fourthStep(const uint8_t *currLevel, int level_minus_1, uint8_t min_type, GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + const uint64_t xLen = gic.x_num[level_minus_1]; + const uint64_t yLen = gic.y_num[level_minus_1]; + const uint64_t zLen = gic.z_num[level_minus_1]; + + const uint64_t xLen_m = gic.x_num[level_minus_1 + 1]; // level max + const uint64_t yLen_m = gic.y_num[level_minus_1 + 1]; // level max + + const uint64_t level_start_minus_1 = level_xz[level_minus_1]; + const uint64_t level_start_m = level_xz[level_minus_1 + 1]; // level max + + + if (x < xLen && z < zLen) { + const size_t offset_pc_data = z * xLen + x; + + const size_t offset_pc_data_m = (z*2) * xLen_m + x * 2; + const size_t offset_part_map = yLen * offset_pc_data; // current level + + uint64_t counter = 0; + uint64_t counter_l = 0; + + uint64_t offset_y = xz_end[level_start_minus_1 + offset_pc_data - 1]; + uint64_t offset_y_m = xz_end[level_start_m + offset_pc_data_m -1]; + + for (size_t y = 0; y < yLen; ++y) { + uint8_t status = currLevel[offset_part_map + y]; + if (status > min_type && status <= UPSAMPLING_SEED_TYPE) { + y_vec[counter + offset_y] = y; + counter++; + } + else if (status > 0 && status <= min_type) { + y_vec[counter_l + offset_y_m] = 2*y; + counter_l++; + + if ((2 * y) < (yLen_m - 1)) { + y_vec[counter_l + offset_y_m] = 2*y + 1; + counter_l++; + } + } + } + } +} + +__global__ void fourthStepLastLevel(GenInfoCuda gic, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec) { + const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + + int maxLevel = gic.level_max(); + const uint64_t xLen_m = gic.x_num[maxLevel]; // level max + const uint64_t zLen_m = gic.z_num[maxLevel]; // level max + + const uint64_t level_start_m = level_xz[maxLevel]; + + + if (x < xLen_m && z < zLen_m) { + + // first check if it's not already there + if ( ((z % 2) != 0) || ((x % 2) != 0) ) { + const size_t offset_pc_data_m = z * xLen_m + x; + const size_t offset_pc_data_m_f = (z/2) * 2 * xLen_m + (x/2) * 2; + + uint64_t offset_y_b_f = xz_end[level_start_m + offset_pc_data_m_f - 1]; + uint64_t offset_y_e_f = xz_end[level_start_m + offset_pc_data_m_f]; + uint64_t offset_y_b = xz_end[level_start_m + offset_pc_data_m - 1]; + + for (uint64_t idx = offset_y_b_f; idx < offset_y_e_f; ++idx) { + y_vec[offset_y_b++] = y_vec[idx]; + } + } + + } +} + +void runFourthStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, uint64_t counter_total, cudaStream_t aStream) { + dim3 threadsPerBlock(32, 1, 1); + dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x, + 1, + (gi.z_num[gi.l_max] + threadsPerBlock.z - 1)/threadsPerBlock.z); + + int level = gi.l_max - 1; + auto &p_mapCurr = p_map[level]; + fourthStep<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFourthStep #1 failed"); + } + + fourthStepLastLevel<<>>(giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + + cudaError_t err2 = cudaGetLastError(); + if (err2 != cudaSuccess) { + printf("----------------------------------Error: %s\n", cudaGetErrorString(err)); + throw std::runtime_error("runFourthStep #2 failed"); + } +} + + +// ********************************************************************************************************************* +// MAIN FUNC TO CALL - implements logic of inearAccess::initialize_linear_structure CPU func. +// ********************************************************************************************************************* + + +/* + * This function does everything: + * - creates CPU structures + * - copies everything to GPU + * - run computation of all linear-structures + * - copy it back to CPU + * - returns all the structure + * + * In current shape it is a good function for testing implementation rather than using it in production code. + * Production code should use parts of it and work on pre-allocated memory - probably in GpuProcessingTask. + */ +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters &apr_parameters, std::vector> &pct) { + // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing + // all steps + ParticleCellTreeCuda p_map; + for (auto &p : pct) { + p_map.emplace_back(std::move(ParticleCellTreeLevelCuda(p))); + } + + uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2; + + VectorData y_vec(true); + VectorData xz_end_vec(true); + VectorData level_xz_vec(true); + + // initialize_xz_linear() - CPU impl. + uint64_t counter_total = 1; //the buffer val to allow -1 calls without checking. + level_xz_vec.resize(gi.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1; + level_xz_vec[0] = 1; //allowing for the offset. + for (int i = 0; i <= gi.l_max; ++i) { + counter_total += gi.x_num[i] * gi.z_num[i]; + level_xz_vec[i + 1] = counter_total; + } + xz_end_vec.resize(counter_total, 0); + +// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; +// prt(y_vec); +// prt(xz_end_vec); +// prt(level_xz_vec); + + // TODO: This is temporary solution. + // Since in CPU code size of y_vec is calculated 'on the fly' and in CUDA code it would be much better + // to have pre-allocated memory for that - currently y_vec is pre-allocated to have maximum size. This is not + // optimal but always working solution. If any better idea pop up - it will be changed. + size_t maxYvecSize = gi.x_num[gi.l_max] * gi.y_num[gi.l_max] * gi.z_num[gi.l_max]; + y_vec.resize(maxYvecSize); + + cudaStream_t aStream = nullptr; + { + ScopedCudaMemHandler y_vec_cuda(y_vec.data(), y_vec.size()); + ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size()); + ScopedCudaMemHandler level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size()); + GenInfoGpuAccess giga(gi, aStream); + if (gi.l_max <= 2) { + runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), gi, giga, aStream); + } + else { + runFirstStep(gi, giga, p_map, min_type, aStream); + runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), aStream); + runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), counter_total, aStream); + runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), aStream); + runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda.get(), counter_total, aStream); + } + } + + // TODO: Resized back to correct size, should it be initialized to this size in the first place or pre-allocation for + // full size is more than enough? (for example in case of computing particles for multiple frames with same resolution + // we can get different size of particles for each frame - with preallocated buffer we can do all of them on it). + y_vec.resize(gi.total_number_particles); + + + LinearAccessCudaStructs lac; + lac.y_vec.swap(y_vec); + lac.xz_end_vec.swap(xz_end_vec); + lac.level_xz_vec.swap(level_xz_vec); + + return lac; +} diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp new file mode 100644 index 00000000..53dfd001 --- /dev/null +++ b/src/data_structures/APR/access/LinearAccessCuda.hpp @@ -0,0 +1,17 @@ +#ifndef APR_LINEARACCESSCUDA_HPP +#define APR_LINEARACCESSCUDA_HPP + +#include "algorithm/APRParameters.hpp" +#include "data_structures/Mesh/PixelData.hpp" +#include "data_structures/APR/GenInfo.hpp" + +typedef struct { + VectorData y_vec; + VectorData xz_end_vec; + VectorData level_xz_vec; +} LinearAccessCudaStructs; + +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters &apr_parameters, std::vector> &pct); + + +#endif //APR_LINEARACCESSCUDA_HPP diff --git a/src/data_structures/Mesh/PixelData.hpp b/src/data_structures/Mesh/PixelData.hpp index e0a037f0..f0127920 100644 --- a/src/data_structures/Mesh/PixelData.hpp +++ b/src/data_structures/Mesh/PixelData.hpp @@ -149,10 +149,6 @@ public : usePinnedMemory = usePinned; } - void setUsePinnedMemory(bool usePinned){ - usePinnedMemory = usePinned; - } - inline uint64_t size() const{ return vec.size(); } @@ -283,8 +279,19 @@ public : std::swap(usePinnedMemory, aObj.usePinnedMemory); std::swap(vecMemory, aObj.vecMemory); vec.swap(aObj.vec); +#ifdef APR_USE_CUDA + std::swap(vecMemoryPinned, aObj.vecMemoryPinned); +#endif } + VectorData(VectorData &&aObj) { + usePinnedMemory = aObj.usePinnedMemory; + vecMemory.swap(aObj.vecMemory); + vec = std::move(aObj.vec); +#ifdef APR_USE_CUDA + vecMemoryPinned =std::move(aObj.vecMemoryPinned); +#endif + } /** * Apply unary operator to each element in parallel, writing the result to VectorData 'output'. diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp index 24e961fd..7d1c4059 100644 --- a/test/LinearAccessCudaTest.cpp +++ b/test/LinearAccessCudaTest.cpp @@ -3,27 +3,98 @@ #include "algorithm/LocalParticleCellSet.hpp" #include "algorithm/PullingScheme.hpp" #include "algorithm/APRConverter.hpp" +#include "data_structures/APR/access/LinearAccessCuda.hpp" #include "TestTools.hpp" - -template -void fillPS(PullingScheme &aPS, PixelData &levels) { - PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); - LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); -} +namespace { + template + void fillPS(PullingScheme &aPS, PixelData &levels) { + PixelData levelsDS(ceil(levels.y_num / 2.0), ceil(levels.x_num / 2.0), ceil(levels.z_num / 2.0)); + LocalParticleCellSet().get_local_particle_cell_set(aPS, levels, levelsDS, APRParameters()); + } /** * Prints PCT * @param particleCellTree */ -template -void printParticleCellTree(const std::vector> &particleCellTree) { - for (uint64_t l = 0; l < particleCellTree.size(); ++l) { - auto &tree = particleCellTree[l]; -// std::cout << "-- level = " << l << ", " << tree << std::endl; - tree.printMeshT(3,0); + template + void printParticleCellTree(const std::vector> &particleCellTree) { + for (uint64_t l = 0; l < particleCellTree.size(); ++l) { + auto &tree = particleCellTree[l]; + tree.printMeshT(3, 0); + } + } + + /** + * Create PCT with provided data + * @param aprInfo + * @param levels complete list of values from level min to level max in form { {level, min, values}, ..., {level, max, values} } + * if levels are not provided PCT with EMPTY values is returned + * @return Particle Cell Tree with values (or with EMPTY if levels are not provided) + */ + auto makePCT(const GenInfo &aprInfo, std::initializer_list> levels) { + auto pct = PullingScheme::generateParticleCellTree(aprInfo); + + // Fill particle cell tree only if levels provided - otherwise return tree with EMPTY values + if (levels.size() != 0) { + + int l = aprInfo.l_min; + // PS levels range is [l_max - 1, l_min] + if (((aprInfo.l_max - 1) - aprInfo.l_min + 1) != (int) levels.size()) { + throw std::runtime_error("Wrong number of level data provided!"); + } + for (auto &level: levels) { + if (pct[l].getDimension().size() != level.size()) { + std::cerr << "Provided data for level=" << l << " differs from level size " + << pct[l].getDimension().size() << " vs. " << level.size() << std::endl; + std::cerr << aprInfo << std::endl; + throw std::runtime_error("Not this time..."); + } + std::copy(level.begin(), level.end(), pct[l].mesh.begin()); + l++; + } + } + return pct; + } + + // Copy PCT - copies only existing levels of it. + auto copyPCT(const std::vector> &pct) { + std::vector> copy; + copy.resize(pct.size()); + + for (int l = 0; l < pct.size(); ++l) { + copy[l].initWithResize(pct[l].y_num, pct[l].x_num, pct[l].z_num); + // Copy only existing levels + if (pct[l].z_num > 0) copy[l].copyFromMesh(pct[l]); + } + + return copy; } + + // Create random Particle Cell Tree with dimensions specified in 'gi' with given number of particles. + auto makeRandomPCT(const GenInfo &gi, int numOfParticles = 3) { + PullingScheme ps; + ps.initialize_particle_cell_tree(gi); + + // Generate random levels for PS and OVPC + PixelData levels(std::ceil(gi.org_dims[0]/2.0), + std::ceil(gi.org_dims[1]/2.0), + std::ceil(gi.org_dims[2]/2.0), + 0); + int seed = std::time(nullptr); + std::srand(seed); + for (int i = 0; i < numOfParticles; ++i) { + int modulo = (gi.l_max - gi.l_min); + if (modulo == 0) modulo = 1; + levels(std::rand() % levels.y_num, std::rand() % levels.x_num, std::rand() % levels.z_num) = std::rand() % modulo + gi.l_min; + } + fillPS(ps, levels); + ps.pulling_scheme_main(); + + return copyPCT(ps.getParticleCellTree()); + } + } TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_fullAprPipeline) { @@ -113,8 +184,8 @@ TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { // int values[] = {0,0,0,5, 0,0,0,0}; // int len = sizeof(values)/sizeof(int); - PixelData levels(3, 1, 1, 0); - levels(0,0,0) = 4; + PixelData levels(8, 1, 1, 0); + levels(5,0,0) = 1; // initFromZYXarray(levels, values); std::cout << "---------------\n"; @@ -145,6 +216,7 @@ TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { ps.pulling_scheme_main(); t.stop_timer(); + // Useful during debugging and can be removed once finished std::cout << "----------PS:\n"; printParticleCellTree(ps.getParticleCellTree()); std::cout << "-------------\n"; @@ -180,6 +252,133 @@ TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { std::cout << std::endl; } +// ********************************************************************************************************************* +// Tests of CUDA implementation of LinearAccess +// ********************************************************************************************************************* + + +TEST(LinearAccessCudaTest, optimizationForSmallLevels) { + // Tests optimized part of LinearAccess returning full-resolution for levels <= 2 + + // --- Create input data structures and objects + GenInfo gi; + gi.init(4, 3, 2); + auto pct = makePCT(gi, {}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + auto linearAccess = initializeLinearStructureCuda(gi, par, pct); + + // ---- Verify output + std::vector expected_y_vec = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // all 'y' particles for each xz + std::vector expected_xz_end_vec = {0, 0, 0, 4, 8, 12, 16, 20, 24}; + std::vector expected_level_xz_vec = {1, 1, 3, 9}; + + EXPECT_EQ(compareParticles(expected_y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(expected_xz_end_vec, linearAccess.xz_end_vec), 0); + // Useful during debugging and can be removed once finished + EXPECT_EQ(compareParticles(expected_level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(gi.total_number_particles, expected_y_vec.size()); + EXPECT_EQ(gi.total_number_particles, 4 * 3 * 2); +} + +TEST(LinearAccessCudaTest, optimizationForSmallLevelsVScpu) { + // Tests optimized part of LinearAccess returning full-resolution for levels <= 2 for all possible combination of xyz + // For bigger xyz 'optimized' part of code is not used + + for (int x = 1; x <= 4; ++x) { + for (int y = 1; y <= 4; ++y) { + for (int z = 1; z <= 4; ++z) { + std::cout << "< ============================================= " << x << " " << y << " "<< z << std::endl; + // --- Create input data structures and objects + GenInfo gi; + gi.init(y, x, z); + std::cout << gi << std::endl; + auto pct = makePCT(gi, {}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + GenInfo giGpu; + giGpu.init(y, x, z); + auto pctGpu = makePCT(giGpu, {}); // In that case values of PCT are not important (all dense particle data will be generated anyway) + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + // --- Method under test + linearAccess.initialize_linear_structure(par, pct); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu); + + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + + EXPECT_EQ(giGpu.total_number_particles, gi.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + } + } + } + +} + +TEST(LinearAccessCudaTest, testGPUvsCPUforDifferentSizes) { + + for (int x : {1, 2, 4, 100, 255}) { + for (int y : {1, 2, 4, 100, 256}) { + for (int z : {1, 2, 4, 100, 257}) { +// std::cout << "< ============================================= " << y << " " << x << " "<< z << std::endl; + + // ----------- Create input data structures and objects + GenInfo gi; + gi.init(y, x, z); + + auto pct = makeRandomPCT(gi, 133); + + auto pctCpu = copyPCT(pct); + auto pctGpu = copyPCT(pct); + + GenInfo giGpu; + giGpu.init(y, x, z); + + LinearAccess linearAccess; + linearAccess.genInfo = &gi; + APRParameters par; + par.neighborhood_optimization = true; + + + // --------- methods under test + APRTimer t(false); + t.start_timer("__________________________ CPU"); + // --- Method under test + linearAccess.initialize_linear_structure(par, pctCpu); + t.stop_timer(); + + t.start_timer("_________________________ GPU"); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pctGpu); + t.stop_timer(); + + + // ----------- verify results + + // LinearAccess changes PCT - compare if changes in CPU and GPU side are same + EXPECT_EQ(compareParticleCellTrees(pctCpu, pctGpu), 0); + + // Test if returned structures have same data + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + + EXPECT_EQ(giGpu.total_number_particles, gi.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + } + } + } + +} + + int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index 53eec162..5c4ebcb3 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -83,39 +83,6 @@ void printParticleCellTree(const std::vector> &particleCellTree) { } } -/** - * Compare - * @param expected - expected levels - * @param tested - levels to verify - * @param maxError - * @param maxNumOfErrPrinted - how many error outputs should be printed - * @return - */ -template -int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { - int cntGlobal = 0; - for (size_t level = 0; level < expected.size(); level++) { - int cnt = 0; - int numOfParticles = 0; - for (size_t i = 0; i < expected[level].mesh.size(); ++i) { - if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= FILLER_TYPE) { - if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) || - std::isnan(tested[level].mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs " - << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl; - } - cnt++; - } - if (expected[level].mesh[i] > 0) numOfParticles++; - } - } - cntGlobal += cnt; - if (cnt > 0) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl; - } - return cntGlobal; -} - template void fillPS(PullingScheme &aPS, PixelData &levels) { PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); diff --git a/test/PullingSchemeTest.cpp b/test/PullingSchemeTest.cpp index be922d90..eeee9718 100644 --- a/test/PullingSchemeTest.cpp +++ b/test/PullingSchemeTest.cpp @@ -91,39 +91,6 @@ namespace { return true; } - /** - * Compare - * @param expected - expected levels - * @param tested - levels to verify - * @param maxError - * @param maxNumOfErrPrinted - how many error outputs should be printed - * @return - */ - template - int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, int maxNumOfErrPrinted = 3) { - int cntGlobal = 0; - for (size_t level = 0; level < expected.size(); level++) { - int cnt = 0; - int numOfParticles = 0; - for (size_t i = 0; i < expected[level].mesh.size(); ++i) { - if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= FILLER_TYPE) { - if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) || - std::isnan(tested[level].mesh[i])) { - if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { - std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs " - << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl; - } - cnt++; - } - if (expected[level].mesh[i] > 0) numOfParticles++; - } - } - cntGlobal += cnt; - if (cnt > 0) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl; - } - return cntGlobal; - } - template void fillPS(PullingScheme &aPS, PixelData &levels) { PixelData levelsDS(ceil(levels.y_num/2.0), ceil(levels.x_num/2.0), ceil(levels.z_num/2.0)); diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 2baa2369..c6accd9a 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -120,9 +120,9 @@ inline int compareMeshes(const PixelData &expected, const PixelData &teste template inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTypeB &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 10) { int64_t cnt = 0; - if(expected.size() != tested.size()) { - std::cerr << "ERROR compareParticles: sizes differ!" << std::endl; - cnt++; + if (expected.size() != tested.size()) { + std::cerr << "ERROR compareParticles: sizes differs! " << expected.size() << " vs. " << tested.size() << std::endl; + return 1; // Return any number > 0 to indicate an error } for (size_t i = 0; i < expected.size(); ++i) { @@ -139,6 +139,40 @@ inline int64_t compareParticles(const ParticleTypeA &expected, const ParticleTyp return cnt; } +/** + * Compares two Particle Cell Trees + * @param expected - expected levels + * @param tested - levels to verify + * @param maxError + * @param maxNumOfErrPrinted - how many error outputs should be printed + * @param maxTypeCompared - maximum type to be compared + * @return + */ +template +int compareParticleCellTrees(const std::vector> &expected, const std::vector> &tested, bool printErrors = true, int maxNumOfErrPrinted = 3, uint8_t maxTypeCompared = FILLER_TYPE) { + int cntGlobal = 0; + for (size_t level = 0; level < expected.size(); level++) { + int cnt = 0; + int numOfParticles = 0; + for (size_t i = 0; i < expected[level].mesh.size(); ++i) { + if (expected[level].mesh[i] < 8 && tested[level].mesh[i] <= maxTypeCompared) { + if (std::abs(expected[level].mesh[i] - tested[level].mesh[i]) > 0 || std::isnan(expected[level].mesh[i]) || + std::isnan(tested[level].mesh[i])) { + if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { + std::cout << "Level: " << level <<" ERROR expected vs tested mesh: " << (float) expected[level].mesh[i] << " vs " + << (float) tested[level].mesh[i] << " IDX:" << tested[level].getStrIndex(i) << std::endl; + } + cnt++; + } + if (expected[level].mesh[i] > 0) numOfParticles++; + } + } + cntGlobal += cnt; + if (cnt > 0 && printErrors) std::cout << "Level: " << level << ", Number of errors / all points: " << cnt << " / " << expected[level].mesh.size() << " Particles:" << numOfParticles << std::endl; + } + return cntGlobal; +} + /** * Generates mesh with provided dims with random values in range [0, 1] * multiplier + offset * @param y From e1b63d7a6d95b1f52d7d9a3d366cfbaa064a537b Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 2 Aug 2024 16:17:35 +0200 Subject: [PATCH 43/59] Compiler warnings fixed --- src/data_structures/APR/access/RandomAccess.hpp | 4 ++-- test/APRTest.cpp | 2 +- test/LinearAccessCudaTest.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/data_structures/APR/access/RandomAccess.hpp b/src/data_structures/APR/access/RandomAccess.hpp index aa8f67bc..18366d99 100644 --- a/src/data_structures/APR/access/RandomAccess.hpp +++ b/src/data_structures/APR/access/RandomAccess.hpp @@ -1513,7 +1513,7 @@ inline void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(AP gap.global_index_begin_offset = 0; uint64_t counter = 0; - uint16_t prev_y = -2; //init + uint16_t prev_y = 65534; // Originally = -2 which is 65534 when assigned to uint16 - removing compiler error //init auto& mesh = p_map.data[i][offset_pc_data][0].mesh; @@ -1577,7 +1577,7 @@ inline void RandomAccess::initialize_structure_from_particle_cell_tree_sparse(AP auto& mesh = p_map.data[i][offset_pc_data1][0].mesh; - uint16_t prev_y = -2; //init + uint16_t prev_y = 65534; // Originally = -2 which is 65534 when assigned to uint16 - removing compiler error //init //SPARSE iteration for (auto it=mesh.begin(); it!=mesh.end(); ++it) { diff --git a/test/APRTest.cpp b/test/APRTest.cpp index 33ea37d6..83071a7f 100644 --- a/test/APRTest.cpp +++ b/test/APRTest.cpp @@ -134,7 +134,7 @@ bool compare_two_iterators(Iterator1& it1, Iterator2& it2, int maxNumOfErrPrinte uint64_t counter_1 = 0; uint64_t counter_2 = 0; - uint64_t errors = 0; + int64_t errors = 0; for (int level = it1.level_min(); level <= it1.level_max(); ++level) { for (int z = 0; z < it1.z_num(level); z++) { diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp index 7d1c4059..1b7dee46 100644 --- a/test/LinearAccessCudaTest.cpp +++ b/test/LinearAccessCudaTest.cpp @@ -63,7 +63,7 @@ namespace { std::vector> copy; copy.resize(pct.size()); - for (int l = 0; l < pct.size(); ++l) { + for (size_t l = 0; l < pct.size(); ++l) { copy[l].initWithResize(pct[l].y_num, pct[l].x_num, pct[l].z_num); // Copy only existing levels if (pct[l].z_num > 0) copy[l].copyFromMesh(pct[l]); From 4c88fae902b2469940a13411d2102d728f72654f Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 6 Aug 2024 12:23:10 +0200 Subject: [PATCH 44/59] Removed debug outputs from LinearAccessCuda test. --- test/LinearAccessCudaTest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp index 1b7dee46..84cf8730 100644 --- a/test/LinearAccessCudaTest.cpp +++ b/test/LinearAccessCudaTest.cpp @@ -292,11 +292,11 @@ TEST(LinearAccessCudaTest, optimizationForSmallLevelsVScpu) { for (int x = 1; x <= 4; ++x) { for (int y = 1; y <= 4; ++y) { for (int z = 1; z <= 4; ++z) { - std::cout << "< ============================================= " << x << " " << y << " "<< z << std::endl; +// std::cout << "< ============================================= " << x << " " << y << " "<< z << std::endl; // --- Create input data structures and objects GenInfo gi; gi.init(y, x, z); - std::cout << gi << std::endl; + auto pct = makePCT(gi, {}); // In that case values of PCT are not important (all dense particle data will be generated anyway) GenInfo giGpu; giGpu.init(y, x, z); From 169cd9dc4ce043cd536ce4d097350f8130cc5662 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 6 Aug 2024 17:28:36 +0200 Subject: [PATCH 45/59] Added two more test for full pipeline (including PS, and LinearAccess) --- test/FullPipelineCudaTest.cpp | 182 ++++++++++++++++++++++++++++++---- 1 file changed, 163 insertions(+), 19 deletions(-) diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 6528227a..eb88b850 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -5,6 +5,8 @@ #include "algorithm/LocalIntensityScale.hpp" #include "algorithm/ComputeGradient.hpp" #include "algorithm/ComputeGradientCuda.hpp" +#include "algorithm/PullingSchemeCuda.hpp" +#include "data_structures/APR/access/LinearAccessCuda.hpp" #include "TestTools.hpp" #include "data_structures/Mesh/PixelDataCuda.h" #include "algorithm/APRConverter.hpp" @@ -18,22 +20,24 @@ namespace { // Generate random mesh - keep it large enough to catch all possible computation errors using ImageType = float; - PixelData input_image = getRandInitializedMesh(100, 100, 100, 13); + constexpr PixelDataDim dim{333, 1000, 333}; + PixelData input_image = getRandInitializedMesh(dim, 13); int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); - PixelData grad_temp; // should be a down-sampled image - grad_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); - PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors - local_scale_temp.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); PixelData local_scale_temp2; - local_scale_temp2.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + local_scale_temp2.initDownsampled(dim, false); - PixelData grad_temp_GPU; // should be a down-sampled image - grad_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, 0, false); - PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors - local_scale_temp_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); - PixelData local_scale_temp2_GPU; - local_scale_temp2_GPU.initDownsampled(input_image.y_num, input_image.x_num, input_image.z_num, false); + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU (grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); // Prepare parameters APRParameters par; @@ -45,16 +49,14 @@ namespace { par.dy = 1; par.dz = 1; - // Calculate bspline on CPU - PixelData mCpuImage(input_image, true); + // Calculate pipeline on CPU timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); timer.stop_timer(); - // Calculate bspline on GPU - PixelData mGpuImage(input_image, true); + // Calculate pipeline on GPU timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); @@ -65,7 +67,149 @@ namespace { EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); } - TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GPT) { + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS) { + APRTimer timer(true); + + // Generate random mesh - keep it large enough to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim{333, 1000, 333}; + PixelData input_image = getRandInitializedMesh(dim, 13); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU (grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + + // Prepare parameters and APR info structures + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + GenInfo aprInfo; + aprInfo.init(input_image.getDimension()); + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + int levelMax = aprInfo.l_max - 1; + int levelMin = aprInfo.l_min; + std::vector> pct = PullingScheme::generateParticleCellTree(aprInfo); + computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); + } + + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS_LINEARACCESS) { + APRTimer timer(true); + + // Generate random mesh - keep it large enough to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim{333, 1000, 333}; + PixelData input_image = getRandInitializedMesh(dim, 13); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU (grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + + // Prepare parameters and APR info structures + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + par.neighborhood_optimization = true; + + GenInfo aprInfo; + aprInfo.init(input_image.getDimension()); + GenInfo giGpu; + giGpu.init(input_image.getDimension()); + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + LinearAccess linearAccess; + linearAccess.genInfo = &aprInfo; + + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + int levelMax = giGpu.l_max - 1; + int levelMin = giGpu.l_min; + std::vector> pct = PullingScheme::generateParticleCellTree(giGpu); + computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + // Test if returned structures have same data + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + + EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + } + + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GpuProcessingTask) { APRTimer timer(true); // Generate random mesh - keep it large enough to catch all possible computation errors @@ -98,7 +242,7 @@ namespace { par.dy = 1; par.dz = 1; - // Calculate bspline on CPU + // Calculate pipeline on CPU PixelData mCpuImage(input_image, true); timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); @@ -107,7 +251,7 @@ namespace { timer.stop_timer(); - // Calculate bspline on GPU + // Calculate pipeline on GPU PixelData mGpuImage(input_image, true); timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); From dadf92f1a813355371ee415e4a43831a3a19acf6 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 8 Aug 2024 13:00:33 +0200 Subject: [PATCH 46/59] -ffast-math must be removed - some optimizations still make GPU and CPU computations different --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 56cd98ec..9cf047e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -174,14 +174,14 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ") if(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_CXX_FLAGS_RELEASE "-O4 -ffast-math -fno-unsafe-math-optimizations") + set(CMAKE_CXX_FLAGS_RELEASE "-O4") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -pedantic") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Bdynamic") if(NOT WIN32) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ldl -lz") endif() elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -ffast-math -fno-unsafe-math-optimizations") + set(CMAKE_CXX_FLAGS_RELEASE "-O3") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -pedantic") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lz") endif() From 27a8dc3a552f8a8e9de9d45d1ae2d59599fcefc6 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 8 Aug 2024 13:02:24 +0200 Subject: [PATCH 47/59] (nasty) fix for computeLevels in CUDA - added TODO to make it more reliable in future --- src/algorithm/ComputePullingScheme.cuh | 9 +++++++-- test/TestTools.hpp | 26 ++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/algorithm/ComputePullingScheme.cuh b/src/algorithm/ComputePullingScheme.cuh index 28450f30..51b88143 100644 --- a/src/algorithm/ComputePullingScheme.cuh +++ b/src/algorithm/ComputePullingScheme.cuh @@ -9,8 +9,13 @@ template __global__ void computeLevels(const T *grad, float *lis, size_t len, float mult_const) { size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; if (idx < len) { - //divide gradient magnitude by Local Intensity Scale (first step in calculating the Local Resolution Estimate L(y), minus constants) - uint32_t d = (grad[idx] / lis[idx]) * mult_const; + // divide gradient magnitude by Local Intensity Scale (first step in calculating the Local Resolution Estimate L(y), minus constants) + // TODO: This part is using a "trick" to convert first to int and then to uint32_t + // Without that some numbers on CPU and GPU are converted to different values... + // For example -6507.28 without conversion to int is converted to 0 but in CPU we got huge value. + // Anyway - both CPU & GPU sides should be checked and maybe some better way of it should be + // used - currently we've got undefined result of such operation. + uint32_t d = (int)((grad[idx] / lis[idx]) * mult_const); //incorporate other factors and compute the level of the Particle Cell, effectively construct LPC L_n lis[idx] = (d == 0) ? 0 : 31 - __clz(d); // fast log2 } diff --git a/test/TestTools.hpp b/test/TestTools.hpp index c6accd9a..53d6ff55 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -67,7 +67,7 @@ inline bool initFromZYXarray(PixelData &mesh, const T *data) { * @return number of errors detected */ template -inline int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0.0001, int maxNumOfErrPrinted = 3) { +inline int compareMeshes(const PixelData &expected, const PixelData &tested, double maxError = 0, int maxNumOfErrPrinted = 3) { if (expected.getDimension() != tested.getDimension()) { std::stringstream errMsg; errMsg << "Dimensions of expected and tested meshes differ! " << expected.getDimension() << " vs " << tested.getDimension(); @@ -86,7 +86,7 @@ inline int compareMeshes(const PixelData &expected, const PixelData &teste if (cnt < maxNumOfErrPrinted || maxNumOfErrPrinted == -1) { std::cout << std::fixed << std::setprecision(9) << "ERROR expected vs tested mesh: " << (float)expected.mesh[i] << " vs " << (float)tested.mesh[i] - << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << tested.getStrIndex(i) << std::endl; + << " error = " << (float)expected.mesh[i] - (float)tested.mesh[i] << " IDX:" << i << "=" << tested.getStrIndex(i) << std::endl; } cnt++; } @@ -213,6 +213,28 @@ inline PixelData getRandInitializedMesh(PixelDataDim dim, float multiplier = return getRandInitializedMesh(dim.y, dim.x, dim.z, multiplier, offset, useIdxNumbers); } +template +inline PixelData getMeshWithBlobInMiddle(int y, int x, int z) { + PixelData m(y, x, z, 0); + + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_real_distribution dist(0.0, 1.0); + + int count = 0; + for (int yi = (1.0/3 * y); yi < (2.0/3 * y); yi++) { + for (int xi = (1.0/3 * x); xi < (2.0/3 * x); xi++) { + for (int zi = (1.0/3 * z); zi < (2.0/3 * z); zi++) { + m(yi, xi, zi) = 30 ;//+ dist(mt) * 10; + count++; + } + } + } + std::cout << "COUNT: " << count << std::endl; + + return m; +} + struct TestBenchStats{ double inf_norm=0; From bb3b3f4e869a064f361b5535dcdbbd54c93668c5 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Fri, 9 Aug 2024 18:34:09 +0200 Subject: [PATCH 48/59] Fix for bsplineYdir for very small input images + test for full pipeline updated --- src/algorithm/bsplineYdir.cuh | 4 +- src/misc/CudaMemory.cuh | 18 +- src/misc/CudaTools.cuh | 9 +- test/FullPipelineCudaTest.cpp | 534 ++++++++++++++++++++-------------- test/TestTools.hpp | 24 +- 5 files changed, 345 insertions(+), 244 deletions(-) diff --git a/src/algorithm/bsplineYdir.cuh b/src/algorithm/bsplineYdir.cuh index b487cb63..e9905b64 100644 --- a/src/algorithm/bsplineYdir.cuh +++ b/src/algorithm/bsplineYdir.cuh @@ -86,7 +86,7 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, BsplineParamsCud } int offs = i % p.k0; int work = i / p.k0; - if (work + xzIndexOfBlock < maxXZoffset) { + if (work + xzIndexOfBlock < maxXZoffset && offs < dirLen) { cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + offs]; } } @@ -114,7 +114,7 @@ __global__ void bsplineYdirBoundary(T *image, PixelDataDim dim, BsplineParamsCud } int offs = i % p.k0; int work = i / p.k0; - if (work + xzIndexOfBlock < maxXZoffset) { + if (work + xzIndexOfBlock < maxXZoffset && offs < dirLen) { cache[work * p.k0 + offs] = image[workersOffset + dim.y * work + dim.y - 1 - offs]; } } diff --git a/src/misc/CudaMemory.cuh b/src/misc/CudaMemory.cuh index e237779f..fbe125e9 100644 --- a/src/misc/CudaMemory.cuh +++ b/src/misc/CudaMemory.cuh @@ -11,14 +11,20 @@ #include -inline cudaError_t checkCuda(cudaError_t result) { -#if defined(DEBUG) || defined(_DEBUG) - if (result != cudaSuccess) { - fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); - assert(result == cudaSuccess); + +// TODO: this method is duplicated in CudaTools.cuh +// Somehow including it here break compilation - fix it please. +#define checkCuda(ans) { cudaAssert2((ans), __FILE__, __LINE__); } +inline void cudaAssert2(cudaError_t code, const char *file, int line, bool abort=true) +{ +#if defined(DEBUG) || defined(_DEBUG) || !defined(NDEBUG) + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line); + assert(code == cudaSuccess); // If debugging it helps to see call tree somehow + if (abort) exit(code); } #endif - return result; } inline void* getPinnedMemory(size_t aNumOfBytes) { diff --git a/src/misc/CudaTools.cuh b/src/misc/CudaTools.cuh index 155ce317..10e4cb73 100644 --- a/src/misc/CudaTools.cuh +++ b/src/misc/CudaTools.cuh @@ -18,10 +18,11 @@ #define checkCuda(ans) { cudaAssert((ans), __FILE__, __LINE__); } inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { -#if defined(DEBUG) || defined(_DEBUG) +#if defined(DEBUG) || defined(_DEBUG) || !defined(NDEBUG) if (code != cudaSuccess) { fprintf(stderr,"GPUassert: (%d) %s %s %d\n", code, cudaGetErrorString(code), file, line); + assert(code == cudaSuccess); // If debugging it helps to see call tree somehow if (abort) exit(code); } #endif @@ -38,12 +39,6 @@ inline void printCudaDims(const dim3 &threadsPerBlock, const dim3 &numBlocks) { std::cout << "Number of threads (x/y/z): " << threadsPerBlock.x << "/" << threadsPerBlock.y << "/" << threadsPerBlock.z << std::endl; } -template -inline void getDataFromKernel(PixelData &input, size_t inputSize, ImgType *cudaInput) { - cudaMemcpy(input.mesh.get(), cudaInput, inputSize, cudaMemcpyDeviceToHost); - cudaFree(cudaInput); -} - class CudaTimer { std::vector iStartTimes; std::vector names; diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index eb88b850..41e865d3 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -15,254 +15,338 @@ namespace { #ifdef APR_USE_CUDA + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS) { + APRTimer timer(true); + + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors + using ImageType = float; + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d/2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true, true); + PixelData grad_temp_GPU(grad_temp, true, true); + PixelData local_scale_temp_GPU(local_scale_temp, true, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true, true); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0); + } + } + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS) { APRTimer timer(true); - // Generate random mesh - keep it large enough to catch all possible computation errors + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; - constexpr PixelDataDim dim{333, 1000, 333}; - PixelData input_image = getRandInitializedMesh(dim, 13); - int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); - - // Initialize CPU data structures - PixelData mCpuImage(input_image, true); - PixelData grad_temp; - grad_temp.initDownsampled(dim, 0, false); - PixelData local_scale_temp; - local_scale_temp.initDownsampled(dim, false); - PixelData local_scale_temp2; - local_scale_temp2.initDownsampled(dim, false); - - // Initialize GPU data structures to same values as CPU - PixelData mGpuImage(input_image, true); - PixelData grad_temp_GPU (grad_temp, true); - PixelData local_scale_temp_GPU(local_scale_temp, true); - PixelData local_scale_temp2_GPU(local_scale_temp2, true); - - // Prepare parameters - APRParameters par; - par.lambda = 3; - par.Ip_th = 10; - par.sigma_th = 0; - par.sigma_th_max = 0; - par.dx = 1; - par.dy = 1; - par.dz = 1; - - // Calculate pipeline on CPU - timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); - ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); - LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); - LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - timer.stop_timer(); - - // Calculate pipeline on GPU - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); - getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); - computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - timer.stop_timer(); - - // Compare GPU vs CPU - expect exactly same result - EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d%2 == 0) ? dim1 : dim2; + PixelData input_image = (d/2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true, false); + PixelData grad_temp_GPU(grad_temp, true, false); + PixelData local_scale_temp_GPU(local_scale_temp, true, false); + PixelData local_scale_temp2_GPU(local_scale_temp2, true, false); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(grad_temp, grad_temp_GPU, 0), 0); + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + } } TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS) { APRTimer timer(true); - // Generate random mesh - keep it large enough to catch all possible computation errors + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; - constexpr PixelDataDim dim{333, 1000, 333}; - PixelData input_image = getRandInitializedMesh(dim, 13); - int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); - - // Initialize CPU data structures - PixelData mCpuImage(input_image, true); - PixelData grad_temp; - grad_temp.initDownsampled(dim, 0, false); - PixelData local_scale_temp; - local_scale_temp.initDownsampled(dim, false); - PixelData local_scale_temp2; - local_scale_temp2.initDownsampled(dim, false); - - // Initialize GPU data structures to same values as CPU - PixelData mGpuImage(input_image, true); - PixelData grad_temp_GPU (grad_temp, true); - PixelData local_scale_temp_GPU(local_scale_temp, true); - PixelData local_scale_temp2_GPU(local_scale_temp2, true); - - // Prepare parameters and APR info structures - APRParameters par; - par.lambda = 3; - par.Ip_th = 10; - par.sigma_th = 0; - par.sigma_th_max = 0; - par.dx = 1; - par.dy = 1; - par.dz = 1; - - GenInfo aprInfo; - aprInfo.init(input_image.getDimension()); - - // Calculate pipeline on CPU - timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); - ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); - LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); - LocalParticleCellSet lpcs = LocalParticleCellSet(); - lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - PullingScheme ps; - ps.initialize_particle_cell_tree(aprInfo); - lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); - ps.pulling_scheme_main(); - timer.stop_timer(); - - // Calculate pipeline on GPU - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); - getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); - computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - int levelMax = aprInfo.l_max - 1; - int levelMin = aprInfo.l_min; - std::vector> pct = PullingScheme::generateParticleCellTree(aprInfo); - computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); - timer.stop_timer(); - - // Compare GPU vs CPU - expect exactly same result - ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU(grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + + // Prepare parameters and APR info structures + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + GenInfo aprInfo; + aprInfo.init(input_image.getDimension()); + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + int levelMax = aprInfo.l_max - 1; + int levelMin = aprInfo.l_min; + std::vector> pct = PullingScheme::generateParticleCellTree(aprInfo); + computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); + } } TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS_LINEARACCESS) { APRTimer timer(true); - // Generate random mesh - keep it large enough to catch all possible computation errors + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; - constexpr PixelDataDim dim{333, 1000, 333}; - PixelData input_image = getRandInitializedMesh(dim, 13); - int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); - - // Initialize CPU data structures - PixelData mCpuImage(input_image, true); - PixelData grad_temp; - grad_temp.initDownsampled(dim, 0, false); - PixelData local_scale_temp; - local_scale_temp.initDownsampled(dim, false); - PixelData local_scale_temp2; - local_scale_temp2.initDownsampled(dim, false); - - // Initialize GPU data structures to same values as CPU - PixelData mGpuImage(input_image, true); - PixelData grad_temp_GPU (grad_temp, true); - PixelData local_scale_temp_GPU(local_scale_temp, true); - PixelData local_scale_temp2_GPU(local_scale_temp2, true); - - // Prepare parameters and APR info structures - APRParameters par; - par.lambda = 3; - par.Ip_th = 10; - par.sigma_th = 0; - par.sigma_th_max = 0; - par.dx = 1; - par.dy = 1; - par.dz = 1; - par.neighborhood_optimization = true; - - GenInfo aprInfo; - aprInfo.init(input_image.getDimension()); - GenInfo giGpu; - giGpu.init(input_image.getDimension()); - - // Calculate pipeline on CPU - timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); - ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); - LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); - LocalParticleCellSet lpcs = LocalParticleCellSet(); - lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - PullingScheme ps; - ps.initialize_particle_cell_tree(aprInfo); - lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); - ps.pulling_scheme_main(); - LinearAccess linearAccess; - linearAccess.genInfo = &aprInfo; - - linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); - timer.stop_timer(); - - // Calculate pipeline on GPU - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); - getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); - computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - int levelMax = giGpu.l_max - 1; - int levelMin = giGpu.l_min; - std::vector> pct = PullingScheme::generateParticleCellTree(giGpu); - computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); - auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct); - timer.stop_timer(); - - // Compare GPU vs CPU - expect exactly same result - // Test if returned structures have same data - EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); - EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); - EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); - - EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles); - EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + + int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); + + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU(grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + + // Prepare parameters and APR info structures + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + par.neighborhood_optimization = true; + + GenInfo aprInfo; + aprInfo.init(input_image.getDimension()); + GenInfo giGpu; + giGpu.init(input_image.getDimension()); + + // Calculate pipeline on CPU + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + LinearAccess linearAccess; + linearAccess.genInfo = &aprInfo; + + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); + timer.stop_timer(); + + // Calculate pipeline on GPU + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); + getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); + computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + int levelMax = giGpu.l_max - 1; + int levelMin = giGpu.l_min; + std::vector> pct = PullingScheme::generateParticleCellTree(giGpu); + computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct); + timer.stop_timer(); + + // Compare GPU vs CPU - expect exactly same result + // Test if returned structures have same data + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + + EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); + } } TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GpuProcessingTask) { APRTimer timer(true); - // Generate random mesh - keep it large enough to catch all possible computation errors + // TODO: This tets fails if dim of input image is smaller than ~8 (not sure in which direction yet) + // It fails for {4,4,3} for sure and surprisingly only for mesh with blob inside... + // Investigate why it fails while it works nicely in tests above (difference must be somewhere in GpuProcessingTask). + + + // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; - constexpr PixelDataDim dim{333, 1000, 333}; - PixelData input_image = getRandInitializedMesh(dim, 99, 0, false); - int maxLevel = ceil(std::log2(dim.maxDimSize())); - - PixelData grad_temp; // should be a down-sampled image - grad_temp.initDownsampled(dim, 0, false); - PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors - local_scale_temp.initDownsampled(dim,false); - PixelData local_scale_temp2; - local_scale_temp2.initDownsampled(dim, false); - - PixelData grad_temp_GPU; // should be a down-sampled image - grad_temp_GPU.initDownsampled(dim, 0, false); - PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors - local_scale_temp_GPU.initDownsampled(dim, false); - PixelData local_scale_temp2_GPU; - local_scale_temp2_GPU.initDownsampled(dim, false); - - // Prepare parameters - APRParameters par; - par.lambda = 3; - par.Ip_th = 10; - par.sigma_th = 0; - par.sigma_th_max = 0; - par.dx = 1; - par.dy = 1; - par.dz = 1; - - // Calculate pipeline on CPU - PixelData mCpuImage(input_image, true); - timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); - ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); - LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); - LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - timer.stop_timer(); - - - // Calculate pipeline on GPU - PixelData mGpuImage(input_image, true); - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - - { - GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); - gpt.doAll(); - } - timer.stop_timer(); + constexpr PixelDataDim dim1{3, 8, 8}; + constexpr PixelDataDim dim2{4, 4 ,3}; + for (int d = 0; d <= 3; d++) { + auto &dim = (d % 2 == 0) ? dim1 : dim2; + PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : + getMeshWithBlobInMiddle(dim); + int maxLevel = ceil(std::log2(dim.maxDimSize())); + + std::cout << "--------------------------> " << dim << " " << (bool)(d/2 == 0) << std::endl; + + PixelData grad_temp; // should be a down-sampled image + grad_temp.initDownsampled(dim, 0, false); + PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp.initDownsampled(dim, false); + PixelData local_scale_temp2; + local_scale_temp2.initDownsampled(dim, false); + + // Prepare parameters + APRParameters par; + par.lambda = 3; + par.Ip_th = 10; + par.sigma_th = 0; + par.sigma_th_max = 0; + par.dx = 1; + par.dy = 1; + par.dz = 1; + + // Calculate pipeline on CPU + PixelData mCpuImage(input_image, true); + timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); + ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); + LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); +// LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + timer.stop_timer(); + + + // Calculate pipeline on GPU + PixelData mGpuImage(input_image, true); + PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + local_scale_temp_GPU.initDownsampled(dim, false); + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + + { + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); + gpt.doAll(); + } + timer.stop_timer(); + if (dim.y < 5 ) { + local_scale_temp.printMesh(3, 2); + local_scale_temp_GPU.printMesh(3, 2); + } + // Compare GPU vs CPU - expect exactly same result + EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); - // Compare GPU vs CPU - expect exactly same result - EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + } } #endif // APR_USE_CUDA } diff --git a/test/TestTools.hpp b/test/TestTools.hpp index 53d6ff55..158bf2ea 100644 --- a/test/TestTools.hpp +++ b/test/TestTools.hpp @@ -213,6 +213,14 @@ inline PixelData getRandInitializedMesh(PixelDataDim dim, float multiplier = return getRandInitializedMesh(dim.y, dim.x, dim.z, multiplier, offset, useIdxNumbers); } +/** + * Generate mesh with square blob in the center of it with values randomly chosen from [20,40] range. Zero values outside. + * @tparam T + * @param y + * @param x + * @param z + * @return + */ template inline PixelData getMeshWithBlobInMiddle(int y, int x, int z) { PixelData m(y, x, z, 0); @@ -221,20 +229,28 @@ inline PixelData getMeshWithBlobInMiddle(int y, int x, int z) { std::mt19937 mt(rd()); std::uniform_real_distribution dist(0.0, 1.0); - int count = 0; for (int yi = (1.0/3 * y); yi < (2.0/3 * y); yi++) { for (int xi = (1.0/3 * x); xi < (2.0/3 * x); xi++) { for (int zi = (1.0/3 * z); zi < (2.0/3 * z); zi++) { - m(yi, xi, zi) = 30 ;//+ dist(mt) * 10; - count++; + m(yi, xi, zi) = 30 + dist(mt) * 10; } } } - std::cout << "COUNT: " << count << std::endl; return m; } +/** + * Generate mesh with square blob in the center of it with values randomly chosen from [20,40] range. Zero values outside. + * @tparam T + * @param dim + * @return + */ +template +inline PixelData getMeshWithBlobInMiddle(const PixelDataDim &dim) { + return getMeshWithBlobInMiddle(dim.y, dim.x, dim.z); +} + struct TestBenchStats{ double inf_norm=0; From a8c4d77c33c925f518d4f441ab4ae0858e090e1d Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 14 Aug 2024 15:30:18 +0200 Subject: [PATCH 49/59] Fixed Local Intensity Scale (LIS) for super small inputs --- src/algorithm/LocalIntensityScale.cu | 6 ++--- test/FullPipelineCudaTest.cpp | 33 ++++++++++++------------- test/LocalIntensityScaleCudaTest.cpp | 37 ++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/src/algorithm/LocalIntensityScale.cu b/src/algorithm/LocalIntensityScale.cu index 2b5c186d..1593b5ab 100644 --- a/src/algorithm/LocalIntensityScale.cu +++ b/src/algorithm/LocalIntensityScale.cu @@ -480,9 +480,9 @@ __global__ void constantScale(S *image, size_t len) { } template -void runConstantScale(S *image, PixelDataDim &dim) { +void runConstantScale(S *image, PixelDataDim &dim, cudaStream_t aStream) { // Check kernel description for further info! - constantScale<<<1, 1>>>(image, dim.size()); + constantScale<<<1, 1, 0, aStream>>>(image, dim.size()); } template @@ -551,7 +551,7 @@ void runLocalIntensityScalePipeline(const PixelData &image, const APRParamete } } else { - runConstantScale(cudaImage, imageSize); + runConstantScale(cudaImage, imageSize, aStream); } } diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 41e865d3..a92abc08 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -26,7 +26,6 @@ namespace { auto &dim = (d % 2 == 0) ? dim1 : dim2; PixelData input_image = (d/2 == 0) ? getRandInitializedMesh(dim, 13) : getMeshWithBlobInMiddle(dim); - int maxLevel = ceil(std::log2(input_image.getDimension().maxDimSize())); // Initialize CPU data structures PixelData mCpuImage(input_image, true); @@ -292,23 +291,29 @@ namespace { // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; - constexpr PixelDataDim dim1{3, 8, 8}; - constexpr PixelDataDim dim2{4, 4 ,3}; + constexpr PixelDataDim dim1{4, 4, 3}; + constexpr PixelDataDim dim2{163, 123, 555}; for (int d = 0; d <= 3; d++) { auto &dim = (d % 2 == 0) ? dim1 : dim2; PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : getMeshWithBlobInMiddle(dim); int maxLevel = ceil(std::log2(dim.maxDimSize())); - std::cout << "--------------------------> " << dim << " " << (bool)(d/2 == 0) << std::endl; - - PixelData grad_temp; // should be a down-sampled image + // Initialize CPU data structures + PixelData mCpuImage(input_image, true); + PixelData grad_temp; grad_temp.initDownsampled(dim, 0, false); - PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors + PixelData local_scale_temp; local_scale_temp.initDownsampled(dim, false); PixelData local_scale_temp2; local_scale_temp2.initDownsampled(dim, false); + // Initialize GPU data structures to same values as CPU + PixelData mGpuImage(input_image, true); + PixelData grad_temp_GPU(grad_temp, true); + PixelData local_scale_temp_GPU(local_scale_temp, true); + PixelData local_scale_temp2_GPU(local_scale_temp2, true); + // Prepare parameters APRParameters par; par.lambda = 3; @@ -318,31 +323,25 @@ namespace { par.dx = 1; par.dy = 1; par.dz = 1; + par.neighborhood_optimization = true; // Calculate pipeline on CPU - PixelData mCpuImage(input_image, true); timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); -// LocalParticleCellSet().computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + LocalParticleCellSet lpcs = LocalParticleCellSet(); + lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); timer.stop_timer(); // Calculate pipeline on GPU - PixelData mGpuImage(input_image, true); - PixelData local_scale_temp_GPU; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors - local_scale_temp_GPU.initDownsampled(dim, false); timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - { GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); gpt.doAll(); } timer.stop_timer(); - if (dim.y < 5 ) { - local_scale_temp.printMesh(3, 2); - local_scale_temp_GPU.printMesh(3, 2); - } + // Compare GPU vs CPU - expect exactly same result EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); diff --git a/test/LocalIntensityScaleCudaTest.cpp b/test/LocalIntensityScaleCudaTest.cpp index ce6ff111..39f8ff22 100644 --- a/test/LocalIntensityScaleCudaTest.cpp +++ b/test/LocalIntensityScaleCudaTest.cpp @@ -558,6 +558,43 @@ namespace { } } + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE_SUPER_SMALL) { + // In case of very small input image like 2x2x2 constant scale is being used + APRTimer timer(false); + + for (int boundary = 0; boundary <= 1; ++boundary) { + for (int r = 0; r <= 1; r++) { + bool hasBoundary = (boundary > 0); + bool useRandomNumbers = (r > 0); + + PixelData m = getRandInitializedMesh(2,2,2, 25, 10, !useRandomNumbers); + + APRParameters params; + params.sigma_th = 1; + params.sigma_th_max = 2; + params.reflect_bc_lis = hasBoundary; + + // Run on CPU + PixelData mCpu(m, true); + PixelData mCpuTemp(m, false); + timer.start_timer("CPU LIS FULL"); + LocalIntensityScale().get_local_intensity_scale(mCpu, mCpuTemp, params); + mCpu.printMesh(3,2); + timer.stop_timer(); + + // Run on GPU + PixelData mGpu(m, true); + PixelData mGpuTemp(m, false); + timer.start_timer("GPU LIS FULL"); + getLocalIntensityScale(mGpu, mGpuTemp, params); + timer.stop_timer(); + + // Compare results - only mGPU mattters since mGpuTemp in case of constant scale is not modified + EXPECT_EQ(compareMeshes(mCpu, mGpu, 0), 0); + } + } + } + TEST(LocalIntensityScaleCudaTest, GPU_VS_CPU_FULL_PIPELINE_CONSTANT_SCALE) { APRTimer timer(false); From e6e43274859466c9b574026f9dd2d49914f5170a Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Mon, 19 Aug 2024 17:09:16 +0200 Subject: [PATCH 50/59] ParticleCellTreeCuda is now main stuff for CUDA --- src/algorithm/OVPC.cu | 125 ++++++++++++---------------- src/algorithm/PullingSchemeCuda.hpp | 7 +- test/FullPipelineCudaTest.cpp | 14 ++-- test/PullingSchemeCudaTest.cpp | 30 ++----- 4 files changed, 67 insertions(+), 109 deletions(-) diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index 070c4d81..c68fd63f 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -107,79 +107,53 @@ void runSecondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, siz secondStep<<>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax); }; -// explicit instantiation of handled types -template void computeOVPC(const PixelData&, PixelData&, int, int); - -template -void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax) { - - // TODO: Depending on implementation of computing particles (next step after OVPC) some port of this method - // might be useful. Leaving it here rigtht now just in case. If not needed in next steps DELETE IT. - - ScopedCudaMemHandler, H2D> in(input); - ScopedCudaMemHandler, D2H> mem(output); - - - CudaTimer t(true, "OVPCCUDA"); - - t.start_timer("wait"); - waitForCuda(); - t.stop_timer(); - - t.start_timer("ALL"); - // TODO: This is not needed later - just for having clear debug - //cudaMemset(mem.get(), 0, mem.getNumOfBytes()); - - // =============== Create pyramid - std::vector levels(levelMax + 1, nullptr); - std::vector xSize(levelMax + 1); - std::vector ySize(levelMax + 1); - std::vector zSize(levelMax + 1); - - int xDS = input.x_num; - int yDS = input.y_num; - int zDS = input.z_num; - - size_t offset = 0; - for (int l = levelMax; l >= levelMin; --l) { - levels[l] = reinterpret_cast(mem.get()) + offset; - xSize[l] = xDS; - ySize[l] = yDS; - zSize[l] = zDS; - - offset += xDS * yDS * zDS * sizeof(TreeElementType); - // round up to 16-bytes - const size_t alignemet = 16; - offset = ((offset + alignemet - 1) / alignemet ) * alignemet; +class ParticleCellTreeCuda { + ScopedCudaMemHandler mem; + std::vector startOffsets; + GenInfo gi; + size_t numOfElements = 0; + cudaStream_t stream = nullptr; + +public: + + ParticleCellTreeCuda(const GenInfo &aprInfo, const cudaStream_t aStream) : gi(aprInfo), stream(aStream) { + // Calculate size of needed memory for PCT and offsets for particular levels + int l_max = aprInfo.l_max - 1; + int l_min = aprInfo.l_min; + + startOffsets.resize(l_max + 1, 0); + + for (int l = l_min; l <= l_max; ++l) { + auto yLen = ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1)); + auto xLen = ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1)); + auto zLen = ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1)); + size_t levelSize = yLen * xLen * zLen; + startOffsets[l] = numOfElements; + numOfElements += levelSize; + } - xDS = ceil(xDS/2.0); - yDS = ceil(yDS/2.0); - zDS = ceil(zDS/2.0); + // Initialize memory, it is not binded to any CPU memory so we provide nullptr + mem.initialize(nullptr, numOfElements, stream); + cudaMemsetAsync(mem.get(), EMPTY, numOfElements, stream); } + inline uint8_t* operator[](size_t level) { return mem.get() + startOffsets[level]; } - runCopyAndClampLevels(in.get(), levels[levelMax], in.getSize(), levelMin, levelMax, 0); + auto getPCTcpu() { + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + for (int i = gi.l_min; i < gi.l_max; ++i) { + checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream)); + } + checkCuda(cudaStreamSynchronize(stream)); - for (int l = levelMax - 1; l >= levelMin; --l) { - runDownsampleMax(levels[l + 1], levels[l], xSize[l + 1], ySize[l + 1], zSize[l + 1], 0); + return pct; } +}; - // ================== Phase 1 - top to down - for (int l = levelMin; l <= levelMax; ++l) { - runFirstStep(levels[l], xSize[l], ySize[l], zSize[l], l, 0); - } - // ================== Phase 1 - down to top - for (int l = levelMax - 1; l >= levelMin; --l) { - runSecondStep(levels[l], levels[l+1], xSize[l], ySize[l], zSize[l], xSize[l+1], ySize[l+1], zSize[l+1], l == levelMin, 0); - } - waitForCuda(); - t.stop_timer(); -} - // explicit instantiation of handled types -template void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); -template void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); +template std::vector> computeOvpcCuda(const PixelData&, const GenInfo&); +template std::vector> computeOvpcCuda(const PixelData&, const GenInfo&); /** * CUDA implementation of Pullin Scheme (OVPC - Optimal Valid Particle Cell set). @@ -191,30 +165,33 @@ template void computeOvpcCuda(const PixelData &input, std::vector -void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax) { +template +std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi) { // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing // all steps + + ParticleCellTreeCuda pct(gi, 0 /*stream*/); + int levelMin = gi.l_min; + int levelMax = gi.l_max - 1; + ScopedCudaMemHandler, H2D> in(input); - std::vector, D2H>> w; - for (int l = 0; l <= levelMax; ++l) { - w.push_back(std::move(ScopedCudaMemHandler, D2H>(pct[l]))); - } // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range - runCopyAndClampLevels(in.get(), w[levelMax].get(), in.getSize(), levelMin, levelMax, 0); + runCopyAndClampLevels(in.get(), pct[levelMax], in.getSize(), levelMin, levelMax, 0); // Downsample with max reduction to levelMin to fill the rest of the tree for (int l = levelMax - 1; l >= levelMin; --l) { - runDownsampleMax(w[l + 1].get(), w[l].get(), pct[l + 1].x_num, pct[l + 1].y_num, pct[l + 1].z_num, 0); + runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], 0); } // ================== Phase 1 - top to down for (int l = levelMin; l <= levelMax; ++l) { - runFirstStep(w[l].get(), pct[l].x_num, pct[l].y_num, pct[l].z_num, l, 0); + runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, 0); } // ================== Phase 1 - down to top for (int l = levelMax - 1; l >= levelMin; --l) { - runSecondStep(w[l].get(), w[l+1].get(), pct[l].x_num, pct[l].y_num, pct[l].z_num, pct[l + 1].x_num, pct[l + 1].y_num, pct[l + 1].z_num, l == levelMin, 0); + runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, 0); } + + return pct.getPCTcpu(); } diff --git a/src/algorithm/PullingSchemeCuda.hpp b/src/algorithm/PullingSchemeCuda.hpp index f98c0883..236c260f 100644 --- a/src/algorithm/PullingSchemeCuda.hpp +++ b/src/algorithm/PullingSchemeCuda.hpp @@ -7,12 +7,11 @@ #include "data_structures/Mesh/PixelData.hpp" +#include "data_structures/APR/GenInfo.hpp" using TreeElementType = uint8_t; -template -void computeOVPC(const PixelData &input, PixelData &output, int levelMin, int levelMax); -template -void computeOvpcCuda(const PixelData &input, std::vector> &pct, int levelMin, int levelMax); +template +std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi); #endif //LIBAPR_PULLINGSCHEMECUDA_HPP diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index a92abc08..95c2b07c 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -10,6 +10,7 @@ #include "TestTools.hpp" #include "data_structures/Mesh/PixelDataCuda.h" #include "algorithm/APRConverter.hpp" +#include "misc/CudaTools.cuh" namespace { @@ -186,10 +187,7 @@ namespace { getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - int levelMax = aprInfo.l_max - 1; - int levelMin = aprInfo.l_min; - std::vector> pct = PullingScheme::generateParticleCellTree(aprInfo); - computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(local_scale_temp_GPU, aprInfo); timer.stop_timer(); // Compare GPU vs CPU - expect exactly same result @@ -197,6 +195,9 @@ namespace { } } + + + TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_PS_LINEARACCESS) { APRTimer timer(true); @@ -263,10 +264,7 @@ namespace { getGradient(mGpuImage, grad_temp_GPU, local_scale_temp_GPU, local_scale_temp2_GPU, 0, par); getLocalIntensityScale(local_scale_temp_GPU, local_scale_temp2_GPU, par); computeLevelsCuda(grad_temp_GPU, local_scale_temp_GPU, maxLevel, par.rel_error, par.dx, par.dy, par.dz); - int levelMax = giGpu.l_max - 1; - int levelMin = giGpu.l_min; - std::vector> pct = PullingScheme::generateParticleCellTree(giGpu); - computeOvpcCuda(local_scale_temp_GPU, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(local_scale_temp_GPU, giGpu); auto linearAccessGpu = initializeLinearStructureCuda(giGpu, par, pct); timer.stop_timer(); diff --git a/test/PullingSchemeCudaTest.cpp b/test/PullingSchemeCudaTest.cpp index 5c4ebcb3..bd24156e 100644 --- a/test/PullingSchemeCudaTest.cpp +++ b/test/PullingSchemeCudaTest.cpp @@ -95,7 +95,7 @@ void fillPS(PullingScheme &aPS, PixelData &levels) { TEST(PullingSchemeTest, PSvsOVPCCUDA) { // Generates random levels in a 3D cube and then compares generated output levels in PS and OVPC GenInfo gi; - gi.init(255, 257, 199); + gi.init(255, 157, 257); // Generate random levels for PS and OVPC PixelData levels(std::ceil(gi.org_dims[0]/2.0), @@ -113,7 +113,7 @@ TEST(PullingSchemeTest, PSvsOVPCCUDA) { PixelData levelsPS(levels, true); // Initialize all needed objects - APRTimer t(false); + APRTimer t(true); t.start_timer("PS - init"); PullingScheme ps; @@ -125,19 +125,15 @@ TEST(PullingSchemeTest, PSvsOVPCCUDA) { t.stop_timer(); // Run test methods and compare results - t.start_timer("OVPCCUDA - init"); - int levelMax = gi.l_max - 1; - int levelMin = gi.l_min; - std::vector> pct = PullingScheme::generateParticleCellTree(gi); - t.stop_timer(); t.start_timer("OVPCCUDA - compute"); - computeOvpcCuda(levelsOVPC, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(levelsOVPC, gi); t.stop_timer(); // -------------- Verify result ASSERT_EQ(compareParticleCellTrees(ps.getParticleCellTree(), pct), 0); } + TEST(PullingSchemeTest, OVPCCUDA_Ydir) { // Prepare input data for PS float values[] = {9,0,0,0, 0,0,0,0}; @@ -157,12 +153,8 @@ TEST(PullingSchemeTest, OVPCCUDA_Ydir) { // Initialize all needed objects APRTimer t(false); - t.start_timer("OVPCCUDA - initialize"); - std::vector> pct = PullingScheme::generateParticleCellTree(gi); - t.stop_timer(); - t.start_timer("OVPCCUDA - compute"); - computeOvpcCuda(levels, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(levels, gi); t.stop_timer(); // List of expected types @@ -199,12 +191,8 @@ TEST(PullingSchemeTest, OVPCCUDA_Xdir) { // Initialize all needed objects APRTimer t(false); - t.start_timer("OVPCCUDA - initialize"); - std::vector> pct = PullingScheme::generateParticleCellTree(gi); - t.stop_timer(); - t.start_timer("OVPCCUDA - compute"); - computeOvpcCuda(levels, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(levels, gi); t.stop_timer(); // List of expected types @@ -241,12 +229,8 @@ TEST(PullingSchemeTest, OVPCCUDA_Zdir) { // Initialize all needed objects APRTimer t(false); - t.start_timer("OVPCCUDA - initialize"); - std::vector> pct = PullingScheme::generateParticleCellTree(gi); - t.stop_timer(); - t.start_timer("OVPCCUDA - compute"); - computeOvpcCuda(levels, pct, levelMin, levelMax); + auto pct = computeOvpcCuda(levels, gi); t.stop_timer(); // List of expected types From 00aac97431bb8feef27f0e0c9eaebdcf1850c86c Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 20 Aug 2024 09:30:46 +0200 Subject: [PATCH 51/59] computeOvpcCuda now using 'stream' instead of hardcoded values --- src/algorithm/OVPC.cu | 25 +++++++++++++------------ src/algorithm/PullingSchemeCuda.hpp | 2 ++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index c68fd63f..a840b7fe 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -158,39 +158,40 @@ template std::vector> computeOvpcCuda(const PixelData&, /** * CUDA implementation of Pullin Scheme (OVPC - Optimal Valid Particle Cell set). * @tparam T - type of input levels - * @tparam S - type of output Particle Cell Tree * @param input - input levels computed in earlier stages - * @param pct - Particle Cell Tree - as input is used for dimensions of each level, will be filled with computed - * Pulling Scheme as a output - * @param levelMin - min level of APR - * @param levelMax - max level of APR + * @param gi - GenInfo for given APR + * + * @return - PCT for CPU (copied from GPU) */ template std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi) { // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing // all steps - ParticleCellTreeCuda pct(gi, 0 /*stream*/); + cudaStream_t stream = nullptr; + + ScopedCudaMemHandler, H2D> in(input, stream); + + ParticleCellTreeCuda pct(gi, stream); int levelMin = gi.l_min; int levelMax = gi.l_max - 1; - ScopedCudaMemHandler, H2D> in(input); // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range - runCopyAndClampLevels(in.get(), pct[levelMax], in.getSize(), levelMin, levelMax, 0); + runCopyAndClampLevels(in.get(), pct[levelMax], in.getSize(), levelMin, levelMax, stream); - // Downsample with max reduction to levelMin to fill the rest of the tree + // Downsample with max reduction to levelMin to fill rest of the tree for (int l = levelMax - 1; l >= levelMin; --l) { - runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], 0); + runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], stream); } // ================== Phase 1 - top to down for (int l = levelMin; l <= levelMax; ++l) { - runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, 0); + runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, stream); } // ================== Phase 1 - down to top for (int l = levelMax - 1; l >= levelMin; --l) { - runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, 0); + runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, stream); } return pct.getPCTcpu(); diff --git a/src/algorithm/PullingSchemeCuda.hpp b/src/algorithm/PullingSchemeCuda.hpp index 236c260f..953903db 100644 --- a/src/algorithm/PullingSchemeCuda.hpp +++ b/src/algorithm/PullingSchemeCuda.hpp @@ -9,9 +9,11 @@ #include "data_structures/Mesh/PixelData.hpp" #include "data_structures/APR/GenInfo.hpp" + using TreeElementType = uint8_t; template std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi); + #endif //LIBAPR_PULLINGSCHEMECUDA_HPP From 1fba1bcdb14044fd517b3f3104ef0041831179c5 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 20 Aug 2024 09:53:09 +0200 Subject: [PATCH 52/59] ParticleCellTreeCuda moved and handle now cpu2gpu transfer --- src/algorithm/OVPC.cu | 44 +--------------- src/algorithm/ParticleCellTreeCuda.cuh | 70 ++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 43 deletions(-) create mode 100644 src/algorithm/ParticleCellTreeCuda.cuh diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index a840b7fe..9c6e0bd6 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -5,6 +5,7 @@ #include "misc/CudaTools.cuh" #include "data_structures/Mesh/downsample.cuh" #include "algorithm/OVPC.h" +#include "algorithm/ParticleCellTreeCuda.cuh" template @@ -107,49 +108,6 @@ void runSecondStep(T *data, T *child, size_t xLen, size_t yLen, size_t zLen, siz secondStep<<>>(data, child, xLen, yLen, zLen, xLenc, yLenc, zLenc, isLevelMax); }; -class ParticleCellTreeCuda { - ScopedCudaMemHandler mem; - std::vector startOffsets; - GenInfo gi; - size_t numOfElements = 0; - cudaStream_t stream = nullptr; - -public: - - ParticleCellTreeCuda(const GenInfo &aprInfo, const cudaStream_t aStream) : gi(aprInfo), stream(aStream) { - // Calculate size of needed memory for PCT and offsets for particular levels - int l_max = aprInfo.l_max - 1; - int l_min = aprInfo.l_min; - - startOffsets.resize(l_max + 1, 0); - - for (int l = l_min; l <= l_max; ++l) { - auto yLen = ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1)); - auto xLen = ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1)); - auto zLen = ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1)); - size_t levelSize = yLen * xLen * zLen; - startOffsets[l] = numOfElements; - numOfElements += levelSize; - } - - // Initialize memory, it is not binded to any CPU memory so we provide nullptr - mem.initialize(nullptr, numOfElements, stream); - cudaMemsetAsync(mem.get(), EMPTY, numOfElements, stream); - } - - inline uint8_t* operator[](size_t level) { return mem.get() + startOffsets[level]; } - - auto getPCTcpu() { - std::vector> pct = PullingScheme::generateParticleCellTree(gi); - for (int i = gi.l_min; i < gi.l_max; ++i) { - checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream)); - } - checkCuda(cudaStreamSynchronize(stream)); - - return pct; - } -}; - // explicit instantiation of handled types template std::vector> computeOvpcCuda(const PixelData&, const GenInfo&); diff --git a/src/algorithm/ParticleCellTreeCuda.cuh b/src/algorithm/ParticleCellTreeCuda.cuh new file mode 100644 index 00000000..9fe38273 --- /dev/null +++ b/src/algorithm/ParticleCellTreeCuda.cuh @@ -0,0 +1,70 @@ +#ifndef PARTICLE_CELL_TREE_CUDA_CUH +#define PARTICLE_CELL_TREE_CUDA_CUH + + +#include "data_structures/APR/GenInfo.hpp" +#include "algorithm/PullingScheme.hpp" + + +/* + * CUDA representation of PCT (Particle Cell Tree) + * Allocates memory and initialize it to EMPTY + * + * Allows acces to each level via subscription operator: + * ParticleCellTreeCuda pct(aprInfo); + * pct[level] + * + * getPCTcpu and uploadPCT2GPU handle interaction with CPU code (mainly for test/debug purposes). + */ +class ParticleCellTreeCuda { + ScopedCudaMemHandler mem; + std::vector startOffsets; + GenInfo gi; + size_t numOfElements = 0; + cudaStream_t stream = nullptr; + +public: + + ParticleCellTreeCuda(const GenInfo &aprInfo, const cudaStream_t aStream) : gi(aprInfo), stream(aStream) { + // Calculate size of needed memory for PCT and offsets for particular levels + int l_max = aprInfo.l_max - 1; + int l_min = aprInfo.l_min; + + startOffsets.resize(l_max + 1, 0); + + for (int l = l_min; l <= l_max; ++l) { + auto yLen = ceil(aprInfo.org_dims[0] / PullingScheme::powr(2.0, l_max - l + 1)); + auto xLen = ceil(aprInfo.org_dims[1] / PullingScheme::powr(2.0, l_max - l + 1)); + auto zLen = ceil(aprInfo.org_dims[2] / PullingScheme::powr(2.0, l_max - l + 1)); + size_t levelSize = yLen * xLen * zLen; + startOffsets[l] = numOfElements; + numOfElements += levelSize; + } + + // Initialize memory, it is not binded to any CPU memory so we provide nullptr + mem.initialize(nullptr, numOfElements, stream); + cudaMemsetAsync(mem.get(), EMPTY, numOfElements, stream); + } + + inline uint8_t* operator[](size_t level) { return mem.get() + startOffsets[level]; } + + auto getPCTcpu() { + std::vector> pct = PullingScheme::generateParticleCellTree(gi); + for (int i = gi.l_min; i < gi.l_max; ++i) { + checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream)); + } + checkCuda(cudaStreamSynchronize(stream)); + + return pct; + } + + void uploadPCT2GPU(std::vector> pct) { + for (int i = gi.l_min; i < gi.l_max; ++i) { + checkCuda(cudaMemcpyAsync((*this)[i], pct[i].mesh.get(), pct[i].mesh.size(), cudaMemcpyHostToDevice, stream)); + } + checkCuda(cudaStreamSynchronize(stream)); + } +}; + + +#endif From 34742506ed6b87c5e629d63f11ec13674d6d45e6 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Tue, 20 Aug 2024 15:02:21 +0200 Subject: [PATCH 53/59] LinearAccessCuda is now using ParticleCellTreeCuda --- src/algorithm/ParticleCellTreeCuda.cuh | 9 ++- .../APR/access/LinearAccessCuda.cu | 56 +++++++++---------- .../APR/access/LinearAccessCuda.hpp | 2 +- 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/src/algorithm/ParticleCellTreeCuda.cuh b/src/algorithm/ParticleCellTreeCuda.cuh index 9fe38273..4f520d54 100644 --- a/src/algorithm/ParticleCellTreeCuda.cuh +++ b/src/algorithm/ParticleCellTreeCuda.cuh @@ -58,7 +58,14 @@ public: return pct; } - void uploadPCT2GPU(std::vector> pct) { + void downloadPCTfromGPU(std::vector> &pct) { + for (int i = gi.l_min; i < gi.l_max; ++i) { + checkCuda(cudaMemcpyAsync(pct[i].mesh.get(), (*this)[i], pct[i].mesh.size(), cudaMemcpyDeviceToHost, stream)); + } + checkCuda(cudaStreamSynchronize(stream)); + } + + void uploadPCT2GPU(const std::vector> &pct) { for (int i = gi.l_min; i < gi.l_max; ++i) { checkCuda(cudaMemcpyAsync((*this)[i], pct[i].mesh.get(), pct[i].mesh.size(), cudaMemcpyHostToDevice, stream)); } diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index 8ce7e347..aeffa2c0 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -1,6 +1,7 @@ #include "LinearAccessCuda.hpp" #include "misc/CudaTools.cuh" +#include "algorithm/ParticleCellTreeCuda.cuh" // CUDA version of GenInfo structure typedef struct GenInfoCuda_t { @@ -90,9 +91,6 @@ public: } }; -typedef ScopedCudaMemHandler, H2D | D2H> ParticleCellTreeLevelCuda; -typedef std::vector ParticleCellTreeCuda; - // ********************************************************************************************************************* // FULL RESOLUTION // ********************************************************************************************************************* @@ -134,7 +132,7 @@ __global__ void fullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint1 } } -void runFullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, GenInfo &gi, GenInfoGpuAccess &giga, cudaStream_t aStream) { +void runFullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, const GenInfo &gi, GenInfoGpuAccess &giga, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x, @@ -154,11 +152,10 @@ void runFullResolution(const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y, // FIRST STEP // ********************************************************************************************************************* -constexpr uint8_t UPSAMPLING_SEED_TYPE = 4; static constexpr uint8_t seed_us = UPSAMPLING_SEED_TYPE; //deal with the equivalence optimization -__global__ void firstStep(uint8_t *prevLevel, uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic) { +__global__ void firstStep(const uint8_t *prevLevel, uint8_t *currLevel, int level, uint8_t min_type, GenInfoCuda gic) { const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; const uint64_t xLen = gic.x_num[level]; @@ -181,16 +178,16 @@ __global__ void firstStep(uint8_t *prevLevel, uint8_t *currLevel, int level, uin } } -void runFirstStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, cudaStream_t aStream) { +void runFirstStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); for (int level = gi.l_min + 1; level < gi.l_max; ++level) { dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); - auto &p_mapPrev = p_map[level - 1]; - auto &p_mapCurr = p_map[level]; - firstStep<<>>(p_mapPrev.get(), p_mapCurr.get(), level, min_type, giga.getGenInfoCuda()); + auto *p_mapPrev = p_map[level - 1]; + auto *p_mapCurr = p_map[level]; + firstStep<<>>(p_mapPrev, p_mapCurr, level, min_type, giga.getGenInfoCuda()); } cudaError_t err = cudaGetLastError(); @@ -232,15 +229,15 @@ __global__ void secondStep(const uint8_t *currLevel, int level, uint8_t min_type } } -void runSecondStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, cudaStream_t aStream) { +void runSecondStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); for (int level = gi.l_min; level < gi.l_max - 1; ++level) { dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); - auto &p_mapCurr = p_map[level]; - secondStep<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); + auto *p_mapCurr = p_map[level]; + secondStep<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); } cudaError_t err = cudaGetLastError(); @@ -323,15 +320,15 @@ __global__ void secondStepCountParticles(GenInfoCuda gic, const uint64_t *level_ *gic.total_number_particles = xz_end[counter_total -1]; } -void runSecondStepLastLevel(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total, cudaStream_t aStream) { +void runSecondStepLastLevel(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint64_t counter_total, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); dim3 numBlocks( (gi.x_num[gi.l_max - 1] + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (gi.z_num[gi.l_max - 1] + threadsPerBlock.z - 1)/threadsPerBlock.z); int level = gi.l_max - 1; - auto &p_mapCurr = p_map[level]; - secondStepLastLevel<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); + auto *p_mapCurr = p_map[level]; + secondStepLastLevel<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -381,15 +378,15 @@ __global__ void getYvalues(const uint8_t *currLevel, int level, uint8_t min_type } } -void runGetYvalues(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, cudaStream_t aStream) { +void runGetYvalues(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); for (int level = gi.l_min; level < gi.l_max - 1; ++level) { dim3 numBlocks( (gi.x_num[level] + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (gi.z_num[level] + threadsPerBlock.z - 1)/threadsPerBlock.z); - auto &p_mapCurr = p_map[level]; - getYvalues<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + auto *p_mapCurr = p_map[level]; + getYvalues<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); } cudaError_t err = cudaGetLastError(); @@ -482,15 +479,15 @@ __global__ void fourthStepLastLevel(GenInfoCuda gic, const uint64_t *level_xz, u } } -void runFourthStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, uint64_t counter_total, cudaStream_t aStream) { +void runFourthStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_map, uint8_t min_type, const uint64_t *level_xz, uint64_t *xz_end, uint16_t *y_vec, uint64_t counter_total, cudaStream_t aStream) { dim3 threadsPerBlock(32, 1, 1); dim3 numBlocks( (gi.x_num[gi.l_max] + threadsPerBlock.x - 1)/threadsPerBlock.x, 1, (gi.z_num[gi.l_max] + threadsPerBlock.z - 1)/threadsPerBlock.z); int level = gi.l_max - 1; - auto &p_mapCurr = p_map[level]; - fourthStep<<>>(p_mapCurr.get(), level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); + auto *p_mapCurr = p_map[level]; + fourthStep<<>>(p_mapCurr, level, min_type, giga.getGenInfoCuda(), level_xz, xz_end, y_vec); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -524,13 +521,14 @@ void runFourthStep(GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCuda &p_ * In current shape it is a good function for testing implementation rather than using it in production code. * Production code should use parts of it and work on pre-allocated memory - probably in GpuProcessingTask. */ -LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters &apr_parameters, std::vector> &pct) { +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct) { + + cudaStream_t aStream = nullptr; + // Copy input to CUDA mem and prepare CUDA representation of particle cell tree which will be filled after computing // all steps - ParticleCellTreeCuda p_map; - for (auto &p : pct) { - p_map.emplace_back(std::move(ParticleCellTreeLevelCuda(p))); - } + ParticleCellTreeCuda p_map (gi, aStream); + p_map.uploadPCT2GPU(pct); uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2; @@ -560,7 +558,7 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters size_t maxYvecSize = gi.x_num[gi.l_max] * gi.y_num[gi.l_max] * gi.z_num[gi.l_max]; y_vec.resize(maxYvecSize); - cudaStream_t aStream = nullptr; + { ScopedCudaMemHandler y_vec_cuda(y_vec.data(), y_vec.size()); ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size()); @@ -583,6 +581,8 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters // we can get different size of particles for each frame - with preallocated buffer we can do all of them on it). y_vec.resize(gi.total_number_particles); + p_map.downloadPCTfromGPU(pct); + LinearAccessCudaStructs lac; lac.y_vec.swap(y_vec); diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp index 53dfd001..51148d9e 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.hpp +++ b/src/data_structures/APR/access/LinearAccessCuda.hpp @@ -11,7 +11,7 @@ typedef struct { VectorData level_xz_vec; } LinearAccessCudaStructs; -LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, APRParameters &apr_parameters, std::vector> &pct); +LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); #endif //APR_LINEARACCESSCUDA_HPP From 1d4e54940df6f4f149388d397ddbbcea991979eb Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Aug 2024 10:27:34 +0200 Subject: [PATCH 54/59] OVPC added to GpuTask --- src/algorithm/ComputeGradientCuda.cu | 11 ++++++- src/algorithm/OVPC.cu | 31 +++++++++++++++++++ src/algorithm/ParticleCellTreeCuda.cuh | 2 +- src/algorithm/PullingSchemeCuda.hpp | 4 ++- src/data_structures/APR/GenInfo.hpp | 3 ++ .../APR/access/LinearAccessCuda.cu | 3 +- test/FullPipelineCudaTest.cpp | 6 ++-- 7 files changed, 52 insertions(+), 8 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 4db49d4d..d092a1b7 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -13,6 +13,8 @@ #include "algorithm/LocalIntensityScale.cuh" #include "misc/CudaTools.cuh" #include "misc/CudaMemory.cuh" +#include "algorithm/ParticleCellTreeCuda.cuh" +#include "algorithm/PullingSchemeCuda.hpp" #include "dsGradient.cuh" #include "invBspline.cuh" @@ -207,6 +209,7 @@ class GpuProcessingTask::GpuProcessingTaskImpl { const PixelData &iCpuImage; PixelData &iCpuLevels; const APRParameters &iParameters; + GenInfo iAprInfo; float iBsplineOffset; int iMaxLevel; @@ -227,6 +230,8 @@ class GpuProcessingTask::GpuProcessingTaskImpl { const size_t boundaryLen; ScopedCudaMemHandler boundary; + ParticleCellTreeCuda pctc; + /** * @return newly created stream */ @@ -247,6 +252,7 @@ public: local_scale_temp (levels, iStream), local_scale_temp2 (levels, iStream), iParameters(parameters), + iAprInfo(iCpuImage.getDimension()), iBsplineOffset(bspline_offset), iMaxLevel(maxLevel), // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. @@ -257,7 +263,8 @@ public: bc3(params.bc3.get(), params.k0, iStream), bc4(params.bc4.get(), params.k0, iStream), boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, - boundary{nullptr, boundaryLen, iStream} + boundary{nullptr, boundaryLen, iStream}, + pctc(iAprInfo, iStream) { // std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n"; std::cout << iCpuImage << std::endl; @@ -308,6 +315,8 @@ public: const float mult_const = level_factor/iParameters.rel_error; runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); std::cout << "3: " << ct.microseconds() - start << std::endl; + + computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); } ~GpuProcessingTaskImpl() { diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index 9c6e0bd6..55656674 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -154,3 +154,34 @@ std::vector> computeOvpcCuda(const PixelData &input, const return pct.getPCTcpu(); } + +// explicit instantiation of handled types +template void computeOvpcCuda(float *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream); +template void computeOvpcCuda(int *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream); + + +template +void computeOvpcCuda(ImgType *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream) { + int levelMin = gi.l_min; + int levelMax = gi.l_max - 1; + + + // feel the highes level of PCT with provided levels and clamp values to be within [levelMin, levelMax] range + runCopyAndClampLevels(in, pct[levelMax], gi.y_num[levelMax]*gi.x_num[levelMax]*gi.z_num[levelMax], levelMin, levelMax, stream); + + // Downsample with max reduction to levelMin to fill rest of the tree + for (int l = levelMax - 1; l >= levelMin; --l) { + runDownsampleMax(pct[l + 1], pct[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], stream); + } + + // ================== Phase 1 - top to down + for (int l = levelMin; l <= levelMax; ++l) { + runFirstStep(pct[l], gi.x_num[l], gi.y_num[l], gi.z_num[l], l, stream); + } + // ================== Phase 1 - down to top + for (int l = levelMax - 1; l >= levelMin; --l) { + runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, stream); + } + + std::cout << "------- RUN --------------\n"; +} \ No newline at end of file diff --git a/src/algorithm/ParticleCellTreeCuda.cuh b/src/algorithm/ParticleCellTreeCuda.cuh index 4f520d54..d3bc6160 100644 --- a/src/algorithm/ParticleCellTreeCuda.cuh +++ b/src/algorithm/ParticleCellTreeCuda.cuh @@ -4,7 +4,7 @@ #include "data_structures/APR/GenInfo.hpp" #include "algorithm/PullingScheme.hpp" - +#include "misc/CudaTools.cuh" /* * CUDA representation of PCT (Particle Cell Tree) diff --git a/src/algorithm/PullingSchemeCuda.hpp b/src/algorithm/PullingSchemeCuda.hpp index 953903db..12aa81d3 100644 --- a/src/algorithm/PullingSchemeCuda.hpp +++ b/src/algorithm/PullingSchemeCuda.hpp @@ -8,12 +8,14 @@ #include "data_structures/Mesh/PixelData.hpp" #include "data_structures/APR/GenInfo.hpp" - +#include "algorithm/ParticleCellTreeCuda.cuh" using TreeElementType = uint8_t; template std::vector> computeOvpcCuda(const PixelData &input, const GenInfo &gi); +template +void computeOvpcCuda(ImgType *in, ParticleCellTreeCuda &pct, const GenInfo &gi, cudaStream_t stream); #endif //LIBAPR_PULLINGSCHEMECUDA_HPP diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp index e506100a..7898fc97 100644 --- a/src/data_structures/APR/GenInfo.hpp +++ b/src/data_structures/APR/GenInfo.hpp @@ -34,6 +34,9 @@ class GenInfo { std::vector level_size; // precomputation of the size of each level, used by the iterators. + GenInfo() {} + GenInfo(const PixelDataDim &dim) { init(dim); } + //initialize the information given the original dimensions void init(const PixelDataDim &dim) { init(dim.y, dim.x, dim.z); diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index aeffa2c0..2de0dd6a 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -506,7 +506,7 @@ void runFourthStep(const GenInfo &gi, GenInfoGpuAccess &giga, ParticleCellTreeCu // ********************************************************************************************************************* -// MAIN FUNC TO CALL - implements logic of inearAccess::initialize_linear_structure CPU func. +// MAIN FUNC TO CALL - implements logic of LinearAccess::initialize_linear_structure CPU func. // ********************************************************************************************************************* @@ -581,6 +581,7 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara // we can get different size of particles for each frame - with preallocated buffer we can do all of them on it). y_vec.resize(gi.total_number_particles); + // Transfer changes to PCT from GPU to CPU (this is needed only for tests) p_map.downloadPCTfromGPU(pct); diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 95c2b07c..975ba0b2 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -238,10 +238,8 @@ namespace { par.dz = 1; par.neighborhood_optimization = true; - GenInfo aprInfo; - aprInfo.init(input_image.getDimension()); - GenInfo giGpu; - giGpu.init(input_image.getDimension()); + GenInfo aprInfo(input_image.getDimension()); + GenInfo giGpu(input_image.getDimension()); // Calculate pipeline on CPU timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); From 9ff0580050477adbc63f63c42dc9978cd16db497 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Aug 2024 16:51:30 +0200 Subject: [PATCH 55/59] Full GPU pipeline works1 --- src/algorithm/ComputeGradientCuda.cu | 24 ++++++---- src/algorithm/ComputeGradientCuda.hpp | 4 +- src/data_structures/APR/GenInfo.hpp | 3 ++ .../APR/access/LinearAccessCuda.cu | 48 +++++++++++++++++++ .../APR/access/LinearAccessCuda.hpp | 3 ++ test/FullPipelineCudaTest.cpp | 42 ++++++++++++---- 6 files changed, 105 insertions(+), 19 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index d092a1b7..bee6417f 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -15,6 +15,7 @@ #include "misc/CudaMemory.cuh" #include "algorithm/ParticleCellTreeCuda.cuh" #include "algorithm/PullingSchemeCuda.hpp" +#include "data_structures/APR/access/LinearAccessCuda.hpp" #include "dsGradient.cuh" #include "invBspline.cuh" @@ -232,6 +233,9 @@ class GpuProcessingTask::GpuProcessingTaskImpl { ParticleCellTreeCuda pctc; + ScopedCudaMemHandler y_vec; // for LinearAccess + LinearAccessCudaStructs lacs; + /** * @return newly created stream */ @@ -264,7 +268,8 @@ public: bc4(params.bc4.get(), params.k0, iStream), boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, boundary{nullptr, boundaryLen, iStream}, - pctc(iAprInfo, iStream) + pctc(iAprInfo, iStream), + y_vec(nullptr, iAprInfo.getSize(), iStream) { // std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n"; std::cout << iCpuImage << std::endl; @@ -279,12 +284,13 @@ public: std::cout << "SEND time: " << ct.microseconds() - start << std::endl; } - void getDataFromGpu() { - CurrentTime ct; - uint64_t start = ct.microseconds(); - local_scale_temp.copyD2H(); - checkCuda(cudaStreamSynchronize(iStream)); - std::cout << "RCV time: " << ct.microseconds() - start << std::endl; + LinearAccessCudaStructs getDataFromGpu() { +// CurrentTime ct; +// uint64_t start = ct.microseconds(); +// local_scale_temp.copyD2H(); +// checkCuda(cudaStreamSynchronize(iStream)); +// std::cout << "RCV time: " << ct.microseconds() - start << std::endl; + return std::move(lacs); } void processOnGpu() { @@ -317,6 +323,8 @@ public: std::cout << "3: " << ct.microseconds() - start << std::endl; computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); + computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream); + std::cout << iAprInfo << std::endl; } ~GpuProcessingTaskImpl() { @@ -339,7 +347,7 @@ template void GpuProcessingTask::sendDataToGpu() {impl->sendDataToGpu();} template -void GpuProcessingTask::getDataFromGpu() {impl->getDataFromGpu();} +LinearAccessCudaStructs GpuProcessingTask::getDataFromGpu() {return impl->getDataFromGpu();} template void GpuProcessingTask::processOnGpu() {impl->processOnGpu();} diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index a8ebe1bf..723b6181 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -7,7 +7,7 @@ #include "data_structures/Mesh/PixelData.hpp" #include "algorithm/APRParameters.hpp" - +#include "data_structures/APR/access/LinearAccessCuda.hpp" // Test helpers and definitions using TypeOfRecBsplineFlags = uint16_t; @@ -47,7 +47,7 @@ class GpuProcessingTask { GpuProcessingTask(GpuProcessingTask&&); void sendDataToGpu(); - void getDataFromGpu(); + LinearAccessCudaStructs getDataFromGpu(); void processOnGpu(); void doAll(); }; diff --git a/src/data_structures/APR/GenInfo.hpp b/src/data_structures/APR/GenInfo.hpp index 7898fc97..8d5da2bd 100644 --- a/src/data_structures/APR/GenInfo.hpp +++ b/src/data_structures/APR/GenInfo.hpp @@ -37,6 +37,8 @@ class GenInfo { GenInfo() {} GenInfo(const PixelDataDim &dim) { init(dim); } + size_t getSize() const { return (size_t)y_num[l_max] * x_num[l_max] * z_num[l_max]; } + //initialize the information given the original dimensions void init(const PixelDataDim &dim) { init(dim.y, dim.x, dim.z); @@ -119,6 +121,7 @@ class GenInfo { friend std::ostream & operator<<(std::ostream &os, const GenInfo &gi) { os << "GenInfo {\n"; os << " Original dimensions(y/x/z): [" << gi.org_dims[0] << ", " << gi.org_dims[1] << ", " << gi.org_dims[2] << "]\n"; + os << " Original size: " << gi.getSize() << "\n"; os << " Number of dimensions: " << static_cast(gi.number_dimensions) << "\n"; os << " l_min, l_max: {" << gi.l_min << " - " << gi.l_max << "}\n"; os << " total number of particles: " << gi.total_number_particles << "\n"; diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index 2de0dd6a..9e38d760 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -592,3 +592,51 @@ LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRPara return lac; } + +void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream) { + + uint8_t min_type = apr_parameters.neighborhood_optimization ? 1 : 2; + + VectorData xz_end_vec(true); + VectorData level_xz_vec(true); + + // initialize_xz_linear() - CPU impl. + uint64_t counter_total = 1; //the buffer val to allow -1 calls without checking. + level_xz_vec.resize(gi.l_max + 2, 0); //includes a buffer for -1 calls, and therefore needs to be called with level + 1; + level_xz_vec[0] = 1; //allowing for the offset. + for (int i = 0; i <= gi.l_max; ++i) { + counter_total += gi.x_num[i] * gi.z_num[i]; + level_xz_vec[i + 1] = counter_total; + } + xz_end_vec.resize(counter_total, 0); + + + { + ScopedCudaMemHandler xz_end_vec_cuda(xz_end_vec.data(), xz_end_vec.size()); + ScopedCudaMemHandler level_xz_vec_cuda(level_xz_vec.data(), level_xz_vec.size()); + GenInfoGpuAccess giga(gi, aStream); + if (gi.l_max <= 2) { + runFullResolution(level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, gi, giga, aStream); + } + else { + runFirstStep(gi, giga, p_map, min_type, aStream); + runSecondStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), aStream); + runSecondStepLastLevel(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), counter_total, aStream); + runGetYvalues(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, aStream); + runFourthStep(gi, giga, p_map, min_type, level_xz_vec_cuda.get(), xz_end_vec_cuda.get(), y_vec_cuda, counter_total, aStream); + } + } + +// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; +// prt(y_vec); +// prt(xz_end_vec); +// prt(level_xz_vec); + VectorData y_vec(true); + y_vec.resize(gi.total_number_particles); + checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda, gi.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, aStream)); + checkCuda(cudaStreamSynchronize(aStream)); + + lacs.y_vec.swap(y_vec); + lacs.xz_end_vec.swap(xz_end_vec); + lacs.level_xz_vec.swap(level_xz_vec); +} diff --git a/src/data_structures/APR/access/LinearAccessCuda.hpp b/src/data_structures/APR/access/LinearAccessCuda.hpp index 51148d9e..27d56ab6 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.hpp +++ b/src/data_structures/APR/access/LinearAccessCuda.hpp @@ -4,6 +4,7 @@ #include "algorithm/APRParameters.hpp" #include "data_structures/Mesh/PixelData.hpp" #include "data_structures/APR/GenInfo.hpp" +#include "algorithm/ParticleCellTreeCuda.cuh" typedef struct { VectorData y_vec; @@ -13,5 +14,7 @@ typedef struct { LinearAccessCudaStructs initializeLinearStructureCuda(GenInfo &gi, const APRParameters &apr_parameters, std::vector> &pct); +void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_map, GenInfo &gi, const APRParameters &apr_parameters, LinearAccessCudaStructs &lacs, cudaStream_t aStream); + #endif //APR_LINEARACCESSCUDA_HPP diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 975ba0b2..8cc516a4 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -277,7 +277,7 @@ namespace { } } - TEST(ComputeThreshold, PIPELINE_TEST_GRADIENT_LIS_LEVELS_GpuProcessingTask) { + TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GpuProcessingTask) { APRTimer timer(true); // TODO: This tets fails if dim of input image is smaller than ~8 (not sure in which direction yet) @@ -288,11 +288,15 @@ namespace { // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; constexpr PixelDataDim dim1{4, 4, 3}; - constexpr PixelDataDim dim2{163, 123, 555}; + constexpr PixelDataDim dim2{1024,512,512}; for (int d = 0; d <= 3; d++) { auto &dim = (d % 2 == 0) ? dim1 : dim2; PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : - getMeshWithBlobInMiddle(dim); + getMeshWithBlobInMiddle(dim); + +// constexpr PixelDataDim dim = dim1; +// PixelData input_image = getRandInitializedMesh(dim, 13); + int maxLevel = ceil(std::log2(dim.maxDimSize())); // Initialize CPU data structures @@ -321,32 +325,52 @@ namespace { par.dz = 1; par.neighborhood_optimization = true; + GenInfo aprInfo(input_image.getDimension()); + GenInfo giGpu(input_image.getDimension()); + + // Calculate pipeline on CPU // Calculate pipeline on CPU timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); LocalParticleCellSet lpcs = LocalParticleCellSet(); lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); + PullingScheme ps; + ps.initialize_particle_cell_tree(aprInfo); + lpcs.get_local_particle_cell_set(ps, local_scale_temp, local_scale_temp2, par); + ps.pulling_scheme_main(); + LinearAccess linearAccess; + linearAccess.genInfo = &aprInfo; + linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); timer.stop_timer(); // Calculate pipeline on GPU timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - { - GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); - gpt.doAll(); - } + // { + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); + gpt.sendDataToGpu(); + gpt.processOnGpu(); + auto linearAccessGpu = gpt.getDataFromGpu(); + giGpu.total_number_particles = linearAccessGpu.y_vec.size(); + + // } timer.stop_timer(); // Compare GPU vs CPU - expect exactly same result - EXPECT_EQ(compareMeshes(local_scale_temp, local_scale_temp_GPU, 0), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.y_vec, linearAccess.y_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.level_xz_vec, linearAccess.level_xz_vec), 0); + EXPECT_EQ(compareParticles(linearAccessGpu.xz_end_vec, linearAccess.xz_end_vec), 0); + + EXPECT_EQ(aprInfo.total_number_particles, giGpu.total_number_particles); + EXPECT_EQ(linearAccessGpu.y_vec.size(), linearAccess.y_vec.size()); } } + #endif // APR_USE_CUDA } - int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); From c10225d6bd099273086ff8e68a0234e6661e4542 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Aug 2024 17:19:48 +0200 Subject: [PATCH 56/59] Some debug prints removed --- src/algorithm/ComputeGradientCuda.cu | 23 +++++++++---------- src/algorithm/OVPC.cu | 2 -- .../APR/access/LinearAccessCuda.cu | 4 ---- test/FullPipelineCudaTest.cpp | 12 +++++----- 4 files changed, 17 insertions(+), 24 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index bee6417f..9c85cd0f 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -247,6 +247,9 @@ class GpuProcessingTask::GpuProcessingTaskImpl { public: + // TODO: Remove need for passing 'levels' to GpuProcessingTask + // It was used during development to control internal computation like filters, gradient, levels etc. but + // once all is done there is no need for it anymore GpuProcessingTaskImpl(const PixelData &inputImage, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) : iCpuImage(inputImage), iCpuLevels(levels), @@ -272,16 +275,16 @@ public: y_vec(nullptr, iAprInfo.getSize(), iStream) { // std::cout << "\n=============== GpuProcessingTaskImpl ===================\n\n"; - std::cout << iCpuImage << std::endl; - std::cout << iCpuLevels << std::endl; +// std::cout << iCpuImage << std::endl; +// std::cout << iCpuLevels << std::endl; } void sendDataToGpu() { - CurrentTime ct; - uint64_t start = ct.microseconds(); +// CurrentTime ct; +// uint64_t start = ct.microseconds(); image.copyH2D(); - checkCuda(cudaStreamSynchronize(iStream)); - std::cout << "SEND time: " << ct.microseconds() - start << std::endl; +// checkCuda(cudaStreamSynchronize(iStream)); +// std::cout << "SEND time: " << ct.microseconds() - start << std::endl; } LinearAccessCudaStructs getDataFromGpu() { @@ -313,18 +316,14 @@ public: getGradientCuda(iCpuImage, iCpuLevels, image.get(), gradient.get(), local_scale_temp.get(), splineCudaX, splineCudaY, splineCudaZ, boundary.get(), iBsplineOffset, iParameters, iStream); - std::cout << "1: " << ct.microseconds() - start << std::endl; runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); - std::cout << "2: " << ct.microseconds() - start << std::endl; float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); float level_factor = pow(2, iMaxLevel) * min_dim; const float mult_const = level_factor/iParameters.rel_error; runComputeLevels(gradient.get(), local_scale_temp.get(), iCpuLevels.mesh.size(), mult_const, iStream); - std::cout << "3: " << ct.microseconds() - start << std::endl; computeOvpcCuda(local_scale_temp.get(), pctc, iAprInfo, iStream); computeLinearStructureCuda(y_vec.get(), pctc, iAprInfo, iParameters, lacs, iStream); - std::cout << iAprInfo << std::endl; } ~GpuProcessingTaskImpl() { @@ -335,10 +334,10 @@ public: template GpuProcessingTask::GpuProcessingTask(const PixelData &image, PixelData &levels, const APRParameters ¶meters, float bspline_offset, int maxLevel) -: impl{new GpuProcessingTaskImpl(image, levels, parameters, bspline_offset, maxLevel)} {std::cout << "GpuProcessingTask\n";} +: impl{new GpuProcessingTaskImpl(image, levels, parameters, bspline_offset, maxLevel)} { } template -GpuProcessingTask::~GpuProcessingTask() {std::cout << "~GpuProcessingTask\n";} +GpuProcessingTask::~GpuProcessingTask() { } template GpuProcessingTask::GpuProcessingTask(GpuProcessingTask&&) = default; diff --git a/src/algorithm/OVPC.cu b/src/algorithm/OVPC.cu index 55656674..80765bca 100644 --- a/src/algorithm/OVPC.cu +++ b/src/algorithm/OVPC.cu @@ -182,6 +182,4 @@ void computeOvpcCuda(ImgType *in, ParticleCellTreeCuda &pct, const GenInfo &gi, for (int l = levelMax - 1; l >= levelMin; --l) { runSecondStep(pct[l], pct[l+1], gi.x_num[l], gi.y_num[l], gi.z_num[l], gi.x_num[l + 1], gi.y_num[l + 1], gi.z_num[l + 1], l == levelMin, stream); } - - std::cout << "------- RUN --------------\n"; } \ No newline at end of file diff --git a/src/data_structures/APR/access/LinearAccessCuda.cu b/src/data_structures/APR/access/LinearAccessCuda.cu index 9e38d760..1a876d0e 100644 --- a/src/data_structures/APR/access/LinearAccessCuda.cu +++ b/src/data_structures/APR/access/LinearAccessCuda.cu @@ -627,10 +627,6 @@ void computeLinearStructureCuda(uint16_t *y_vec_cuda, ParticleCellTreeCuda &p_ma } } -// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; -// prt(y_vec); -// prt(xz_end_vec); -// prt(level_xz_vec); VectorData y_vec(true); y_vec.resize(gi.total_number_particles); checkCuda(cudaMemcpyAsync(y_vec.begin(), y_vec_cuda, gi.total_number_particles * sizeof(uint16_t), cudaMemcpyDeviceToHost, aStream)); diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 8cc516a4..aa706190 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -310,9 +310,7 @@ namespace { // Initialize GPU data structures to same values as CPU PixelData mGpuImage(input_image, true); - PixelData grad_temp_GPU(grad_temp, true); - PixelData local_scale_temp_GPU(local_scale_temp, true); - PixelData local_scale_temp2_GPU(local_scale_temp2, true); + PixelData local_scale_temp_GPU(local_scale_temp, false); // Prepare parameters APRParameters par; @@ -346,14 +344,16 @@ namespace { // Calculate pipeline on GPU - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - // { GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); + cudaDeviceSynchronize(); + + timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); + // { gpt.sendDataToGpu(); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); giGpu.total_number_particles = linearAccessGpu.y_vec.size(); - + cudaDeviceSynchronize(); // } timer.stop_timer(); From 6b7a87d870501b3178c901208825304c7c71d261 Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Aug 2024 17:46:38 +0200 Subject: [PATCH 57/59] Test for full pipeline cleaned up --- test/FullPipelineCudaTest.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index aa706190..8f29141b 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -294,9 +294,6 @@ namespace { PixelData input_image = (d / 2 == 0) ? getRandInitializedMesh(dim, 13) : getMeshWithBlobInMiddle(dim); -// constexpr PixelDataDim dim = dim1; -// PixelData input_image = getRandInitializedMesh(dim, 13); - int maxLevel = ceil(std::log2(dim.maxDimSize())); // Initialize CPU data structures @@ -326,7 +323,6 @@ namespace { GenInfo aprInfo(input_image.getDimension()); GenInfo giGpu(input_image.getDimension()); - // Calculate pipeline on CPU // Calculate pipeline on CPU timer.start_timer(">>>>>>>>>>>>>>>>> CPU PIPELINE"); ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); @@ -344,17 +340,13 @@ namespace { // Calculate pipeline on GPU - GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); - cudaDeviceSynchronize(); - timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - // { + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); gpt.sendDataToGpu(); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); giGpu.total_number_particles = linearAccessGpu.y_vec.size(); cudaDeviceSynchronize(); - // } timer.stop_timer(); // Compare GPU vs CPU - expect exactly same result From 3c601be7ecb9354ba6853efb93256a52607d34fc Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Wed, 21 Aug 2024 17:50:22 +0200 Subject: [PATCH 58/59] doAll() removed from Gpu pipeline --- src/algorithm/ComputeGradientCuda.cu | 7 ------- src/algorithm/ComputeGradientCuda.hpp | 1 - 2 files changed, 8 deletions(-) diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 9c85cd0f..14d1d5d0 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -351,13 +351,6 @@ LinearAccessCudaStructs GpuProcessingTask::getDataFromGpu() {return imp template void GpuProcessingTask::processOnGpu() {impl->processOnGpu();} -template -void GpuProcessingTask::doAll() { - sendDataToGpu(); - processOnGpu(); - getDataFromGpu(); -} - // explicit instantiation of handled types template class GpuProcessingTask; template class GpuProcessingTask; diff --git a/src/algorithm/ComputeGradientCuda.hpp b/src/algorithm/ComputeGradientCuda.hpp index 723b6181..837d29f5 100644 --- a/src/algorithm/ComputeGradientCuda.hpp +++ b/src/algorithm/ComputeGradientCuda.hpp @@ -49,7 +49,6 @@ class GpuProcessingTask { void sendDataToGpu(); LinearAccessCudaStructs getDataFromGpu(); void processOnGpu(); - void doAll(); }; #endif //LIBAPR_COMPUTEGRADIENTCUDA_HPP From d2fd1d0f4f5afb5b72f47ed1cc49d91097004d2b Mon Sep 17 00:00:00 2001 From: Krzysztof Gonciarz Date: Thu, 22 Aug 2024 13:24:39 +0200 Subject: [PATCH 59/59] GPU pipeline now works for APRConverter! --- examples/Example_get_apr.h | 2 +- src/algorithm/APRConverter.hpp | 163 ++++---------- src/algorithm/ComputeGradient.hpp | 29 +++ src/algorithm/ComputeGradientCuda.cu | 77 +++++-- test/FullPipelineCudaTest.cpp | 10 +- test/LinearAccessCudaTest.cpp | 309 ++++++++++++++------------- 6 files changed, 290 insertions(+), 300 deletions(-) diff --git a/examples/Example_get_apr.h b/examples/Example_get_apr.h index c1be9d2b..6d787811 100644 --- a/examples/Example_get_apr.h +++ b/examples/Example_get_apr.h @@ -30,7 +30,7 @@ struct cmdLineOptions{ bool auto_parameters = false; float Ip_th = 0; - float lambda = -1; + float lambda = 3.0; float sigma_th = 0; float rel_error = 0.1; float grad_th = 1; diff --git a/src/algorithm/APRConverter.hpp b/src/algorithm/APRConverter.hpp index 404d2bf5..91858629 100644 --- a/src/algorithm/APRConverter.hpp +++ b/src/algorithm/APRConverter.hpp @@ -117,7 +117,7 @@ class APRConverter { PixelData local_scale_temp; // Used as down-sampled images for some averaging steps where it is useful to not lose precision, or get over-flow errors PixelData local_scale_temp2; - void applyParameters(APR& aAPR,APRParameters& aprParameters); + void applyParameters(APRParameters& aprParameters); template void computeL(APR& aAPR,PixelData& input_image); @@ -184,7 +184,7 @@ void APRConverter::get_apr_custom_grad_scale(APR& aAPR,PixelData::computeL(APR& aAPR,PixelData& input_image){ } template -void APRConverter::applyParameters(APR& aAPR,APRParameters& aprParameters) { +void APRConverter::applyParameters(APRParameters& aprParameters) { // // Apply the main parameters // @@ -265,39 +265,7 @@ void APRConverter::applyParameters(APR& aAPR,APRParameters& aprParame } fine_grained_timer.stop_timer(); - fine_grained_timer.start_timer("threshold"); - iComputeGradient.threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset); - fine_grained_timer.stop_timer(); - - float max_th = 60000; - -#ifdef HAVE_OPENMP -#pragma omp parallel for default(shared) -#endif - for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { - - float rescaled = local_scale_temp.mesh[i]; - if (rescaled < aprParameters.sigma_th) { - rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : par.sigma_th; - local_scale_temp.mesh[i] = rescaled; - } - } - -#ifdef HAVE_LIBTIFF - if(par.output_steps) { - TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_rescaled.tif", local_scale_temp); - } -#endif - -#ifdef HAVE_OPENMP -#pragma omp parallel for default(shared) -#endif - for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { - - if(grad_temp.mesh[i] < aprParameters.grad_th){ - grad_temp.mesh[i] = 0; - } - } + iComputeGradient.applyParameters(grad_temp, local_scale_temp, local_scale_temp2, aprParameters, bspline_offset); } @@ -405,7 +373,7 @@ inline bool APRConverter::get_lrf(APR &aAPR, PixelData& input_imag template inline bool APRConverter::get_ds(APR &aAPR) { - applyParameters(aAPR,par); + applyParameters(par); aAPR.parameters = par; solveForAPR(aAPR); @@ -426,104 +394,45 @@ inline bool APRConverter::get_ds(APR &aAPR) { */ template template inline bool APRConverter::get_apr_cuda(APR &aAPR, PixelData& input_image) { - if (!initPipelineAPR(aAPR, input_image)) return false; + if (!initPipelineAPR(aAPR, input_image)) return false; initPipelineMemory(input_image.y_num, input_image.x_num, input_image.z_num); - method_timer.start_timer("compute_gradient_magnitude_using_bsplines and local instensity scale CUDA"); - APRTimer t(true); - APRTimer d(true); - t.start_timer(" =========== ALL"); - { - - computation_timer.start_timer("init_mem"); - PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image) - - ///////////////////////////////// - /// Pipeline - //////////////////////// - // offset image by factor (this is required if there are zero areas in the background with - // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) - // Warning both of these could result in over-flow! - - if (std::is_same::value) { - bspline_offset = 100; - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } else if (std::is_same::value) { - bspline_offset = 5; - image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); - } else { - image_temp.copyFromMesh(input_image); - } - - computation_timer.stop_timer(); - - std::vector> gpts; - - int numOfStreams = 1; - int repetitionsPerStream = 1; - - computation_timer.start_timer("compute_L"); - // Create streams and send initial task to do - for (int i = 0; i < numOfStreams; ++i) { - gpts.emplace_back(GpuProcessingTask(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max())); - gpts.back().sendDataToGpu(); - gpts.back().processOnGpu(); - } - computation_timer.stop_timer(); - - - for (int i = 0; i < numOfStreams * repetitionsPerStream; ++i) { - int c = i % numOfStreams; - - computation_timer.start_timer("apply_parameters"); - // get data from previous task - gpts[c].getDataFromGpu(); - - computation_timer.stop_timer(); - - // in theory we get new data and send them to task - if (i < numOfStreams * (repetitionsPerStream - 1)) { - gpts[c].sendDataToGpu(); - gpts[c].processOnGpu(); - } - - // Postprocess on CPU - std::cout << "--------- start CPU processing ---------- " << i << std::endl; - - computation_timer.start_timer("solve_for_apr"); - iPullingScheme.initialize_particle_cell_tree(aAPR.aprInfo); - - PixelData lst(local_scale_temp, true); - -#ifdef HAVE_LIBTIFF - if (par.output_steps){ - TiffUtils::saveMeshAsTiff(par.output_dir + "local_intensity_scale_step.tif", lst); - } -#endif + computation_timer.start_timer("init_mem"); + PixelData image_temp(input_image, false /* don't copy */, true /* pinned memory */); // global image variable useful for passing between methods, or re-using memory (should be the only full sized copy of the image) -#ifdef HAVE_LIBTIFF - if (par.output_steps){ - TiffUtils::saveMeshAsTiff(par.output_dir + "gradient_step.tif", grad_temp); - } -#endif + ///////////////////////////////// + /// Pipeline + //////////////////////// + // offset image by factor (this is required if there are zero areas in the background with + // uint16_t and uint8_t images, as the Bspline co-efficients otherwise may be negative!) + // Warning both of these could result in over-flow! - iLocalParticleSet.get_local_particle_cell_set(iPullingScheme,lst, local_scale_temp2,par); + if (std::is_same::value) { + bspline_offset = 100; + image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); + } else if (std::is_same::value) { + bspline_offset = 5; + image_temp.copyFromMeshWithUnaryOp(input_image, [=](const auto &a) { return (a + bspline_offset); }); + } else { + image_temp.copyFromMesh(input_image); + } - iPullingScheme.pulling_scheme_main(); + GpuProcessingTask gpt(image_temp, local_scale_temp, par, bspline_offset, aAPR.level_max()); + gpt.sendDataToGpu(); + gpt.processOnGpu(); + auto linearAccessGpu = gpt.getDataFromGpu(); - computation_timer.stop_timer(); + aAPR.aprInfo.total_number_particles = linearAccessGpu.y_vec.size(); - computation_timer.start_timer("generate_data_structures"); - generateDatastructures(aAPR); - computation_timer.stop_timer(); - } - std::cout << "Total n ENDED" << std::endl; + // generateDatastructures(aAPR) for linearAcceess for CUDA + aAPR.linearAccess.y_vec.copy(linearAccessGpu.y_vec); + aAPR.linearAccess.xz_end_vec.copy(linearAccessGpu.xz_end_vec); + aAPR.linearAccess.level_xz_vec.copy(linearAccessGpu.level_xz_vec); + aAPR.apr_initialized = true; - } - t.stop_timer(); - method_timer.stop_timer(); + std::cout << "CUDA pipeline finished!\n"; return true; } @@ -565,7 +474,7 @@ inline bool APRConverter::get_apr_cpu(APR &aAPR, PixelData &input_ method_timer.stop_timer(); } - applyParameters(aAPR,par); + applyParameters(par); computation_timer.stop_timer(); @@ -597,7 +506,7 @@ template template inline bool APRConverter::get_apr(APR &aAPR, PixelData &input_image) { // TODO: CUDA pipeline is temporarily turned off and CPU version is always chosen. // After revising a CUDA pipeline remove "#if true // " part. -#if true // #ifndef APR_USE_CUDA +#ifndef APR_USE_CUDA return get_apr_cpu(aAPR, input_image); #else return get_apr_cuda(aAPR, input_image); diff --git a/src/algorithm/ComputeGradient.hpp b/src/algorithm/ComputeGradient.hpp index ee5aeec8..6b682fdf 100644 --- a/src/algorithm/ComputeGradient.hpp +++ b/src/algorithm/ComputeGradient.hpp @@ -38,6 +38,35 @@ class ComputeGradient { template void calc_inv_bspline_z(PixelData &input); + template + void applyParameters(PixelData &grad_temp, PixelData &local_scale_temp, PixelData &local_scale_temp2, APRParameters &aprParameters, float bspline_offset) { + threshold_gradient(grad_temp,local_scale_temp2,aprParameters.Ip_th + bspline_offset); + + float max_th = 60000; + +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) +#endif + for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { + + float rescaled = local_scale_temp.mesh[i]; + if (rescaled < aprParameters.sigma_th) { + rescaled = (rescaled < aprParameters.sigma_th_max) ? max_th : aprParameters.sigma_th; + local_scale_temp.mesh[i] = rescaled; + } + } + +#ifdef HAVE_OPENMP +#pragma omp parallel for default(shared) +#endif + for (size_t i = 0; i < grad_temp.mesh.size(); ++i) { + + if(grad_temp.mesh[i] < aprParameters.grad_th){ + grad_temp.mesh[i] = 0; + } + } + } + struct three_temps { float temp_1, temp_2, temp_3; }; diff --git a/src/algorithm/ComputeGradientCuda.cu b/src/algorithm/ComputeGradientCuda.cu index 14d1d5d0..c4f0e849 100644 --- a/src/algorithm/ComputeGradientCuda.cu +++ b/src/algorithm/ComputeGradientCuda.cu @@ -57,6 +57,7 @@ namespace { } BsplineParams prepareBsplineStuff(size_t dimLen, float lambda, float tol, int maxFilterLen = -1) { + // Recursive Filter Implimentation for Smoothing BSplines // B-Spline Signal Processing: Part II - Efficient Design and Applications, Unser 1993 @@ -79,8 +80,8 @@ namespace { const float norm_factor = powf((1 - 2.0 * rho * cosf(omg) + powf(rho, 2)), 2); - //std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 - // << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << std::endl; +// std::cout << std::fixed << std::setprecision(9) << "GPU: xi=" << xi << " rho=" << rho << " omg=" << omg << " gamma=" << gamma << " b1=" << b1 +// << " b2=" << b2 << " k0=" << k0 << " minLen=" << minLen << " norm_factor=" << norm_factor << " lambda=" << lambda << " tol=" << tol << std::endl; // ------- Calculating boundary conditions @@ -169,18 +170,18 @@ void getGradientCuda(const PixelData &image, PixelData &local_sc // TODO: Used PixelDataDim in all methods below and change input parameter from image to imageDim - runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream); - runBsplineXdir(cudaImage, image.getDimension(), px, aStream); - runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); + if (image.y_num > 2) runBsplineYdir(cudaImage, image.getDimension(), py, boundary, aStream); + if (image.x_num > 2) runBsplineXdir(cudaImage, image.getDimension(), px, aStream); + if (image.z_num > 2) runBsplineZdir(cudaImage, image.getDimension(), pz, aStream); runKernelGradient(cudaImage, cudaGrad, image.getDimension(), local_scale_temp.getDimension(), par.dx, par.dy, par.dz, aStream); runDownsampleMean(cudaImage, cudalocal_scale_temp, image.x_num, image.y_num, image.z_num, aStream); - runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); - runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.y_num > 2) runInvBsplineYdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.x_num > 2) runInvBsplineXdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); + if (image.z_num > 2) runInvBsplineZdir(cudalocal_scale_temp, local_scale_temp.x_num, local_scale_temp.y_num, local_scale_temp.z_num, aStream); } class CurrentTime { @@ -202,6 +203,49 @@ public: }; +/** + * Thresholds output basing on input values. When input is <= thresholdLevel then output is set to 0 and is not changed otherwise. + * @param input + * @param output + * @param length - len of input/output arrays + * @param thresholdLevel + */ +template +__global__ void threshold(const T *input, S *output, size_t length, float thresholdLevel) { + size_t idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x; + if (idx < length) { + if (input[idx] <= thresholdLevel) { output[idx] = 0; } + } +} + +template +void runThreshold(ImgType *cudaImage, T *cudaGrad, size_t x_num, size_t y_num, size_t z_num, float Ip_th, cudaStream_t aStream) { + dim3 threadsPerBlock(64); + dim3 numBlocks((x_num * y_num * z_num + threadsPerBlock.x - 1)/threadsPerBlock.x); + threshold<<>>(cudaImage, cudaGrad, x_num * y_num * z_num, Ip_th); +}; + +template +__global__ void rescaleAndThreshold(T *data, size_t len, float sigmaThreshold, float sigmaThresholdMax) { + const float max_th = 60000.0; + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (idx < len) { + float rescaled = data[idx]; + if (rescaled < sigmaThreshold) { + rescaled = (rescaled < sigmaThresholdMax) ? max_th : sigmaThreshold; + } + data[idx] = rescaled; + } +} + +template +void runRescaleAndThreshold(T *data, size_t len, float sigma, float sigmaMax, cudaStream_t aStream) { + dim3 threadsPerBlock(64); + dim3 numBlocks((len + threadsPerBlock.x - 1) / threadsPerBlock.x); + rescaleAndThreshold <<< numBlocks, threadsPerBlock, 0, aStream >>> (data, len, sigma, sigmaMax); +} + + template template class GpuProcessingTask::GpuProcessingTaskImpl { @@ -264,11 +308,11 @@ public: iMaxLevel(maxLevel), // TODO: This is wrong and done only for compile. BsplineParams has to be computed seperately for each dimension. // Should be fixed when other parts of pipeline are ready. - params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)), - bc1(params.bc1.get(), params.k0, iStream), - bc2(params.bc2.get(), params.k0, iStream), - bc3(params.bc3.get(), params.k0, iStream), - bc4(params.bc4.get(), params.k0, iStream), +// params(prepareBsplineStuff((size_t)inputImage.x_num, parameters.lambda, tolerance)), +// bc1(params.bc1.get(), params.k0, iStream), +// bc2(params.bc2.get(), params.k0, iStream), +// bc3(params.bc3.get(), params.k0, iStream), +// bc4(params.bc4.get(), params.k0, iStream), boundaryLen{(2 /*two first elements*/ + 2 /* two last elements */) * (size_t)inputImage.x_num * (size_t)inputImage.z_num}, boundary{nullptr, boundaryLen, iStream}, pctc(iAprInfo, iStream), @@ -317,6 +361,13 @@ public: splineCudaX, splineCudaY, splineCudaZ, boundary.get(), iBsplineOffset, iParameters, iStream); runLocalIntensityScalePipeline(iCpuLevels, iParameters, local_scale_temp.get(), local_scale_temp2.get(), iStream); + + // Apply parameters from APRConverter: + runThreshold(local_scale_temp2.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.Ip_th + iBsplineOffset, iStream); + runRescaleAndThreshold(local_scale_temp.get(), iCpuLevels.mesh.size(), iParameters.sigma_th, iParameters.sigma_th_max, iStream); + runThreshold(gradient.get(), gradient.get(), iCpuLevels.x_num, iCpuLevels.y_num, iCpuLevels.z_num, iParameters.grad_th, iStream); + // TODO: automatic parameters are not implemented for GPU pipeline (yet) + float min_dim = std::min(iParameters.dy, std::min(iParameters.dx, iParameters.dz)); float level_factor = pow(2, iMaxLevel) * min_dim; const float mult_const = level_factor/iParameters.rel_error; diff --git a/test/FullPipelineCudaTest.cpp b/test/FullPipelineCudaTest.cpp index 8f29141b..913b7e09 100644 --- a/test/FullPipelineCudaTest.cpp +++ b/test/FullPipelineCudaTest.cpp @@ -280,11 +280,6 @@ namespace { TEST(ComputeThreshold, FULL_PIPELINE_TEST_CPU_vs_GpuProcessingTask) { APRTimer timer(true); - // TODO: This tets fails if dim of input image is smaller than ~8 (not sure in which direction yet) - // It fails for {4,4,3} for sure and surprisingly only for mesh with blob inside... - // Investigate why it fails while it works nicely in tests above (difference must be somewhere in GpuProcessingTask). - - // Generate random mesh of two sizes very small and reasonable large to catch all possible computation errors using ImageType = float; constexpr PixelDataDim dim1{4, 4, 3}; @@ -320,6 +315,8 @@ namespace { par.dz = 1; par.neighborhood_optimization = true; + float bspline_offset = 0; + GenInfo aprInfo(input_image.getDimension()); GenInfo giGpu(input_image.getDimension()); @@ -328,6 +325,7 @@ namespace { ComputeGradient().get_gradient(mCpuImage, grad_temp, local_scale_temp, par); LocalIntensityScale().get_local_intensity_scale(local_scale_temp, local_scale_temp2, par); LocalParticleCellSet lpcs = LocalParticleCellSet(); + ComputeGradient().applyParameters(grad_temp, local_scale_temp, local_scale_temp2, par, bspline_offset); lpcs.computeLevels(grad_temp, local_scale_temp, maxLevel, par.rel_error, par.dx, par.dy, par.dz); PullingScheme ps; ps.initialize_particle_cell_tree(aprInfo); @@ -341,7 +339,7 @@ namespace { // Calculate pipeline on GPU timer.start_timer(">>>>>>>>>>>>>>>>> GPU PIPELINE"); - GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, 0, maxLevel); + GpuProcessingTask gpt(mGpuImage, local_scale_temp_GPU, par, bspline_offset, maxLevel); gpt.sendDataToGpu(); gpt.processOnGpu(); auto linearAccessGpu = gpt.getDataFromGpu(); diff --git a/test/LinearAccessCudaTest.cpp b/test/LinearAccessCudaTest.cpp index 84cf8730..eb91e7bd 100644 --- a/test/LinearAccessCudaTest.cpp +++ b/test/LinearAccessCudaTest.cpp @@ -97,160 +97,163 @@ namespace { } -TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_fullAprPipeline) { - // TODO: delete me after development - // Full 'get apr' pipeline to test imp. on different stages - // Useful during debugging and can be removed once finished - - // Prepare input data (image) - int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; - // PS input values = 5 0 0 0 0 0 0 0 - -// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; -// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; - - int len = sizeof(values)/sizeof(int); - PixelData data(len, 1, 1); - initFromZYXarray(data, values); - std::cout << "----- Input image:\n"; - data.printMeshT(3, 1); - - // Produce APR - APR apr; - APRConverter aprConverter; - aprConverter.par.rel_error = 0.1; - aprConverter.par.lambda = 0.1; - aprConverter.par.sigma_th = 0.0001; - aprConverter.par.neighborhood_optimization = true; - aprConverter.get_apr(apr, data); - - // Print information about APR and all particles - std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; - for (int l = apr.level_min(); l <= apr.level_max(); ++l) { - std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; - } - std::cout << "APR particles z x y level:\n"; - auto it = apr.iterator(); - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; - - // Sample input - ParticleData particleIntensities; - particleIntensities.sample_image(apr, data); - - // Reconstruct image from particles - PixelData reconstructImg; - APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); - std::cout << "----- Reconstructed image:"< levelImg; - APRReconstruction::reconstruct_level(apr, levelImg); - std::cout << "----- Image levels:" << std::endl; - levelImg.printMeshT(3, 1); - - // Show intensities and levels of each particle - std::cout << "----- Particle intensities:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - particleIntensities.fill_with_levels(apr); - - std::cout << "----- Particle levels:\n"; - for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; - std::cout << std::endl; - - // Show some general information about generated APR - double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); - std::cout << std::endl; - std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; - std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; -} - - -TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { - // TODO: delete me after development - // Runs PS to test imp. on different stages - // Useful during debugging and can be removed once finished -// int values[] = {0,0,0,5, 0,0,0,0}; +// TODO: There are still problems with computing of small (like 1D images in pipeline) +// belows test can be used to trigger those errors - should be fixed + +//TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_fullAprPipeline) { +// // TODO: delete me after development +// // Full 'get apr' pipeline to test imp. on different stages +// // Useful during debugging and can be removed once finished +// +// // Prepare input data (image) +// int values[] = {9,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; +// // PS input values = 5 0 0 0 0 0 0 0 +// +//// int values[] = {3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 3,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }; +//// PullingScheme input values (local_scale_temp) for above 'image' = {6 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0}; +// // int len = sizeof(values)/sizeof(int); - - PixelData levels(8, 1, 1, 0); - levels(5,0,0) = 1; - -// initFromZYXarray(levels, values); - std::cout << "---------------\n"; - levels.printMeshT(3, 1); - std::cout << "---------------\n"; - - GenInfo gi; - const PixelDataDim dim = levels.getDimension(); - std::cout << "Levels dim: " << dim << std::endl; - gi.init(dim.y * 2, dim.x * 1, dim.z * 1); // time two in y-direction since PS container is downsized. - std::cout << gi << std::endl; - - APRTimer t(false); - - t.start_timer("PS1"); - PullingScheme ps; - ps.initialize_particle_cell_tree(gi); - int l_max = gi.l_max - 1; - int l_min = gi.l_min; - std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; - - fillPS(ps, levels); - - std::cout << "---------- Filled PS tree\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "---------------\n"; - - ps.pulling_scheme_main(); - t.stop_timer(); - - // Useful during debugging and can be removed once finished - std::cout << "----------PS:\n"; - printParticleCellTree(ps.getParticleCellTree()); - std::cout << "-------------\n"; - - LinearAccess linearAccess; - linearAccess.genInfo = &gi; - APRParameters par; - par.neighborhood_optimization = true; - linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); - - std::cout << gi << std::endl; - auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; - prt(linearAccess.y_vec); - prt(linearAccess.xz_end_vec); - prt(linearAccess.level_xz_vec); - - LinearIterator it(linearAccess, gi); - for (int l = 0; l <= 3; l++) { - std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl; - } - std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl; - - std::cout << "===========================\n"; - for (int level = it.level_min(); level <= it.level_max(); ++level) { - for (int z = 0; z < it.z_num(level); z++) { - for (int x = 0; x < it.x_num(level); ++x) { - for (it.begin(level, z, x); it < it.end(); it++) { - std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; - } - } - } - } - std::cout << std::endl; -} +// PixelData data(len, 1, 1); +// initFromZYXarray(data, values); +// std::cout << "----- Input image:\n"; +// data.printMeshT(3, 1); +// +// // Produce APR +// APR apr; +// APRConverter aprConverter; +// aprConverter.par.rel_error = 0.1; +// aprConverter.par.lambda = 0.1; +// aprConverter.par.sigma_th = 0.0001; +// aprConverter.par.neighborhood_optimization = true; +// aprConverter.get_apr(apr, data); +// +// // Print information about APR and all particles +// std::cout << "APR level min/max: " << apr.level_max() << "/" << apr.level_min() << std::endl; +// for (int l = apr.level_min(); l <= apr.level_max(); ++l) { +// std::cout << " level[" << l << "] size: " << apr.level_size(l) << std::endl; +// } +// std::cout << "APR particles z x y level:\n"; +// auto it = apr.iterator(); +// for (int level = it.level_min(); level <= it.level_max(); ++level) { +// for (int z = 0; z < it.z_num(level); z++) { +// for (int x = 0; x < it.x_num(level); ++x) { +// for (it.begin(level, z, x); it < it.end(); it++) { +// std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; +// } +// } +// } +// } +// std::cout << std::endl; +// +// // Sample input +// ParticleData particleIntensities; +// particleIntensities.sample_image(apr, data); +// +// // Reconstruct image from particles +// PixelData reconstructImg; +// APRReconstruction::reconstruct_constant(apr, reconstructImg, particleIntensities); +// std::cout << "----- Reconstructed image:"< levelImg; +// APRReconstruction::reconstruct_level(apr, levelImg); +// std::cout << "----- Image levels:" << std::endl; +// levelImg.printMeshT(3, 1); +// +// // Show intensities and levels of each particle +// std::cout << "----- Particle intensities:\n"; +// for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; +// std::cout << std::endl; +// +// particleIntensities.fill_with_levels(apr); +// +// std::cout << "----- Particle levels:\n"; +// for (uint64_t i = 0; i < particleIntensities.size(); i++) std::cout << particleIntensities.data[i] << " "; +// std::cout << std::endl; +// +// // Show some general information about generated APR +// double computational_ratio = (1.0 * apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) / (1.0 * apr.total_number_particles()); +// std::cout << std::endl; +// std::cout << "#pixels: " << (apr.org_dims(0) * apr.org_dims(1) * apr.org_dims(2)) << " #particles: " << (apr.total_number_particles()) << std::endl; +// std::cout << "Computational Ratio (Pixels/Particles): " << std::setprecision(2) << computational_ratio << std::endl; +//} + + +//TEST(LinearAccessCudaTest, DeleteMeAfterDevelopment_PS) { +// // TODO: delete me after development +// // Runs PS to test imp. on different stages +// // Useful during debugging and can be removed once finished +//// int values[] = {0,0,0,5, 0,0,0,0}; +//// int len = sizeof(values)/sizeof(int); +// +// PixelData levels(8, 1, 1, 0); +// levels(5,0,0) = 1; +// +//// initFromZYXarray(levels, values); +// std::cout << "---------------\n"; +// levels.printMeshT(3, 1); +// std::cout << "---------------\n"; +// +// GenInfo gi; +// const PixelDataDim dim = levels.getDimension(); +// std::cout << "Levels dim: " << dim << std::endl; +// gi.init(dim.y * 2, dim.x * 1, dim.z * 1); // time two in y-direction since PS container is downsized. +// std::cout << gi << std::endl; +// +// APRTimer t(false); +// +// t.start_timer("PS1"); +// PullingScheme ps; +// ps.initialize_particle_cell_tree(gi); +// int l_max = gi.l_max - 1; +// int l_min = gi.l_min; +// std::cout << "PS: max/max min/min" << l_max << " " << ps.pct_level_max() << " " << l_min << " " << ps.pct_level_min() << std::endl; +// +// fillPS(ps, levels); +// +// std::cout << "---------- Filled PS tree\n"; +// printParticleCellTree(ps.getParticleCellTree()); +// std::cout << "---------------\n"; +// +// ps.pulling_scheme_main(); +// t.stop_timer(); +// +// // Useful during debugging and can be removed once finished +// std::cout << "----------PS:\n"; +// printParticleCellTree(ps.getParticleCellTree()); +// std::cout << "-------------\n"; +// +// LinearAccess linearAccess; +// linearAccess.genInfo = &gi; +// APRParameters par; +// par.neighborhood_optimization = true; +// linearAccess.initialize_linear_structure(par, ps.getParticleCellTree()); +// +// std::cout << gi << std::endl; +// auto prt = [&](const auto& v){ std::cout << "size=" << v.size() << " data="; for (size_t i = 0; i < v.size(); i++) std::cout << v[i] << ", "; std::cout << std::endl; }; +// prt(linearAccess.y_vec); +// prt(linearAccess.xz_end_vec); +// prt(linearAccess.level_xz_vec); +// +// LinearIterator it(linearAccess, gi); +// for (int l = 0; l <= 3; l++) { +// std::cout << it.particles_level_begin(l) << " " << it.particles_level_end(l) << std::endl; +// } +// std::cout << "NumOfParticles: " << gi.total_number_particles << std::endl; +// +// std::cout << "===========================\n"; +// for (int level = it.level_min(); level <= it.level_max(); ++level) { +// for (int z = 0; z < it.z_num(level); z++) { +// for (int x = 0; x < it.x_num(level); ++x) { +// for (it.begin(level, z, x); it < it.end(); it++) { +// std::cout << " " << z << " " << x << " " << it.y() << " " << level << std::endl; +// } +// } +// } +// } +// std::cout << std::endl; +//} // ********************************************************************************************************************* // Tests of CUDA implementation of LinearAccess