Make VCSBeam GPU-agnostic (#56)

* changes of cuda -> gpu macros in preparation for compilation on Setonix (still todo) * cuda -> gpu * cuC to gpuC (complex operations) * conversion complated (last changes done) * cuDoubleComplex -> gpuDoubleComplex * added #include "gpu_macros.h" * further Setonix compilation issues * corrections in compilation * correction * corrections * correction * corrections * changes * changes * corr * changes * corr * changes * corr * cudaCheckErrors moved to gpu_macros.h * correction * modifications for Setonix * cleanup * moved back (mistake) * added back * corrected back ? * .cu -> .cpp * correction * CXX added * mwalib added * update * added MPI module * test * test * corrections * back * MPI linking added * error message added * temporary comments * correction * changes * depracation fixed * back * removed inlcude * removed include * cu -> gpu * HDF5 beam file added * correction * LIBPAL fixed? * temporary * linking hyper beam added * temporary comments * gpu * gpu * gpu * cu -> cpp * cu -> cpp * cu -> gpu * cudaMallocHost issue * correction * include added * getting there * update * added from setonix * compilation on Setonix corrections after Bradley's merge to the most recent version in main * improvement in CMake - added define __HIPCC__ * corrections for compilation on NVIDIA, build_nvidia.sh added * +x added * correction * cast to (char*) added to fix compilation on NVIDIA * first attempt to rectify cmake build for HIP/CUDA - incomplete! * Updates CMake recipes to compile in CUDA and HIP environments (#52) * Refactoring of CMakeLists.txt - moving contents of src/CMakeLists.txt into CMakeLists.txt - no need for two of them. - moved some instructions up (option's and find_package's) for better readability and functionality. - Why do we need to define __HIP_PLATFORM_AMD__? Maybe because these are C files! - need to figure out how to configure vcsbeam.h The current version configures correctly with the following line: `cmake -DUSE_HIP=ON -DCMAKE_CXX_COMPILER=hipcc -DCMAKE_C_COMPILER=hipcc ..` * Progess on CMakeLists.txt file.. * VCSBeam compiles on Setonix. * Removes unnecessary modification. * Modify and test the CMakeLists.txt file so that it works on NVIDIA architecture too. * Update CMakeLists.txt Added some extra comments to the file for future reference --------- Co-authored-by: Bradley Meyers <[email protected]> * Update GPU macros (#53) * Refactoring of CMakeLists.txt - moving contents of src/CMakeLists.txt into CMakeLists.txt - no need for two of them. - moved some instructions up (option's and find_package's) for better readability and functionality. - Why do we need to define __HIP_PLATFORM_AMD__? Maybe because these are C files! - need to figure out how to configure vcsbeam.h The current version configures correctly with the following line: `cmake -DUSE_HIP=ON -DCMAKE_CXX_COMPILER=hipcc -DCMAKE_C_COMPILER=hipcc ..` * Progess on CMakeLists.txt file.. * VCSBeam compiles on Setonix. * Removes unnecessary modification. * Updates gpu macros - substitutes deprecated calls - removes useless error checks. gpu* macros have built-in error checking. * Removes more duplicate error checking. * Fixes minor merging issues. * Improves build script using bash-utils (#55) - Use rocm/5.7.3 to avoid bug in rocm/5.2.3 - Use the bash-utils/devel module to write a build script to easily install software on Setonix - Notice that MWA beam model is now part of Pawsey reference datasets (/scratch/references/mwa/beam-models) * removed .cu files, as the kernels are now kept within the corresponding .cpp files for HIP compatability * fixed file permission oddity * added some extra comments for future use * small tweaks to output messaging --------- Co-authored-by: Marcin Sokolowski <[email protected]> Co-authored-by: Cristian Di Pietrantonio <[email protected]>
CIRA-Pulsars-and-Transients-Group · Sep 3, 2024 · dc94d9f · dc94d9f
1 parent eb170b2
commit dc94d9f
Show file tree

Hide file tree

Showing 23 changed files with 918 additions and 668 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,49 +1,158 @@
-cmake_minimum_required (VERSION 3.15)
-#cmake_policy(SET CMP0012 NEW)
-#cmake_policy(SET CMP0048 NEW)
+cmake_minimum_required(VERSION 3.15)
+project(vcsbeam)
 
-# Set up version number
+# Define CMake options to choose the GPU flavour
+# TODO: Is there a way to make sure these are required and mutually exclusive?
+option(USE_CUDA "Compile the code with NVIDIA GPU support." OFF)
+option(USE_HIP "Compile the code with AMD GPU support." OFF)
+
+# Find packages needed
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+find_package(MWALIB REQUIRED)
+find_package(MPI REQUIRED)
+find_package(PAL REQUIRED)
+find_package(CFITSIO REQUIRED)
+find_package(PSRFITS_UTILS REQUIRED)
+find_package(HYPERBEAM REQUIRED)
+find_package(VDIFIO REQUIRED)
+find_package(XGPU)
+
+# Enable the support and relevant compiliation flags/config for the selected GPU language
+if(USE_CUDA)
+    enable_language(CUDA C CXX)
+    set(CMAKE_CUDA_ARCHITECTURES "native") 
+    add_definitions("-D__NVCC__")
+    set(GPU_FFTLIB cufft)
+    message(STATUS "CUDA generation enabled.")
+    message(NOTICE "Using the available CUDA 'native' architecture.")
+elseif(USE_HIP)
+    # NOTE: The enable_language(HIP ...) macro is only available from CMake 3.21.
+    # For versions < 3.21, comment out the enable_language macro and simply
+    # specify the 
+    #     -DCMAKE_CXX_COMPILER=hipcc
+    # as an option when executing cmake on the command line.
+    # enable_language(HIP CXX)
+    # TODO: Revisit this - does the enable_language option really do what we need?
+    set(GPU_FFTLIB hipfft)
+    add_definitions("-D__HIP_PLATFORM_AMD__ -D__HIPCC__")
+    message(STATUS "HIP generation enabled.")
+else()
+    message(FATAL_ERROR "Either USE_CUDA=ON or USE_HIP=ON must be specified.")
+endif()
+
+# Set up version number from Git
 execute_process(
   COMMAND bash -c "git describe --tags --long | sed 's/-/./' | sed 's/-g/_/'"
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
   OUTPUT_VARIABLE VCSBEAM_VERSION
   OUTPUT_STRIP_TRAILING_WHITESPACE
 )
+message(STATUS "VCSBeam ${VCSBEAM_VERSION}")
 
-message("VCSBeam ${VCSBEAM_VERSION}")
 
+# Ensure that runtime files have an installation path
 if(NOT RUNTIME_DIR)
     set(RUNTIME_DIR ${CMAKE_INSTALL_PREFIX}/bin/vcsbeam_runtime)
-endif(NOT RUNTIME_DIR)
+endif()
+message(NOTICE "Runtime files will be installed in ${RUNTIME_DIR}")
 
-project (vcsbeam LANGUAGES C CUDA)
+# Collect the source files without GPU kernels
+file(GLOB vcsbeam_c_sources 
+    "src/ascii_header.c"
+    "src/performance.c"
+    "src/filter.c"
+    "src/jones.c"
+    "src/buffer.c"
+    "src/calibration.c"
+    "src/metadata.c"
+)
 
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+# Collect the source files _with_ GPU kernels
+file(GLOB vcsbeam_gpu_sources 
+    "src/form_beam.cpp"
+    "src/pfb.cpp"
+)
 
-find_package(CUDA REQUIRED)
-find_package(MWALIB REQUIRED)
-find_package(MPI)
-find_package(PAL)
-find_package(CFITSIO)
-find_package(PSRFITS_UTILS)
-find_package(HYPERBEAM)
-find_package(VDIFIO)
-find_package(XGPU)
+if(USE_CUDA)
+    # This sets the .cpp files, containing the kernels, as the targetted source files
+    # for the CUDA compiler (rather than the typical .cu extensions).
+    set_source_files_properties(${vcsbeam_gpu_sources} PROPERTIES LANGUAGE CUDA)
+endif()
+
+# Generate the core package library
+add_library(vcsbeam STATIC
+    ${vcsbeam_c_sources}
+    ${vcsbeam_gpu_sources}
+)
+
+# Various gates defining which source files should be available based on
+# which dependencies were found on the system.
+if(MPI_FOUND AND PAL_FOUND AND PSRFITS_UTILS_FOUND)
+    target_sources(vcsbeam PRIVATE "src/beam_psrfits.c")
+endif()
+
+if(PAL_FOUND)
+    target_sources(vcsbeam PRIVATE "src/geometry.c")
+endif()
+
+if(VDIFIO_FOUND)
+    target_sources(vcsbeam PRIVATE "src/beam_vdif.c")
+endif()
+
+if(HYPERBEAM_FOUND)
+    target_sources(vcsbeam PRIVATE "src/primary_beam.c")
+endif()
+
+# Define required components/places to look when compiling parts...
+target_include_directories(vcsbeam PUBLIC
+    ${PSRFITS_UTILS_INCLUDE_DIR}
+    ${VDIFIO_INCLUDE_DIR}
+    ${CFITSIO_INCLUDE_DIR}
+    ${PAL_INCLUDE_DIR}
+    ${CUDA_INCLUDE_DIRS}
+    ${HYPERBEAM_INCLUDE_DIR}
+    ${MWALIB_INCLUDE_DIR}
+    ${MPI_INCLUDE_PATH}
+    ${CMAKE_BINARY_DIR})
 
-configure_file(include/vcsbeam.h.in vcsbeam.h)
+target_link_libraries(vcsbeam
+    ${PSRFITS_UTILS_LIBRARY}
+    ${VDIFIO_LIBRARY}
+    ${CFITSIO_LIBRARY}
+    ${PAL_LIBRARY}
+    ${M_LIBRARY}
+    ${HYPERBEAM_LIB}
+    ${MWALIB_LIB}
+    ${MPI_C_LIBRARIES}
+    ${GPU_FFTLIB})
+
+# ... And where to install things at the end
+install(TARGETS vcsbeam
+    LIBRARY DESTINATION lib
+    PUBLIC_HEADER DESTINATION include)
+
+# Add files/directories to build with
+configure_file(include/vcsbeam.h.in ${CMAKE_BINARY_DIR}/vcsbeam.h)
+
+set_target_properties(vcsbeam
+    PROPERTIES PUBLIC_HEADER "vcsbeam.h"
+)
+
+# Add paths to hints for package finding and source compilation/linking
+include_directories("src/")
 
-add_subdirectory(src)
 add_subdirectory(app)
 add_subdirectory(utils)
 
 if(XGPU_FOUND AND CFITSIO_FOUND)
     add_subdirectory(offline_correlator)
-endif ()
+endif()
 
-message("Runtime files will be installed in ${RUNTIME_DIR}")
 
+# Install any necessary data files in the required location
 install(FILES pq_phase_correction.txt
     pfb_filter/MIRROR.dat
     pfb_filter/LSQ12.dat
     pfb_filter/FINEPFB.dat
     DESTINATION ${RUNTIME_DIR})
+
diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
@@ -1,13 +1,11 @@
 # Fine PFB Offline
-
 # (Depends only on CUDA and MWALIB, which are global dependencies)
 add_executable(fine_pfb_offline fine_pfb_offline.c)
 target_link_libraries(fine_pfb_offline vcsbeam)
 target_include_directories(fine_pfb_offline PUBLIC ${CMAKE_BINARY_DIR})
 install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/fine_pfb_offline DESTINATION bin)
 
 # Tied array beamformer
-
 if (PAL_FOUND AND PSRFITS_UTILS_FOUND AND HYPERBEAM_FOUND AND VDIFIO_FOUND)
     add_executable(make_mwa_tied_array_beam make_mwa_tied_array_beam.c)
     target_link_libraries(make_mwa_tied_array_beam vcsbeam)
@@ -16,7 +14,6 @@ if (PAL_FOUND AND PSRFITS_UTILS_FOUND AND HYPERBEAM_FOUND AND VDIFIO_FOUND)
 endif ()
 
 # Incoherent beamformer
-
 if (PSRFITS_UTILS_FOUND)
     add_executable(make_mwa_incoh_beam make_mwa_incoh_beam.c)
     target_link_libraries(make_mwa_incoh_beam vcsbeam)
@@ -25,7 +22,6 @@ if (PSRFITS_UTILS_FOUND)
 endif ()
 
 # Primary beam response
-
 if (PAL_FOUND AND HYPERBEAM_FOUND)
     add_executable(mwa_track_primary_beam_response mwa_track_primary_beam_response.c)
     target_link_libraries(mwa_track_primary_beam_response vcsbeam)

diff --git a/app/make_mwa_incoh_beam.c b/app/make_mwa_incoh_beam.c
@@ -13,8 +13,8 @@
 
 // Non-standard dependencies
 #include <mwalib.h>
-#include <cuda_runtime.h>
 #include <mpi.h>
+#include "gpu_macros.h"
 
 // Local includes
 #include "vcsbeam.h"
@@ -90,14 +90,10 @@ int main(int argc, char **argv)
 
     allocate_input_output_arrays( (void **)&data, (void **)&d_data, data_size );
 
-    cudaMalloc( (void **)&d_incoh, incoh_size );
-    cudaCheckErrors( "cudaMalloc(d_incoh) failed" );
-    cudaMalloc( (void **)&d_offsets, nchans*sizeof(float) );
-    cudaCheckErrors( "cudaMalloc(d_offsets) failed" );
-    cudaMalloc( (void **)&d_scales,  nchans*sizeof(float) );
-    cudaCheckErrors( "cudaMalloc(d_scales) failed" );
-    cudaMalloc( (void **)&d_Iscaled, Iscaled_size );
-    cudaCheckErrors( "cudaMalloc(Iscaled) failed" );
+    gpuMalloc( (void **)&d_incoh, incoh_size );
+    gpuMalloc( (void **)&d_offsets, nchans*sizeof(float) );
+    gpuMalloc( (void **)&d_scales,  nchans*sizeof(float) );
+    gpuMalloc( (void **)&d_Iscaled, Iscaled_size );
 
     // Get pointing geometry information
     beam_geom beam_geom_vals;
@@ -179,14 +175,10 @@ int main(int argc, char **argv)
 
     free_input_output_arrays( data, d_data );
 
-    cudaFree( d_incoh );
-    cudaCheckErrors( "cudaFree(d_incoh) failed" );
-    cudaFree( d_offsets );
-    cudaCheckErrors( "cudaFree(d_offsets) failed" );
-    cudaFree( d_scales );
-    cudaCheckErrors( "cudaFree(d_scales) failed" );
-    cudaFree( d_Iscaled );
-    cudaCheckErrors( "cudaFree(d_Iscaled) failed" );
+    gpuFree( d_incoh );
+    gpuFree( d_offsets );
+    gpuFree( d_scales );
+    gpuFree( d_Iscaled );
 
     // Clean up memory associated with mwalib
     destroy_vcsbeam_context( vm );
@@ -343,7 +335,7 @@ void read_step( VoltageContext *vcs_context, uint64_t gps_second,
                 gps_second,
                 1,
                 coarse_chan_idx,
-                data,
+                (char*)data,
                 data_size,
                 error_message,
                 ERROR_MESSAGE_LEN ) != MWALIB_SUCCESS)

diff --git a/app/make_mwa_tied_array_beam.c b/app/make_mwa_tied_array_beam.c
@@ -18,6 +18,7 @@
 
 // Local includes
 #include "vcsbeam.h"
+#include "gpu_macros.h"
 
 #define MAX_COMMAND_LENGTH 1024
 
@@ -150,7 +151,7 @@ int main(int argc, char **argv)
     uintptr_t npols          = vm->obs_metadata->num_ant_pols;
     unsigned int nsamples    = vm->fine_sample_rate;
 
-    cuDoubleComplex  *data_buffer_fine;
+    gpuDoubleComplex  *data_buffer_fine;
 
     if (vm->do_inverse_pfb)
     {
@@ -339,8 +340,7 @@ int main(int argc, char **argv)
 
     if (vm->do_inverse_pfb)
     {
-        cudaFreeHost( data_buffer_vdif  );
-        cudaCheckErrors( "cudaFreeHost(data_buffer_vdif) failed" );
+        gpuHostFree( data_buffer_vdif  );
     }
 
     vmDestroyStatistics( vm );

diff --git a/app/mwa_tied_array_beam_psf.c b/app/mwa_tied_array_beam_psf.c
@@ -115,7 +115,7 @@ int main(int argc, char **argv)
     // Loop over RA
     int X_idx, Y_idx;
     double X, Y;
-    cuDoubleComplex *J = malloc( 4*sizeof(cuDoubleComplex) );
+    gpuDoubleComplex *J = (gpuDoubleComplex*)malloc( 4*sizeof(gpuDoubleComplex) );
     for (X_idx = 0; X_idx < opts.width_pixels; X_idx++)
     {
         X = X0 + X_idx*dX;

diff --git a/app/mwa_track_primary_beam_response.c b/app/mwa_track_primary_beam_response.c
@@ -86,7 +86,7 @@ int main(int argc, char **argv)
         sprintf( coord2, "ϕ" );
     }
 
-    cuDoubleComplex *J = malloc( 4*sizeof(cuDoubleComplex) ); // For the FEE beam
+    gpuDoubleComplex *J = (gpuDoubleComplex*)malloc( 4*sizeof(gpuDoubleComplex) ); // For the FEE beam
     vm.npointing = 1;
     vm.coarse_chan_idx = 0; // <-- just a dummy for initially setting up the primary beam struct
     vmCreatePrimaryBeam( &vm );
@@ -174,10 +174,10 @@ int main(int argc, char **argv)
                     IQUV[2],
                     IQUV[3],
                     array_factor,
-                    cuCreal( J[0] ), cuCimag( J[0] ),
-                    cuCreal( J[1] ), cuCimag( J[1] ),
-                    cuCreal( J[2] ), cuCimag( J[2] ),
-                    cuCreal( J[3] ), cuCimag( J[3] )
+                    gpuCreal( J[0] ), gpuCimag( J[0] ),
+                    gpuCreal( J[1] ), gpuCimag( J[1] ),
+                    gpuCreal( J[2] ), gpuCimag( J[2] ),
+                    gpuCreal( J[3] ), gpuCimag( J[3] )
                    );
 
         }

diff --git a/build_hip.sh b/build_hip.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# TO BUILD ON SETONIX -
+
+# First, you need to source the bash library.
+
+module load bash-utils
+source "${BASH_UTILS_DIR}/build_utils.sh"
+
+# Set the program name and versions, used to create the installation paths.
+PROGRAM_NAME=vcsbeam
+PROGRAM_VERSION=devel
+# the following function sets up the installation path according to the
+# cluster the script is running on and the first argument given. The argument
+# can be:
+# - "group": install the software in the group wide directory
+# - "user": install the software only for the current user
+# - "test": install the software in the current working directory
+process_build_script_input group
+
+# load all the modules required for the program to compile and run.
+# the following command also adds those module names in the modulefile
+# that this script will generate.
+
+# module use /software/projects/pawsey1045/setonix/2024.05/modules/zen3/gcc/12.2.0/
+# module_load module1/ver module2/ver ..
+module_load pal/0.9.8-yyskiux mwalib/1.3.3-qvtlpxn cfitsio/4.3.0 rocm/5.7.3 psrfits-utils/2023-10-08-ltewgrw vdifio/master-u6heigs hyperbeam/0.5.0-glmva5q
+
+module load cmake/3.27.7
+
+mkdir build
+cd build
+cmake -DUSE_HIP=ON -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR \
+    -DCMAKE_CXX_COMPILER=hipcc \
+    -DCMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3" \
+    -DHYPERBEAM_HDF5=/scratch/references/mwa/beam-models/mwa_full_embedded_element_pattern.h5 \
+    -DCMAKE_C_COMPILER=hipcc \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DPSRFITS_UTILS_ROOT_DIR=${PAWSEY_PSRFITS_UTILS_HOME} -DPAL_ROOT_DIR=${PAWSEY_PAL_HOME} ..
+
+make VERBOSE=1 -j 12
+make install
+create_modulefile
+
+# NOTE: Needs to be built on the node with the GPU available (for HIP).