Skip to content

Commit

Permalink
Merge pull request #21 from ANL-CESAR/hip
Browse files Browse the repository at this point in the history
Addition of HIP port
  • Loading branch information
jtramm authored Sep 20, 2021
2 parents 45643fc + e14fd59 commit ff7e9d4
Show file tree
Hide file tree
Showing 28 changed files with 1,722 additions and 46 deletions.
6 changes: 6 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
=====================================================================
NEW IN VERSION 20
=====================================================================
- (Feature) Adds a HIP port. This port is closely derived from the
CUDA port, with only a few very minor changes.

=====================================================================
NEW IN VERSION 19
=====================================================================
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ This version of XSBench is written in OpenCL, and can be used for CPU, GPU, FPGA
4. **XSBench/sycl**
This version of XSBench is written in SYCL, and can be used for CPU, GPU, FPGA, or other architectures that support OpenCL and SYCL. It was written with GPUs in mind, so if running on other architectures you may need to heavily re-optimize the code. You will also likely need to edit the makefile to supply the path to your SYCL compiler.

5. **XSBench/hip**
This version of XSBench is written in HIP for use with GPU architectures. This version is derived from CUDA using an automatic conversion tool with only a few small manual changes.

## Compilation

To compile XSBench with default settings, navigate to your selected source directory and use the following command:
Expand Down
2 changes: 1 addition & 1 deletion cuda/Main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ int main( int argc, char* argv[] )
// =====================================================================
// Initialization & Command Line Read-In
// =====================================================================
int version = 19;
int version = 20;
int mype = 0;
double omp_start, omp_end;
int nprocs = 1;
Expand Down
6 changes: 3 additions & 3 deletions cuda/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ COMPILER = nvidia
OPTIMIZE = yes
DEBUG = no
PROFILE = no
SM_VERSION = 70
SM_VERSION = 80

#===============================================================================
# Program name & source code list
Expand Down Expand Up @@ -34,10 +34,10 @@ CFLAGS :=
# Linker Flags
LDFLAGS = -lm

# Regular gcc Compiler
# NVIDIA Compiler
ifeq ($(COMPILER),nvidia)
CC = nvcc
CFLAGS += -Xcompiler -Wall -Xcompiler -O3 -arch=sm_$(SM_VERSION) -std=c++11
CFLAGS += -Xcompiler -Wall -Xcompiler -O3 -arch=sm_$(SM_VERSION) -std=c++14
endif

# Debug Flags
Expand Down
4 changes: 2 additions & 2 deletions cuda/Simulation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ unsigned long long run_event_based_simulation_baseline(Inputs in, SimulationData
////////////////////////////////////////////////////////////////////////////////
if( mype == 0) printf("Running baseline event-based simulation...\n");

int nthreads = 32;
int nblocks = ceil( (double) in.lookups / 32.0);
int nthreads = 256;
int nblocks = ceil( (double) in.lookups / (double) nthreads);

xs_lookup_kernel_baseline<<<nblocks, nthreads>>>( in, GSD );
gpuErrchk( cudaPeekAtLastError() );
Expand Down
1 change: 1 addition & 0 deletions cuda/io.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ int print_results( Inputs in, int mype, double runtime, int nprocs,
border_print();

// Print the results
printf("NOTE: Timings are estimated -- use nvprof/nsys/iprof/rocprof for formal analysis\n");
#ifdef MPI
printf("MPI ranks: %d\n", nprocs);
#endif
Expand Down
252 changes: 252 additions & 0 deletions hip/GridInit.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
#include "XSbench_header.h"

// Moves all required data structures to the GPU's memory space
SimulationData move_simulation_data_to_device( Inputs in, int mype, SimulationData SD )
{
if(mype == 0) printf("Allocating and moving simulation data to GPU memory space...\n");

////////////////////////////////////////////////////////////////////////////////
// SUMMARY: Simulation Data Structure Manifest for "SD" Object
// Here we list all heap arrays (and lengths) in SD that would need to be
// offloaded manually if using an accelerator with a seperate memory space
////////////////////////////////////////////////////////////////////////////////
// int * num_nucs; // Length = length_num_nucs;
// double * concs; // Length = length_concs
// int * mats; // Length = length_mats
// double * unionized_energy_array; // Length = length_unionized_energy_array
// int * index_grid; // Length = length_index_grid
// NuclideGridPoint * nuclide_grid; // Length = length_nuclide_grid
//
// Note: "unionized_energy_array" and "index_grid" can be of zero length
// depending on lookup method.
//
// Note: "Lengths" are given as the number of objects in the array, not the
// number of bytes.
////////////////////////////////////////////////////////////////////////////////
size_t sz;
size_t total_sz = 0;

// Shallow copy of CPU simulation data to GPU simulation data
SimulationData GSD = SD;

// Move data to GPU memory space
sz = GSD.length_num_nucs * sizeof(int);
gpuErrchk( hipMalloc((void **) &GSD.num_nucs, sz) );
gpuErrchk( hipMemcpy(GSD.num_nucs, SD.num_nucs, sz, hipMemcpyHostToDevice) );
total_sz += sz;

sz = GSD.length_concs * sizeof(double);
gpuErrchk( hipMalloc((void **) &GSD.concs, sz) );
gpuErrchk( hipMemcpy(GSD.concs, SD.concs, sz, hipMemcpyHostToDevice) );
total_sz += sz;

sz = GSD.length_mats * sizeof(int);
gpuErrchk( hipMalloc((void **) &GSD.mats, sz) );
gpuErrchk( hipMemcpy(GSD.mats, SD.mats, sz, hipMemcpyHostToDevice) );
total_sz += sz;

sz = GSD.length_unionized_energy_array * sizeof(double);
gpuErrchk( hipMalloc((void **) &GSD.unionized_energy_array, sz) );
gpuErrchk( hipMemcpy(GSD.unionized_energy_array, SD.unionized_energy_array, sz, hipMemcpyHostToDevice) );
total_sz += sz;

sz = GSD.length_index_grid * sizeof(int);
gpuErrchk( hipMalloc((void **) &GSD.index_grid, sz) );
gpuErrchk( hipMemcpy(GSD.index_grid, SD.index_grid, sz, hipMemcpyHostToDevice) );
total_sz += sz;

sz = GSD.length_nuclide_grid * sizeof(NuclideGridPoint);
gpuErrchk( hipMalloc((void **) &GSD.nuclide_grid, sz) );
gpuErrchk( hipMemcpy(GSD.nuclide_grid, SD.nuclide_grid, sz, hipMemcpyHostToDevice) );
total_sz += sz;

// Allocate verification array on device. This structure is not needed on CPU, so we don't
// have to copy anything over.
sz = in.lookups * sizeof(unsigned long);
gpuErrchk( hipMalloc((void **) &GSD.verification, sz) );
total_sz += sz;
GSD.length_verification = in.lookups;

// Synchronize
gpuErrchk( hipPeekAtLastError() );
gpuErrchk( hipDeviceSynchronize() );

if(mype == 0 ) printf("GPU Intialization complete. Allocated %.0lf MB of data on GPU.\n", total_sz/1024.0/1024.0 );

return GSD;

}

SimulationData grid_init_do_not_profile( Inputs in, int mype )
{
// Structure to hold all allocated simuluation data arrays
SimulationData SD;

// Keep track of how much data we're allocating
size_t nbytes = 0;

// Set the initial seed value
uint64_t seed = 42;

////////////////////////////////////////////////////////////////////
// Initialize Nuclide Grids
////////////////////////////////////////////////////////////////////

if(mype == 0) printf("Intializing nuclide grids...\n");

// First, we need to initialize our nuclide grid. This comes in the form
// of a flattened 2D array that hold all the information we need to define
// the cross sections for all isotopes in the simulation.
// The grid is composed of "NuclideGridPoint" structures, which hold the
// energy level of the grid point and all associated XS data at that level.
// An array of structures (AOS) is used instead of
// a structure of arrays, as the grid points themselves are accessed in
// a random order, but all cross section interaction channels and the
// energy level are read whenever the gridpoint is accessed, meaning the
// AOS is more cache efficient.

// Initialize Nuclide Grid
SD.length_nuclide_grid = in.n_isotopes * in.n_gridpoints;
SD.nuclide_grid = (NuclideGridPoint *) malloc( SD.length_nuclide_grid * sizeof(NuclideGridPoint));
assert(SD.nuclide_grid != NULL);
nbytes += SD.length_nuclide_grid * sizeof(NuclideGridPoint);
for( int i = 0; i < SD.length_nuclide_grid; i++ )
{
SD.nuclide_grid[i].energy = LCG_random_double(&seed);
SD.nuclide_grid[i].total_xs = LCG_random_double(&seed);
SD.nuclide_grid[i].elastic_xs = LCG_random_double(&seed);
SD.nuclide_grid[i].absorbtion_xs = LCG_random_double(&seed);
SD.nuclide_grid[i].fission_xs = LCG_random_double(&seed);
SD.nuclide_grid[i].nu_fission_xs = LCG_random_double(&seed);
}

// Sort so that each nuclide has data stored in ascending energy order.
for( int i = 0; i < in.n_isotopes; i++ )
qsort( &SD.nuclide_grid[i*in.n_gridpoints], in.n_gridpoints, sizeof(NuclideGridPoint), NGP_compare);

// error debug check
/*
for( int i = 0; i < in.n_isotopes; i++ )
{
printf("NUCLIDE %d ==============================\n", i);
for( int j = 0; j < in.n_gridpoints; j++ )
printf("E%d = %lf\n", j, SD.nuclide_grid[i * in.n_gridpoints + j].energy);
}
*/


////////////////////////////////////////////////////////////////////
// Initialize Acceleration Structure
////////////////////////////////////////////////////////////////////

if( in.grid_type == NUCLIDE )
{
SD.length_unionized_energy_array = 0;
SD.length_index_grid = 0;
}

if( in.grid_type == UNIONIZED )
{
if(mype == 0) printf("Intializing unionized grid...\n");

// Allocate space to hold the union of all nuclide energy data
SD.length_unionized_energy_array = in.n_isotopes * in.n_gridpoints;
SD.unionized_energy_array = (double *) malloc( SD.length_unionized_energy_array * sizeof(double));
assert(SD.unionized_energy_array != NULL );
nbytes += SD.length_unionized_energy_array * sizeof(double);

// Copy energy data over from the nuclide energy grid
for( int i = 0; i < SD.length_unionized_energy_array; i++ )
SD.unionized_energy_array[i] = SD.nuclide_grid[i].energy;

// Sort unionized energy array
qsort( SD.unionized_energy_array, SD.length_unionized_energy_array, sizeof(double), double_compare);

// Allocate space to hold the acceleration grid indices
SD.length_index_grid = SD.length_unionized_energy_array * in.n_isotopes;
SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
assert(SD.index_grid != NULL);
nbytes += SD.length_index_grid * sizeof(int);

// Generates the double indexing grid
int * idx_low = (int *) calloc( in.n_isotopes, sizeof(int));
assert(idx_low != NULL );
double * energy_high = (double *) malloc( in.n_isotopes * sizeof(double));
assert(energy_high != NULL );

for( int i = 0; i < in.n_isotopes; i++ )
energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + 1].energy;

for( long e = 0; e < SD.length_unionized_energy_array; e++ )
{
double unionized_energy = SD.unionized_energy_array[e];
for( long i = 0; i < in.n_isotopes; i++ )
{
if( unionized_energy < energy_high[i] )
SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
else if( idx_low[i] == in.n_gridpoints - 2 )
SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
else
{
idx_low[i]++;
SD.index_grid[e * in.n_isotopes + i] = idx_low[i];
energy_high[i] = SD.nuclide_grid[i * in.n_gridpoints + idx_low[i] + 1].energy;
}
}
}

free(idx_low);
free(energy_high);
}

if( in.grid_type == HASH )
{
if(mype == 0) printf("Intializing hash grid...\n");
SD.length_unionized_energy_array = 0;
SD.length_index_grid = in.hash_bins * in.n_isotopes;
SD.index_grid = (int *) malloc( SD.length_index_grid * sizeof(int));
assert(SD.index_grid != NULL);
nbytes += SD.length_index_grid * sizeof(int);

double du = 1.0 / in.hash_bins;

// For each energy level in the hash table
for( long e = 0; e < in.hash_bins; e++ )
{
double energy = e * du;

// We need to determine the bounding energy levels for all isotopes
for( long i = 0; i < in.n_isotopes; i++ )
{
SD.index_grid[e * in.n_isotopes + i] = grid_search_nuclide( in.n_gridpoints, energy, SD.nuclide_grid + i * in.n_gridpoints, 0, in.n_gridpoints-1);
}
}
}

////////////////////////////////////////////////////////////////////
// Initialize Materials and Concentrations
////////////////////////////////////////////////////////////////////
if(mype == 0) printf("Intializing material data...\n");

// Set the number of nuclides in each material
SD.num_nucs = load_num_nucs(in.n_isotopes);
SD.length_num_nucs = 12; // There are always 12 materials in XSBench

// Intialize the flattened 2D grid of material data. The grid holds
// a list of nuclide indices for each of the 12 material types. The
// grid is allocated as a full square grid, even though not all
// materials have the same number of nuclides.
SD.mats = load_mats(SD.num_nucs, in.n_isotopes, &SD.max_num_nucs);
SD.length_mats = SD.length_num_nucs * SD.max_num_nucs;

// Intialize the flattened 2D grid of nuclide concentration data. The grid holds
// a list of nuclide concentrations for each of the 12 material types. The
// grid is allocated as a full square grid, even though not all
// materials have the same number of nuclides.
SD.concs = load_concs(SD.num_nucs, SD.max_num_nucs);
SD.length_concs = SD.length_mats;

if(mype == 0) printf("Intialization complete. Allocated %.0lf MB of data on CPU.\n", nbytes/1024.0/1024.0 );

return SD;
}
Loading

0 comments on commit ff7e9d4

Please sign in to comment.