diff --git a/CHANGES.txt b/CHANGES.txt
index bc07a9c..1ddea26 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,16 @@
+=====================================================================
+NEW IN VERSION 13
+=====================================================================
+- (Feature) Added HIP port of RSBench. This port is based closely
+  off the CUDA version, and was generated using an automated code
+  conversion utility with only a few manual changes required.
+
+- Fixed threads per block for CUDA/HIP/OpenCL to all use 256 threads.
+  Other models will select this value themselves, but it may be
+  worth testing configurations maually with those models as well.
+
+- Added a warning about GPU timers to output.
+
 =====================================================================
 NEW IN VERSION 12
 =====================================================================
diff --git a/README.md b/README.md
index be30937..7476af6 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,9 @@ This version of RSBench is written in OpenCL, and can be used for CPU, GPU, FPGA
 4. **RSBench/sycl**
 This version of RSBench is written in SYCL, and can be used for CPU, GPU, FPGA, or other architectures that support OpenCL and SYCL. It was written with GPUs in mind, so if running on other architectures you may need to heavily re-optimize the code. You will also likely need to edit the makefile to supply the path to your SYCL compiler.
 
+5. **RSBench/hip**
+This version of RSBench is written in HIP for use with GPU architectures. This version is derived from CUDA using an automatic conversion tool with only a few small manual changes.
+
 ## Compilation
 
 To compile RSBench with default settings, navigate to your selected source directory and use the following command:
diff --git a/cuda/Makefile b/cuda/Makefile
index 55294cc..78dc383 100644
--- a/cuda/Makefile
+++ b/cuda/Makefile
@@ -6,7 +6,7 @@ COMPILER    = nvidia
 OPTIMIZE    = yes
 DEBUG       = no
 PROFILE     = no
-SM_VERSION  = 37
+SM_VERSION  = 80
 
 #===============================================================================
 # Program name & source code list
@@ -31,10 +31,10 @@ obj = $(source:.cu=.o)
 # Standard Flags
 CFLAGS := 
 
-# Regular gcc Compiler
+# NVIDIA Compiler
 ifeq ($(COMPILER),nvidia)
   CC = nvcc
-  CFLAGS += -Xcompiler -Wall -Xcompiler -O3 -arch=sm_$(SM_VERSION) -std=c++11
+  CFLAGS += -Xcompiler -Wall -Xcompiler -O3 -arch=sm_$(SM_VERSION) -std=c++14
 endif
 
 # Linker Flags
diff --git a/cuda/io.cu b/cuda/io.cu
index 26f132f..e505ea0 100644
--- a/cuda/io.cu
+++ b/cuda/io.cu
@@ -288,6 +288,7 @@ void print_input_summary(Input input)
 
 int validate_and_print_results(Input input, double runtime, unsigned long vhash)
 {
+  printf("NOTE: Timings are estimated -- use nvprof/nsys/iprof/rocprof for formal analysis\n");
 	printf("Runtime:               %.3lf seconds\n", runtime);
 	int lookups = 0;
 	if( input.simulation_method == HISTORY_BASED )
diff --git a/cuda/main.cu b/cuda/main.cu
index f88244a..24dd288 100644
--- a/cuda/main.cu
+++ b/cuda/main.cu
@@ -6,7 +6,7 @@ int main(int argc, char * argv[])
 	// Initialization & Command Line Read-In
 	// =====================================================================
 
-	int version = 12;
+	int version = 13;
 	double start, stop;
 	
 	// Process CLI Fields
diff --git a/cuda/simulation.cu b/cuda/simulation.cu
index 3e049b9..504729a 100644
--- a/cuda/simulation.cu
+++ b/cuda/simulation.cu
@@ -19,8 +19,8 @@ void run_event_based_simulation(Input input, SimulationData GSD, unsigned long *
 	////////////////////////////////////////////////////////////////////////////////
 	printf("Running baseline event-based simulation on device...\n");
 
-	int nthreads = 32;
-	int nblocks = ceil( (double) input.lookups / 32.0);
+	int nthreads = 256;
+	int nblocks = ceil( (double) input.lookups / (double) nthreads);
 
 	xs_lookup_kernel_baseline<<<nblocks, nthreads>>>( input, GSD );
 	gpuErrchk( cudaPeekAtLastError() );
diff --git a/hip/Makefile b/hip/Makefile
new file mode 100644
index 0000000..645e3b0
--- /dev/null
+++ b/hip/Makefile
@@ -0,0 +1,76 @@
+#===============================================================================
+# User Options
+#===============================================================================
+
+COMPILER    = amd
+OPTIMIZE    = yes
+DEBUG       = no
+PROFILE     = no
+
+#===============================================================================
+# Program name & source code list
+#===============================================================================
+
+program = rsbench
+
+source = \
+main.hip \
+simulation.hip\
+io.hip \
+init.hip \
+material.hip \
+utils.hip
+
+obj = $(source:.hip=.o)
+
+#===============================================================================
+# Sets Flags
+#===============================================================================
+
+# Standard Flags
+CFLAGS := 
+
+# AMD
+ifeq ($(COMPILER),amd)
+  CC = hipcc
+  CFLAGS += -std=c++14
+endif
+
+# Linker Flags
+LDFLAGS = -lm
+
+# Debug Flags
+ifeq ($(DEBUG),yes)
+  CFLAGS += -g -G
+  LDFLAGS += -g -G
+endif
+
+# Profiling Flags
+ifeq ($(PROFILE),yes)
+  CFLAGS += -pg
+  LDFLAGS  += -pg
+endif
+
+# Optimization Flags
+ifeq ($(OPTIMIZE),yes)
+  CFLAGS += -O3
+endif
+
+#===============================================================================
+# Targets to Build
+#===============================================================================
+
+$(program): $(obj) rsbench.h Makefile
+	$(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
+
+%.o: %.hip rsbench.h Makefile
+	$(CC) $(CFLAGS) -c $< -o $@
+
+clean:
+	rm -rf rsbench $(obj)
+
+edit:
+	vim -p $(source) rsbench.h
+
+run:
+	./rsbench
diff --git a/hip/init.hip b/hip/init.hip
new file mode 100644
index 0000000..0b2cedf
--- /dev/null
+++ b/hip/init.hip
@@ -0,0 +1,254 @@
+#include "rsbench.h"
+
+// Moves all required data structures to the GPU's memory space
+SimulationData move_simulation_data_to_device( Input in, SimulationData SD )
+{
+	printf("Allocating and moving simulation data to GPU memory space...\n");
+
+	size_t sz;
+	size_t total_sz = 0;
+
+	// Shallow copy of CPU simulation data to GPU simulation data
+	SimulationData GSD = SD;
+
+	// Move data to GPU memory space
+	sz = GSD.length_num_nucs * sizeof(int);
+	gpuErrchk( hipMalloc((void **) &GSD.num_nucs, sz) );
+	gpuErrchk( hipMemcpy(GSD.num_nucs, SD.num_nucs, sz, hipMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_concs * sizeof(double);
+	gpuErrchk( hipMalloc((void **) &GSD.concs, sz) );
+	gpuErrchk( hipMemcpy(GSD.concs, SD.concs, sz, hipMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_mats * sizeof(int);
+	gpuErrchk( hipMalloc((void **) &GSD.mats, sz) );
+	gpuErrchk( hipMemcpy(GSD.mats, SD.mats, sz, hipMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_n_poles * sizeof(int);
+	gpuErrchk( hipMalloc((void **) &GSD.n_poles, sz) );
+	gpuErrchk( hipMemcpy(GSD.n_poles, SD.n_poles, sz, hipMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_n_windows * sizeof(int);
+	gpuErrchk( hipMalloc((void **) &GSD.n_windows, sz) );
+	gpuErrchk( hipMemcpy(GSD.n_windows, SD.n_windows, sz, hipMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_poles * sizeof(Pole);
+	gpuErrchk( hipMalloc((void **) &GSD.poles, sz) );
+	gpuErrchk( hipMemcpy(GSD.poles, SD.poles, sz, hipMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_windows * sizeof(Window);
+	gpuErrchk( hipMalloc((void **) &GSD.windows, sz) );
+	gpuErrchk( hipMemcpy(GSD.windows, SD.windows, sz, hipMemcpyHostToDevice) );
+	total_sz += sz;
+
+	sz = GSD.length_pseudo_K0RS * sizeof(double);
+	gpuErrchk( hipMalloc((void **) &GSD.pseudo_K0RS, sz) );
+	gpuErrchk( hipMemcpy(GSD.pseudo_K0RS, SD.pseudo_K0RS, sz, hipMemcpyHostToDevice) );
+	total_sz += sz;
+	
+	// Allocate verification array on device. This structure is not needed on CPU, so we don't
+	// have to copy anything over.
+	sz = in.lookups * sizeof(unsigned long);
+	gpuErrchk( hipMalloc((void **) &GSD.verification, sz) );
+	total_sz += sz;
+	GSD.length_verification = in.lookups;
+	
+	// Synchronize
+	gpuErrchk( hipPeekAtLastError() );
+	gpuErrchk( hipDeviceSynchronize() );
+	
+	printf("GPU Intialization complete. Allocated %.0lf MB of data on GPU.\n", total_sz/1024.0/1024.0 );
+
+	return GSD;
+
+}
+
+SimulationData initialize_simulation( Input input )
+{
+	uint64_t seed = INITIALIZATION_SEED;
+	
+	// Get material data
+	printf("Loading Hoogenboom-Martin material data...\n");
+	SimulationData SD = get_materials( input, &seed ); 
+	
+	// Allocate & fill energy grids
+	printf("Generating resonance distributions...\n");
+	SD.n_poles = generate_n_poles( input, &seed );
+	SD.length_n_poles = input.n_nuclides;
+
+	// Allocate & fill Window grids
+	printf("Generating window distributions...\n");
+	SD.n_windows = generate_n_windows( input, &seed );
+	SD.length_n_windows = input.n_nuclides;
+
+	// Prepare full resonance grid
+	printf("Generating resonance parameter grid...\n");
+	SD.poles = generate_poles( input, SD.n_poles, &seed, &SD.max_num_poles );
+	SD.length_poles = input.n_nuclides * SD.max_num_poles;
+
+	// Prepare full Window grid
+	printf("Generating window parameter grid...\n");
+	SD.windows = generate_window_params( input, SD.n_windows, SD.n_poles, &seed, &SD.max_num_windows);
+	SD.length_windows = input.n_nuclides * SD.max_num_windows;
+
+	// Prepare 0K Resonances
+	printf("Generating 0K l_value data...\n");
+	SD.pseudo_K0RS = generate_pseudo_K0RS( input, &seed );
+	SD.length_pseudo_K0RS = input.n_nuclides * input.numL;
+
+	return SD;
+}
+
+int * generate_n_poles( Input input, uint64_t * seed )
+{
+	int total_resonances = input.avg_n_poles * input.n_nuclides;
+
+	int * R = (int *) malloc( input.n_nuclides * sizeof(int));
+	
+	// Ensure all nuclides have at least 1 resonance
+	for( int i = 0; i < input.n_nuclides; i++ )
+		R[i] = 1;
+
+	// Sample the rest
+	for( int i = 0; i < total_resonances - input.n_nuclides; i++ )
+		R[LCG_random_int(seed) % input.n_nuclides]++;
+	
+	/* Debug	
+	for( int i = 0; i < input.n_nuclides; i++ )
+		printf("R[%d] = %d\n", i, R[i]);
+	*/
+	
+	return R;
+}
+
+int * generate_n_windows( Input input, uint64_t * seed )
+{
+	int total_resonances = input.avg_n_windows * input.n_nuclides;
+
+	int * R = (int *) malloc( input.n_nuclides * sizeof(int));
+	
+	// Ensure all nuclides have at least 1 resonance
+	for( int i = 0; i < input.n_nuclides; i++ )
+		R[i] = 1;
+
+	// Sample the rest
+	for( int i = 0; i < total_resonances - input.n_nuclides; i++ )
+		R[LCG_random_int(seed) % input.n_nuclides]++;
+	
+	/* Debug	
+	for( int i = 0; i < input.n_nuclides; i++ )
+		printf("R[%d] = %d\n", i, R[i]);
+	*/
+	
+	return R;
+}
+
+Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_num_poles )
+{
+	// Pole Scaling Factor -- Used to bias hitting of the fast Faddeeva
+	// region to approximately 99.5% (i.e., only 0.5% of lookups should
+	// require the full eval).
+	double f = 152.5;
+	RSComplex f_c = {f, 0};
+
+	int max_poles = -1;
+	
+	for( int i = 0; i < input.n_nuclides; i++ )
+	{
+		if( n_poles[i] > max_poles)
+			max_poles = n_poles[i];
+	}
+	*max_num_poles = max_poles;
+
+	// Allocating 2D matrix as a 1D contiguous vector
+	Pole * R = (Pole *) malloc( input.n_nuclides * max_poles * sizeof(Pole));
+	
+	// fill with data
+	for( int i = 0; i < input.n_nuclides; i++ )
+		for( int j = 0; j < n_poles[i]; j++ )
+		{
+			double r = LCG_random_double(seed);
+			double im = LCG_random_double(seed);
+			RSComplex t1 = {r, im};
+			R[i * max_poles + j].MP_EA = c_mul(f_c,t1);
+			r = LCG_random_double(seed);
+			im = LCG_random_double(seed);
+			RSComplex t2 = {f*r, im};
+			R[i * max_poles + j].MP_RT = t2;
+			r = LCG_random_double(seed);
+			im = LCG_random_double(seed);
+			RSComplex t3 = {f*r, im};
+			R[i * max_poles + j].MP_RA = t3;
+			r = LCG_random_double(seed);
+			im = LCG_random_double(seed);
+			RSComplex t4 = {f*r, im};
+			R[i * max_poles + j].MP_RF = t4;
+			R[i * max_poles + j].l_value = LCG_random_int(seed) % input.numL;
+		}
+	
+	/* Debug
+	for( int i = 0; i < input.n_nuclides; i++ )
+		for( int j = 0; j < n_poles[i]; j++ )
+			printf("R[%d][%d]: Eo = %lf lambda_o = %lf Tn = %lf Tg = %lf Tf = %lf\n", i, j, R[i * max_poles + j].Eo, R[i * max_poles + j].lambda_o, R[i * max_poles + j].Tn, R[i * max_poles + j].Tg, R[i * max_poles + j].Tf);
+	*/
+
+	return R;
+}
+
+Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows )
+{
+	int max_windows = -1;
+	
+	for( int i = 0; i < input.n_nuclides; i++ )
+	{
+		if( n_windows[i] > max_windows)
+			max_windows = n_windows[i];
+	}
+	*max_num_windows = max_windows;
+
+	// Allocating 2D contiguous matrix
+	Window * R = (Window *) malloc( input.n_nuclides * max_windows * sizeof(Window));
+	
+	// fill with data
+	for( int i = 0; i < input.n_nuclides; i++ )
+	{
+		int space = n_poles[i] / n_windows[i];
+		int remainder = n_poles[i] - space * n_windows[i];
+		int ctr = 0;
+		for( int j = 0; j < n_windows[i]; j++ )
+		{
+			R[i * max_windows + j].T = LCG_random_double(seed);
+			R[i * max_windows + j].A = LCG_random_double(seed);
+			R[i * max_windows + j].F = LCG_random_double(seed);
+			R[i * max_windows + j].start = ctr; 
+			R[i * max_windows + j].end = ctr + space - 1;
+
+			ctr += space;
+
+			if ( j < remainder )
+			{
+				ctr++;
+				R[i * max_windows + j].end++;
+			}
+		}
+	}
+
+	return R;
+}
+
+double * generate_pseudo_K0RS( Input input, uint64_t * seed )
+{
+	double * R = (double *) malloc( input.n_nuclides * input.numL * sizeof(double));
+
+	for( int i = 0; i < input.n_nuclides; i++)
+		for( int j = 0; j < input.numL; j++ )
+			R[i * input.numL + j] = LCG_random_double(seed);
+
+	return R;
+}
diff --git a/hip/io.hip b/hip/io.hip
new file mode 100644
index 0000000..3fad7e2
--- /dev/null
+++ b/hip/io.hip
@@ -0,0 +1,338 @@
+#include "rsbench.h"
+
+// Prints program logo
+void logo(int version)
+{
+	border_print();
+	printf(
+"                    _____   _____ ____                  _     \n"
+"                   |  __ \\ / ____|  _ \\                | |    \n"
+"                   | |__) | (___ | |_) | ___ _ __   ___| |__  \n"
+"                   |  _  / \\___ \\|  _ < / _ \\ '_ \\ / __| '_ \\ \n"
+"                   | | \\ \\ ____) | |_) |  __/ | | | (__| | | |\n"
+"                   |_|  \\_\\_____/|____/ \\___|_| |_|\\___|_| |_|\n\n"
+	);
+	border_print();
+	center_print("Developed at Argonne National Laboratory", 79);
+	char v[100];
+	sprintf(v, "Version: %d", version);
+	center_print(v, 79);
+	border_print();
+}
+
+// Prints Section titles in center of 80 char terminal
+void center_print(const char *s, int width)
+{
+	int length = strlen(s);
+	int i;
+	for (i=0; i<=(width-length)/2; i++) {
+		fputs(" ", stdout);
+	}
+	fputs(s, stdout);
+	fputs("\n", stdout);
+}
+
+void border_print(void)
+{
+	printf(
+	"==================================================================="
+	"=============\n");
+}
+
+// Prints comma separated integers - for ease of reading
+void fancy_int( int a )
+{
+    if( a < 1000 )
+        printf("%d\n",a);
+
+    else if( a >= 1000 && a < 1000000 )
+        printf("%d,%03d\n", a / 1000, a % 1000);
+
+    else if( a >= 1000000 && a < 1000000000 )
+        printf("%d,%03d,%03d\n", a / 1000000, (a % 1000000) / 1000, a % 1000 );
+
+    else if( a >= 1000000000 )
+        printf("%d,%03d,%03d,%03d\n",
+               a / 1000000000,
+               (a % 1000000000) / 1000000,
+               (a % 1000000) / 1000,
+               a % 1000 );
+    else
+        printf("%d\n",a);
+}
+
+Input read_CLI( int argc, char * argv[] )
+{
+	Input input;
+
+	// defaults to the history based simulation method
+	input.simulation_method = HISTORY_BASED;
+	// defaults to max threads on the system	
+	input.nthreads = 1;
+	// defaults to 355 (corresponding to H-M Large benchmark)
+	input.n_nuclides = 355;
+	// defaults to 300,000
+	input.particles = 300000;
+	// defaults to 34
+	input.lookups = 34;
+	// defaults to H-M Large benchmark
+	input.HM = LARGE;
+	// defaults to 3000 resonancs (avg) per nuclide
+	input.avg_n_poles = 1000;
+	// defaults to 100
+	input.avg_n_windows = 100;
+	// defaults to 4;
+	input.numL = 4;
+	// defaults to no temperature dependence (Doppler broadening)
+	input.doppler = 1;
+	// defaults to baseline simulation kernel
+	input.kernel_id = 0;
+	
+	int default_lookups = 1;
+	int default_particles = 1;
+
+	// Collect Raw Input
+	for( int i = 1; i < argc; i++ )
+	{
+		char * arg = argv[i];
+
+		// Simulation Method (-m)
+		if( strcmp(arg, "-m") == 0 )
+		{
+			char * sim_type = NULL;
+			if( ++i < argc )
+				sim_type = argv[i];
+			else
+				print_CLI_error();
+
+			if( strcmp(sim_type, "history") == 0 )
+				input.simulation_method = HISTORY_BASED;
+			else if( strcmp(sim_type, "event") == 0 )
+			{
+				input.simulation_method = EVENT_BASED;
+				// Also resets default # of lookups
+				if( default_lookups && default_particles )
+				{
+					input.lookups =  input.lookups * input.particles;
+					input.particles = 0;
+				}
+			}
+			else
+				print_CLI_error();
+		}
+		// lookups (-l)
+		else if( strcmp(arg, "-l") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.lookups = atoi(argv[i]);
+				default_lookups = 0;
+			}
+			else
+				print_CLI_error();
+		}
+		// particles (-p)
+		else if( strcmp(arg, "-p") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.particles = atoi(argv[i]);
+				default_particles = 0;
+			}
+			else
+				print_CLI_error();
+		}
+		// nuclides (-n)
+		else if( strcmp(arg, "-n") == 0 )
+		{
+			if( ++i < argc )
+				input.n_nuclides = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// HM (-s)
+		else if( strcmp(arg, "-s") == 0 )
+		{	
+			if( ++i < argc )
+			{
+				if( strcmp(argv[i], "small") == 0 )
+					input.HM = SMALL;
+				else if ( strcmp(argv[i], "large") == 0 )
+					input.HM = LARGE;
+				else
+					print_CLI_error();
+			}
+			else
+				print_CLI_error();
+		}
+		// Doppler Broadening (Temperature Dependence)
+		else if( strcmp(arg, "-d") == 0 )
+		{	
+			input.doppler = 0;
+		}
+		// Avg number of windows per nuclide (-w)
+		else if( strcmp(arg, "-W") == 0 )
+		{
+			if( ++i < argc )
+				input.avg_n_windows = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// Avg number of poles per nuclide (-p)
+		else if( strcmp(arg, "-P") == 0 )
+		{
+			if( ++i < argc )
+				input.avg_n_poles = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// Kernel ID (-k)
+		else if( strcmp(arg, "-k") == 0 )
+		{
+			if( ++i < argc )
+				input.kernel_id = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		else
+			print_CLI_error();
+	}
+
+	// Validate Input
+
+	// Validate nthreads
+	if( input.nthreads < 1 )
+		print_CLI_error();
+	
+	// Validate n_isotopes
+	if( input.n_nuclides < 1 )
+		print_CLI_error();
+	
+	// Validate lookups
+	if( input.lookups < 1 )
+		print_CLI_error();
+	
+	// Validate lookups
+	if( input.avg_n_poles < 1 )
+		print_CLI_error();
+	
+	// Validate lookups
+	if( input.avg_n_windows < 1 )
+		print_CLI_error();
+	
+	// Set HM size specific parameters
+	// (defaults to large)
+	if( input.HM == SMALL )
+		input.n_nuclides = 68;
+
+	// Return input struct
+	return input;
+}
+
+void print_CLI_error(void)
+{
+	printf("Usage: ./multibench <options>\n");
+	printf("Options include:\n");
+	printf("  -s <size>        Size of H-M Benchmark to run (small, large)\n");
+	printf("  -l <lookups>     Number of Cross-section (XS) lookups per particle history\n");
+	printf("  -p <particles>   Number of particle histories\n");
+	printf("  -P <poles>       Average Number of Poles per Nuclide\n");
+	printf("  -W <poles>       Average Number of Windows per Nuclide\n");
+	printf("  -d               Disables Temperature Dependence (Doppler Broadening)\n");
+	printf("Default is equivalent to: -s large -l 34 -p 300000 -P 1000 -W 100\n");
+	printf("See readme for full description of default run values\n");
+	exit(4);
+}
+
+void print_input_summary(Input input)
+{
+	printf("Programming Model:           HIP\n");
+	// Print CUDA device name
+	hipDeviceProp_t prop;
+	int device;
+	hipGetDevice(&device);
+	hipGetDeviceProperties ( &prop, device );
+	printf("HIP Device:                  %s\n", prop.name); 
+
+	// Calculate Estimate of Memory Usage
+	size_t mem = get_mem_estimate(input);
+
+	if( input.simulation_method == EVENT_BASED )
+		printf("Simulation Method:           Event Based\n");
+	else
+		printf("Simulation Method:           History Based\n");
+	printf("Materials:                   12\n");
+	printf("H-M Benchmark Size:          ");
+	if( input.HM == 0 )
+		printf("Small\n");
+	else
+		printf("Large\n");
+	if( input.doppler == 1 )
+		printf("Temperature Dependence:      ON\n");
+	else
+		printf("Temperature Dependence:      OFF\n");
+	printf("Total Nuclides:              %d\n", input.n_nuclides);
+	printf("Avg Poles per Nuclide:       "); fancy_int(input.avg_n_poles);
+	printf("Avg Windows per Nuclide:     "); fancy_int(input.avg_n_windows);
+
+	int lookups = input.lookups;
+	if( input.simulation_method == HISTORY_BASED )
+	{
+		printf("Particles:                   "); fancy_int(input.particles);
+		printf("XS Lookups per Particle:     "); fancy_int(input.lookups);
+		lookups *= input.particles;
+	}
+	printf("Total XS Lookups:            "); fancy_int(lookups);
+	printf("Est. Memory Usage (MB):      %.1lf\n", mem / 1024.0 / 1024.0);
+}
+
+int validate_and_print_results(Input input, double runtime, unsigned long vhash)
+{
+  printf("NOTE: Timings are estimated -- use nvprof/nsys/iprof/rocprof for formal analysis\n");
+	printf("Runtime:               %.3lf seconds\n", runtime);
+	int lookups = 0;
+	if( input.simulation_method == HISTORY_BASED )
+		lookups = input.lookups*input.particles;
+	else
+		lookups = input.lookups;
+	printf("Lookups:               "); fancy_int(lookups);
+	printf("Lookups/s:             "); fancy_int((double) lookups / (runtime));
+
+	int is_invalid = 1;
+
+	unsigned long long large = 0;
+	unsigned long long small = 0;
+	if(input.simulation_method == HISTORY_BASED )
+	{
+		large = 351485;
+		small = 879693;
+	}
+	else if( input.simulation_method == EVENT_BASED )
+	{
+		large = 358389;
+		small = 880018;
+	}
+
+	if( input.HM  == LARGE )
+	{
+		if( vhash == large )
+		{
+			printf("Verification checksum: %lu (Valid)\n", vhash);
+			is_invalid = 0;
+		}
+		else
+			printf("Verification checksum: %lu (WARNING - INAVALID CHECKSUM!)\n", vhash);
+	}
+	else if( input.HM  == SMALL )
+	{
+		if( vhash == small )
+		{
+			printf("Verification checksum: %lu (Valid)\n", vhash);
+			is_invalid = 0;
+		}
+		else
+			printf("Verification checksum: %lu (WARNING - INAVALID CHECKSUM!)\n", vhash);
+	}
+
+	return is_invalid;
+}
diff --git a/hip/main.hip b/hip/main.hip
new file mode 100644
index 0000000..d9d1077
--- /dev/null
+++ b/hip/main.hip
@@ -0,0 +1,87 @@
+#include "rsbench.h"
+
+int main(int argc, char * argv[])
+{
+	// =====================================================================
+	// Initialization & Command Line Read-In
+	// =====================================================================
+
+	int version = 13;
+	double start, stop;
+	
+	// Process CLI Fields
+	Input input = read_CLI( argc, argv );
+	
+	// =====================================================================
+	// Print-out of Input Summary
+	// =====================================================================
+	logo(version);
+	center_print("INPUT SUMMARY", 79);
+	border_print();
+	print_input_summary(input);
+
+	// =====================================================================
+	// Intialize Simulation Data Structures
+	// =====================================================================
+	border_print();
+	center_print("INITIALIZATION", 79);
+	border_print();
+	
+	start = get_time();
+	
+	SimulationData SD = initialize_simulation( input );
+	SimulationData GSD = move_simulation_data_to_device( input, SD );
+
+	stop = get_time();
+
+	printf("Initialization Complete. (%.2lf seconds)\n", stop-start);
+	
+	// =====================================================================
+	// Cross Section (XS) Parallel Lookup Simulation Begins
+	// =====================================================================
+	border_print();
+	center_print("SIMULATION", 79);
+	border_print();
+
+	unsigned long vhash = 0;
+
+	// Run Simulation
+	start = get_time();
+
+	// Run simulation
+	if( input.simulation_method == EVENT_BASED )
+	{
+		if( input.kernel_id == 0 )
+			run_event_based_simulation(input, GSD, &vhash );
+		else
+		{
+			printf("Error: No kernel ID %d found!\n", input.kernel_id);
+			exit(1);
+		}
+	}
+	else if( input.simulation_method == HISTORY_BASED )
+	{
+		printf("History-based simulation not implemented in OpenMP offload code. Instead,\nuse the event-based method with \"-m event\" argument.\n");
+		exit(1);
+	}
+
+	stop = get_time();
+
+	// Final hash step
+	vhash = vhash % 999983;
+
+	printf("Simulation Complete.\n");
+
+	// =====================================================================
+	// Print / Save Results and Exit
+	// =====================================================================
+	border_print();
+	center_print("RESULTS", 79);
+	border_print();
+
+	int is_invalid = validate_and_print_results(input, stop-start, vhash);
+
+	border_print();
+
+	return is_invalid;
+}
diff --git a/hip/material.hip b/hip/material.hip
new file mode 100644
index 0000000..791565b
--- /dev/null
+++ b/hip/material.hip
@@ -0,0 +1,123 @@
+#include "rsbench.h"
+
+// Handles all material creation tasks - returns Material struct
+SimulationData get_materials(Input input, uint64_t * seed)
+{
+	SimulationData SD;
+
+	SD.num_nucs = load_num_nucs(input);
+	SD.length_num_nucs = 12;
+
+	SD.mats = load_mats(input, SD.num_nucs, &SD.max_num_nucs, &SD.length_mats);
+
+	SD.concs = load_concs(SD.num_nucs, seed, SD.max_num_nucs);
+	SD.length_concs = 12 * SD.max_num_nucs; 
+
+	return SD;
+}
+
+// num_nucs represents the number of nuclides that each material contains
+int * load_num_nucs(Input input)
+{
+	int * num_nucs = (int*)malloc(12*sizeof(int));
+	
+	// Material 0 is a special case (fuel). The H-M small reactor uses
+	// 34 nuclides, while H-M larges uses 300.
+	if( input.n_nuclides == 68 )
+		num_nucs[0]  = 34; // HM Small is 34, H-M Large is 321
+	else
+		num_nucs[0]  = 321; // HM Small is 34, H-M Large is 321
+
+	num_nucs[1]  = 5;
+	num_nucs[2]  = 4;
+	num_nucs[3]  = 4;
+	num_nucs[4]  = 27;
+	num_nucs[5]  = 21;
+	num_nucs[6]  = 21;
+	num_nucs[7]  = 21;
+	num_nucs[8]  = 21;
+	num_nucs[9]  = 21;
+	num_nucs[10] = 9;
+	num_nucs[11] = 9;
+
+	return num_nucs;
+}
+
+// Assigns an array of nuclide ID's to each material
+int * load_mats( Input input, int * num_nucs, int * max_num_nucs, unsigned long * length_mats )
+{
+	*max_num_nucs = 0;
+	int num_mats = 12;
+	for( int m = 0; m < num_mats; m++ )
+	{
+		if( num_nucs[m] > *max_num_nucs )
+			*max_num_nucs = num_nucs[m];
+	}
+	int * mats = (int *) malloc( num_mats * (*max_num_nucs) * sizeof(int) );
+	*length_mats = num_mats * (*max_num_nucs);
+
+	// Small H-M has 34 fuel nuclides
+	int mats0_Sml[] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
+	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53, 
+	                 54, 55, 56, 18, 23, 41 }; //fuel
+	// Large H-M has 300 fuel nuclides
+	int mats0_Lrg[321] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
+	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53,
+	                 54, 55, 56, 18, 23, 41 }; //fuel
+	for( int i = 0; i < 321-34; i++ )
+		mats0_Lrg[34+i] = 68 + i; // H-M large adds nuclides to fuel only
+	
+	// These are the non-fuel materials	
+	int mats1[] =  { 63, 64, 65, 66, 67 }; // cladding
+	int mats2[] =  { 24, 41, 4, 5 }; // cold borated water
+	int mats3[] =  { 24, 41, 4, 5 }; // hot borated water
+	int mats4[] =  { 19, 20, 21, 22, 35, 36, 37, 38, 39, 25, 27, 28, 29,
+	                 30, 31, 32, 26, 49, 50, 51, 11, 12, 13, 14, 6, 16,
+	                 17 }; // RPV
+	int mats5[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // lower radial reflector
+	int mats6[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // top reflector / plate
+	int mats7[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // bottom plate
+	int mats8[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // bottom nozzle
+	int mats9[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // top nozzle
+	int mats10[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // top of FA's
+	int mats11[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // bottom FA's
+	
+	// H-M large v small dependency
+	if( input.n_nuclides == 68 )
+		memcpy( mats,  mats0_Sml,  num_nucs[0]  * sizeof(int) );	
+	else
+		memcpy( mats,  mats0_Lrg,  num_nucs[0]  * sizeof(int) );
+	
+	// Copy other materials
+	memcpy( mats + *max_num_nucs * 1,  mats1,  num_nucs[1]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 2,  mats2,  num_nucs[2]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 3,  mats3,  num_nucs[3]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 4,  mats4,  num_nucs[4]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 5,  mats5,  num_nucs[5]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 6,  mats6,  num_nucs[6]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 7,  mats7,  num_nucs[7]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 8,  mats8,  num_nucs[8]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 9,  mats9,  num_nucs[9]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 10, mats10, num_nucs[10] * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 11, mats11, num_nucs[11] * sizeof(int) );	
+
+	return mats;
+}
+
+// Creates a randomized array of 'concentrations' of nuclides in each mat
+double * load_concs( int * num_nucs, uint64_t * seed, int max_num_nucs )
+{
+	double * concs = (double *) malloc( 12 * max_num_nucs * sizeof( double ) );
+
+	for( int i = 0; i < 12; i++ )
+		for( int j = 0; j < num_nucs[i]; j++ )
+			concs[i * max_num_nucs + j] = LCG_random_double(seed);
+
+	return concs;
+}
+
diff --git a/hip/rsbench.cuh b/hip/rsbench.cuh
new file mode 100644
index 0000000..efc9688
--- /dev/null
+++ b/hip/rsbench.cuh
@@ -0,0 +1,145 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include<math.h>
+#include<string.h>
+#include<stdint.h>
+#include<float.h>
+#include<assert.h>
+#include<cuda.h>
+#include <thrust/reduce.h>
+#include <chrono> 
+
+#define PI 3.14159265359
+
+// typedefs
+typedef enum __hm{SMALL, LARGE, XL, XXL} HM_size;
+
+#define HISTORY_BASED 1
+#define EVENT_BASED 2
+
+#define STARTING_SEED 1070
+#define INITIALIZATION_SEED 42
+
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+	if (code != cudaSuccess) 
+	{
+		fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+		if (abort) exit(code);
+	}
+}
+
+typedef struct{
+	double r;
+	double i;
+} RSComplex;
+
+typedef struct{
+	int nthreads;
+	int n_nuclides;
+	int lookups;
+	HM_size HM;
+	int avg_n_poles;
+	int avg_n_windows;
+	int numL;
+	int doppler;
+	int particles;
+	int simulation_method;
+	int kernel_id;
+} Input;
+
+typedef struct{
+	RSComplex MP_EA;
+	RSComplex MP_RT;
+	RSComplex MP_RA;
+	RSComplex MP_RF;
+	short int l_value;
+} Pole;
+
+typedef struct{
+	double T;
+	double A;
+	double F;
+	int start;
+	int end;
+} Window;
+
+typedef struct{
+	int * n_poles;
+	unsigned long length_n_poles;
+	int * n_windows;
+	unsigned long length_n_windows;
+	Pole * poles;
+	unsigned long length_poles;
+	Window * windows;
+	unsigned long length_windows;
+	double * pseudo_K0RS;
+	unsigned long length_pseudo_K0RS;
+	int * num_nucs;
+	unsigned long length_num_nucs;
+	int * mats;
+	unsigned long length_mats;
+	double * concs;
+	unsigned long length_concs;
+	int max_num_nucs;
+	int max_num_poles;
+	int max_num_windows;
+	double * p_energy_samples;
+	unsigned long length_p_energy_samples;
+	int * mat_samples;
+	unsigned long length_mat_samples;
+	unsigned long  * verification;
+	unsigned long length_verification;
+} SimulationData;
+
+// io.c
+void logo(int version);
+void center_print(const char *s, int width);
+void border_print(void);
+void fancy_int( int a );
+Input read_CLI( int argc, char * argv[] );
+void print_CLI_error(void);
+void print_input_summary(Input input);
+int validate_and_print_results(Input input, double runtime, unsigned long vhash);
+
+// init.c
+SimulationData initialize_simulation( Input input );
+int * generate_n_poles( Input input,  uint64_t * seed );
+int * generate_n_windows( Input input ,  uint64_t * seed);
+Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_num_poles );
+Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows );
+double * generate_pseudo_K0RS( Input input, uint64_t * seed );
+SimulationData move_simulation_data_to_device( Input in, SimulationData SD );
+
+// material.c
+int * load_num_nucs(Input input);
+int * load_mats( Input input, int * num_nucs, int * max_num_nucs, unsigned long * length_mats );
+double * load_concs( int * num_nucs, uint64_t * seed, int max_num_nucs );
+SimulationData get_materials(Input input, uint64_t * seed);
+
+// utils.c
+size_t get_mem_estimate( Input input );
+double get_time(void);
+
+// simulation.c
+void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result );
+void run_event_based_simulation_optimization_1(Input in, SimulationData GSD, unsigned long * vhash_result);
+__global__ void xs_lookup_kernel_baseline(Input in, SimulationData GSD );
+__device__ void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
+__device__ void calculate_micro_xs( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles);
+__device__ void calculate_micro_xs_doppler( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
+__device__ int pick_mat( uint64_t * seed );
+__device__ void calculate_sig_T( int nuc, double E, Input input, double * pseudo_K0RS, RSComplex * sigTfactors );
+__device__ RSComplex fast_nuclear_W( RSComplex Z );
+__host__ __device__ double LCG_random_double(uint64_t * seed);
+__host__ __device__ uint64_t LCG_random_int(uint64_t * seed);
+__device__ uint64_t fast_forward_LCG(uint64_t seed, uint64_t n);
+__device__ RSComplex c_add( RSComplex A, RSComplex B);
+__device__ RSComplex c_sub( RSComplex A, RSComplex B);
+__host__ __device__ RSComplex c_mul( RSComplex A, RSComplex B);
+__device__ RSComplex c_div( RSComplex A, RSComplex B);
+__device__ double c_abs( RSComplex A);
+__device__ double fast_exp(double x);
+__device__ RSComplex fast_cexp( RSComplex z );
diff --git a/hip/rsbench.h b/hip/rsbench.h
new file mode 100644
index 0000000..1abca08
--- /dev/null
+++ b/hip/rsbench.h
@@ -0,0 +1,144 @@
+#include "hip/hip_runtime.h"
+#include<stdio.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include<math.h>
+#include<string.h>
+#include<stdint.h>
+#include<float.h>
+#include<assert.h>
+#include<hip/hip_runtime.h>
+#include <chrono> 
+
+#define PI 3.14159265359
+
+// typedefs
+typedef enum __hm{SMALL, LARGE, XL, XXL} HM_size;
+
+#define HISTORY_BASED 1
+#define EVENT_BASED 2
+
+#define STARTING_SEED 1070
+#define INITIALIZATION_SEED 42
+
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(hipError_t code, const char *file, int line, bool abort=true)
+{
+	if (code != hipSuccess) 
+	{
+		fprintf(stderr,"GPUassert: %s %s %d\n", hipGetErrorString(code), file, line);
+		if (abort) exit(code);
+	}
+}
+
+typedef struct{
+	double r;
+	double i;
+} RSComplex;
+
+typedef struct{
+	int nthreads;
+	int n_nuclides;
+	int lookups;
+	HM_size HM;
+	int avg_n_poles;
+	int avg_n_windows;
+	int numL;
+	int doppler;
+	int particles;
+	int simulation_method;
+	int kernel_id;
+} Input;
+
+typedef struct{
+	RSComplex MP_EA;
+	RSComplex MP_RT;
+	RSComplex MP_RA;
+	RSComplex MP_RF;
+	short int l_value;
+} Pole;
+
+typedef struct{
+	double T;
+	double A;
+	double F;
+	int start;
+	int end;
+} Window;
+
+typedef struct{
+	int * n_poles;
+	unsigned long length_n_poles;
+	int * n_windows;
+	unsigned long length_n_windows;
+	Pole * poles;
+	unsigned long length_poles;
+	Window * windows;
+	unsigned long length_windows;
+	double * pseudo_K0RS;
+	unsigned long length_pseudo_K0RS;
+	int * num_nucs;
+	unsigned long length_num_nucs;
+	int * mats;
+	unsigned long length_mats;
+	double * concs;
+	unsigned long length_concs;
+	int max_num_nucs;
+	int max_num_poles;
+	int max_num_windows;
+	double * p_energy_samples;
+	unsigned long length_p_energy_samples;
+	int * mat_samples;
+	unsigned long length_mat_samples;
+	unsigned long  * verification;
+	unsigned long length_verification;
+} SimulationData;
+
+// io.c
+void logo(int version);
+void center_print(const char *s, int width);
+void border_print(void);
+void fancy_int( int a );
+Input read_CLI( int argc, char * argv[] );
+void print_CLI_error(void);
+void print_input_summary(Input input);
+int validate_and_print_results(Input input, double runtime, unsigned long vhash);
+
+// init.c
+SimulationData initialize_simulation( Input input );
+int * generate_n_poles( Input input,  uint64_t * seed );
+int * generate_n_windows( Input input ,  uint64_t * seed);
+Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_num_poles );
+Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows );
+double * generate_pseudo_K0RS( Input input, uint64_t * seed );
+SimulationData move_simulation_data_to_device( Input in, SimulationData SD );
+
+// material.c
+int * load_num_nucs(Input input);
+int * load_mats( Input input, int * num_nucs, int * max_num_nucs, unsigned long * length_mats );
+double * load_concs( int * num_nucs, uint64_t * seed, int max_num_nucs );
+SimulationData get_materials(Input input, uint64_t * seed);
+
+// utils.c
+size_t get_mem_estimate( Input input );
+double get_time(void);
+
+// simulation.c
+void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result );
+__global__ void xs_lookup_kernel_baseline(Input in, SimulationData GSD );
+__device__ void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
+__device__ void calculate_micro_xs( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles);
+__device__ void calculate_micro_xs_doppler( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
+__device__ int pick_mat( uint64_t * seed );
+__device__ void calculate_sig_T( int nuc, double E, Input input, double * pseudo_K0RS, RSComplex * sigTfactors );
+__device__ RSComplex fast_nuclear_W( RSComplex Z );
+__host__ __device__ double LCG_random_double(uint64_t * seed);
+__host__ __device__ uint64_t LCG_random_int(uint64_t * seed);
+__device__ uint64_t fast_forward_LCG(uint64_t seed, uint64_t n);
+__device__ RSComplex c_add( RSComplex A, RSComplex B);
+__device__ RSComplex c_sub( RSComplex A, RSComplex B);
+__host__ __device__ RSComplex c_mul( RSComplex A, RSComplex B);
+__device__ RSComplex c_div( RSComplex A, RSComplex B);
+__device__ double c_abs( RSComplex A);
+__device__ double fast_exp(double x);
+__device__ RSComplex fast_cexp( RSComplex z );
diff --git a/hip/simulation.hip b/hip/simulation.hip
new file mode 100644
index 0000000..3592747
--- /dev/null
+++ b/hip/simulation.hip
@@ -0,0 +1,502 @@
+#include "hip/hip_runtime.h"
+#include "rsbench.h"
+
+////////////////////////////////////////////////////////////////////////////////////
+// BASELINE FUNCTIONS
+////////////////////////////////////////////////////////////////////////////////////
+// All "baseline" code is at the top of this file. The baseline code is a simple
+// implementation of the algorithm, with only minor GPU optimizations in place.
+// Following these functions are a number of optimized variants,
+// which each deploy a different combination of optimizations strategies. By
+// default, RSBench will only run the baseline implementation. Optimized variants
+// must be specifically selected using the "-k <optimized variant ID>" command
+// line argument.
+////////////////////////////////////////////////////////////////////////////////////
+
+void run_event_based_simulation(Input input, SimulationData GSD, unsigned long * vhash_result )
+{
+	////////////////////////////////////////////////////////////////////////////////
+	// Configure & Launch Simulation Kernel
+	////////////////////////////////////////////////////////////////////////////////
+	printf("Running baseline event-based simulation on device...\n");
+
+	int nthreads = 256;
+	int nblocks = ceil( (double) input.lookups / (double) nthreads);
+
+	hipLaunchKernelGGL(xs_lookup_kernel_baseline, dim3(nblocks), dim3(nthreads), 0, 0,  input, GSD );
+	gpuErrchk( hipPeekAtLastError() );
+	gpuErrchk( hipDeviceSynchronize() );
+	
+	////////////////////////////////////////////////////////////////////////////////
+	// Reduce Verification Results
+	////////////////////////////////////////////////////////////////////////////////
+	printf("Reducing verification results...\n");
+
+  size_t sz = input.lookups * sizeof(unsigned long);
+  unsigned long * v = (unsigned long *) malloc(sz);
+  gpuErrchk( hipMemcpy(v, GSD.verification, sz, hipMemcpyDeviceToHost) );
+
+  unsigned long verification_scalar = 0;
+  for( int i =0; i < input.lookups; i++ )
+    verification_scalar += v[i];
+
+	*vhash_result = verification_scalar;
+}
+
+// In this kernel, we perform a single lookup with each thread. Threads within a warp
+// do not really have any relation to each other, and divergence due to high nuclide count fuel
+// material lookups are costly. This kernel constitutes baseline performance.
+__global__ void xs_lookup_kernel_baseline(Input in, SimulationData GSD )
+{
+	// The lookup ID. Used to set the seed, and to store the verification value
+	const int i = blockIdx.x *blockDim.x + threadIdx.x;
+
+	if( i >= in.lookups )
+		return;
+
+	// Set the initial seed value
+	uint64_t seed = STARTING_SEED;	
+
+	// Forward seed to lookup index (we need 2 samples per lookup)
+	seed = fast_forward_LCG(seed, 2*i);
+
+	// Randomly pick an energy and material for the particle
+	double E = LCG_random_double(&seed);
+	int mat  = pick_mat(&seed);
+
+	double macro_xs[4] = {0};
+
+	calculate_macro_xs( macro_xs, mat, E, in, GSD.num_nucs, GSD.mats, GSD.max_num_nucs, GSD.concs, GSD.n_windows, GSD.pseudo_K0RS, GSD.windows, GSD.poles, GSD.max_num_windows, GSD.max_num_poles );
+
+	// For verification, and to prevent the compiler from optimizing
+	// all work out, we interrogate the returned macro_xs_vector array
+	// to find its maximum value index, then increment the verification
+	// value by that index. In this implementation, we write to a global
+	// verification array that will get reduced after this kernel comples.
+	double max = -DBL_MAX;
+	int max_idx = 0;
+	for(int x = 0; x < 4; x++ )
+	{
+		if( macro_xs[x] > max )
+		{
+			max = macro_xs[x];
+			max_idx = x;
+		}
+	}
+	GSD.verification[i] = max_idx+1;
+}
+
+__device__ void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles ) 
+{
+	// zero out macro vector
+	for( int i = 0; i < 4; i++ )
+		macro_xs[i] = 0;
+
+	// for nuclide in mat
+	for( int i = 0; i < num_nucs[mat]; i++ )
+	{
+		double micro_xs[4];
+		int nuc = mats[mat * max_num_nucs + i];
+
+		if( input.doppler == 1 )
+			calculate_micro_xs_doppler( micro_xs, nuc, E, input, n_windows, pseudo_K0Rs, windows, poles, max_num_windows, max_num_poles);
+		else
+			calculate_micro_xs( micro_xs, nuc, E, input, n_windows, pseudo_K0Rs, windows, poles, max_num_windows, max_num_poles);
+
+		for( int j = 0; j < 4; j++ )
+		{
+			macro_xs[j] += micro_xs[j] * concs[mat * max_num_nucs + i];
+		}
+		// Debug
+		/*
+		printf("E = %.2lf, mat = %d, macro_xs[0] = %.2lf, macro_xs[1] = %.2lf, macro_xs[2] = %.2lf, macro_xs[3] = %.2lf\n",
+		E, mat, macro_xs[0], macro_xs[1], macro_xs[2], macro_xs[3] );
+		*/
+	}
+
+	// Debug
+	/*
+	printf("E = %.2lf, mat = %d, macro_xs[0] = %.2lf, macro_xs[1] = %.2lf, macro_xs[2] = %.2lf, macro_xs[3] = %.2lf\n",
+	E, mat, macro_xs[0], macro_xs[1], macro_xs[2], macro_xs[3] );
+	*/
+}
+
+// No Temperature dependence (i.e., 0K evaluation)
+__device__ void calculate_micro_xs( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles)
+{
+	// MicroScopic XS's to Calculate
+	double sigT;
+	double sigA;
+	double sigF;
+	double sigE;
+
+	// Calculate Window Index
+	double spacing = 1.0 / n_windows[nuc];
+	int window = (int) ( E / spacing );
+	if( window == n_windows[nuc] )
+		window--;
+
+	// Calculate sigTfactors
+	RSComplex sigTfactors[4]; // Of length input.numL, which is always 4
+	calculate_sig_T(nuc, E, input, pseudo_K0RS, sigTfactors );
+
+	// Calculate contributions from window "background" (i.e., poles outside window (pre-calculated)
+	Window w = windows[nuc * max_num_windows + window];
+	sigT = E * w.T;
+	sigA = E * w.A;
+	sigF = E * w.F;
+
+	// Loop over Poles within window, add contributions
+	for( int i = w.start; i < w.end; i++ )
+	{
+		RSComplex PSIIKI;
+		RSComplex CDUM;
+		Pole pole = poles[nuc * max_num_poles + i];
+		RSComplex t1 = {0, 1};
+		RSComplex t2 = {sqrt(E), 0 };
+		PSIIKI = c_div( t1 , c_sub(pole.MP_EA,t2) );
+		RSComplex E_c = {E, 0};
+		CDUM = c_div(PSIIKI, E_c);
+		sigT += (c_mul(pole.MP_RT, c_mul(CDUM, sigTfactors[pole.l_value])) ).r;
+		sigA += (c_mul( pole.MP_RA, CDUM)).r;
+		sigF += (c_mul(pole.MP_RF, CDUM)).r;
+	}
+
+	sigE = sigT - sigA;
+
+	micro_xs[0] = sigT;
+	micro_xs[1] = sigA;
+	micro_xs[2] = sigF;
+	micro_xs[3] = sigE;
+}
+
+// Temperature Dependent Variation of Kernel
+// (This involves using the Complex Faddeeva function to
+// Doppler broaden the poles within the window)
+__device__ void calculate_micro_xs_doppler( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles )
+{
+	// MicroScopic XS's to Calculate
+	double sigT;
+	double sigA;
+	double sigF;
+	double sigE;
+
+	// Calculate Window Index
+	double spacing = 1.0 / n_windows[nuc];
+	int window = (int) ( E / spacing );
+	if( window == n_windows[nuc] )
+		window--;
+
+	// Calculate sigTfactors
+	RSComplex sigTfactors[4]; // Of length input.numL, which is always 4
+	calculate_sig_T(nuc, E, input, pseudo_K0RS, sigTfactors );
+
+	// Calculate contributions from window "background" (i.e., poles outside window (pre-calculated)
+	Window w = windows[nuc * max_num_windows + window];
+	sigT = E * w.T;
+	sigA = E * w.A;
+	sigF = E * w.F;
+
+	double dopp = 0.5;
+
+	// Loop over Poles within window, add contributions
+	for( int i = w.start; i < w.end; i++ )
+	{
+		Pole pole = poles[nuc * max_num_poles + i];
+
+		// Prep Z
+		RSComplex E_c = {E, 0};
+		RSComplex dopp_c = {dopp, 0};
+		RSComplex Z = c_mul(c_sub(E_c, pole.MP_EA), dopp_c);
+
+		// Evaluate Fadeeva Function
+		RSComplex faddeeva = fast_nuclear_W( Z );
+
+		// Update W
+		sigT += (c_mul( pole.MP_RT, c_mul(faddeeva, sigTfactors[pole.l_value]) )).r;
+		sigA += (c_mul( pole.MP_RA , faddeeva)).r;
+		sigF += (c_mul( pole.MP_RF , faddeeva)).r;
+	}
+
+	sigE = sigT - sigA;
+
+	micro_xs[0] = sigT;
+	micro_xs[1] = sigA;
+	micro_xs[2] = sigF;
+	micro_xs[3] = sigE;
+}
+
+// picks a material based on a probabilistic distribution
+__device__ int pick_mat( uint64_t * seed )
+{
+	// I have a nice spreadsheet supporting these numbers. They are
+	// the fractions (by volume) of material in the core. Not a 
+	// *perfect* approximation of where XS lookups are going to occur,
+	// but this will do a good job of biasing the system nonetheless.
+
+	double dist[12];
+	dist[0]  = 0.140;	// fuel
+	dist[1]  = 0.052;	// cladding
+	dist[2]  = 0.275;	// cold, borated water
+	dist[3]  = 0.134;	// hot, borated water
+	dist[4]  = 0.154;	// RPV
+	dist[5]  = 0.064;	// Lower, radial reflector
+	dist[6]  = 0.066;	// Upper reflector / top plate
+	dist[7]  = 0.055;	// bottom plate
+	dist[8]  = 0.008;	// bottom nozzle
+	dist[9]  = 0.015;	// top nozzle
+	dist[10] = 0.025;	// top of fuel assemblies
+	dist[11] = 0.013;	// bottom of fuel assemblies
+
+	double roll = LCG_random_double(seed);
+
+	// makes a pick based on the distro
+	for( int i = 0; i < 12; i++ )
+	{
+		double running = 0;
+		for( int j = i; j > 0; j-- )
+			running += dist[j];
+		if( roll < running )
+			return i;
+	}
+
+	return 0;
+}
+
+__device__ void calculate_sig_T( int nuc, double E, Input input, double * pseudo_K0RS, RSComplex * sigTfactors )
+{
+	double phi;
+
+	for( int i = 0; i < 4; i++ )
+	{
+		phi = pseudo_K0RS[nuc * input.numL + i] * sqrt(E);
+
+		if( i == 1 )
+			phi -= - atan( phi );
+		else if( i == 2 )
+			phi -= atan( 3.0 * phi / (3.0 - phi*phi));
+		else if( i == 3 )
+			phi -= atan(phi*(15.0-phi*phi)/(15.0-6.0*phi*phi));
+
+		phi *= 2.0;
+
+		sigTfactors[i].r = cos(phi);
+		sigTfactors[i].i = -sin(phi);
+	}
+}
+
+// This function uses a combination of the Abrarov Approximation
+// and the QUICK_W three term asymptotic expansion.
+// Only expected to use Abrarov ~0.5% of the time.
+__device__ RSComplex fast_nuclear_W( RSComplex Z )
+{
+	// Abrarov 
+	if( c_abs(Z) < 6.0 )
+	{
+		// Precomputed parts for speeding things up
+		// (N = 10, Tm = 12.0)
+		RSComplex prefactor = {0, 8.124330e+01};
+		double an[10] = {
+			2.758402e-01,
+			2.245740e-01,
+			1.594149e-01,
+			9.866577e-02,
+			5.324414e-02,
+			2.505215e-02,
+			1.027747e-02,
+			3.676164e-03,
+			1.146494e-03,
+			3.117570e-04
+		};
+		double neg_1n[10] = {
+			-1.0,
+			1.0,
+			-1.0,
+			1.0,
+			-1.0,
+			1.0,
+			-1.0,
+			1.0,
+			-1.0,
+			1.0
+		};
+
+		double denominator_left[10] = {
+			9.869604e+00,
+			3.947842e+01,
+			8.882644e+01,
+			1.579137e+02,
+			2.467401e+02,
+			3.553058e+02,
+			4.836106e+02,
+			6.316547e+02,
+			7.994380e+02,
+			9.869604e+02
+		};
+
+		RSComplex t1 = {0, 12};
+		RSComplex t2 = {12, 0};
+		RSComplex i = {0,1};
+		RSComplex one = {1, 0};
+		RSComplex W = c_div(c_mul(i, ( c_sub(one, fast_cexp(c_mul(t1, Z))) )) , c_mul(t2, Z));
+		RSComplex sum = {0,0};
+		for( int n = 0; n < 10; n++ )
+		{
+			RSComplex t3 = {neg_1n[n], 0};
+			RSComplex top = c_sub(c_mul(t3, fast_cexp(c_mul(t1, Z))), one);
+			RSComplex t4 = {denominator_left[n], 0};
+			RSComplex t5 = {144, 0};
+			RSComplex bot = c_sub(t4, c_mul(t5,c_mul(Z,Z)));
+			RSComplex t6 = {an[n], 0};
+			sum = c_add(sum, c_mul(t6, c_div(top,bot)));
+		}
+		W = c_add(W, c_mul(prefactor, c_mul(Z, sum)));
+		return W;
+	}
+	else
+	{
+		// QUICK_2 3 Term Asymptotic Expansion (Accurate to O(1e-6)).
+		// Pre-computed parameters
+		RSComplex a = {0.512424224754768462984202823134979415014943561548661637413182,0};
+		RSComplex b = {0.275255128608410950901357962647054304017026259671664935783653, 0};
+		RSComplex c = {0.051765358792987823963876628425793170829107067780337219430904, 0};
+		RSComplex d = {2.724744871391589049098642037352945695982973740328335064216346, 0};
+
+		RSComplex i = {0,1};
+		RSComplex Z2 = c_mul(Z, Z);
+		// Three Term Asymptotic Expansion
+		RSComplex W = c_mul(c_mul(Z,i), (c_add(c_div(a,(c_sub(Z2, b))) , c_div(c,(c_sub(Z2, d))))));
+
+		return W;
+	}
+}
+
+__host__ __device__ double LCG_random_double(uint64_t * seed)
+{
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	const uint64_t a = 2806196910506780709ULL;
+	const uint64_t c = 1ULL;
+	*seed = (a * (*seed) + c) % m;
+	return (double) (*seed) / (double) m;
+}	
+
+__host__ __device__ uint64_t LCG_random_int(uint64_t * seed)
+{
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	const uint64_t a = 2806196910506780709ULL;
+	const uint64_t c = 1ULL;
+	*seed = (a * (*seed) + c) % m;
+	return *seed;
+}	
+
+__device__ uint64_t fast_forward_LCG(uint64_t seed, uint64_t n)
+{
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	uint64_t a = 2806196910506780709ULL;
+	uint64_t c = 1ULL;
+
+	n = n % m;
+
+	uint64_t a_new = 1;
+	uint64_t c_new = 0;
+
+	while(n > 0) 
+	{
+		if(n & 1)
+		{
+			a_new *= a;
+			c_new = c_new * a + c;
+		}
+		c *= (a + 1);
+		a *= a;
+
+		n >>= 1;
+	}
+
+	return (a_new * seed + c_new) % m;
+}
+
+// Complex arithmetic functions
+
+__device__ RSComplex c_add( RSComplex A, RSComplex B)
+{
+	RSComplex C;
+	C.r = A.r + B.r;
+	C.i = A.i + B.i;
+	return C;
+}
+
+__device__ RSComplex c_sub( RSComplex A, RSComplex B)
+{
+	RSComplex C;
+	C.r = A.r - B.r;
+	C.i = A.i - B.i;
+	return C;
+}
+
+__host__ __device__ RSComplex c_mul( RSComplex A, RSComplex B)
+{
+	double a = A.r;
+	double b = A.i;
+	double c = B.r;
+	double d = B.i;
+	RSComplex C;
+	C.r = (a*c) - (b*d);
+	C.i = (a*d) + (b*c);
+	return C;
+}
+
+__device__ RSComplex c_div( RSComplex A, RSComplex B)
+{
+	double a = A.r;
+	double b = A.i;
+	double c = B.r;
+	double d = B.i;
+	RSComplex C;
+	double denom = c*c + d*d;
+	C.r = ( (a*c) + (b*d) ) / denom;
+	C.i = ( (b*c) - (a*d) ) / denom;
+	return C;
+}
+
+__device__ double c_abs( RSComplex A)
+{
+	return sqrt(A.r*A.r + A.i * A.i);
+}
+
+
+// Fast (but inaccurate) exponential function
+// Written By "ACMer":
+// https://codingforspeed.com/using-faster-exponential-approximation/
+// We use our own to avoid small differences in compiler specific
+// exp() intrinsic implementations that make it difficult to verify
+// if the code is working correctly or not.
+__device__ double fast_exp(double x)
+{
+  x = 1.0 + x * 0.000244140625;
+  x *= x; x *= x; x *= x; x *= x;
+  x *= x; x *= x; x *= x; x *= x;
+  x *= x; x *= x; x *= x; x *= x;
+  return x;
+}
+
+// Implementation based on:
+// z = x + iy
+// cexp(z) = e^x * (cos(y) + i * sin(y))
+__device__ RSComplex fast_cexp( RSComplex z )
+{
+	double x = z.r;
+	double y = z.i;
+
+	// For consistency across architectures, we
+	// will use our own exponetial implementation
+	//double t1 = exp(x);
+	double t1 = fast_exp(x);
+	double t2 = cos(y);
+	double t3 = sin(y);
+	RSComplex t4 = {t2, t3};
+	RSComplex t5 = {t1, 0};
+	RSComplex result = c_mul(t5, (t4));
+	return result;
+}	
+
diff --git a/hip/utils.hip b/hip/utils.hip
new file mode 100644
index 0000000..ffd966e
--- /dev/null
+++ b/hip/utils.hip
@@ -0,0 +1,31 @@
+#include "rsbench.h"
+
+size_t get_mem_estimate( Input input )
+{
+	size_t poles = input.n_nuclides * input.avg_n_poles * sizeof(Pole) + input.n_nuclides * sizeof(Pole *);
+	size_t windows = input.n_nuclides * input.avg_n_windows * sizeof(Window) + input.n_nuclides * sizeof(Window *);
+	size_t pseudo_K0RS = input.n_nuclides * input.numL * sizeof( double ) + input.n_nuclides * sizeof(double);
+	size_t other = input.n_nuclides * 2 * sizeof(int);
+
+	size_t total = poles + windows + pseudo_K0RS + other;
+	
+	return total;
+}
+
+double get_time(void)
+{
+
+	// If using C, we can do this:
+	/* 
+	struct timeval timecheck;
+	gettimeofday(&timecheck, NULL);
+	long ms = (long)timecheck.tv_sec * 1000 + (long)timecheck.tv_usec / 1000;
+	double time = (double) ms / 1000.0;
+	return time;
+	*/
+	
+	// If using C++, we can do this:
+	unsigned long us_since_epoch = std::chrono::high_resolution_clock::now().time_since_epoch() / std::chrono::microseconds(1);
+	return (double) us_since_epoch / 1.0e6;
+
+}
diff --git a/opencl/Makefile b/opencl/Makefile
index 59f212a..8db7913 100644
--- a/opencl/Makefile
+++ b/opencl/Makefile
@@ -32,12 +32,16 @@ obj = $(source:.c=.o)
 CFLAGS := -std=gnu99 -Wall
 
 # Linker Flags
-LDFLAGS = -lm
+LDFLAGS = -lm -lOpenCL
 
-# Regular gcc Compiler
+# GNU gcc Compiler
 ifeq ($(COMPILER),gnu)
   CC = gcc
-  CFLAGS += -lOpenCL
+endif
+
+# LLVM Clang compiler
+ifeq ($(COMPILER),llvm)
+  CC = clang
 endif
 
 # Debug Flags
diff --git a/opencl/io.c b/opencl/io.c
index 771e562..520ce79 100644
--- a/opencl/io.c
+++ b/opencl/io.c
@@ -321,6 +321,7 @@ int validate_and_print_results(Input input, double runtime, unsigned long vhash,
     lookups = input.lookups;
   int lookups_per_sec = (int) ((double) lookups / runtime);
   int sim_only_lookups_per_sec = (int) ((double) lookups/ sim_runtime);
+  printf("NOTE: Timings are estimated -- use nvprof/nsys/iprof/rocprof for formal analysis\n");
   printf("Total Time Statistics (OpenCL Init / JIT Compilation + Simulation Kernel)\n");
   printf("Runtime:     %.3lf seconds\n", runtime);
   printf("Lookups:     "); fancy_int(lookups);
diff --git a/opencl/kernel.cl b/opencl/kernel.cl
index e7458d0..fb663c3 100644
--- a/opencl/kernel.cl
+++ b/opencl/kernel.cl
@@ -76,6 +76,9 @@ __kernel void macro_xs_lookup_kernel(		Input in,
 {
 	// Get the index of the current element to be processed
 	int i = get_global_id(0);
+  
+  if( i >= in.lookups )
+    return;
 		
 	// Set the initial seed value
 	unsigned long seed = STARTING_SEED;	
@@ -401,27 +404,27 @@ RSComplex fast_nuclear_W( RSComplex Z )
 
 double LCG_random_double(unsigned long * seed)
 {
-	const unsigned long m = 9223372036854775808ULL; // 2^63
-	const unsigned long a = 2806196910506780709ULL;
-	const unsigned long c = 1ULL;
+	const unsigned long m = 9223372036854775808UL; // 2^63
+	const unsigned long a = 2806196910506780709UL;
+	const unsigned long c = 1UL;
 	*seed = (a * (*seed) + c) % m;
 	return (double) (*seed) / (double) m;
 }	
 
 unsigned long LCG_random_int(unsigned long * seed)
 {
-	const unsigned long m = 9223372036854775808ULL; // 2^63
-	const unsigned long a = 2806196910506780709ULL;
-	const unsigned long c = 1ULL;
+	const unsigned long m = 9223372036854775808UL; // 2^63
+	const unsigned long a = 2806196910506780709UL;
+	const unsigned long c = 1UL;
 	*seed = (a * (*seed) + c) % m;
 	return *seed;
 }	
 
 unsigned long fast_forward_LCG(unsigned long seed, unsigned long n)
 {
-	const unsigned long m = 9223372036854775808ULL; // 2^63
-	unsigned long a = 2806196910506780709ULL;
-	unsigned long c = 1ULL;
+	const unsigned long m = 9223372036854775808UL; // 2^63
+	unsigned long a = 2806196910506780709UL;
+	unsigned long c = 1UL;
 
 	n = n % m;
 
diff --git a/opencl/main.c b/opencl/main.c
index deea25a..84779fe 100644
--- a/opencl/main.c
+++ b/opencl/main.c
@@ -6,7 +6,7 @@ int main(int argc, char * argv[])
 	// Initialization & Command Line Read-In
 	// =====================================================================
 
-	int version = 12;
+	int version = 13;
 	double start, stop;
 	
 	// Process CLI Fields
diff --git a/opencl/simulation.c b/opencl/simulation.c
index c2607c3..80a9ced 100644
--- a/opencl/simulation.c
+++ b/opencl/simulation.c
@@ -155,9 +155,14 @@ unsigned long long run_event_based_simulation(Input in, SimulationData SD, doubl
 
   printf("Running event based simulation...\n");
 
-  // Execute the OpenCL kernel on the list
-  size_t global_item_size = in.lookups; // Process the entire lists
-  size_t local_item_size = 8; // Divide work items into groups of 8
+	// Execute the OpenCL kernel on the list
+	size_t global_item_size = in.lookups; // Process the entire lists
+	size_t local_item_size = 256; // Divide work items into groups
+
+  // Add extra work items if global size not evenly divisible by local size
+  if( in.lookups % local_item_size != 0 && in.lookups > local_item_size )
+    global_item_size = ((in.lookups / local_item_size) + 1) * local_item_size;
+
   ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
   check(ret);
 
diff --git a/openmp-offload/Makefile b/openmp-offload/Makefile
index 1063181..6005dda 100644
--- a/openmp-offload/Makefile
+++ b/openmp-offload/Makefile
@@ -2,7 +2,7 @@
 # User Options
 #===============================================================================
 
-COMPILER  = intel
+COMPILER  = llvm
 OPTIMIZE  = yes
 DEBUG     = no
 PROFILE   = no
@@ -30,43 +30,38 @@ obj = $(source:.c=.o)
 # Standard Flags
 CFLAGS := -std=gnu99 -Wall
 
-# gcc Compiler
-ifeq ($(COMPILER),gnu)
-  CC = gcc
-  CFLAGS += -fopenmp -ffast-math -flto
-endif
+# Linker Flags
+LDFLAGS = -lm
 
 # Intel Compiler
 ifeq ($(COMPILER),intel)
   CC = icx 
   CFLAGS += -fiopenmp -fopenmp-targets=spir64 -D__STRICT_ANSI__ 
-  # Optimization Flags
-  ifeq ($(OPTIMIZE),yes)
-    CFLAGS += -O3
-  endif
 endif
 
-# Clang-ykt compiler Targeting P100
-ifeq ($(COMPILER),clang)
+# LLVM Compiler Targeting A100 -- Change SM Level to Target Other GPUs
+ifeq ($(COMPILER),llvm)
   CC = clang
-  CFLAGS += -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60
-  ifeq ($(OPTIMIZE),yes)
-    #CFLAGS += -O3
-    CFLAGS += -Ofast
-  endif
+  CFLAGS += -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_80
 endif
 
-# IBM XL Compiler Targeting K80
+# IBM XL Compiler
 ifeq ($(COMPILER),ibm)
   CC = xlc_r
   CFLAGS += -qsmp=omp -qoffload
-  ifeq ($(OPTIMIZE),yes)
-    CFLAGS += -O2 # O3 gets the wrong verification answer
-  endif
 endif
 
-# Linker Flags
-LDFLAGS = -lm
+# NVIDIA Compiler Targeting A100 -- Change SM Level to Target Other GPUs
+ifeq ($(COMPILER),nvidia)
+  CC = nvc
+  CFLAGS += -mp=gpu -Minfo=mp -gpu=cc80
+endif
+
+# AOMP Targeting MI100 -- Change march to Target Other GPUs
+ifeq ($(COMPILER),amd)
+  CC = clang
+  CFLAGS += -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
+endif
 
 # Debug Flags
 ifeq ($(DEBUG),yes)
diff --git a/openmp-offload/io.c b/openmp-offload/io.c
index 1c13494..e15ce16 100644
--- a/openmp-offload/io.c
+++ b/openmp-offload/io.c
@@ -281,6 +281,7 @@ void print_input_summary(Input input)
 
 int validate_and_print_results(Input input, double runtime, unsigned long vhash)
 {
+  printf("NOTE: Timings are estimated -- use nvprof/nsys/iprof/rocprof for formal analysis\n");
 	printf("Runtime:               %.3lf seconds\n", runtime);
 	int lookups = 0;
 	if( input.simulation_method == HISTORY_BASED )
diff --git a/openmp-offload/main.c b/openmp-offload/main.c
index a3cf679..0f88ad8 100644
--- a/openmp-offload/main.c
+++ b/openmp-offload/main.c
@@ -6,7 +6,7 @@ int main(int argc, char * argv[])
 	// Initialization & Command Line Read-In
 	// =====================================================================
 
-	int version = 12;
+	int version = 13;
 	double start, stop;
 	
 	// Process CLI Fields
diff --git a/openmp-offload/simulation.c b/openmp-offload/simulation.c
index e7257e9..c4df586 100644
--- a/openmp-offload/simulation.c
+++ b/openmp-offload/simulation.c
@@ -15,13 +15,12 @@
 void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result )
 {
 	printf("Beginning baseline event based simulation on device...\n");
-	unsigned long verification = 0;
+	unsigned long long * verification = (unsigned long long *) malloc(input.lookups * sizeof(unsigned long long));
 
 	int offloaded_to_device = 0;
 
 	// Main simulation loop over macroscopic cross section lookups
 
-	//#pragma omp parallel for reduction(+:verification)
 	#pragma omp target teams distribute parallel for\
 	map(to:data.n_poles[:data.length_n_poles])\
 	map(to:data.n_windows[:data.length_n_windows])\
@@ -35,7 +34,7 @@ void run_event_based_simulation(Input input, SimulationData data, unsigned long
 	map(to:data.max_num_poles)\
 	map(to:data.max_num_windows)\
 	map(tofrom:offloaded_to_device)\
-	reduction(+:verification)
+  map(from:verification[:input.lookups])
 	for( int i = 0; i < input.lookups; i++ )
 	{
 		// Set the initial seed value
@@ -69,12 +68,17 @@ void run_event_based_simulation(Input input, SimulationData data, unsigned long
 				max_idx = x;
 			}
 		}
-		verification += max_idx+1;
+		verification[i] = max_idx+1;
 
 		// Check if we are currently running on the device or not
 		if( i == 0 )
 			offloaded_to_device = !omp_is_initial_device();
 	}
+  
+  // Reduce validation hash on the host
+  unsigned long long validation_hash = 0;
+	for( int i = 0; i < input.lookups; i++ )
+    validation_hash += verification[i];
 
 	// Print if kernel actually ran on the device
 	if( offloaded_to_device )
@@ -82,7 +86,7 @@ void run_event_based_simulation(Input input, SimulationData data, unsigned long
 	else
 		printf( "NOTE - Kernel ran on the host!\n" );
 
-	*vhash_result = verification;
+	*vhash_result = validation_hash;
 }
 
 void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles ) 
diff --git a/openmp-threading/main.c b/openmp-threading/main.c
index 76f0202..2349852 100644
--- a/openmp-threading/main.c
+++ b/openmp-threading/main.c
@@ -6,7 +6,7 @@ int main(int argc, char * argv[])
 	// Initialization & Command Line Read-In
 	// =====================================================================
 
-	int version = 12;
+	int version = 13;
 	double start, stop;
 	
 	// Process CLI Fields
diff --git a/sycl/Makefile b/sycl/Makefile
index 0ce826b..3b4e6df 100644
--- a/sycl/Makefile
+++ b/sycl/Makefile
@@ -2,10 +2,7 @@
 # User Options
 #===============================================================================
 
-# Compiler can be set below, or via environment variable
-ifeq ($(CC),)
-CC    = clang
-endif
+CC        = llvm
 OPTIMIZE  = yes
 DEBUG     = no
 PROFILE   = no
@@ -30,11 +27,6 @@ obj = $(source:.cpp=.o)
 # Sets Flags
 #===============================================================================
 
-# Make sure we didn't pick up lower case cc
-ifeq (cc,$(CC))
-CC    = clang
-endif
-
 # Standard Flags
 CFLAGS := -std=c++14 -Wall
 
@@ -42,13 +34,13 @@ CFLAGS := -std=c++14 -Wall
 LDFLAGS = -lm
 
 # Codeplay Compiler
-ifneq (,$(findstring codeplay,$(CC)))
+ifeq ($(CC),codeplay)
   CC = compute++
   CFLAGS += -sycl -sycl-driver
   LDFLAGS += -lComputeCpp
 endif
 
-ifneq (,$(findstring clang,$(CC)))
+ifeq ($(CC),llvm)
   CC = clang++
   CFLAGS += -fsycl
   LDFLAGS += -lOpenCL
diff --git a/sycl/io.cpp b/sycl/io.cpp
index 518a836..b11d811 100644
--- a/sycl/io.cpp
+++ b/sycl/io.cpp
@@ -290,6 +290,7 @@ int validate_and_print_results(Input input, double runtime, unsigned long vhash,
 	int lookups_per_sec = (int) ((double) lookups / runtime);
 	int sim_only_lookups_per_sec = (int) ((double) lookups/ (runtime-kernel_init_time));
 
+  printf("NOTE: Timings are estimated -- use nvprof/nsys/iprof/rocprof for formal analysis\n");
 	printf("Total Time Statistics (SYCL+OpenCL Init / JIT Compilation + Simulation Kernel)\n");
 	printf("Runtime:               %.3lf seconds\n", runtime);
 	printf("Lookups:               "); fancy_int(lookups);
diff --git a/sycl/main.cpp b/sycl/main.cpp
index 6b8711a..4945ef9 100644
--- a/sycl/main.cpp
+++ b/sycl/main.cpp
@@ -6,7 +6,7 @@ int main(int argc, char * argv[])
 	// Initialization & Command Line Read-In
 	// =====================================================================
 
-	int version = 12;
+	int version = 13;
 	double start, stop;
 	
 	// Process CLI Fields