From 8e89bca78b373c98be82b2b4d98d75b40c9c831e Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sat, 27 May 2023 13:57:04 +1000
Subject: [PATCH 01/91] Exploit PMULL+EOR fusion capability on Apple M1

---
 gf16/gf16_clmul_neon.c | 47 ++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/gf16/gf16_clmul_neon.c b/gf16/gf16_clmul_neon.c
index b7645184..6af64ff0 100644
--- a/gf16/gf16_clmul_neon.c
+++ b/gf16/gf16_clmul_neon.c
@@ -11,9 +11,6 @@
 static HEDLEY_ALWAYS_INLINE poly8x16_t veorq_p8(poly8x16_t a, poly8x16_t b) {
 	return vreinterpretq_p8_u8(veorq_u8(vreinterpretq_u8_p8(a), vreinterpretq_u8_p8(b)));
 }
-static HEDLEY_ALWAYS_INLINE poly16x8_t veorq_p16(poly16x8_t a, poly16x8_t b) {
-	return vreinterpretq_p16_u16(veorq_u16(vreinterpretq_u16_p16(a), vreinterpretq_u16_p16(b)));
-}
 
 #ifdef __aarch64__
 typedef poly8x16_t coeff_t;
@@ -48,6 +45,34 @@ typedef poly8x8_t coeff_t;
 # define pmull_high(x, y) vmull_p8(vget_high_p8(x), y)
 #endif
 
+#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) && defined(__APPLE__)
+// Apple M1 supports fusing PMULL+EOR, so ensure these are paired
+static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_low(poly16x8_t sum, poly8x16_t a, poly8x16_t b) {
+	poly16x8_t result;
+	__asm__ ("pmull %0.8h,%1.8b,%2.8b\n"
+	         "eor %0.16b,%0.16b,%3.16b\n"
+		: "=&w"(result)
+		: "w"(a), "w"(b), "w"(sum)
+		: /* No clobbers */);
+	return result;
+}
+static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_high(poly16x8_t sum, poly8x16_t a, poly8x16_t b) {
+	poly16x8_t result;
+	__asm__ ("pmull2 %0.8h,%1.16b,%2.16b\n"
+	         "eor %0.16b,%0.16b,%3.16b\n"
+		: "=&w"(result)
+		: "w"(a), "w"(b), "w"(sum)
+		: /* No clobbers */);
+	return result;
+}
+#else
+static HEDLEY_ALWAYS_INLINE poly16x8_t veorq_p16(poly16x8_t a, poly16x8_t b) {
+	return vreinterpretq_p16_u16(veorq_u16(vreinterpretq_u16_p16(a), vreinterpretq_u16_p16(b)));
+}
+# define pmacl_low(sum, a, b) veorq_p16(sum, pmull_low(a, b))
+# define pmacl_high(sum, a, b) veorq_p16(sum, pmull_high(a, b))
+#endif
+
 static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round1(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) {
 	poly8x16x2_t data = vld2q_p8((const poly8_t*)src);
 	*low1 = pmull_low(data.val[0], coeff[0]);
@@ -76,14 +101,14 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round1(const void* src, poly16x
 }
 
 static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) {
-	poly16x8_t _low1, _low2, _mid1, _mid2, _high1, _high2;
-	gf16_clmul_neon_round1(src, &_low1, &_low2, &_mid1, &_mid2, &_high1, &_high2, coeff);
-	*low1 = veorq_p16(*low1, _low1);
-	*low2 = veorq_p16(*low2, _low2);
-	*mid1 = veorq_p16(*mid1, _mid1);
-	*mid2 = veorq_p16(*mid2, _mid2);
-	*high1 = veorq_p16(*high1, _high1);
-	*high2 = veorq_p16(*high2, _high2);
+	poly8x16x2_t data = vld2q_p8((const poly8_t*)src);
+	*low1 = pmacl_low(*low1, data.val[0], coeff[0]);
+	*low2 = pmacl_high(*low2, data.val[0], coeff[0]);
+	poly8x16_t mid = veorq_p8(data.val[0], data.val[1]);
+	*mid1 = pmacl_low(*mid1, mid, coeff[2]);
+	*mid2 = pmacl_high(*mid2, mid, coeff[2]);
+	*high1 = pmacl_low(*high1, data.val[1], coeff[1]);
+	*high2 = pmacl_high(*high2, data.val[1], coeff[1]);
 }
 
 static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, poly16x8_t low2, poly16x8_t mid1, poly16x8_t mid2, poly16x8_t* high1, poly16x8_t high2) {

From 95b5d3ed14c5b49c2fb1a6ad3107960c158ed72e Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sat, 27 May 2023 13:57:36 +1000
Subject: [PATCH 02/91] Set dev version

---
 package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package.json b/package.json
index 021cf5e9..fcb7340d 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@animetosho/parpar",
-  "version": "0.4.1",
+  "version": "0.4.2-dev",
   "description": "High performance multi-threaded PAR2 creation library",
   "keywords": [
     "par2",

From 81da84166b8de020bfe33d7452b04e8f2b91d587 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 28 May 2023 13:03:57 +1000
Subject: [PATCH 03/91] Clear OpenCL platform list when unloading

---
 gf16/controller_ocl.cpp | 5 +----
 gf16/controller_ocl.h   | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/gf16/controller_ocl.cpp b/gf16/controller_ocl.cpp
index bee23f60..f1d1e881 100644
--- a/gf16/controller_ocl.cpp
+++ b/gf16/controller_ocl.cpp
@@ -6,10 +6,6 @@
 
 std::vector<cl::Platform> PAR2ProcOCL::platforms;
 
-// buffer for zeroing GPU memory
-#define ZERO_MEM_SIZE 65536
-#include "gfmat_coeff.h"
-
 int PAR2ProcOCL::load_runtime() {
 	if(load_opencl()) {
 		return 1;
@@ -384,6 +380,7 @@ bool PAR2ProcOCL::fillInput(const void* buffer) {
 	return false;
 }
 
+#include "gfmat_coeff.h"
 void PAR2ProcOCL::set_coeffs(PAR2ProcOCLStaging& area, unsigned idx, uint16_t inputNum) {
 	uint16_t inputLog = gfmat_input_log(inputNum);
 	auto& coeffs = area.procCoeffs;
diff --git a/gf16/controller_ocl.h b/gf16/controller_ocl.h
index 09988958..f0e7f7c2 100644
--- a/gf16/controller_ocl.h
+++ b/gf16/controller_ocl.h
@@ -165,6 +165,7 @@ class PAR2ProcOCL : public IPAR2ProcBackend {
 public:
 	static int load_runtime();
 	static inline int unload_runtime() {
+		platforms.clear();
 		return unload_opencl();
 	}
 	static int defaultPlatformId();

From 12fce934067216788e11fedeebd4069f4edf2786 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 28 May 2023 13:49:18 +1000
Subject: [PATCH 04/91] Fix parpar_gf_init signature when building as an
 internal module

---
 nexe/build.js | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nexe/build.js b/nexe/build.js
index dd7b8b8c..25464dae 100644
--- a/nexe/build.js
+++ b/nexe/build.js
@@ -198,6 +198,7 @@ nexe.compile({
 			data = data.contents.toString();
 			const internalModuleRegister = (parseFloat(nodeVer) >= 12) ? 'NODE_MODULE_CONTEXT_AWARE_INTERNAL' : 'NODE_BUILTIN_MODULE_CONTEXT_AWARE';
 			data = data.replace(/NODE_MODULE\(/, '#define NODE_WANT_INTERNALS 1\n#include <node_internals.h>\n' + internalModuleRegister + '(');
+			data = data.replace(/Local<Value> module,\s*void\* priv/, 'Local<Value> module, v8::Local<v8::Context> context, void* priv');
 			await compiler.setFileContentsAsync('deps/parpar/src/gf.cc', data);
 			
 			data = await compiler.readFileAsync('deps/parpar/binding.gyp');

From 32338b52cd38b64d96a9099faee79c5badeef8ab Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sat, 3 Jun 2023 21:22:14 +1000
Subject: [PATCH 05/91] Initial matrix inversion implementation

---
 gf16/gf16_shuffle.h       |   5 ++
 gf16/gf16_shuffle_ssse3.c |  23 ++++++++
 gf16/gf16mul.cpp          |  35 ++++++++++--
 gf16/gf16mul.h            |  12 +++-
 gf16/gfmat_coeff.c        |  24 ++++++++
 gf16/gfmat_inv.cpp        | 115 ++++++++++++++++++++++++++++++++++++++
 gf16/gfmat_inv.h          |  12 ++++
 7 files changed, 219 insertions(+), 7 deletions(-)
 create mode 100644 gf16/gfmat_inv.cpp
 create mode 100644 gf16/gfmat_inv.h

diff --git a/gf16/gf16_shuffle.h b/gf16/gf16_shuffle.h
index 3bff4658..2c12db49 100644
--- a/gf16/gf16_shuffle.h
+++ b/gf16/gf16_shuffle.h
@@ -121,3 +121,8 @@ void* gf16_shuffle_init_128_sve(int polynomial);
 void* gf16_shuffle_init_512_sve(int polynomial);
 
 int gf16_sve_get_size();
+
+uint16_t gf16_shuffle8_replace_word(void* data, size_t index, uint16_t newValue);
+uint16_t gf16_shuffle16_replace_word(void* data, size_t index, uint16_t newValue);
+uint16_t gf16_shuffle32_replace_word(void* data, size_t index, uint16_t newValue);
+uint16_t gf16_shuffle64_replace_word(void* data, size_t index, uint16_t newValue);
diff --git a/gf16/gf16_shuffle_ssse3.c b/gf16/gf16_shuffle_ssse3.c
index b6b9de4a..ddd8247c 100644
--- a/gf16/gf16_shuffle_ssse3.c
+++ b/gf16/gf16_shuffle_ssse3.c
@@ -21,6 +21,29 @@
 #undef _FNSUFFIX
 #undef _MM_END
 
+static HEDLEY_ALWAYS_INLINE uint16_t gf16_shuffleX_replace_word(void* data, size_t index, uint16_t newValue, size_t width) {
+	uint8_t* base = (uint8_t*)data + (index & ~(width-1)) * 2;
+	unsigned pos = index & (width-1);
+	if(width > 16)
+		pos = (pos & 7) | ((pos & ((width/2)-8)) << 1) | ((pos & (width/2)) ? 8 : 0); // handle awkward positioning due to avoiding cross-lane shuffles
+	uint16_t oldValue = base[pos + width] | (base[pos] << 8);
+	base[pos + width] = newValue & 0xff;
+	base[pos] = newValue >> 8;
+	return oldValue;
+}
+
+uint16_t gf16_shuffle8_replace_word(void* data, size_t index, uint16_t newValue) { // only used for Affine2x
+	return gf16_shuffleX_replace_word(data, index, newValue, 8);
+}
+uint16_t gf16_shuffle16_replace_word(void* data, size_t index, uint16_t newValue) {
+	return gf16_shuffleX_replace_word(data, index, newValue, 16);
+}
+uint16_t gf16_shuffle32_replace_word(void* data, size_t index, uint16_t newValue) {
+	return gf16_shuffleX_replace_word(data, index, newValue, 32);
+}
+uint16_t gf16_shuffle64_replace_word(void* data, size_t index, uint16_t newValue) {
+	return gf16_shuffleX_replace_word(data, index, newValue, 64);
+}
 
 
 void* gf16_shuffle_init_x86(int polynomial) {
diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index 1def382b..917de3e0 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -507,6 +507,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_ssse3;
 					copy_cksum = &gf16_cksum_copy_sse2;
 					copy_cksum_check = &gf16_cksum_copy_check_sse2;
+					replace_word = &gf16_shuffle16_replace_word;
 				break;
 				case GF16_SHUFFLE_AVX:
 					METHOD_REQUIRES(gf16_shuffle_available_avx && scratch)
@@ -526,6 +527,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx;
 					copy_cksum = &gf16_cksum_copy_sse2;
 					copy_cksum_check = &gf16_cksum_copy_check_sse2;
+					replace_word = &gf16_shuffle16_replace_word;
 				break;
 				case GF16_SHUFFLE_AVX2:
 					METHOD_REQUIRES(gf16_shuffle_available_avx2 && scratch)
@@ -545,6 +547,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx2;
 					copy_cksum = &gf16_cksum_copy_avx2;
 					copy_cksum_check = &gf16_cksum_copy_check_avx2;
+					replace_word = &gf16_shuffle32_replace_word;
 				break;
 				case GF16_SHUFFLE_AVX512:
 					METHOD_REQUIRES(gf16_shuffle_available_avx512 && scratch)
@@ -573,6 +576,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx512;
 					copy_cksum = &gf16_cksum_copy_avx512;
 					copy_cksum_check = &gf16_cksum_copy_check_avx512;
+					replace_word = &gf16_shuffle64_replace_word;
 				break;
 				default: break; // for pedantic compilers
 			}
@@ -604,6 +608,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx512;
 			copy_cksum = &gf16_cksum_copy_avx512;
 			copy_cksum_check = &gf16_cksum_copy_check_avx512;
+			replace_word = &gf16_shuffle64_replace_word;
 		break;
 		case GF16_SHUFFLE2X_AVX512:
 			scratch = gf16_shuffle_init_x86(GF16_POLYNOMIAL);
@@ -631,6 +636,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_shuffle2x_finish_partial_packsum_avx512;
 			copy_cksum = &gf16_cksum_copy_avx512;
 			copy_cksum_check = &gf16_cksum_copy_check_avx512;
+			replace_word = &gf16_shuffle32_replace_word;
 		break;
 		case GF16_SHUFFLE2X_AVX2:
 			scratch = gf16_shuffle_init_x86(GF16_POLYNOMIAL);
@@ -658,6 +664,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_shuffle2x_finish_partial_packsum_avx2;
 			copy_cksum = &gf16_cksum_copy_avx2;
 			copy_cksum_check = &gf16_cksum_copy_check_avx2;
+			replace_word = &gf16_shuffle16_replace_word;
 		break;
 		
 		case GF16_SHUFFLE_NEON:
@@ -842,6 +849,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx512;
 			copy_cksum = &gf16_cksum_copy_avx512;
 			copy_cksum_check = &gf16_cksum_copy_check_avx512;
+			replace_word = &gf16_shuffle64_replace_word;
 		break;
 		
 		case GF16_AFFINE_AVX2:
@@ -871,6 +879,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_avx2;
 			copy_cksum = &gf16_cksum_copy_avx2;
 			copy_cksum_check = &gf16_cksum_copy_check_avx2;
+			replace_word = &gf16_shuffle32_replace_word;
 		break;
 		
 		case GF16_AFFINE_GFNI:
@@ -900,6 +909,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_ssse3;
 			copy_cksum = &gf16_cksum_copy_sse2;
 			copy_cksum_check = &gf16_cksum_copy_check_sse2;
+			replace_word = &gf16_shuffle16_replace_word;
 		break;
 		
 		case GF16_AFFINE2X_AVX512:
@@ -927,6 +937,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_avx512;
 			copy_cksum = &gf16_cksum_copy_avx512;
 			copy_cksum_check = &gf16_cksum_copy_check_avx512;
+			replace_word = &gf16_shuffle32_replace_word;
 		break;
 		
 		case GF16_AFFINE2X_AVX2:
@@ -954,6 +965,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_avx2;
 			copy_cksum = &gf16_cksum_copy_avx2;
 			copy_cksum_check = &gf16_cksum_copy_check_avx2;
+			replace_word = &gf16_shuffle16_replace_word;
 		break;
 		
 		case GF16_AFFINE2X_GFNI:
@@ -981,6 +993,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_gfni;
 			copy_cksum = &gf16_cksum_copy_sse2;
 			copy_cksum_check = &gf16_cksum_copy_check_sse2;
+			replace_word = &gf16_shuffle8_replace_word;
 		break;
 		
 		case GF16_XOR_JIT_AVX512:
@@ -1018,6 +1031,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_xor_finish_partial_packsum_sse2;
 					copy_cksum = &gf16_cksum_copy_sse2;
 					copy_cksum_check = &gf16_cksum_copy_check_sse2;
+					replace_word = NULL;
 				break;
 				/*
 				case GF16_XOR_JIT_AVX:
@@ -1039,6 +1053,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx;
 					copy_cksum = &gf16_cksum_copy_sse2;
 					copy_cksum_check = &gf16_cksum_copy_check_sse2;
+					replace_word = NULL;
 				break;
 				*/
 				case GF16_XOR_JIT_AVX2:
@@ -1060,6 +1075,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx2;
 					copy_cksum = &gf16_cksum_copy_avx2;
 					copy_cksum_check = &gf16_cksum_copy_check_avx2;
+					replace_word = NULL;
 				break;
 				case GF16_XOR_JIT_AVX512:
 					METHOD_REQUIRES(gf16_xor_available_avx512)
@@ -1082,6 +1098,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx512;
 					copy_cksum = &gf16_cksum_copy_avx512;
 					copy_cksum_check = &gf16_cksum_copy_check_avx512;
+					replace_word = NULL;
 				break;
 				default: break; // for pedantic compilers
 			}
@@ -1140,6 +1157,7 @@ Galois16Mul::Galois16Mul(Galois16Methods method) {
 	prepare_packed = &Galois16Mul::_prepare_packed_none;
 	finish = &Galois16Mul::_finish_none;
 	finish_packed = NULL;
+	replace_word = &Galois16Mul::_replace_word;
 	
 	_mul = NULL;
 	_mul_add_pf = NULL;
@@ -1218,9 +1236,11 @@ void Galois16Mul::mutScratch_free(void* mutScratch) const {
 	}
 }
 
-Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned /*outputs*/) {
+Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inputs, unsigned /*outputs*/, bool forInvert) {
 	const CpuCap caps(true);
 	(void)regionSizeHint;
+	(void)inputs;
+	(void)forInvert;
 	
 #ifdef PLATFORM_X86
 	if(caps.hasGFNI) {
@@ -1237,7 +1257,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned /*ou
 	}
 	if(caps.hasAVX2) {
 # ifdef PLATFORM_AMD64
-		if(gf16_xor_available_avx2 && caps.canMemWX && caps.propFastJit && !caps.isEmulated) // TODO: check size hint?
+		if(gf16_xor_available_avx2 && caps.canMemWX && caps.propFastJit && !caps.isEmulated && !forInvert) // TODO: check size hint?
 			return GF16_XOR_JIT_AVX2;
 # endif
 		if(gf16_shuffle_available_avx2)
@@ -1245,7 +1265,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned /*ou
 	}
 	if(gf16_affine_available_gfni && caps.hasGFNI && gf16_shuffle_available_ssse3 && caps.hasSSSE3)
 		return GF16_AFFINE2X_GFNI; // this should beat XOR-JIT; even seems to generally beat Shuffle2x AVX2
-	if(!caps.isEmulated && (!regionSizeHint || regionSizeHint > caps.propPrefShuffleThresh)) {
+	if(!caps.isEmulated && regionSizeHint > caps.propPrefShuffleThresh && !forInvert) {
 		// TODO: if only a few recovery slices being made (e.g. 3), prefer shuffle
 		//if(gf16_xor_available_avx && caps.hasAVX && caps.canMemWX)
 		//	return GF16_XOR_JIT_AVX;
@@ -1260,12 +1280,15 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned /*ou
 		return GF16_XOR_SSE2;
 #endif
 #ifdef PLATFORM_ARM
-	if(caps.hasSVE2)
-		return gf16_sve_get_size() >= 64 ? GF16_SHUFFLE_512_SVE2 : GF16_CLMUL_SVE2;
+	if(caps.hasSVE2) {
+		if(gf16_sve_get_size() >= 64)
+			return GF16_SHUFFLE_512_SVE2;
+		return inputs > 3 ? GF16_CLMUL_SVE2 : GF16_SHUFFLE_128_SVE2;
+	}
 	if(caps.hasSVE && gf16_sve_get_size() > 16)
 		return GF16_SHUFFLE_128_SVE;
 	if(gf16_available_neon && caps.hasNEON)
-		return GF16_CLMUL_NEON;
+		return inputs > 3 ? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON;
 #endif
 	
 	
diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h
index d5baa952..9d3e7a59 100644
--- a/gf16/gf16mul.h
+++ b/gf16/gf16mul.h
@@ -15,6 +15,9 @@ typedef void(*Galois16MulUntransformPacked) (void *HEDLEY_RESTRICT dst, const vo
 typedef int(*Galois16MulUntransformPackedCksum) (void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen);
 typedef int(*Galois16MulUntransformPackedCksumPartial) (void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen);
 
+typedef uint16_t(*Galois16ReplaceWord) (void* data, size_t index, uint16_t newValue);
+
+
 typedef void(*Galois16MulFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 typedef void(*Galois16MulPfFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch);
 typedef void(*Galois16PowFunc) (const void *HEDLEY_RESTRICT scratch, unsigned outputs, size_t offset, void **HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
@@ -122,6 +125,12 @@ class Galois16Mul {
 	}
 	static void _finish_none(void *HEDLEY_RESTRICT, size_t) {}
 	static void _prepare_packed_none(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
+	static uint16_t _replace_word(void* data, size_t index, uint16_t newValue) {
+		uint16_t* p = (uint16_t*)data + index;
+		uint16_t oldValue = *p;
+		*p = newValue;
+		return oldValue;
+	}
 	
 	
 	Galois16Methods _method;
@@ -136,7 +145,7 @@ class Galois16Mul {
 #endif
 	
 public:
-	static Galois16Methods default_method(size_t regionSizeHint = 0, unsigned outputs = 0);
+	static Galois16Methods default_method(size_t regionSizeHint = 1048576, unsigned inputs = 32768, unsigned outputs = 65535, bool forInvert = false);
 	Galois16Mul(Galois16Methods method = GF16_AUTO);
 	~Galois16Mul();
 	
@@ -199,6 +208,7 @@ class Galois16Mul {
 	Galois16MulUntransformPacked finish_packed;
 	Galois16MulUntransformPackedCksum finish_packed_cksum;
 	Galois16MulUntransformPackedCksumPartial finish_partial_packsum;
+	Galois16ReplaceWord replace_word;
 	Galois16AddMultiFunc add_multi;
 	Galois16AddPackedFunc add_multi_packed;
 	Galois16AddPackPfFunc add_multi_packpf;
diff --git a/gf16/gfmat_coeff.c b/gf16/gfmat_coeff.c
index df31276b..4cd8e8b8 100644
--- a/gf16/gfmat_coeff.c
+++ b/gf16/gfmat_coeff.c
@@ -3,15 +3,24 @@
 
 static int8_t* input_diff = NULL; // difference between predicted input coefficient and actual (number range is -4...5, so could be compressed to 4 bits, but I don't feel it's worth the savings)
 static uint16_t* gf_exp = NULL; // pre-calculated exponents in GF(2^16), missing bottom 3 bits, followed by 128-entry polynomial shift table
+#ifdef PARPAR_INVERT_SUPPORT
+uint16_t* gf16_recip = NULL; // full GF(2^16) reciprocal table
+#endif
 void gfmat_init() {
 	if(input_diff) return;
 	
 	input_diff = (int8_t*)malloc(32768);
 	gf_exp = (uint16_t*)malloc((8192+128)*2);
+#ifdef PARPAR_INVERT_SUPPORT
+	gf16_recip = (uint16_t*)malloc(65536*2);
+#endif
 	
 	int exp = 0, n = 1;
 	for (int i = 0; i < 32768; i++) {
 		do {
+#ifdef PARPAR_INVERT_SUPPORT
+			gf16_recip[n] = exp; // essentially construct a log table, then alter it later to get the reciprocal
+#endif
 			if((exp & 7) == 0) gf_exp[exp>>3] = n;
 			exp++; // exp will reach 65534 by the end of the loop
 			n <<= 1;
@@ -20,6 +29,9 @@ void gfmat_init() {
 		
 		input_diff[i] = exp - i*2;
 	}
+#ifdef PARPAR_INVERT_SUPPORT
+	gf16_recip[n] = exp;
+#endif
 	
 	// correction values for handling the missing bottom 3 bits of exp
 	// essentially this is a table to speed up multiplication by 0...127 by applying the effects of polynomial masking
@@ -31,6 +43,14 @@ void gfmat_init() {
 		}
 		gf_exp[8192+i] = n;
 	}
+	
+#ifdef PARPAR_INVERT_SUPPORT
+	gf16_recip[1] = 65535;
+	// exponentiate for reciprocals
+	for (int i = 1; i < 65536; i++) {
+		gf16_recip[i] = gf16_exp(65535 - gf16_recip[i]);
+	}
+#endif
 }
 
 void gfmat_free() {
@@ -38,6 +58,10 @@ void gfmat_free() {
 	free(gf_exp);
 	input_diff = NULL;
 	gf_exp = NULL;
+#ifdef PARPAR_INVERT_SUPPORT
+	free(gf16_recip);
+	gf16_recip = NULL;
+#endif
 }
 
 HEDLEY_CONST uint16_t gf16_exp(uint_fast16_t v) {
diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
new file mode 100644
index 00000000..2bc5cd1a
--- /dev/null
+++ b/gf16/gfmat_inv.cpp
@@ -0,0 +1,115 @@
+#include "gfmat_coeff.h"
+
+#ifdef PARPAR_INVERT_SUPPORT
+extern "C" uint16_t* gf16_recip;
+
+#include <cassert>
+#include "../src/platform.h" // for ALIGN_*
+#include "gf16mul.h"
+
+uint16_t* compute_recovery_matrix(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, unsigned& stride) {
+	unsigned matWidth = inputValid.size() * sizeof(uint16_t);
+	Galois16Mul gf(Galois16Mul::default_method(matWidth, inputValid.size(), inputValid.size(), true));
+	stride = gf.alignToStride(matWidth);
+	const auto gfInfo = gf.info();
+	void* gfScratch = gf.mutScratch_alloc();
+	
+	unsigned invalidCount = inputValid.size() - validCount;
+	assert(validCount < inputValid.size()); // i.e. invalidCount > 0
+	
+	uint16_t* mat;
+	ALIGN_ALLOC(mat, invalidCount * stride, gfInfo.alignment);
+	
+	unsigned validCol, missingCol;
+	unsigned stride16 = stride / sizeof(uint16_t);
+	assert(stride16 * sizeof(uint16_t) == stride);
+	
+	invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying?
+		if(invalidCount > recovery.size()) { // not enough recovery
+			gf.mutScratch_free(gfScratch);
+			ALIGN_FREE(mat);
+			return nullptr;
+		}
+		
+		// generate matrix
+		validCol = 0;
+		missingCol = validCount;
+		for(unsigned input = 0; input < inputValid.size(); input++) {
+			uint16_t inputLog = gfmat_input_log(input);
+			unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++;
+			for(unsigned rec = 0; rec < invalidCount; rec++) {
+				mat[rec * stride16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec));
+			}
+		}
+		assert(validCol == validCount);
+		
+		// pre-transform
+		if(gf.needPrepare()) {
+			for(unsigned rec = 0; rec < invalidCount; rec++) {
+				uint16_t* row = mat + rec * stride16;
+				//memset(row + matWidth, 0, stride - matWidth); // not necessary, but do this to avoid uninitialized memory
+				gf.prepare(row, row, stride);
+			}
+		}
+		
+		// invert
+		// TODO: optimise: multi-thread + packed arrangement
+		// TODO: progress hook
+		missingCol = validCount;
+		for(unsigned rec = 0; rec < invalidCount; rec++) {
+			uint16_t* row = mat + rec * stride16;
+			// scale down factor
+			uint16_t baseCoeff = gf.replace_word(row, missingCol, 1);
+			if(HEDLEY_UNLIKELY(baseCoeff == 0)) { // bad recovery coeff
+				// ignore this recovery row and try again
+				recovery.erase(recovery.begin() + rec);
+				goto invert_loop;
+			}
+			baseCoeff = gf16_recip[baseCoeff]; // TODO: consider prefetching this?
+			if(HEDLEY_LIKELY(baseCoeff != 1)) {
+				gf.mul(row, row, stride, baseCoeff, gfScratch);
+			}
+			
+			for(unsigned rec2 = 0; rec2 < invalidCount; rec2++) {
+				if(HEDLEY_UNLIKELY(rec == rec2)) continue;
+				uint16_t* row2 = mat + rec2 * stride16;
+				uint16_t coeff = gf.replace_word(row2, missingCol, 0);
+				if(HEDLEY_LIKELY(coeff != 0)) {
+					gf.mul_add(row2, row, stride, coeff, gfScratch);
+				} // TODO: is a coefficient of 0 ever correct?
+			}
+			
+			missingCol++;
+		}
+		
+		// post transform
+		if(gf.needPrepare()) {
+			for(unsigned rec = 0; rec < invalidCount; rec++) {
+				uint16_t* row = mat + rec * stride16;
+				gf.finish(row, stride);
+				
+				/*
+				// check for zeroes; TODO: does this need to be the full row?
+				for(unsigned col = validCount; col < inputValid.size(); col++) {
+					if(HEDLEY_UNLIKELY(row[col] == 0)) { // bad coeff
+						recovery.erase(recovery.begin() + rec);
+						goto invert_loop;
+					}
+				}
+				*/
+			}
+		}
+	}
+	
+	// remove excess recovery
+	recovery.resize(invalidCount);
+	
+	gf.mutScratch_free(gfScratch);
+	return mat;
+}
+
+void free_recovery_matrix(uint16_t* mat) {
+	ALIGN_FREE(mat);
+}
+
+#endif
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
new file mode 100644
index 00000000..c70cacba
--- /dev/null
+++ b/gf16/gfmat_inv.h
@@ -0,0 +1,12 @@
+#ifndef GFMAT_INV_H
+#define GFMAT_INV_H
+
+#include <vector>
+#include "../src/stdint.h"
+
+#ifdef PARPAR_INVERT_SUPPORT
+uint16_t* compute_recovery_matrix(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, unsigned& stride);
+void free_recovery_matrix(uint16_t* mat);
+#endif
+
+#endif

From 7accee96259b6fa7dd501734cddd991994310d4f Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sat, 3 Jun 2023 21:25:56 +1000
Subject: [PATCH 06/91] Compiler warning

---
 gf16/gf_add_x86.h | 55 ++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/gf16/gf_add_x86.h b/gf16/gf_add_x86.h
index f44921b3..0024f0df 100644
--- a/gf16/gf_add_x86.h
+++ b/gf16/gf_add_x86.h
@@ -51,34 +51,35 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf_add_x)(
 		
 		if(vecStride == 16) {
 			// for xor kernels, need to do 4x prefetch
-			const char* pfBase;
-			if(doPrefetch) pfBase = _pf+(ptr>>1);
-			if(doPrefetch == 1) {
-				_mm_prefetch(pfBase, MM_HINT_WT1);
-				_mm_prefetch(pfBase+64, MM_HINT_WT1);
-				if(sizeof(_mword) > 16) {
-					_mm_prefetch(pfBase+128, MM_HINT_WT1);
-					_mm_prefetch(pfBase+192, MM_HINT_WT1);
+			if(doPrefetch) {
+				const char* pfBase = _pf+(ptr>>1);
+				if(doPrefetch == 1) {
+					_mm_prefetch(pfBase, MM_HINT_WT1);
+					_mm_prefetch(pfBase+64, MM_HINT_WT1);
+					if(sizeof(_mword) > 16) {
+						_mm_prefetch(pfBase+128, MM_HINT_WT1);
+						_mm_prefetch(pfBase+192, MM_HINT_WT1);
+					}
+					if(sizeof(_mword) > 32) {
+						_mm_prefetch(pfBase+256, MM_HINT_WT1);
+						_mm_prefetch(pfBase+320, MM_HINT_WT1);
+						_mm_prefetch(pfBase+384, MM_HINT_WT1);
+						_mm_prefetch(pfBase+448, MM_HINT_WT1);
+					}
 				}
-				if(sizeof(_mword) > 32) {
-					_mm_prefetch(pfBase+256, MM_HINT_WT1);
-					_mm_prefetch(pfBase+320, MM_HINT_WT1);
-					_mm_prefetch(pfBase+384, MM_HINT_WT1);
-					_mm_prefetch(pfBase+448, MM_HINT_WT1);
-				}
-			}
-			if(doPrefetch == 2) {
-				_mm_prefetch(pfBase, _MM_HINT_T1);
-				_mm_prefetch(pfBase+64, _MM_HINT_T1);
-				if(sizeof(_mword) > 16) {
-					_mm_prefetch(pfBase+128, _MM_HINT_T1);
-					_mm_prefetch(pfBase+192, _MM_HINT_T1);
-				}
-				if(sizeof(_mword) > 32) {
-					_mm_prefetch(pfBase+256, _MM_HINT_T1);
-					_mm_prefetch(pfBase+320, _MM_HINT_T1);
-					_mm_prefetch(pfBase+384, _MM_HINT_T1);
-					_mm_prefetch(pfBase+448, _MM_HINT_T1);
+				if(doPrefetch == 2) {
+					_mm_prefetch(pfBase, _MM_HINT_T1);
+					_mm_prefetch(pfBase+64, _MM_HINT_T1);
+					if(sizeof(_mword) > 16) {
+						_mm_prefetch(pfBase+128, _MM_HINT_T1);
+						_mm_prefetch(pfBase+192, _MM_HINT_T1);
+					}
+					if(sizeof(_mword) > 32) {
+						_mm_prefetch(pfBase+256, _MM_HINT_T1);
+						_mm_prefetch(pfBase+320, _MM_HINT_T1);
+						_mm_prefetch(pfBase+384, _MM_HINT_T1);
+						_mm_prefetch(pfBase+448, _MM_HINT_T1);
+					}
 				}
 			}
 		} else {

From ae7af0fa682c12a882dd7173b7f73f4a2ed77627 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 6 Jun 2023 12:27:19 +1000
Subject: [PATCH 07/91] Enable in-place mul/prepare for use in inversion +
 replace_word fixes

---
 gf16/gf16_affine.h              |   5 +-
 gf16/gf16_affine2x_x86.h        |   6 +-
 gf16/gf16_affine_avx2.c         |  25 +++-
 gf16/gf16_affine_avx512.c       |  25 +++-
 gf16/gf16_affine_gfni.c         |  24 +++-
 gf16/gf16_clmul.h               |   1 +
 gf16/gf16_clmul_neon.c          |  43 ++++--
 gf16/gf16_clmul_sve2.c          |  18 +++
 gf16/gf16_global.h              |  14 +-
 gf16/gf16_lookup.c              |   4 +-
 gf16/gf16_lookup.h              |   6 +-
 gf16/gf16_lookup_sse2.c         |   2 +-
 gf16/gf16_shuffle.h             |  20 +--
 gf16/gf16_shuffle2x128_sve2.c   |  37 ++++++
 gf16/gf16_shuffle2x_x86.h       |   6 +-
 gf16/gf16_shuffle512_sve2.c     |  24 ++++
 gf16/gf16_shuffle_neon.c        |   2 +-
 gf16/gf16_shuffle_ssse3.c       |  19 ++-
 gf16/gf16_shuffle_vbmi.c        |   2 +-
 gf16/gf16_shuffle_x86.h         |   2 +-
 gf16/gf16_shuffle_x86_prepare.h |   4 +-
 gf16/gf16_xor.h                 |  10 +-
 gf16/gf16_xor_avx2.c            |  73 +++++++----
 gf16/gf16_xor_avx512.c          |  16 +--
 gf16/gf16_xor_common.h          |  25 +++-
 gf16/gf16_xor_common_funcs.h    |  95 ++++++++++----
 gf16/gf16_xor_sse2.c            | 225 ++++++++++++++++++++++----------
 gf16/gf16mul.cpp                |  37 ++++--
 gf16/gf16mul.h                  |  49 +++----
 29 files changed, 586 insertions(+), 233 deletions(-)

diff --git a/gf16/gf16_affine.h b/gf16/gf16_affine.h
index b25f4eb9..254e5334 100644
--- a/gf16/gf16_affine.h
+++ b/gf16/gf16_affine.h
@@ -2,7 +2,7 @@
 #include "../src/hedley.h"
 
 #define FUNCS(v) \
-	void gf16_affine_mul_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
+	void gf16_affine_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_affine_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_affine_muladd_prefetch_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); \
 	void gf16_affine_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
@@ -21,11 +21,12 @@ FUNCS(avx512);
 #undef FUNCS
 
 #define FUNCS(v) \
+	void gf16_affine2x_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_affine2x_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_affine2x_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_affine2x_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_affine2x_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \
-	void gf16_affine2x_prepare_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen); \
+	void gf16_affine2x_prepare_##v(void* dst, const void* src, size_t srcLen); \
 	void gf16_affine2x_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
 	void gf16_affine2x_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
 	void gf16_affine2x_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); \
diff --git a/gf16/gf16_affine2x_x86.h b/gf16/gf16_affine2x_x86.h
index acf522ce..b6391622 100644
--- a/gf16/gf16_affine2x_x86.h
+++ b/gf16/gf16_affine2x_x86.h
@@ -4,12 +4,12 @@
 
 #ifdef _AVAILABLE
 # include "gf16_checksum_x86.h"
-static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_prepare_block)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) {
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_prepare_block)(void* dst, const void* src) {
 	_mword data = _MMI(loadu)((_mword*)src);
 	data = separate_low_high(data);
 	_MMI(store)((_mword*)dst, data);
 }
-static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_prepare_blocku)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) {
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_prepare_blocku)(void* dst, const void* src, size_t remaining) {
 	_mword data = partial_load(src, remaining);
 	data = separate_low_high(data);
 	_MMI(store)((_mword*)dst, data);
@@ -62,7 +62,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_finish_copy_blocku)(void *HED
 }
 #endif
 
-void _FN(gf16_affine2x_prepare)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen) {
+void _FN(gf16_affine2x_prepare)(void* dst, const void* src, size_t srcLen) {
 #ifdef _AVAILABLE
 	gf16_prepare(dst, src, srcLen, sizeof(_mword), &_FN(gf16_affine2x_prepare_block), &_FN(gf16_affine2x_prepare_blocku));
 	_MM_END
diff --git a/gf16/gf16_affine_avx2.c b/gf16/gf16_affine_avx2.c
index 80eaea34..1d42e328 100644
--- a/gf16/gf16_affine_avx2.c
+++ b/gf16/gf16_affine_avx2.c
@@ -53,7 +53,7 @@ static HEDLEY_ALWAYS_INLINE __m256i gf16_affine_load_matrix(const void *HEDLEY_R
 }
 #endif
 
-void gf16_affine_mul_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_affine_mul_avx2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__GFNI__) && defined(__AVX2__)
 	__m256i depmask = gf16_affine_load_matrix(scratch, coefficient);
@@ -336,6 +336,29 @@ static HEDLEY_ALWAYS_INLINE void gf16_affine2x_muladd_x_avx2(
 }
 #endif /*defined(__GFNI__) && defined(__AVX2__)*/
 
+void gf16_affine2x_mul_avx2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch);
+#if defined(__GFNI__) && defined(__AVX2__)
+	__m256i depmask = gf16_affine_load_matrix(scratch, coefficient);
+	__m256i matNorm = _mm256_inserti128_si256(depmask, _mm256_castsi256_si128(depmask), 1);
+	__m256i matSwap = _mm256_permute2x128_si256(depmask, depmask, 0x11);
+	
+	uint8_t* _src = (uint8_t*)src + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m256i)) {
+		__m256i data = _mm256_load_si256((__m256i*)(_src + ptr));
+		__m256i result1 = _mm256_gf2p8affine_epi64_epi8(data, matNorm, 0);
+		__m256i result2 = _mm256_gf2p8affine_epi64_epi8(data, matSwap, 0);
+		
+		result1 = _mm256_xor_si256(result1, _mm256_shuffle_epi32(result2, _MM_SHUFFLE(1,0,3,2)));
+		_mm256_store_si256((__m256i*)(_dst + ptr), result1);
+	}
+#else
+	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient);
+#endif
+}
+
 void gf16_affine2x_muladd_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__GFNI__) && defined(__AVX2__)
diff --git a/gf16/gf16_affine_avx512.c b/gf16/gf16_affine_avx512.c
index a848d0d2..67f11f36 100644
--- a/gf16/gf16_affine_avx512.c
+++ b/gf16/gf16_affine_avx512.c
@@ -87,7 +87,7 @@ static HEDLEY_ALWAYS_INLINE __m512i gf16_affine_load2_matrix(const void *HEDLEY_
 }
 #endif
 
-void gf16_affine_mul_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_affine_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
 	__m256i depmask = gf16_affine_load_matrix(scratch, coefficient);
@@ -465,6 +465,29 @@ static HEDLEY_ALWAYS_INLINE void gf16_affine2x_muladd_x_avx512(
 #endif /*defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)*/
 
 
+void gf16_affine2x_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch);
+#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
+	__m512i depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficient));
+	__m512i matNorm = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+	__m512i matSwap = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+	
+	uint8_t* _src = (uint8_t*)src + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m512i)) {
+		__m512i data = _mm512_load_si512((__m512i*)(_src + ptr));
+		__m512i result = _mm512_gf2p8affine_epi64_epi8(data, matNorm, 0);
+		__m512i swapped = _mm512_gf2p8affine_epi64_epi8(data, matSwap, 0);
+		
+		result = _mm512_xor_si512(result, _mm512_shuffle_epi32(swapped, _MM_SHUFFLE(1,0,3,2)));
+		_mm512_store_si512((__m512i*)(_dst + ptr), result);
+	}
+#else
+	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient);
+#endif
+}
+
 void gf16_affine2x_muladd_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
diff --git a/gf16/gf16_affine_gfni.c b/gf16/gf16_affine_gfni.c
index be1bc9e4..e7668cdb 100644
--- a/gf16/gf16_affine_gfni.c
+++ b/gf16/gf16_affine_gfni.c
@@ -56,7 +56,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_affine_load_matrix(const void *HEDLEY_REST
 }
 #endif
 
-void gf16_affine_mul_gfni(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_affine_mul_gfni(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__GFNI__) && defined(__SSSE3__)
 	__m128i depmask1, depmask2;
@@ -364,6 +364,28 @@ static HEDLEY_ALWAYS_INLINE void gf16_affine2x_muladd_x_gfni(
 }
 #endif /*defined(__GFNI__) && defined(__SSSE3__)*/
 
+void gf16_affine2x_mul_gfni(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch);
+#if defined(__GFNI__) && defined(__SSSE3__)
+	__m128i matNorm, matSwap;
+	gf16_affine_load_matrix(scratch, coefficient, &matNorm, &matSwap);
+	
+	uint8_t* _src = (uint8_t*)src + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)) {
+		__m128i data = _mm_load_si128((__m128i*)(_src + ptr));
+		__m128i result1 = _mm_gf2p8affine_epi64_epi8(data, matNorm, 0);
+		__m128i result2 = _mm_gf2p8affine_epi64_epi8(data, matSwap, 0);
+		
+		result1 = _mm_xor_si128(result1, _mm_shuffle_epi32(result2, _MM_SHUFFLE(1,0,3,2)));
+		_mm_store_si128((__m128i*)(_dst + ptr), result1);
+	}
+#else
+	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient);
+#endif
+}
+
 void gf16_affine2x_muladd_gfni(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__GFNI__) && defined(__SSSE3__)
diff --git a/gf16/gf16_clmul.h b/gf16/gf16_clmul.h
index b5be1ff0..696f0dcc 100644
--- a/gf16/gf16_clmul.h
+++ b/gf16/gf16_clmul.h
@@ -5,6 +5,7 @@
 	void gf16_clmul_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_clmul_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_clmul_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \
+	void gf16_clmul_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_clmul_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_clmul_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
 	void gf16_clmul_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
diff --git a/gf16/gf16_clmul_neon.c b/gf16/gf16_clmul_neon.c
index 6af64ff0..7d73a938 100644
--- a/gf16/gf16_clmul_neon.c
+++ b/gf16/gf16_clmul_neon.c
@@ -36,6 +36,7 @@ static HEDLEY_ALWAYS_INLINE poly16x8_t pmull_high(poly8x16_t a, poly8x16_t b) {
 #  define pmull_low(x, y) vmull_p8(vget_low_p8(x), vget_low_p8(y))
 #  define pmull_high vmull_high_p8
 # endif
+# define coeff_fn(f1, f2) f1##q_##f2
 #else
 static HEDLEY_ALWAYS_INLINE poly8x8_t veor_p8(poly8x8_t a, poly8x8_t b) {
 	return vreinterpret_p8_u8(veor_u8(vreinterpret_u8_p8(a), vreinterpret_u8_p8(b)));
@@ -43,6 +44,7 @@ static HEDLEY_ALWAYS_INLINE poly8x8_t veor_p8(poly8x8_t a, poly8x8_t b) {
 typedef poly8x8_t coeff_t;
 # define pmull_low(x, y) vmull_p8(vget_low_p8(x), y)
 # define pmull_high(x, y) vmull_p8(vget_high_p8(x), y)
+# define coeff_fn(f1, f2) f1##_##f2
 #endif
 
 #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) && defined(__APPLE__)
@@ -178,18 +180,11 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_muladd_x_neon(
 	for(int src=0; src<srcCount; src++) {
 		uint8_t lo = coefficients[src] & 0xff;
 		uint8_t hi = coefficients[src] >> 8;
-#ifdef __aarch64__
-		coeff[src*CLMUL_COEFF_PER_REGION +0] = vdupq_n_p8(lo);
-		coeff[src*CLMUL_COEFF_PER_REGION +1] = vdupq_n_p8(hi);
-		coeff[src*CLMUL_COEFF_PER_REGION +2] = veorq_p8(coeff[src*CLMUL_COEFF_PER_REGION +0], coeff[src*CLMUL_COEFF_PER_REGION +1]);
-		
-		// if we want to have one register per region, at the expense of 2 extra instructions per region
+		coeff[src*CLMUL_COEFF_PER_REGION +0] = coeff_fn(vdup, n_p8)(lo);
+		coeff[src*CLMUL_COEFF_PER_REGION +1] = coeff_fn(vdup, n_p8)(hi);
+		coeff[src*CLMUL_COEFF_PER_REGION +2] = coeff_fn(veor, p8)(coeff[src*CLMUL_COEFF_PER_REGION +0], coeff[src*CLMUL_COEFF_PER_REGION +1]);
+		// if we want to have one register per region (AArch64), at the expense of 2 extra instructions per region
 		//coeff[src] = vcombine_p8(vdup_n_p8(lo), vdup_n_p8(hi));
-#else
-		coeff[src*CLMUL_COEFF_PER_REGION +0] = vdup_n_p8(lo);
-		coeff[src*CLMUL_COEFF_PER_REGION +1] = vdup_n_p8(hi);
-		coeff[src*CLMUL_COEFF_PER_REGION +2] = veor_p8(coeff[src*CLMUL_COEFF_PER_REGION +0], coeff[src*CLMUL_COEFF_PER_REGION +1]);
-#endif
 	}
 
 	poly16x8_t low1, low2, mid1, mid2, high1, high2;
@@ -249,6 +244,32 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_muladd_x_neon(
 
 
 
+void gf16_clmul_mul_neon(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch); UNUSED(scratch);
+#if defined(__ARM_NEON)
+	
+	coeff_t coeff[3];
+	coeff[0] = coeff_fn(vdup, n_p8)(val & 0xff);
+	coeff[1] = coeff_fn(vdup, n_p8)(val >> 8);
+	coeff[2] = coeff_fn(veor, p8)(coeff[0], coeff[1]);
+	
+	uint8_t* _src = (uint8_t*)src + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	poly16x8_t low1, low2, mid1, mid2, high1, high2;
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) {
+		gf16_clmul_neon_round1(_src+ptr, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff);
+		gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2);
+		uint8x16x2_t out;
+		out.val[0] = vreinterpretq_u8_p16(low1);
+		out.val[1] = vreinterpretq_u8_p16(high1);
+		vst2q_u8(_dst+ptr, out);
+	}
+#else
+	UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val);
+#endif
+}
+
+
 void gf16_clmul_muladd_neon(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__ARM_NEON)
diff --git a/gf16/gf16_clmul_sve2.c b/gf16/gf16_clmul_sve2.c
index ad225827..913dc8a9 100644
--- a/gf16/gf16_clmul_sve2.c
+++ b/gf16/gf16_clmul_sve2.c
@@ -178,6 +178,24 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_muladd_x_sve2(
 
 
 
+void gf16_clmul_mul_sve2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch); UNUSED(scratch);
+#if defined(__ARM_FEATURE_SVE2)
+	svuint8_t coeff = svreinterpret_u8_u16(svdup_n_u16(val));
+	uint8_t* _src = (uint8_t*)src + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	
+	svuint8_t low1, low2, mid1, mid2, high1, high2;
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += svcntb()*2) {
+		gf16_clmul_sve2_round(_src+ptr, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff);
+		gf16_clmul_sve2_reduction(&low1, low2, mid1, mid2, &high1, high2);
+		svst2_u8(svptrue_b8(), _dst+ptr, svcreate2_u8(low1, high1));
+	}
+#else
+	UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val);
+#endif
+}
+
 void gf16_clmul_muladd_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__ARM_FEATURE_SVE2)
diff --git a/gf16/gf16_global.h b/gf16/gf16_global.h
index ce72593e..4951b544 100644
--- a/gf16/gf16_global.h
+++ b/gf16/gf16_global.h
@@ -31,13 +31,15 @@
 typedef void (CONST_PTR gf16_checksum_exp)(void *HEDLEY_RESTRICT checksum, uint16_t exp);
 typedef void (CONST_PTR gf16_checksum_block)(const void *HEDLEY_RESTRICT src, void *HEDLEY_RESTRICT checksum, const size_t blockLen, const int aligned);
 typedef void (CONST_PTR gf16_checksum_blocku)(const void *HEDLEY_RESTRICT src, size_t amount, void *HEDLEY_RESTRICT checksum);
-typedef void (CONST_PTR gf16_transform_block)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src);
-typedef void (CONST_PTR gf16_transform_blocku)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining);
-typedef void (CONST_PTR gf16_prepare_checksum)(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock);
+typedef void (CONST_PTR gf16_transform_block)(void* dst, const void* src);
+typedef void (CONST_PTR gf16_transform_blocku)(void* dst, const void* src, size_t remaining);
+typedef void (CONST_PTR gf16_transform_block_rst)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src);
+typedef void (CONST_PTR gf16_transform_blocku_rst)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining);
+typedef void (CONST_PTR gf16_prepare_checksum)(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock);
 typedef void (CONST_PTR gf16_finish_block)(void *HEDLEY_RESTRICT dst);
 #undef CONST_PTR
 
-static HEDLEY_ALWAYS_INLINE void gf16_prepare(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, const size_t blockLen, gf16_transform_block prepareBlock, gf16_transform_blocku prepareBlockU) {
+static HEDLEY_ALWAYS_INLINE void gf16_prepare(void* dst, const void* src, size_t srcLen, const size_t blockLen, gf16_transform_block prepareBlock, gf16_transform_blocku prepareBlockU) {
 	size_t remaining = srcLen % blockLen;
 	size_t len = srcLen - remaining;
 	uint8_t* _src = (uint8_t*)src + len;
@@ -79,7 +81,7 @@ static HEDLEY_ALWAYS_INLINE void* gf16_checksum_ptr(void* ptr, size_t sliceLen,
 #include <string.h>
 #include "gfmat_coeff.h"
 static HEDLEY_ALWAYS_INLINE void gf16_prepare_packed(
-	void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, const size_t blockLen, gf16_transform_block prepareBlock, gf16_transform_blocku prepareBlockU,
+	void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, const size_t blockLen, gf16_transform_block_rst prepareBlock, gf16_transform_blocku_rst prepareBlockU,
 	unsigned inputPackSize, unsigned inputNum, size_t chunkLen, const unsigned interleaveSize,
 	size_t partOffset, size_t partLen,
 	void *HEDLEY_RESTRICT checksum, gf16_checksum_block checksumBlock, gf16_checksum_blocku checksumBlockU, gf16_checksum_exp checksumExp, gf16_prepare_checksum prepareChecksum
@@ -217,7 +219,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_prepare_packed(
 
 
 static HEDLEY_ALWAYS_INLINE int gf16_finish_packed(
-	void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, const size_t blockLen, gf16_transform_block finishBlock, gf16_transform_blocku finishBlockU,
+	void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, const size_t blockLen, gf16_transform_block_rst finishBlock, gf16_transform_blocku_rst finishBlockU,
 	unsigned numOutputs, unsigned outputNum, size_t chunkLen, const unsigned interleaveSize,
 	size_t partOffset, size_t partLen,
 	gf16_checksum_block checksumBlock, gf16_checksum_blocku checksumBlockU, gf16_checksum_exp checksumExp, gf16_finish_block inlineFinishBlock,
diff --git a/gf16/gf16_lookup.c b/gf16/gf16_lookup.c
index 0e9b4060..c8ceef7a 100644
--- a/gf16/gf16_lookup.c
+++ b/gf16/gf16_lookup.c
@@ -182,7 +182,7 @@ static HEDLEY_ALWAYS_INLINE void calc_table(uint16_t coefficient, uint16_t* lhta
 
 #endif
 
-void gf16_lookup_mul(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_lookup_mul(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(scratch); UNUSED(mutScratch);
 	uint16_t lhtable[512];
 	calc_table(coefficient, lhtable);
@@ -393,7 +393,7 @@ static HEDLEY_ALWAYS_INLINE void calc_3table(uint16_t coefficient, struct gf16_l
 	}
 }
 
-void gf16_lookup3_mul(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_lookup3_mul(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(scratch); UNUSED(mutScratch);
 	struct gf16_lookup3_tables lookup;
 	calc_3table(coefficient, &lookup);
diff --git a/gf16/gf16_lookup.h b/gf16/gf16_lookup.h
index adce8b5d..9eaaf6f1 100644
--- a/gf16/gf16_lookup.h
+++ b/gf16/gf16_lookup.h
@@ -5,15 +5,15 @@
 #include "../src/stdint.h"
 #include <stddef.h>
 
-void gf16_lookup_mul(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
+void gf16_lookup_mul(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_lookup_muladd(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_lookup_powadd(const void *HEDLEY_RESTRICT scratch, unsigned outputs, size_t offset, void **HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 
-void gf16_lookup3_mul(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
+void gf16_lookup3_mul(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_lookup3_muladd(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_lookup3_muladd_multi_packed(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch);
 
-void gf16_lookup_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
+void gf16_lookup_mul_sse2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_lookup_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 
 
diff --git a/gf16/gf16_lookup_sse2.c b/gf16/gf16_lookup_sse2.c
index 7831582f..a95d9a7f 100644
--- a/gf16/gf16_lookup_sse2.c
+++ b/gf16/gf16_lookup_sse2.c
@@ -65,7 +65,7 @@ static HEDLEY_ALWAYS_INLINE void calc_table(uint16_t val, uint16_t* lhtable) {
 }
 #endif
 
-void gf16_lookup_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_lookup_mul_sse2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(scratch); UNUSED(mutScratch);
 #ifdef __SSE2__
 	ALIGN_TO(16, uint16_t lhtable[513]); // +1 for potential misaligned load at end
diff --git a/gf16/gf16_shuffle.h b/gf16/gf16_shuffle.h
index 2c12db49..0344bcef 100644
--- a/gf16/gf16_shuffle.h
+++ b/gf16/gf16_shuffle.h
@@ -3,7 +3,7 @@
 
 // basic
 #define FUNCS(v) \
-	void gf16_shuffle_prepare_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen); \
+	void gf16_shuffle_prepare_##v(void* dst, const void* src, size_t srcLen); \
 	void gf16_shuffle_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
 	void gf16_shuffle_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
 	void gf16_shuffle_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); \
@@ -11,7 +11,7 @@
 	void gf16_shuffle_finish_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \
 	int gf16_shuffle_finish_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \
 	int gf16_shuffle_finish_partial_packsum_##v(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); \
-	void gf16_shuffle_mul_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
+	void gf16_shuffle_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_shuffle_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_shuffle_muladd_prefetch_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); \
 	extern int gf16_shuffle_available_##v
@@ -38,7 +38,7 @@ FUNCS(512_sve2);
 
 #undef FUNCS
 
-void gf16_shuffle_mul_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
+void gf16_shuffle_mul_vbmi(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_shuffle_muladd_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_shuffle_muladd_prefetch_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch);
 void gf16_shuffle_prepare_packed_vbmi(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
@@ -47,14 +47,13 @@ void gf16_shuffle_prepare_partial_packsum_vbmi(void *HEDLEY_RESTRICT dst, const
 extern int gf16_shuffle_available_vbmi;
 
 #define FUNCS(v) \
-	void gf16_shuffle_mul_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
+	void gf16_shuffle_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_shuffle_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch)
 
 FUNCS(neon);
 FUNCS(128_sve);
 FUNCS(128_sve2);
-void gf16_shuffle_muladd_512_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
-
+FUNCS(512_sve2);
 
 #undef FUNCS
 
@@ -83,7 +82,7 @@ extern int gf16_available_sve2;
 
 // shuffle2x
 #define FUNCS(v) \
-	void gf16_shuffle2x_prepare_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen); \
+	void gf16_shuffle2x_prepare_##v(void* dst, const void* src, size_t srcLen); \
 	void gf16_shuffle2x_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
 	void gf16_shuffle2x_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
 	void gf16_shuffle2x_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); \
@@ -91,7 +90,7 @@ extern int gf16_available_sve2;
 	void gf16_shuffle2x_finish_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \
 	int gf16_shuffle2x_finish_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \
 	int gf16_shuffle2x_finish_partial_packsum_##v(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); \
-	void gf16_shuffle2x_mul_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
+	void gf16_shuffle2x_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_shuffle2x_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_shuffle2x_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_shuffle2x_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
@@ -108,6 +107,7 @@ void gf16_shuffle2x_prepare_partial_packsum_sve(void *HEDLEY_RESTRICT dst, const
 void gf16_shuffle2x_finish_packed_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen);
 int gf16_shuffle2x_finish_packed_cksum_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen);
 int gf16_shuffle2x_finish_partial_packsum_sve(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen);
+void gf16_shuffle2x_mul_128_sve2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_shuffle2x_muladd_128_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_shuffle2x_muladd_multi_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch);
 void gf16_shuffle2x_muladd_multi_packed_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch);
@@ -122,7 +122,9 @@ void* gf16_shuffle_init_512_sve(int polynomial);
 
 int gf16_sve_get_size();
 
-uint16_t gf16_shuffle8_replace_word(void* data, size_t index, uint16_t newValue);
+uint16_t gf16_affine2x_replace_word(void* data, size_t index, uint16_t newValue);
 uint16_t gf16_shuffle16_replace_word(void* data, size_t index, uint16_t newValue);
 uint16_t gf16_shuffle32_replace_word(void* data, size_t index, uint16_t newValue);
 uint16_t gf16_shuffle64_replace_word(void* data, size_t index, uint16_t newValue);
+uint16_t gf16_shuffle2x16_replace_word(void* data, size_t index, uint16_t newValue);
+uint16_t gf16_shuffle2x32_replace_word(void* data, size_t index, uint16_t newValue);
diff --git a/gf16/gf16_shuffle2x128_sve2.c b/gf16/gf16_shuffle2x128_sve2.c
index ac1cda5d..4e358b6d 100644
--- a/gf16/gf16_shuffle2x128_sve2.c
+++ b/gf16/gf16_shuffle2x128_sve2.c
@@ -251,6 +251,43 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_x_sve2(
 #endif /*defined(__ARM_FEATURE_SVE2)*/
 
 
+void gf16_shuffle2x_mul_128_sve2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch);
+	UNUSED(scratch);
+#if defined(__ARM_FEATURE_SVE2)
+	svuint8_t tbl_ln, tbl_ls, tbl_hn, tbl_hs;
+	gf16_shuffle2x128_sve2_calc_tables(1, &val,
+		&tbl_ln, &tbl_ls, &tbl_hn, &tbl_hs,
+		NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL
+	);
+	
+	svuint8_t mask = svreinterpret_u8_u16(svdup_n_u16(0x1000));
+	uint8_t* _src = (uint8_t*)src + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += svcntb()) {
+		svuint8_t data = svld1_u8(svptrue_b8(), _src+ptr);;
+		svuint8_t tmp1 = svbsl_n_u8(data, mask, 0xf);
+		svuint8_t tmp2 = svsri_n_u8(mask, data, 4);
+		data = sveor3_u8(
+			svtbl_u8(tbl_ln, tmp1),
+			svtbl_u8(tbl_hn, tmp2),
+			svreinterpret_u8_u16(svxar_n_u16(
+				svreinterpret_u16_u8(svtbl_u8(tbl_ls, tmp1)),
+				svreinterpret_u16_u8(svtbl_u8(tbl_hs, tmp2)),
+				8
+			))
+		);
+		svst1_u8(svptrue_b8(), _dst+ptr, data);
+	}
+#else
+	UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val);
+#endif
+}
+
 void gf16_shuffle2x_muladd_128_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__ARM_FEATURE_SVE2)
diff --git a/gf16/gf16_shuffle2x_x86.h b/gf16/gf16_shuffle2x_x86.h
index 718feed0..43bba321 100644
--- a/gf16/gf16_shuffle2x_x86.h
+++ b/gf16/gf16_shuffle2x_x86.h
@@ -4,7 +4,7 @@
 
 #ifdef _AVAILABLE
 # include "gf16_checksum_x86.h"
-static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_prepare_block)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) {
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_prepare_block)(void* dst, const void* src) {
 	_mword data = _MMI(loadu)((_mword*)src);
 	
 	data = separate_low_high(data);
@@ -16,7 +16,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_prepare_block)(void *HEDLEY_
 	
 	_MMI(store)((_mword*)dst, data);
 }
-static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_prepare_blocku)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) {
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_prepare_blocku)(void* dst, const void* src, size_t remaining) {
 	_mword data = partial_load(src, remaining);
 	
 	data = separate_low_high(data);
@@ -91,7 +91,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle2x_finish_copy_blocku)(void *HE
 }
 #endif
 
-void _FN(gf16_shuffle2x_prepare)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen) {
+void _FN(gf16_shuffle2x_prepare)(void* dst, const void* src, size_t srcLen) {
 #ifdef _AVAILABLE
 	gf16_prepare(dst, src, srcLen, sizeof(_mword), &_FN(gf16_shuffle2x_prepare_block), &_FN(gf16_shuffle2x_prepare_blocku));
 	_MM_END
diff --git a/gf16/gf16_shuffle512_sve2.c b/gf16/gf16_shuffle512_sve2.c
index b2a2682c..83881062 100644
--- a/gf16/gf16_shuffle512_sve2.c
+++ b/gf16/gf16_shuffle512_sve2.c
@@ -310,6 +310,30 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle512_muladd_x_sve2(
 #endif /*defined(__ARM_FEATURE_SVE2)*/
 
 
+void gf16_shuffle_mul_512_sve2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch);
+#if defined(__ARM_FEATURE_SVE2)
+	svuint8_t tbl_l0, tbl_l1, tbl_l2, tbl_h0, tbl_h1, tbl_h2;
+	gf16_shuffle512_sve2_calc_tables(scratch, 1, &val,
+		&tbl_l0, &tbl_l1, &tbl_l2, &tbl_h0, &tbl_h1, &tbl_h2,
+		NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL,
+		NULL, NULL, NULL, NULL, NULL, NULL
+	);
+	
+	uint8_t* _src = (uint8_t*)src + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	
+	svuint8_t rl, rh;
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += svcntb()*2) {
+		gf16_shuffle512_sve2_round1(svld2_u8(svptrue_b8(), _src+ptr), &rl, &rh, tbl_l0, tbl_l1, tbl_l2, tbl_h0, tbl_h1, tbl_h2);
+		svst2_u8(svptrue_b8(), _dst+ptr, svcreate2_u8(rl, rh));
+	}
+#else
+	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val);
+#endif
+}
+
 void gf16_shuffle_muladd_512_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #ifdef __ARM_FEATURE_SVE2
diff --git a/gf16/gf16_shuffle_neon.c b/gf16/gf16_shuffle_neon.c
index c2ef7519..90b643a6 100644
--- a/gf16/gf16_shuffle_neon.c
+++ b/gf16/gf16_shuffle_neon.c
@@ -274,7 +274,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle_muladd_x_neon(
 
 
 
-void gf16_shuffle_mul_neon(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_shuffle_mul_neon(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__ARM_NEON)
 	qtbl_t tbl_h[4], tbl_l[4];
diff --git a/gf16/gf16_shuffle_ssse3.c b/gf16/gf16_shuffle_ssse3.c
index ddd8247c..1c8daeec 100644
--- a/gf16/gf16_shuffle_ssse3.c
+++ b/gf16/gf16_shuffle_ssse3.c
@@ -32,8 +32,17 @@ static HEDLEY_ALWAYS_INLINE uint16_t gf16_shuffleX_replace_word(void* data, size
 	return oldValue;
 }
 
-uint16_t gf16_shuffle8_replace_word(void* data, size_t index, uint16_t newValue) { // only used for Affine2x
-	return gf16_shuffleX_replace_word(data, index, newValue, 8);
+static HEDLEY_ALWAYS_INLINE uint16_t gf16_shuffle2X_replace_word(void* data, size_t index, uint16_t newValue, size_t width) {
+	uint8_t* base = (uint8_t*)data + (index & ~(width-1)) * 2;
+	unsigned pos = index & (width-1);
+	uint16_t oldValue = base[pos] | (base[pos + width] << 8);
+	base[pos] = newValue & 0xff;
+	base[pos + width] = newValue >> 8;
+	return oldValue;
+}
+
+uint16_t gf16_affine2x_replace_word(void* data, size_t index, uint16_t newValue) {
+	return gf16_shuffle2X_replace_word(data, index, newValue, 8);
 }
 uint16_t gf16_shuffle16_replace_word(void* data, size_t index, uint16_t newValue) {
 	return gf16_shuffleX_replace_word(data, index, newValue, 16);
@@ -44,6 +53,12 @@ uint16_t gf16_shuffle32_replace_word(void* data, size_t index, uint16_t newValue
 uint16_t gf16_shuffle64_replace_word(void* data, size_t index, uint16_t newValue) {
 	return gf16_shuffleX_replace_word(data, index, newValue, 64);
 }
+uint16_t gf16_shuffle2x16_replace_word(void* data, size_t index, uint16_t newValue) {
+	return gf16_shuffle2X_replace_word(data, index, newValue, 16);
+}
+uint16_t gf16_shuffle2x32_replace_word(void* data, size_t index, uint16_t newValue) {
+	return gf16_shuffle2X_replace_word(data, index, newValue, 32);
+}
 
 
 void* gf16_shuffle_init_x86(int polynomial) {
diff --git a/gf16/gf16_shuffle_vbmi.c b/gf16/gf16_shuffle_vbmi.c
index e2d1da4b..399d9e5f 100644
--- a/gf16/gf16_shuffle_vbmi.c
+++ b/gf16/gf16_shuffle_vbmi.c
@@ -382,7 +382,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle_muladd_x_vbmi(
 }
 #endif
 
-void gf16_shuffle_mul_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_shuffle_mul_vbmi(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__AVX512VBMI__) && defined(__AVX512VL__)
 	__m512i lo0, lo1, lo2, hi0, hi1, hi2;
diff --git a/gf16/gf16_shuffle_x86.h b/gf16/gf16_shuffle_x86.h
index 1556e2fb..bd8fd9dc 100644
--- a/gf16/gf16_shuffle_x86.h
+++ b/gf16/gf16_shuffle_x86.h
@@ -10,7 +10,7 @@ int _FN(gf16_shuffle_available) = 1;
 int _FN(gf16_shuffle_available) = 0;
 #endif
 
-void _FN(gf16_shuffle_prepare)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen) {
+void _FN(gf16_shuffle_prepare)(void* dst, const void* src, size_t srcLen) {
 #ifdef _AVAILABLE
 	gf16_prepare(dst, src, srcLen, sizeof(_mword)*2, &_FN(gf16_shuffle_prepare_block), &_FN(gf16_shuffle_prepare_blocku));
 	_MM_END
diff --git a/gf16/gf16_shuffle_x86_prepare.h b/gf16/gf16_shuffle_x86_prepare.h
index 9754ebc4..e29070fd 100644
--- a/gf16/gf16_shuffle_x86_prepare.h
+++ b/gf16/gf16_shuffle_x86_prepare.h
@@ -2,7 +2,7 @@
 #include "gf16_shuffle_x86_common.h"
 #include "gf16_checksum_x86.h"
 
-static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle_prepare_block)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) {
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle_prepare_block)(void* dst, const void* src) {
 	_mword ta = _MMI(loadu)((_mword*)src);
 	_mword tb = _MMI(loadu)((_mword*)src + 1);
 	
@@ -17,7 +17,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle_prepare_block)(void *HEDLEY_RE
 	);
 }
 // final block
-static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle_prepare_blocku)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) {
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_shuffle_prepare_blocku)(void* dst, const void* src, size_t remaining) {
 	_mword ta, tb;
 	if(remaining & sizeof(_mword))
 		ta = _MMI(loadu)((_mword*)src);
diff --git a/gf16/gf16_xor.h b/gf16/gf16_xor.h
index bd0b50e6..54e246bd 100644
--- a/gf16/gf16_xor.h
+++ b/gf16/gf16_xor.h
@@ -4,7 +4,7 @@
 #define FUNCS(v) \
 	void* gf16_xor_jit_init_##v(int polynomial, int jitOptStrat); \
 	void* gf16_xor_jit_init_mut_##v(); \
-	void gf16_xor_prepare_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen); \
+	void gf16_xor_prepare_##v(void* dst, const void* src, size_t srcLen); \
 	void gf16_xor_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
 	void gf16_xor_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
 	void gf16_xor_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen); \
@@ -12,7 +12,7 @@
 	void gf16_xor_finish_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \
 	int gf16_xor_finish_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \
 	int gf16_xor_finish_partial_packsum_##v(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen); \
-	void gf16_xor_jit_mul_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
+	void gf16_xor_jit_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_xor_jit_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_xor_jit_muladd_prefetch_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); \
 	extern int gf16_xor_available_##v
@@ -30,7 +30,7 @@ void gf16_xor_jit_uninit(void* scratch);
 
 // non-JIT version
 void* gf16_xor_init_sse2(int polynomial);
-void gf16_xor_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
+void gf16_xor_mul_sse2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_xor_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 
 
@@ -40,3 +40,7 @@ void gf16_xor_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_REST
 #define GF16_XOR_JIT_STRAT_COPY 2
 #define GF16_XOR_JIT_STRAT_CLR 3
 
+
+uint16_t gf16_xor16_replace_word(void* data, size_t index, uint16_t newValue);
+uint16_t gf16_xor32_replace_word(void* data, size_t index, uint16_t newValue);
+uint16_t gf16_xor64_replace_word(void* data, size_t index, uint16_t newValue);
diff --git a/gf16/gf16_xor_avx2.c b/gf16/gf16_xor_avx2.c
index f492f830..a81329bc 100644
--- a/gf16/gf16_xor_avx2.c
+++ b/gf16/gf16_xor_avx2.c
@@ -182,7 +182,7 @@ static inline int xor_write_avx_main_part(void* jitptr, uint8_t dep1, uint8_t de
 	return xor256_jit_len[dep];
 }
 
-static inline void* xor_write_jit_avx(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int xor, const int prefetch) {
+static inline void* xor_write_jit_avx(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int mode, const int prefetch) {
 	uint_fast32_t bit;
 	
 	__m256i depmask = _mm256_load_si256((__m256i*)scratch->deps + (val & 0xf)*4);
@@ -226,7 +226,7 @@ static inline void* xor_write_jit_avx(const struct gf16_xor_scratch *HEDLEY_REST
 	tmp3 = _mm_xor_si128(tmp3, common_elim);
 	tmp4 = _mm_xor_si128(tmp4, common_elim);
 	
-	if(!xor) {
+	if(mode != XORDEP_JIT_MODE_MULADD) {
 		lowest = ssse3_tzcnt_epi16(tmp3);
 		_mm_store_si128((__m128i*)dep1_lowest, lowest);
 		tmp3 = _mm_and_si128(tmp3, _mm_add_epi16(tmp3, _mm_set1_epi16(-1)));
@@ -292,7 +292,7 @@ static inline void* xor_write_jit_avx(const struct gf16_xor_scratch *HEDLEY_REST
 	#define _C_PXOR_R(rD, r2, r1, c) jitptr += _jit_vpxor_r(jitptr, rD, r2, r1) & -(c)
 	
 	/* generate code */
-	if(xor) {
+	if(mode == XORDEP_JIT_MODE_MULADD) {
 		for(bit=0; bit<8; bit++) {
 			int destOffs = (bit<<6)-128;
 			int destOffs2 = destOffs+32;
@@ -385,29 +385,40 @@ static inline void* xor_write_jit_avx(const struct gf16_xor_scratch *HEDLEY_REST
 	return jitptr+5;
 }
 
-static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_avx2_base(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int add, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) {
+static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_avx2_base(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int mode, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) {
 	jit_wx_pair* jit = (jit_wx_pair*)mutScratch;
-	gf16_xorjit_write_jit(scratch, coefficient, jit, add, doPrefetch, &xor_write_jit_avx);
-	
-	gf16_xor256_jit_stub(
-		(intptr_t)src - 384,
-		(intptr_t)dst + len - 384,
-		(intptr_t)dst - 384,
-		(intptr_t)prefetch - 128,
-		jit->x
-	);
+	gf16_xorjit_write_jit(scratch, coefficient, jit, mode, doPrefetch, &xor_write_jit_avx);
+	
+	if(mode == XORDEP_JIT_MODE_MUL_INSITU) {
+		ALIGN_TO(32, __m256i spill[3]);
+		gf16_xor256_jit_stub(
+			(intptr_t)spill + 128,
+			(intptr_t)dst + len - 384,
+			(intptr_t)dst - 384,
+			(intptr_t)prefetch - 128,
+			(uint8_t*)jit->x + XORDEP_JIT_SIZE/2
+		);
+	} else {
+		gf16_xor256_jit_stub(
+			(intptr_t)src - 384,
+			(intptr_t)dst + len - 384,
+			(intptr_t)dst - 384,
+			(intptr_t)prefetch - 128,
+			jit->x
+		);
+	}
 	
 	_mm256_zeroupper();
 }
 #endif /* defined(__AVX2__) && defined(PLATFORM_AMD64) */
 
-void gf16_xor_jit_mul_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_xor_jit_mul_avx2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 #if defined(__AVX2__) && defined(PLATFORM_AMD64)
 	if(coefficient == 0) {
 		memset(dst, 0, len);
 		return;
 	}
-	gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, 0, 0, NULL);
+	gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, dst==src ? XORDEP_JIT_MODE_MUL_INSITU : XORDEP_JIT_MODE_MUL, 0, NULL);
 #else
 	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch);
 #endif
@@ -416,7 +427,7 @@ void gf16_xor_jit_mul_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RES
 void gf16_xor_jit_muladd_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 #if defined(__AVX2__) && defined(PLATFORM_AMD64)
 	if(coefficient == 0) return;
-	gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, 1, 0, NULL);
+	gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, 0, NULL);
 #else
 	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch);
 #endif
@@ -425,7 +436,7 @@ void gf16_xor_jit_muladd_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_
 void gf16_xor_jit_muladd_prefetch_avx2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) {
 #if defined(__AVX2__) && defined(PLATFORM_AMD64)
 	if(coefficient == 0) return;
-	gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, 1, _MM_HINT_T1, prefetch);
+	gf16_xor_jit_mul_avx2_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, _MM_HINT_T1, prefetch);
 #else
 	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); UNUSED(prefetch);
 #endif
@@ -664,16 +675,28 @@ GF_FINISH_PACKED_FUNCS_STUB(gf16_xor, _avx2)
 
 
 #if defined(__AVX2__) && defined(PLATFORM_AMD64)
-static size_t xor_write_init_jit(uint8_t *jitCode) {
-	uint8_t *jitCodeStart = jitCode;
-	jitCode += _jit_add_i(jitCode, AX, 512);
-	jitCode += _jit_add_i(jitCode, DX, 512);
+static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast8_t* sizeNorm, uint_fast8_t* sizeInsitu) {
+	uint8_t *jitCodeStart = jitCodeNorm;
+	jitCodeNorm += _jit_add_i(jitCodeNorm, AX, 512);
+	jitCodeNorm += _jit_add_i(jitCodeNorm, DX, 512);
 	
 	/* only 64-bit supported*/
 	for(int i=3; i<16; i++) {
-		jitCode += _jit_vmovdqa_load(jitCode, i, AX, lshift32(i-4, 5));
+		jitCodeNorm += _jit_vmovdqa_load(jitCodeNorm, i, AX, lshift32(i-4, 5));
+	}
+	if(sizeNorm) *sizeNorm = jitCodeNorm-jitCodeStart;
+	
+	
+	jitCodeStart = jitCodeInsitu;
+	jitCodeInsitu += _jit_add_i(jitCodeInsitu, DX, 512);
+	
+	for(int i=0; i<16; i++) {
+		jitCodeInsitu += _jit_vmovdqa_load(jitCodeInsitu, i, DX, lshift32(i-4, 5));
+	}
+	for(int i=0; i<3; i++) {
+		jitCodeInsitu += _jit_vmovdqa_store(jitCodeInsitu, AX, lshift32(i-4, 5), i);
 	}
-	return jitCode-jitCodeStart;
+	if(sizeInsitu) *sizeInsitu = jitCodeInsitu-jitCodeStart;
 }
 
 # include "gf16_bitdep_init_avx2.h"
@@ -691,7 +714,7 @@ void* gf16_xor_jit_init_avx2(int polynomial, int jitOptStrat) {
 	gf16_xor_create_jit_lut_avx2();
 	
 	ret->jitOptStrat = jitOptStrat;
-	ret->codeStart = (uint_fast8_t)xor_write_init_jit(tmpCode);
+	xor_write_init_jit(tmpCode, tmpCode, &(ret->codeStart), &(ret->codeStartInsitu));
 	return ret;
 #else
 	UNUSED(polynomial); UNUSED(jitOptStrat);
@@ -703,7 +726,7 @@ void* gf16_xor_jit_init_mut_avx2() {
 #if defined(__AVX2__) && defined(PLATFORM_AMD64)
 	jit_wx_pair *jitCode = jit_alloc(XORDEP_JIT_SIZE);
 	if(!jitCode) return NULL;
-	xor_write_init_jit(jitCode->w);
+	xor_write_init_jit(jitCode->w, jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL);
 	return jitCode;
 #else
 	return NULL;
diff --git a/gf16/gf16_xor_avx512.c b/gf16/gf16_xor_avx512.c
index 487612bd..cce0625b 100644
--- a/gf16/gf16_xor_avx512.c
+++ b/gf16/gf16_xor_avx512.c
@@ -364,7 +364,7 @@ static HEDLEY_ALWAYS_INLINE int xor_avx512_merge_part(uint8_t *HEDLEY_RESTRICT j
 }
 
 
-static inline void* xor_write_jit_avx512(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int xor, const int prefetch) {
+static inline void* xor_write_jit_avx512(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int mode, const int prefetch) {
 	uint_fast32_t bit;
 	
 	__m256i depmask = _mm256_load_si256((__m256i*)scratch->deps + (val & 0xf)*4);
@@ -420,7 +420,7 @@ static inline void* xor_write_jit_avx512(const struct gf16_xor_scratch *HEDLEY_R
 	jitptr += _jit_vmovdqa32_load(jitptr, 16, DX, 0);
 	
 	/* generate code */
-	if(xor) {
+	if(mode == XORDEP_JIT_MODE_MULADD) {
 		for(bit=0; bit<8; bit++) {
 			int destOffs = bit<<7;
 			int destOffs2 = destOffs+64;
@@ -743,9 +743,9 @@ static void* xor_write_jit_avx512_multi(const struct gf16_xor_scratch *HEDLEY_RE
 	return jitptr;
 }
 
-static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_avx512_base(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int add, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) {
+static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_avx512_base(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int mode, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) {
 	jit_wx_pair* jit = (jit_wx_pair*)mutScratch;
-	gf16_xorjit_write_jit(scratch, coefficient, jit, add, doPrefetch, &xor_write_jit_avx512);
+	gf16_xorjit_write_jit(scratch, coefficient, jit, mode, doPrefetch, &xor_write_jit_avx512);
 	
 	gf16_xor512_jit_stub(
 		(intptr_t)dst - 1024,
@@ -760,13 +760,13 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_avx512_base(const void *HEDLEY
 
 #endif /* defined(__AVX512BW__) && defined(__AVX512VL__) && defined(PLATFORM_AMD64) */
 
-void gf16_xor_jit_mul_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_xor_jit_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 #if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(PLATFORM_AMD64)
 	if(coefficient == 0) {
 		memset(dst, 0, len);
 		return;
 	}
-	gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, 0, 0, NULL);
+	gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MUL, 0, NULL);
 #else
 	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch);
 #endif
@@ -775,7 +775,7 @@ void gf16_xor_jit_mul_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_R
 void gf16_xor_jit_muladd_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 #if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(PLATFORM_AMD64)
 	if(coefficient == 0) return;
-	gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, 1, 0, NULL);
+	gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, 0, NULL);
 #else
 	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch);
 #endif
@@ -784,7 +784,7 @@ void gf16_xor_jit_muladd_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLE
 void gf16_xor_jit_muladd_prefetch_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) {
 #if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(PLATFORM_AMD64)
 	if(coefficient == 0) return;
-	gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, 1, _MM_HINT_T1, prefetch);
+	gf16_xor_jit_mul_avx512_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, _MM_HINT_T1, prefetch);
 #else
 	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); UNUSED(prefetch);
 #endif
diff --git a/gf16/gf16_xor_common.h b/gf16/gf16_xor_common.h
index 730671a7..80877440 100644
--- a/gf16/gf16_xor_common.h
+++ b/gf16/gf16_xor_common.h
@@ -8,6 +8,9 @@
 #define XORDEP_JIT_SIZE 4096
 #define XORDEP_JIT_CODE_SIZE 1280
 
+#define XORDEP_JIT_MODE_MUL 0
+#define XORDEP_JIT_MODE_MULADD 1
+#define XORDEP_JIT_MODE_MUL_INSITU 2
 
 /* we support MSVC and GCC style ASM */
 #ifdef PLATFORM_AMD64
@@ -113,18 +116,26 @@ struct gf16_xor_scratch {
 	uint8_t deps[16*16*2*4];
 	int jitOptStrat; // GF16_XOR_JIT_STRAT_*
 	uint_fast8_t codeStart;
+	uint_fast8_t codeStartInsitu;
 };
 
 
 #ifdef __SSE2__
 typedef void*(*gf16_xorjit_write_func)(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int xor, const int prefetch);
-static HEDLEY_ALWAYS_INLINE void gf16_xorjit_write_jit(const void *HEDLEY_RESTRICT scratch, uint16_t coefficient, jit_wx_pair* jit, const int add, const int prefetch, gf16_xorjit_write_func writeFunc) {
+static HEDLEY_ALWAYS_INLINE void gf16_xorjit_write_jit(const void *HEDLEY_RESTRICT scratch, uint16_t coefficient, jit_wx_pair* jit, const int mode, const int prefetch, gf16_xorjit_write_func writeFunc) {
 	const struct gf16_xor_scratch *HEDLEY_RESTRICT info = (const struct gf16_xor_scratch*)scratch;
-	uint8_t* jitptr = (uint8_t*)jit->w + info->codeStart;
+	uint8_t* jitWPtr = (uint8_t*)jit->w;
+	uint8_t* jitptr;
+	if(mode == XORDEP_JIT_MODE_MUL_INSITU) {
+		jitWPtr += XORDEP_JIT_SIZE/2;
+		jitptr = jitWPtr + info->codeStartInsitu;
+	} else {
+		jitptr = jitWPtr + info->codeStart;
+	}
 	
 	if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT || info->jitOptStrat == GF16_XOR_JIT_STRAT_COPY) {
 		ALIGN_TO(_GF16_XORJIT_COPY_ALIGN, uint8_t jitTemp[XORDEP_JIT_CODE_SIZE]);
-		uintptr_t copyOffset = info->codeStart;
+		uintptr_t copyOffset = (mode == XORDEP_JIT_MODE_MUL_INSITU) ? info->codeStartInsitu : info->codeStart;
 		if((uintptr_t)jitptr & (_GF16_XORJIT_COPY_ALIGN-1)) {
 			// copy unaligned part
 #if _GF16_XORJIT_COPY_ALIGN == 32 && defined(__AVX2__)
@@ -138,13 +149,13 @@ static HEDLEY_ALWAYS_INLINE void gf16_xorjit_write_jit(const void *HEDLEY_RESTRI
 		else
 			jitptr = jitTemp;
 		
-		jitptr = writeFunc(info, jitptr, coefficient, add, prefetch);
+		jitptr = writeFunc(info, jitptr, coefficient, mode, prefetch);
 		write32(jitptr, (int32_t)(jitTemp - copyOffset - jitptr -4));
 		jitptr[4] = 0xC3; /* ret */
 		jitptr += 5;
 		
 		/* memcpy to destination */
-		uint8_t* jitdst = (uint8_t*)jit->w + copyOffset;
+		uint8_t* jitdst = jitWPtr + copyOffset;
 		if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT) {
 			// 256-bit NT copies never seem to be better, so just stick to 128-bit
 			for(uint_fast32_t i=0; i<(uint_fast32_t)(jitptr-jitTemp); i+=64) {
@@ -185,8 +196,8 @@ static HEDLEY_ALWAYS_INLINE void gf16_xorjit_write_jit(const void *HEDLEY_RESTRI
 			for(int i=0; i<XORDEP_JIT_CODE_SIZE; i+=64)
 				jitptr[i] = 0;
 		}
-		jitptr = writeFunc(info, jitptr, coefficient, add, prefetch);
-		write32(jitptr, (int32_t)((uint8_t*)jit->w - jitptr -4));
+		jitptr = writeFunc(info, jitptr, coefficient, mode, prefetch);
+		write32(jitptr, (int32_t)(jitWPtr - jitptr -4));
 		jitptr[4] = 0xC3; /* ret */
 	}
 	#ifdef GF16_XORJIT_ENABLE_DUAL_MAPPING
diff --git a/gf16/gf16_xor_common_funcs.h b/gf16/gf16_xor_common_funcs.h
index 435e866e..c607da06 100644
--- a/gf16/gf16_xor_common_funcs.h
+++ b/gf16/gf16_xor_common_funcs.h
@@ -1,6 +1,7 @@
 
 #include "../src/hedley.h"
 #include <string.h>
+#include <assert.h>
 
 /* type returned by *movemask* function */
 #if MWORD_SIZE == 64
@@ -18,16 +19,16 @@
 
 #ifdef _AVAILABLE
 # include "gf16_checksum_x86.h"
-static HEDLEY_ALWAYS_INLINE void gf16_xor_prep_write(_mword ta, _mword tb, umask_t* _dst) {
+static HEDLEY_ALWAYS_INLINE void gf16_xor_prep_split(_mword ta, _mword tb, _mword* tl, _mword* th) {
 	/* split to high/low parts */
 #if MWORD_SIZE == 64
 	// arrange to hlhl...
 	_mword tmp1 = _mm512_shuffle_epi8(ta, _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200));
 	_mword tmp2 = _mm512_shuffle_epi8(tb, _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200));
-	_mword th = _mm512_permutex2var_epi64(tmp1, _mm512_set_epi64(
+	*th = _mm512_permutex2var_epi64(tmp1, _mm512_set_epi64(
 		15, 13, 11, 9, 7, 5, 3, 1
 	), tmp2);
-	_mword tl = _mm512_permutex2var_epi64(tmp1, _mm512_set_epi64(
+	*tl = _mm512_permutex2var_epi64(tmp1, _mm512_set_epi64(
 		14, 12, 10, 8, 6, 4, 2, 0
 	), tmp2);
 #elif MWORD_SIZE == 32
@@ -41,48 +42,83 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_prep_write(_mword ta, _mword tb, umask
 		0x0e0c0a08, 0x06040200, 0x0f0d0b09, 0x07050301,
 		0x0e0c0a08, 0x06040200, 0x0f0d0b09, 0x07050301
 	));
-	_mword th = _mm256_blend_epi32(tmp1, tmp2, 0x33);
-	_mword tl = _mm256_blend_epi32(tmp2, tmp1, 0x33);
-	tl = _mm256_permute4x64_epi64(tl, _MM_SHUFFLE(3,1,2,0));
-	th = _mm256_permute4x64_epi64(th, _MM_SHUFFLE(2,0,3,1));
+	*th = _mm256_blend_epi32(tmp1, tmp2, 0x33);
+	*tl = _mm256_blend_epi32(tmp2, tmp1, 0x33);
+	*tl = _mm256_permute4x64_epi64(*tl, _MM_SHUFFLE(3,1,2,0));
+	*th = _mm256_permute4x64_epi64(*th, _MM_SHUFFLE(2,0,3,1));
 #else
-	_mword th = _mm_packus_epi16(
+	*th = _mm_packus_epi16(
 		_mm_srli_epi16(tb, 8),
 		_mm_srli_epi16(ta, 8)
 	);
-	_mword tl = _mm_packus_epi16(
+	*tl = _mm_packus_epi16(
 		_mm_and_si128(tb, _mm_set1_epi16(0xff)),
 		_mm_and_si128(ta, _mm_set1_epi16(0xff))
 	);
 #endif
-	
-	/* save to dest by extracting masks */
-	_dst[0] = MOVMASK(th);
-	for(int i=1; i<8; i++) {
-		th = _MM(add_epi8)(th, th);
-		_dst[i*8] = MOVMASK(th);
-	}
-	_dst[64] = MOVMASK(tl);
+}
+static HEDLEY_ALWAYS_INLINE void gf16_xor_prep_write(umask_t* _dst, _mword bytes) {
+	_dst[0] = MOVMASK(bytes);
 	for(int i=1; i<8; i++) {
-		tl = _MM(add_epi8)(tl, tl);
-		_dst[64+i*8] = MOVMASK(tl);
+		bytes = _MM(add_epi8)(bytes, bytes);
+		_dst[i*8] = MOVMASK(bytes);
 	}
 }
 
 static HEDLEY_ALWAYS_INLINE void _FN(gf16_xor_prepare_block)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) {
 	uint8_t* _src = (uint8_t*)src;
 	umask_t* _dst = (umask_t*)dst;
+	_mword tl, th;
 	for(int j=0; j<8; j++) {
-		gf16_xor_prep_write(
-			_MMI(loadu)((_mword*)_src),
-			_MMI(loadu)((_mword*)_src + 1),
-			_dst
-		);
+		gf16_xor_prep_split(_MMI(loadu)((_mword*)_src), _MMI(loadu)((_mword*)_src + 1), &tl, &th);
+		
+		/* save to dest by extracting masks */
+		gf16_xor_prep_write(_dst, th);
+		gf16_xor_prep_write(_dst+64, tl);
+		
 		_src += sizeof(_mword)*2;
 		_dst++;
 	}
 }
-static HEDLEY_ALWAYS_INLINE void _FN(gf16_xor_prepare_blocku)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) {
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_xor_prepare_block_insitu)(void* dst, const void* src) {
+	assert(dst == src);
+	_mword* _src = (_mword*)src;
+	umask_t* _dst = (umask_t*)dst;
+	
+	_mword tl0, tl1, tl2, tl3, tl4, tl5, tl6, tl7;
+	_mword th0, th1, th2, th3, th4, th5, th6, th7;
+	
+	// load 8 registers (need to load the first half of the block)
+	gf16_xor_prep_split(_MMI(loadu)(_src + 0), _MMI(loadu)(_src + 1), &tl0, &th0);
+	gf16_xor_prep_split(_MMI(loadu)(_src + 2), _MMI(loadu)(_src + 3), &tl1, &th1);
+	gf16_xor_prep_split(_MMI(loadu)(_src + 4), _MMI(loadu)(_src + 5), &tl2, &th2);
+	gf16_xor_prep_split(_MMI(loadu)(_src + 6), _MMI(loadu)(_src + 7), &tl3, &th3);
+	
+	// free up 4 of them (th* can now be freely written)
+	gf16_xor_prep_write(_dst+0, th0);
+	gf16_xor_prep_write(_dst+1, th1);
+	gf16_xor_prep_write(_dst+2, th2);
+	gf16_xor_prep_write(_dst+3, th3);
+	
+	gf16_xor_prep_split(_MMI(loadu)(_src + 8), _MMI(loadu)(_src + 9), &tl4, &th4);
+	gf16_xor_prep_write(_dst+4, th4);
+	gf16_xor_prep_split(_MMI(loadu)(_src + 10), _MMI(loadu)(_src + 11), &tl5, &th5);
+	gf16_xor_prep_write(_dst+5, th5);
+	gf16_xor_prep_split(_MMI(loadu)(_src + 12), _MMI(loadu)(_src + 13), &tl6, &th6);
+	gf16_xor_prep_write(_dst+6, th6);
+	gf16_xor_prep_split(_MMI(loadu)(_src + 14), _MMI(loadu)(_src + 15), &tl7, &th7);
+	gf16_xor_prep_write(_dst+7, th7);
+	
+	gf16_xor_prep_write(_dst+64, tl0);
+	gf16_xor_prep_write(_dst+65, tl1);
+	gf16_xor_prep_write(_dst+66, tl2);
+	gf16_xor_prep_write(_dst+67, tl3);
+	gf16_xor_prep_write(_dst+68, tl4);
+	gf16_xor_prep_write(_dst+69, tl5);
+	gf16_xor_prep_write(_dst+70, tl6);
+	gf16_xor_prep_write(_dst+71, tl7);
+}
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_xor_prepare_blocku)(void* dst, const void* src, size_t remaining) {
 	// handle unaligned area with a simple copy and repeat
 	uint8_t tmp[MWORD_SIZE*16] = {0};
 	memcpy(tmp, src, remaining);
@@ -92,9 +128,14 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_xor_prepare_blocku)(void *HEDLEY_RESTR
 
 
 
-void _FN(gf16_xor_prepare)(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen) {
+void _FN(gf16_xor_prepare)(void* dst, const void* src, size_t srcLen) {
 #ifdef _AVAILABLE
-	gf16_prepare(dst, src, srcLen, sizeof(_mword)*16, &_FN(gf16_xor_prepare_block), &_FN(gf16_xor_prepare_blocku));
+	if(dst == src) {
+		// prepare_blocku is unused for in-situ prepare
+		assert(srcLen % (sizeof(_mword)*16) == 0);
+		gf16_prepare(dst, src, srcLen, sizeof(_mword)*16, &_FN(gf16_xor_prepare_block_insitu), &_FN(gf16_xor_prepare_blocku));
+	} else
+		gf16_prepare(dst, src, srcLen, sizeof(_mword)*16, &_FN(gf16_xor_prepare_block), &_FN(gf16_xor_prepare_blocku));
 	_MM_END
 #else
 	UNUSED(dst); UNUSED(src); UNUSED(srcLen);
diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c
index ff4117c0..e4f27287 100644
--- a/gf16/gf16_xor_sse2.c
+++ b/gf16/gf16_xor_sse2.c
@@ -315,7 +315,7 @@ static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_nc_noxor(uint8_t* des
 
 
 
-static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int xor, const int prefetch) {
+static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT jitptr, uint16_t val, const int mode, const int prefetch) {
 	uint_fast32_t bit;
 	ALIGN_TO(16, uint32_t lumask[8]);
 	
@@ -495,7 +495,7 @@ static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_REST
 		jitptr += ((c)<<2)+(c)
 	
 	/* generate code */
-	if(xor) {
+	if(mode == XORDEP_JIT_MODE_MULADD) {
 		for(bit=0; bit<8; bit++) {
 			int destOffs = (bit<<5)-128;
 			int destOffs2 = destOffs+16;
@@ -669,30 +669,46 @@ static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_REST
 	return jitptr;
 }
 
-static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_sse2_base(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int add, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) {
+static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_mul_sse2_base(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const int mode, const int doPrefetch, const void *HEDLEY_RESTRICT prefetch) {
 	jit_wx_pair* jit = (jit_wx_pair*)mutScratch;
-	gf16_xorjit_write_jit(scratch, coefficient, jit, add, doPrefetch, &xor_write_jit_sse);
+	gf16_xorjit_write_jit(scratch, coefficient, jit, mode, doPrefetch, &xor_write_jit_sse);
 	
 	// exec
 	/* adding 128 to the destination pointer allows the register offset to be coded in 1 byte
 	 * eg: 'movdqa xmm0, [rdx+0x90]' is 8 bytes, whilst 'movdqa xmm0, [rdx-0x60]' is 5 bytes */
-	gf16_xor_jit_stub(
-		(intptr_t)src - 128,
-		(intptr_t)dst + len - 128,
-		(intptr_t)dst - 128,
-		(intptr_t)prefetch - 128,
-		jit->x
-	);
+	if(mode == XORDEP_JIT_MODE_MUL_INSITU) {
+		// need a place to store a copy of the source, that won't fit in registers; these will be used as the memory source
+#ifdef PLATFORM_AMD64
+		ALIGN_TO(16, __m128i spill[3]);
+#else
+		ALIGN_TO(16, __m128i spill[11]);
+#endif
+		gf16_xor_jit_stub(
+			(intptr_t)spill + 128,
+			(intptr_t)dst + len - 128,
+			(intptr_t)dst - 128,
+			(intptr_t)prefetch - 128,
+			(uint8_t*)jit->x + XORDEP_JIT_SIZE/2
+		);
+	} else {
+		gf16_xor_jit_stub(
+			(intptr_t)src - 128,
+			(intptr_t)dst + len - 128,
+			(intptr_t)dst - 128,
+			(intptr_t)prefetch - 128,
+			jit->x
+		);
+	}
 }
 #endif /* defined(__SSE2__) */
 
-void gf16_xor_jit_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_xor_jit_mul_sse2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 #ifdef __SSE2__
 	if(coefficient == 0) {
 		memset(dst, 0, len);
 		return;
 	}
-	gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, 0, 0, NULL);
+	gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, dst==src ? XORDEP_JIT_MODE_MUL_INSITU : XORDEP_JIT_MODE_MUL, 0, NULL);
 #else
 	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch);
 #endif
@@ -701,7 +717,7 @@ void gf16_xor_jit_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RES
 void gf16_xor_jit_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 #ifdef __SSE2__
 	if(coefficient == 0) return;
-	gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, 1, 0, NULL);
+	gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, 0, NULL);
 #else
 	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch);
 #endif
@@ -710,7 +726,7 @@ void gf16_xor_jit_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_
 void gf16_xor_jit_muladd_prefetch_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) {
 #ifdef __SSE2__
 	if(coefficient == 0) return;
-	gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, 1, _MM_HINT_T1, prefetch);
+	gf16_xor_jit_mul_sse2_base(scratch, dst, src, len, coefficient, mutScratch, XORDEP_JIT_MODE_MULADD, _MM_HINT_T1, prefetch);
 #else
 	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(mutScratch); UNUSED(prefetch);
 #endif
@@ -818,9 +834,53 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_write_deptable(intptr_t *HEDLEY_RESTRI
 		*/
 	}
 }
+
+static HEDLEY_ALWAYS_INLINE void gf16_xor_mul_block_sse2(const uint8_t* inP, uint8_t* outP, uint_fast32_t counts[16], intptr_t deptable[256]) {
+	/* Note that we assume that all counts are at least 1; I don't think it's possible for that to be false */
+	#define STEP(bit, type, typev, typed) { \
+		intptr_t* deps = deptable + bit*16; \
+		typev tmp = _mm_load_ ## type((typed*)(inP + deps[ 0])); \
+		HEDLEY_ASSUME(counts[bit] <= 15); \
+		switch(counts[bit]) { \
+			case 15: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[15])); /* FALLTHRU */ \
+			case 14: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[14])); /* FALLTHRU */ \
+			case 13: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[13])); /* FALLTHRU */ \
+			case 12: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[12])); /* FALLTHRU */ \
+			case 11: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[11])); /* FALLTHRU */ \
+			case 10: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[10])); /* FALLTHRU */ \
+			case  9: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 9])); /* FALLTHRU */ \
+			case  8: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 8])); /* FALLTHRU */ \
+			case  7: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 7])); /* FALLTHRU */ \
+			case  6: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 6])); /* FALLTHRU */ \
+			case  5: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 5])); /* FALLTHRU */ \
+			case  4: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 4])); /* FALLTHRU */ \
+			case  3: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 3])); /* FALLTHRU */ \
+			case  2: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 2])); /* FALLTHRU */ \
+			case  1: tmp = _mm_xor_ ## type(tmp, *(typev*)(inP + deps[ 1])); /* FALLTHRU */ \
+		} \
+		_mm_store_ ## type((typed*)outP + bit, tmp); \
+	}
+	STEP( 0, si128, __m128i, __m128i)
+	STEP( 1, si128, __m128i, __m128i)
+	STEP( 2, si128, __m128i, __m128i)
+	STEP( 3, si128, __m128i, __m128i)
+	STEP( 4, si128, __m128i, __m128i)
+	STEP( 5, si128, __m128i, __m128i)
+	STEP( 6, si128, __m128i, __m128i)
+	STEP( 7, si128, __m128i, __m128i)
+	STEP( 8, si128, __m128i, __m128i)
+	STEP( 9, si128, __m128i, __m128i)
+	STEP(10, si128, __m128i, __m128i)
+	STEP(11, si128, __m128i, __m128i)
+	STEP(12, si128, __m128i, __m128i)
+	STEP(13, si128, __m128i, __m128i)
+	STEP(14, si128, __m128i, __m128i)
+	STEP(15, si128, __m128i, __m128i)
+	#undef STEP
+}
 #endif
 
-void gf16_xor_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
+void gf16_xor_mul_sse2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #ifdef __SSE2__
 	if(val == 0) {
@@ -831,51 +891,24 @@ void gf16_xor_mul_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRIC
 	ALIGN_TO(16, intptr_t deptable[256]);
 	uint8_t* _dst = (uint8_t*)dst + len;
 	
-	gf16_xor_write_deptable(deptable, counts, (uint8_t*)scratch, val, (uintptr_t)src - (uintptr_t)dst);
-	
-	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)*16) {
-		uint8_t* p = _dst + ptr;
-		/* Note that we assume that all counts are at least 1; I don't think it's possible for that to be false */
-		#define STEP(bit, type, typev, typed) { \
-			intptr_t* deps = deptable + bit*16; \
-			typev tmp = _mm_load_ ## type((typed*)(p + deps[ 0])); \
-			HEDLEY_ASSUME(counts[bit] <= 15); \
-			switch(counts[bit]) { \
-				case 15: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[15])); /* FALLTHRU */ \
-				case 14: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[14])); /* FALLTHRU */ \
-				case 13: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[13])); /* FALLTHRU */ \
-				case 12: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[12])); /* FALLTHRU */ \
-				case 11: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[11])); /* FALLTHRU */ \
-				case 10: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[10])); /* FALLTHRU */ \
-				case  9: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 9])); /* FALLTHRU */ \
-				case  8: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 8])); /* FALLTHRU */ \
-				case  7: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 7])); /* FALLTHRU */ \
-				case  6: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 6])); /* FALLTHRU */ \
-				case  5: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 5])); /* FALLTHRU */ \
-				case  4: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 4])); /* FALLTHRU */ \
-				case  3: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 3])); /* FALLTHRU */ \
-				case  2: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 2])); /* FALLTHRU */ \
-				case  1: tmp = _mm_xor_ ## type(tmp, *(typev*)(p + deps[ 1])); /* FALLTHRU */ \
-			} \
-			_mm_store_ ## type((typed*)p + bit, tmp); \
+	if(dst == src) {
+		// for in-situ mul, write to a temp block and copy back
+		ALIGN_TO(16, uint8_t tmp[256]);
+		__m128i* _tmp = (__m128i*)tmp;
+		gf16_xor_write_deptable(deptable, counts, (uint8_t*)scratch, val, 0);
+		for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)*16) {
+			uint8_t* p = _dst + ptr;
+			gf16_xor_mul_block_sse2(p, tmp, counts, deptable);
+			for(int i=0; i<16; i++) {
+				_mm_store_si128((__m128i*)p + i, _mm_load_si128(_tmp+i));
+			}
+		}
+	} else {
+		gf16_xor_write_deptable(deptable, counts, (uint8_t*)scratch, val, (uintptr_t)src - (uintptr_t)dst);
+		
+		for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)*16) {
+			gf16_xor_mul_block_sse2(_dst + ptr, _dst + ptr, counts, deptable);
 		}
-		STEP( 0, si128, __m128i, __m128i)
-		STEP( 1, si128, __m128i, __m128i)
-		STEP( 2, si128, __m128i, __m128i)
-		STEP( 3, si128, __m128i, __m128i)
-		STEP( 4, si128, __m128i, __m128i)
-		STEP( 5, si128, __m128i, __m128i)
-		STEP( 6, si128, __m128i, __m128i)
-		STEP( 7, si128, __m128i, __m128i)
-		STEP( 8, si128, __m128i, __m128i)
-		STEP( 9, si128, __m128i, __m128i)
-		STEP(10, si128, __m128i, __m128i)
-		STEP(11, si128, __m128i, __m128i)
-		STEP(12, si128, __m128i, __m128i)
-		STEP(13, si128, __m128i, __m128i)
-		STEP(14, si128, __m128i, __m128i)
-		STEP(15, si128, __m128i, __m128i)
-		#undef STEP
 	}
 #else
 	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val);
@@ -1164,23 +1197,47 @@ GF_FINISH_PACKED_FUNCS_STUB(gf16_xor, _sse2)
 #include "gf16_bitdep_init_sse2.h"
 
 #ifdef PLATFORM_X86
-static size_t xor_write_init_jit(uint8_t *jitCode) {
-	uint8_t *jitCodeStart = jitCode;
-	jitCode += _jit_add_i(jitCode, AX, 256);
-	jitCode += _jit_add_i(jitCode, DX, 256);
+static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast8_t* sizeNorm, uint_fast8_t* sizeInsitu) {
+	uint8_t *jitCodeStart = jitCodeNorm;
+	jitCodeNorm += _jit_add_i(jitCodeNorm, AX, 256);
+	jitCodeNorm += _jit_add_i(jitCodeNorm, DX, 256);
 	
 # ifdef PLATFORM_AMD64
 	/* preload upper 13 inputs into registers */
 	for(int i=3; i<16; i++) {
-		jitCode += _jit_movaps_load(jitCode, i, AX, lshift32(i-8, 4));
+		jitCodeNorm += _jit_movaps_load(jitCodeNorm, i, AX, lshift32(i-8, 4));
 	}
 # else
 	/* can only fit 5 in 32-bit mode :( */
 	for(int i=3; i<8; i++) { /* despite appearances, we're actually loading the top 5, not mid 5 */
-		jitCode += _jit_movaps_load(jitCode, i, AX, i<<4);
+		jitCodeNorm += _jit_movaps_load(jitCodeNorm, i, AX, i<<4);
 	}
 # endif
-	return jitCode-jitCodeStart;
+	
+	if(sizeNorm) *sizeNorm = jitCodeNorm-jitCodeStart;
+	
+	// in-situ version
+	jitCodeStart = jitCodeInsitu;
+	jitCodeInsitu += _jit_add_i(jitCodeInsitu, DX, 256);
+	
+# ifdef PLATFORM_AMD64
+	for(int i=0; i<16; i++) {
+		jitCodeInsitu += _jit_movaps_load(jitCodeInsitu, i, DX, lshift32(i-8, 4));
+	}
+	for(int i=0; i<3; i++) {
+		jitCodeInsitu += _jit_movaps_store(jitCodeInsitu, AX, lshift32(i-8, 4), i);
+	}
+# else
+	for(int i=0; i<11; i++) {
+		jitCodeInsitu += _jit_movaps_load(jitCodeInsitu, 0, DX, lshift32(i-8, 4));
+		jitCodeInsitu += _jit_movaps_store(jitCodeInsitu, AX, lshift32(i-8, 4), 0);
+	}
+	for(int i=3; i<8; i++) { /* despite appearances, we're actually loading the top 5, not mid 5 */
+		jitCodeInsitu += _jit_movaps_load(jitCodeInsitu, i, DX, i<<4);
+	}
+# endif
+	
+	if(sizeInsitu) *sizeInsitu = jitCodeInsitu-jitCodeStart;
 }
 #endif
 
@@ -1195,7 +1252,7 @@ void* gf16_xor_jit_init_sse2(int polynomial, int jitOptStrat) {
 	gf16_xor_create_jit_lut_sse2();
 	
 	ret->jitOptStrat = jitOptStrat;
-	ret->codeStart = (uint_fast8_t)xor_write_init_jit(tmpCode);
+	xor_write_init_jit(tmpCode, tmpCode, &(ret->codeStart), &(ret->codeStartInsitu));
 	return ret;
 #else
 	UNUSED(polynomial); UNUSED(jitOptStrat);
@@ -1207,7 +1264,7 @@ void* gf16_xor_jit_init_mut_sse2() {
 #ifdef PLATFORM_X86
 	jit_wx_pair *jitCode = jit_alloc(XORDEP_JIT_SIZE);
 	if(!jitCode) return NULL;
-	xor_write_init_jit(jitCode->w);
+	xor_write_init_jit(jitCode->w, jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL);
 	return jitCode;
 #else
 	return NULL;
@@ -1222,6 +1279,36 @@ void gf16_xor_jit_uninit(void* scratch) {
 #endif
 }
 
+static HEDLEY_ALWAYS_INLINE uint16_t gf16_xorX_replace_word(void* data, size_t index, uint16_t newValue, size_t width, unsigned byteFlip) {
+	uint8_t* base = (uint8_t*)data + (index & ~(width*8-1)) * 2; // advance pointer to correct group
+	base += ((index >> 3) & (width-1)) ^ byteFlip; // advance to correct byte
+	// TODO: remove byteFlip parameter
+	
+	unsigned bitIndex = index&7;
+	uint16_t oldValue = 0;
+	unsigned _newValue = newValue << bitIndex;
+	uint8_t byteMask = 1 << bitIndex;
+	for(int i=0; i<16; i++) {
+		uint8_t byte = base[i*width];
+		oldValue <<= 1;
+		_newValue <<= 1;
+		oldValue |= (byte >> bitIndex) & 1;
+		
+		base[i*width] = (byte & ~byteMask) | ((_newValue >> 16) & byteMask);
+	}
+	return oldValue;
+}
+uint16_t gf16_xor16_replace_word(void* data, size_t index, uint16_t newValue) {
+	return gf16_xorX_replace_word(data, index, newValue, 16, 1);
+}
+uint16_t gf16_xor32_replace_word(void* data, size_t index, uint16_t newValue) {
+	return gf16_xorX_replace_word(data, index, newValue, 32, 0);
+}
+uint16_t gf16_xor64_replace_word(void* data, size_t index, uint16_t newValue) {
+	return gf16_xorX_replace_word(data, index, newValue, 64, 0);
+}
+
+
 void* gf16_xor_init_sse2(int polynomial) {
 #ifdef __SSE2__
 	void* ret;
diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index 917de3e0..690d4a88 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -636,7 +636,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_shuffle2x_finish_partial_packsum_avx512;
 			copy_cksum = &gf16_cksum_copy_avx512;
 			copy_cksum_check = &gf16_cksum_copy_check_avx512;
-			replace_word = &gf16_shuffle32_replace_word;
+			replace_word = &gf16_shuffle2x32_replace_word;
 		break;
 		case GF16_SHUFFLE2X_AVX2:
 			scratch = gf16_shuffle_init_x86(GF16_POLYNOMIAL);
@@ -664,7 +664,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_shuffle2x_finish_partial_packsum_avx2;
 			copy_cksum = &gf16_cksum_copy_avx2;
 			copy_cksum_check = &gf16_cksum_copy_check_avx2;
-			replace_word = &gf16_shuffle16_replace_word;
+			replace_word = &gf16_shuffle2x16_replace_word;
 		break;
 		
 		case GF16_SHUFFLE_NEON:
@@ -697,6 +697,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			int available = gf16_clmul_init_arm(GF16_POLYNOMIAL);
 			
 			METHOD_REQUIRES(gf16_available_neon && available)
+			_mul = &gf16_clmul_mul_neon;
 			_mul_add = &gf16_clmul_muladd_neon;
 			_mul_add_multi = &gf16_clmul_muladd_multi_neon;
 			_mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_neon;
@@ -762,6 +763,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 		case GF16_SHUFFLE2X_128_SVE2:
 			METHOD_REQUIRES(gf16_available_sve2 && gf16_sve_get_size() >= 32)
 			
+			_mul = &gf16_shuffle2x_mul_128_sve2;
 			_mul_add = &gf16_shuffle2x_muladd_128_sve2;
 			_mul_add_multi = &gf16_shuffle2x_muladd_multi_128_sve2;
 			_mul_add_multi_packed = &gf16_shuffle2x_muladd_multi_packed_128_sve2;
@@ -785,6 +787,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			
 			scratch = gf16_shuffle_init_512_sve(GF16_POLYNOMIAL);
 			
+			_mul = &gf16_shuffle_mul_512_sve2;
 			_mul_add = &gf16_shuffle_muladd_512_sve2;
 			_mul_add_multi = &gf16_shuffle_muladd_multi_512_sve2;
 			_mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_512_sve2;
@@ -805,6 +808,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 		case GF16_CLMUL_SVE2:
 			METHOD_REQUIRES(gf16_available_sve2)
 			
+			_mul = &gf16_clmul_mul_sve2;
 			_mul_add = &gf16_clmul_muladd_sve2;
 			_mul_add_multi = &gf16_clmul_muladd_multi_sve2;
 			_mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_sve2;
@@ -915,6 +919,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 		case GF16_AFFINE2X_AVX512:
 			scratch = gf16_affine_init_avx512(GF16_POLYNOMIAL);
 			METHOD_REQUIRES(gf16_affine_available_avx512 && gf16_shuffle_available_avx512)
+			_mul = &gf16_affine2x_mul_avx512;
 			_mul_add = &gf16_affine2x_muladd_avx512;
 			_mul_add_multi = &gf16_affine2x_muladd_multi_avx512;
 			_mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_avx512;
@@ -937,12 +942,13 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_avx512;
 			copy_cksum = &gf16_cksum_copy_avx512;
 			copy_cksum_check = &gf16_cksum_copy_check_avx512;
-			replace_word = &gf16_shuffle32_replace_word;
+			replace_word = &gf16_affine2x_replace_word;
 		break;
 		
 		case GF16_AFFINE2X_AVX2:
 			scratch = gf16_affine_init_avx2(GF16_POLYNOMIAL);
 			METHOD_REQUIRES(gf16_affine_available_avx2 && gf16_shuffle_available_avx2)
+			_mul = &gf16_affine2x_mul_avx2;
 			_mul_add = &gf16_affine2x_muladd_avx2;
 			_mul_add_multi = &gf16_affine2x_muladd_multi_avx2;
 			_mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_avx2;
@@ -965,12 +971,13 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_avx2;
 			copy_cksum = &gf16_cksum_copy_avx2;
 			copy_cksum_check = &gf16_cksum_copy_check_avx2;
-			replace_word = &gf16_shuffle16_replace_word;
+			replace_word = &gf16_affine2x_replace_word;
 		break;
 		
 		case GF16_AFFINE2X_GFNI:
 			scratch = gf16_affine_init_gfni(GF16_POLYNOMIAL);
 			METHOD_REQUIRES(gf16_affine_available_gfni && gf16_shuffle_available_ssse3)
+			_mul = &gf16_affine2x_mul_gfni;
 			_mul_add = &gf16_affine2x_muladd_gfni;
 			_mul_add_multi = &gf16_affine2x_muladd_multi_gfni;
 			_mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_gfni;
@@ -993,7 +1000,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			finish_partial_packsum = &gf16_affine2x_finish_partial_packsum_gfni;
 			copy_cksum = &gf16_cksum_copy_sse2;
 			copy_cksum_check = &gf16_cksum_copy_check_sse2;
-			replace_word = &gf16_shuffle8_replace_word;
+			replace_word = &gf16_affine2x_replace_word;
 		break;
 		
 		case GF16_XOR_JIT_AVX512:
@@ -1031,7 +1038,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_xor_finish_partial_packsum_sse2;
 					copy_cksum = &gf16_cksum_copy_sse2;
 					copy_cksum_check = &gf16_cksum_copy_check_sse2;
-					replace_word = NULL;
+					replace_word = gf16_xor16_replace_word;
 				break;
 				/*
 				case GF16_XOR_JIT_AVX:
@@ -1053,7 +1060,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx;
 					copy_cksum = &gf16_cksum_copy_sse2;
 					copy_cksum_check = &gf16_cksum_copy_check_sse2;
-					replace_word = NULL;
+					replace_word = gf16_xor16_replace_word;
 				break;
 				*/
 				case GF16_XOR_JIT_AVX2:
@@ -1075,7 +1082,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx2;
 					copy_cksum = &gf16_cksum_copy_avx2;
 					copy_cksum_check = &gf16_cksum_copy_check_avx2;
-					replace_word = NULL;
+					replace_word = gf16_xor32_replace_word;
 				break;
 				case GF16_XOR_JIT_AVX512:
 					METHOD_REQUIRES(gf16_xor_available_avx512)
@@ -1098,7 +1105,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					finish_partial_packsum = &gf16_xor_finish_partial_packsum_avx512;
 					copy_cksum = &gf16_cksum_copy_avx512;
 					copy_cksum_check = &gf16_cksum_copy_check_avx512;
-					replace_word = NULL;
+					replace_word = gf16_xor64_replace_word;
 				break;
 				default: break; // for pedantic compilers
 			}
@@ -1159,7 +1166,6 @@ Galois16Mul::Galois16Mul(Galois16Methods method) {
 	finish_packed = NULL;
 	replace_word = &Galois16Mul::_replace_word;
 	
-	_mul = NULL;
 	_mul_add_pf = NULL;
 	add_multi = &gf_add_multi_generic;
 	add_multi_packed = &gf_add_multi_packed_generic;
@@ -1208,6 +1214,7 @@ void Galois16Mul::move(Galois16Mul& other) {
 	_pow_add = other._pow_add;
 	copy_cksum = other.copy_cksum;
 	copy_cksum_check = other.copy_cksum_check;
+	replace_word = other.replace_word;
 }
 #endif
 
@@ -1283,12 +1290,18 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu
 	if(caps.hasSVE2) {
 		if(gf16_sve_get_size() >= 64)
 			return GF16_SHUFFLE_512_SVE2;
-		return inputs > 3 ? GF16_CLMUL_SVE2 : GF16_SHUFFLE_128_SVE2;
+		return inputs > 3 && !forInvert ? GF16_CLMUL_SVE2 : GF16_SHUFFLE_128_SVE2;
 	}
 	if(caps.hasSVE && gf16_sve_get_size() > 16)
 		return GF16_SHUFFLE_128_SVE;
 	if(gf16_available_neon && caps.hasNEON)
-		return inputs > 3 ? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON;
+		return
+# ifdef __aarch64__
+			inputs > 3
+# else
+			inputs > 1
+# endif
+			&& !forInvert ? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON;
 #endif
 	
 	
diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h
index 9d3e7a59..8b3223c0 100644
--- a/gf16/gf16mul.h
+++ b/gf16/gf16mul.h
@@ -7,7 +7,7 @@
 #include <vector>
 #include <cstring>
 
-typedef void(*Galois16MulTransform) (void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen);
+typedef void(*Galois16MulTransform) (void* dst, const void* src, size_t srcLen);
 typedef void(*Galois16MulTransformPacked) (void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
 typedef void(*Galois16MulTransformPackedPartial) (void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen);
 typedef void(*Galois16MulUntransform) (void *HEDLEY_RESTRICT dst, size_t len);
@@ -18,7 +18,8 @@ typedef int(*Galois16MulUntransformPackedCksumPartial) (void *HEDLEY_RESTRICT ds
 typedef uint16_t(*Galois16ReplaceWord) (void* data, size_t index, uint16_t newValue);
 
 
-typedef void(*Galois16MulFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
+typedef void(*Galois16MulFunc) (const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
+typedef void(*Galois16MulRstFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 typedef void(*Galois16MulPfFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch);
 typedef void(*Galois16PowFunc) (const void *HEDLEY_RESTRICT scratch, unsigned outputs, size_t offset, void **HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 typedef void(*Galois16MulMultiFunc) (const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch);
@@ -112,7 +113,7 @@ class Galois16Mul {
 	Galois16MethodInfo _info;
 	
 	Galois16MulFunc _mul;
-	Galois16MulFunc _mul_add;
+	Galois16MulRstFunc _mul_add;
 	Galois16MulPfFunc _mul_add_pf;
 	Galois16PowFunc _pow;
 	Galois16PowFunc _pow_add;
@@ -120,8 +121,9 @@ class Galois16Mul {
 	Galois16MulPackedFunc _mul_add_multi_packed;
 	Galois16MulPackPfFunc _mul_add_multi_packpf;
 	
-	static void _prepare_none(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen) {
-		memcpy(dst, src, srcLen);
+	static void _prepare_none(void* dst, const void* src, size_t srcLen) {
+		if(dst != src)
+			memcpy(dst, src, srcLen);
 	}
 	static void _finish_none(void *HEDLEY_RESTRICT, size_t) {}
 	static void _prepare_packed_none(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
@@ -185,7 +187,7 @@ class Galois16Mul {
 	inline HEDLEY_CONST bool isMultipleOfStride(size_t len) const {
 #if defined(_M_ARM64) || defined(__aarch64__)
 		// SVE can have non-power-of-2 strides
-		if((_info.stride & (_info.stride-1)) != 0) // ...but most of the time, expect stride to be a power of 2
+		if(HEDLEY_UNLIKELY((_info.stride & (_info.stride-1)) != 0)) // ...but most of the time, expect stride to be a power of 2
 			return (len % _info.stride) == 0;
 #endif
 		return (len & (_info.stride-1)) == 0;
@@ -193,7 +195,7 @@ class Galois16Mul {
 	inline HEDLEY_CONST size_t alignToStride(size_t len) const {
 		size_t alignMask = _info.stride-1;
 #if defined(_M_ARM64) || defined(__aarch64__)
-		if((_info.stride & (_info.stride-1)) != 0) {
+		if(HEDLEY_UNLIKELY((_info.stride & (_info.stride-1)) != 0)) {
 			return ((len + alignMask) / _info.stride) * _info.stride;
 		}
 #endif
@@ -218,32 +220,24 @@ class Galois16Mul {
 	HEDLEY_MALLOC void* mutScratch_alloc() const;
 	void mutScratch_free(void* mutScratch) const;
 	
-	inline void mul(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) const {
-		assert(((uintptr_t)dst & (_info.alignment-1)) == 0);
-		assert(((uintptr_t)src & (_info.alignment-1)) == 0);
+	inline void mul(void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) const {
 		assert(isMultipleOfStride(len));
 		assert(len > 0);
 		
-		if(!(coefficient & 0xfffe)) {
+		if(HEDLEY_UNLIKELY(!(coefficient & 0xfffe))) {
 			if(coefficient == 0)
 				memset(dst, 0, len);
-			else
+			else if(dst != src)
 				memcpy(dst, src, len);
 		}
-		else if(_mul)
+		else
 			_mul(scratch, dst, src, len, coefficient, mutScratch);
-		else {
-			memset(dst, 0, len);
-			_mul_add(scratch, dst, src, len, coefficient, mutScratch);
-		}
 	}
 	inline void mul_add(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) const {
-		assert(((uintptr_t)dst & (_info.alignment-1)) == 0);
-		assert(((uintptr_t)src & (_info.alignment-1)) == 0);
 		assert(isMultipleOfStride(len));
 		assert(len > 0);
 		
-		if(coefficient == 0) return;
+		if(HEDLEY_UNLIKELY(coefficient == 0)) return;
 		_mul_add(scratch, dst, src, len, coefficient, mutScratch);
 	}
 	
@@ -252,7 +246,7 @@ class Galois16Mul {
 		assert(len > 0);
 		assert(outputs > 0);
 		
-		if(!(coefficient & 0xfffe)) {
+		if(HEDLEY_UNLIKELY(!(coefficient & 0xfffe))) {
 			if(coefficient == 0) {
 				for(unsigned output = 0; output < outputs; output++)
 					memset((uint8_t*)dst[output] + offset, 0, len);
@@ -268,20 +262,11 @@ class Galois16Mul {
 				memset((uint8_t*)dst[output] + offset, 0, len);
 			_pow_add(scratch, outputs, offset, dst, src, len, coefficient, mutScratch);
 		}
-		else if(_mul) {
-			void* prev = (uint8_t*)src + offset;
-			for(unsigned output = 0; output < outputs; output++) {
-				void* cur = (uint8_t*)dst[output] + offset;
-				_mul(scratch, cur, prev, len, coefficient, mutScratch);
-				prev = cur;
-			}
-		}
 		else {
 			void* prev = (uint8_t*)src + offset;
 			for(unsigned output = 0; output < outputs; output++) {
 				void* cur = (uint8_t*)dst[output] + offset;
-				memset(cur, 0, len);
-				_mul_add(scratch, cur, prev, len, coefficient, mutScratch);
+				_mul(scratch, cur, prev, len, coefficient, mutScratch);
 				prev = cur;
 			}
 		}
@@ -291,7 +276,7 @@ class Galois16Mul {
 		assert(len > 0);
 		assert(outputs > 0);
 		
-		if(coefficient == 0) return;
+		if(HEDLEY_UNLIKELY(coefficient == 0)) return;
 		_pow_add(scratch, outputs, offset, dst, src, len, coefficient, mutScratch);
 	}
 	

From 9362d265355800cde41268d4e2d8e735aa58540b Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 6 Jun 2023 12:31:35 +1000
Subject: [PATCH 08/91] Make Xor SSE2 memory layout consistent with AVX*
 implementations

---
 gf16/gf16_xor_common_funcs.h |  8 ++--
 gf16/gf16_xor_sse2.c         | 75 ++++++++++++++++++------------------
 2 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/gf16/gf16_xor_common_funcs.h b/gf16/gf16_xor_common_funcs.h
index c607da06..90235423 100644
--- a/gf16/gf16_xor_common_funcs.h
+++ b/gf16/gf16_xor_common_funcs.h
@@ -48,12 +48,12 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_prep_split(_mword ta, _mword tb, _mwor
 	*th = _mm256_permute4x64_epi64(*th, _MM_SHUFFLE(2,0,3,1));
 #else
 	*th = _mm_packus_epi16(
-		_mm_srli_epi16(tb, 8),
-		_mm_srli_epi16(ta, 8)
+		_mm_srli_epi16(ta, 8),
+		_mm_srli_epi16(tb, 8)
 	);
 	*tl = _mm_packus_epi16(
-		_mm_and_si128(tb, _mm_set1_epi16(0xff)),
-		_mm_and_si128(ta, _mm_set1_epi16(0xff))
+		_mm_and_si128(ta, _mm_set1_epi16(0xff)),
+		_mm_and_si128(tb, _mm_set1_epi16(0xff))
 	);
 #endif
 }
diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c
index e4f27287..723ad74f 100644
--- a/gf16/gf16_xor_sse2.c
+++ b/gf16/gf16_xor_sse2.c
@@ -1065,14 +1065,14 @@ void gf16_xor_finish_block_sse2(void *HEDLEY_RESTRICT dst) {
 		srcVec = _mm_add_epi8(srcVec, srcVec); \
 		write16((target)+0, _mm_movemask_epi8(srcVec)); \
 	}
-	EXTRACT_BITS_HALF(_dst +  0, dstA, 0, srcDQb)
-	EXTRACT_BITS_HALF(_dst +  8, dstA, 1, srcDQa)
-	EXTRACT_BITS_HALF(_dst + 16, dstB, 0, srcDQd)
-	EXTRACT_BITS_HALF(_dst + 24, dstB, 1, srcDQc)
-	EXTRACT_BITS_HALF(_dst + 32, dstC, 0, srcDQf)
-	EXTRACT_BITS_HALF(_dst + 40, dstC, 1, srcDQe)
-	EXTRACT_BITS_HALF(_dst + 48, dstD, 0, srcDQh)
-	EXTRACT_BITS_HALF(_dst + 56, dstD, 1, srcDQg)
+	EXTRACT_BITS_HALF(_dst +  0, dstA, 0, srcDQa)
+	EXTRACT_BITS_HALF(_dst +  8, dstA, 1, srcDQb)
+	EXTRACT_BITS_HALF(_dst + 16, dstB, 0, srcDQc)
+	EXTRACT_BITS_HALF(_dst + 24, dstB, 1, srcDQd)
+	EXTRACT_BITS_HALF(_dst + 32, dstC, 0, srcDQe)
+	EXTRACT_BITS_HALF(_dst + 40, dstC, 1, srcDQf)
+	EXTRACT_BITS_HALF(_dst + 48, dstD, 0, srcDQg)
+	EXTRACT_BITS_HALF(_dst + 56, dstD, 1, srcDQh)
 	#undef EXTRACT_BITS_HALF
 	
 	
@@ -1098,14 +1098,14 @@ void gf16_xor_finish_block_sse2(void *HEDLEY_RESTRICT dst) {
 	
 	// extract & write all bits
 	// TODO: consider saving some to a register to reduce write ops
-	EXTRACT_BITS(_dst + 64 +  0, srcDQb)
-	EXTRACT_BITS(_dst + 64 +  8, srcDQa)
-	EXTRACT_BITS(_dst + 64 + 16, srcDQd)
-	EXTRACT_BITS(_dst + 64 + 24, srcDQc)
-	EXTRACT_BITS(_dst + 64 + 32, srcDQf)
-	EXTRACT_BITS(_dst + 64 + 40, srcDQe)
-	EXTRACT_BITS(_dst + 64 + 48, srcDQh)
-	EXTRACT_BITS(_dst + 64 + 56, srcDQg)
+	EXTRACT_BITS(_dst + 64 +  0, srcDQa)
+	EXTRACT_BITS(_dst + 64 +  8, srcDQb)
+	EXTRACT_BITS(_dst + 64 + 16, srcDQc)
+	EXTRACT_BITS(_dst + 64 + 24, srcDQd)
+	EXTRACT_BITS(_dst + 64 + 32, srcDQe)
+	EXTRACT_BITS(_dst + 64 + 40, srcDQf)
+	EXTRACT_BITS(_dst + 64 + 48, srcDQg)
+	EXTRACT_BITS(_dst + 64 + 56, srcDQh)
 }
 void gf16_xor_finish_copy_block_sse2(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) {
 	uint16_t* _dst = (uint16_t*)dst;
@@ -1125,14 +1125,14 @@ void gf16_xor_finish_copy_block_sse2(void *HEDLEY_RESTRICT dst, const void *HEDL
 	UNPACK_VECTS;
 	
 	// write extracted bits
-	EXTRACT_BITS(_dst +  0, srcDQb)
-	EXTRACT_BITS(_dst +  8, srcDQa)
-	EXTRACT_BITS(_dst + 16, srcDQd)
-	EXTRACT_BITS(_dst + 24, srcDQc)
-	EXTRACT_BITS(_dst + 32, srcDQf)
-	EXTRACT_BITS(_dst + 40, srcDQe)
-	EXTRACT_BITS(_dst + 48, srcDQh)
-	EXTRACT_BITS(_dst + 56, srcDQg)
+	EXTRACT_BITS(_dst +  0, srcDQa)
+	EXTRACT_BITS(_dst +  8, srcDQb)
+	EXTRACT_BITS(_dst + 16, srcDQc)
+	EXTRACT_BITS(_dst + 24, srcDQd)
+	EXTRACT_BITS(_dst + 32, srcDQe)
+	EXTRACT_BITS(_dst + 40, srcDQf)
+	EXTRACT_BITS(_dst + 48, srcDQg)
+	EXTRACT_BITS(_dst + 56, srcDQh)
 	
 	
 	// load second half
@@ -1143,14 +1143,14 @@ void gf16_xor_finish_copy_block_sse2(void *HEDLEY_RESTRICT dst, const void *HEDL
 	
 	UNPACK_VECTS;
 	
-	EXTRACT_BITS(_dst + 64 +  0, srcDQb)
-	EXTRACT_BITS(_dst + 64 +  8, srcDQa)
-	EXTRACT_BITS(_dst + 64 + 16, srcDQd)
-	EXTRACT_BITS(_dst + 64 + 24, srcDQc)
-	EXTRACT_BITS(_dst + 64 + 32, srcDQf)
-	EXTRACT_BITS(_dst + 64 + 40, srcDQe)
-	EXTRACT_BITS(_dst + 64 + 48, srcDQh)
-	EXTRACT_BITS(_dst + 64 + 56, srcDQg)
+	EXTRACT_BITS(_dst + 64 +  0, srcDQa)
+	EXTRACT_BITS(_dst + 64 +  8, srcDQb)
+	EXTRACT_BITS(_dst + 64 + 16, srcDQc)
+	EXTRACT_BITS(_dst + 64 + 24, srcDQd)
+	EXTRACT_BITS(_dst + 64 + 32, srcDQe)
+	EXTRACT_BITS(_dst + 64 + 40, srcDQf)
+	EXTRACT_BITS(_dst + 64 + 48, srcDQg)
+	EXTRACT_BITS(_dst + 64 + 56, srcDQh)
 }
 void gf16_xor_finish_copy_blocku_sse2(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t bytes) {
 	uint16_t block[128];
@@ -1279,10 +1279,9 @@ void gf16_xor_jit_uninit(void* scratch) {
 #endif
 }
 
-static HEDLEY_ALWAYS_INLINE uint16_t gf16_xorX_replace_word(void* data, size_t index, uint16_t newValue, size_t width, unsigned byteFlip) {
+static HEDLEY_ALWAYS_INLINE uint16_t gf16_xorX_replace_word(void* data, size_t index, uint16_t newValue, size_t width) {
 	uint8_t* base = (uint8_t*)data + (index & ~(width*8-1)) * 2; // advance pointer to correct group
-	base += ((index >> 3) & (width-1)) ^ byteFlip; // advance to correct byte
-	// TODO: remove byteFlip parameter
+	base += ((index >> 3) & (width-1)); // advance to correct byte
 	
 	unsigned bitIndex = index&7;
 	uint16_t oldValue = 0;
@@ -1299,13 +1298,13 @@ static HEDLEY_ALWAYS_INLINE uint16_t gf16_xorX_replace_word(void* data, size_t i
 	return oldValue;
 }
 uint16_t gf16_xor16_replace_word(void* data, size_t index, uint16_t newValue) {
-	return gf16_xorX_replace_word(data, index, newValue, 16, 1);
+	return gf16_xorX_replace_word(data, index, newValue, 16);
 }
 uint16_t gf16_xor32_replace_word(void* data, size_t index, uint16_t newValue) {
-	return gf16_xorX_replace_word(data, index, newValue, 32, 0);
+	return gf16_xorX_replace_word(data, index, newValue, 32);
 }
 uint16_t gf16_xor64_replace_word(void* data, size_t index, uint16_t newValue) {
-	return gf16_xorX_replace_word(data, index, newValue, 64, 0);
+	return gf16_xorX_replace_word(data, index, newValue, 64);
 }
 
 

From 0dc711448531ccc96a1c656082d021965322607f Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 6 Jun 2023 18:20:51 +1000
Subject: [PATCH 09/91] Adjust inversion for integration with par2cmdline-turbo

---
 gf16/gfmat_inv.cpp | 30 +++++++++++++++++++++++-------
 gf16/gfmat_inv.h   | 14 ++++++++++++--
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index 2bc5cd1a..a9b512e9 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -1,4 +1,5 @@
 #include "gfmat_coeff.h"
+#include "gfmat_inv.h"
 
 #ifdef PARPAR_INVERT_SUPPORT
 extern "C" uint16_t* gf16_recip;
@@ -7,7 +8,9 @@ extern "C" uint16_t* gf16_recip;
 #include "../src/platform.h" // for ALIGN_*
 #include "gf16mul.h"
 
-uint16_t* compute_recovery_matrix(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, unsigned& stride) {
+bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, std::function<void(uint16_t, uint16_t)> progressCb) {
+	if(mat) ALIGN_FREE(mat);
+	
 	unsigned matWidth = inputValid.size() * sizeof(uint16_t);
 	Galois16Mul gf(Galois16Mul::default_method(matWidth, inputValid.size(), inputValid.size(), true));
 	stride = gf.alignToStride(matWidth);
@@ -16,21 +19,27 @@ uint16_t* compute_recovery_matrix(const std::vector<bool>& inputValid, unsigned
 	
 	unsigned invalidCount = inputValid.size() - validCount;
 	assert(validCount < inputValid.size()); // i.e. invalidCount > 0
+	assert(inputValid.size() <= 32768);
+	assert(recovery.size() <= 65535);
 	
-	uint16_t* mat;
 	ALIGN_ALLOC(mat, invalidCount * stride, gfInfo.alignment);
 	
 	unsigned validCol, missingCol;
 	unsigned stride16 = stride / sizeof(uint16_t);
 	assert(stride16 * sizeof(uint16_t) == stride);
 	
+	uint16_t totalProgress = invalidCount + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc
+	
 	invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying?
 		if(invalidCount > recovery.size()) { // not enough recovery
 			gf.mutScratch_free(gfScratch);
 			ALIGN_FREE(mat);
-			return nullptr;
+			mat = nullptr;
+			return false;
 		}
 		
+		if(progressCb) progressCb(0, totalProgress);
+		
 		// generate matrix
 		validCol = 0;
 		missingCol = validCount;
@@ -44,7 +53,11 @@ uint16_t* compute_recovery_matrix(const std::vector<bool>& inputValid, unsigned
 		assert(validCol == validCount);
 		
 		// pre-transform
+		uint16_t progressOffset = 1;
 		if(gf.needPrepare()) {
+			if(progressCb) progressCb(1, totalProgress);
+			progressOffset = 2;
+			
 			for(unsigned rec = 0; rec < invalidCount; rec++) {
 				uint16_t* row = mat + rec * stride16;
 				//memset(row + matWidth, 0, stride - matWidth); // not necessary, but do this to avoid uninitialized memory
@@ -54,9 +67,10 @@ uint16_t* compute_recovery_matrix(const std::vector<bool>& inputValid, unsigned
 		
 		// invert
 		// TODO: optimise: multi-thread + packed arrangement
-		// TODO: progress hook
 		missingCol = validCount;
 		for(unsigned rec = 0; rec < invalidCount; rec++) {
+			if(progressCb) progressCb(rec + progressOffset, totalProgress);
+			
 			uint16_t* row = mat + rec * stride16;
 			// scale down factor
 			uint16_t baseCoeff = gf.replace_word(row, missingCol, 1);
@@ -84,6 +98,8 @@ uint16_t* compute_recovery_matrix(const std::vector<bool>& inputValid, unsigned
 		
 		// post transform
 		if(gf.needPrepare()) {
+			if(progressCb) progressCb(totalProgress-1, totalProgress);
+			
 			for(unsigned rec = 0; rec < invalidCount; rec++) {
 				uint16_t* row = mat + rec * stride16;
 				gf.finish(row, stride);
@@ -105,11 +121,11 @@ uint16_t* compute_recovery_matrix(const std::vector<bool>& inputValid, unsigned
 	recovery.resize(invalidCount);
 	
 	gf.mutScratch_free(gfScratch);
-	return mat;
+	return true;
 }
 
-void free_recovery_matrix(uint16_t* mat) {
-	ALIGN_FREE(mat);
+Galois16RecMatrix::~Galois16RecMatrix() {
+	if(mat) ALIGN_FREE(mat);
 }
 
 #endif
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index c70cacba..3aa4aa25 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -2,11 +2,21 @@
 #define GFMAT_INV_H
 
 #include <vector>
+#include <functional>
 #include "../src/stdint.h"
 
 #ifdef PARPAR_INVERT_SUPPORT
-uint16_t* compute_recovery_matrix(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, unsigned& stride);
-void free_recovery_matrix(uint16_t* mat);
+class Galois16RecMatrix {
+	uint16_t* mat;
+	unsigned stride;
+public:
+	Galois16RecMatrix() : mat(nullptr) {}
+	~Galois16RecMatrix();
+	bool Compute(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, std::function<void(uint16_t, uint16_t)> progressCb = nullptr);
+	inline uint16_t GetFactor(uint16_t inIdx, uint16_t recIdx) const {
+		return mat[recIdx * stride/sizeof(uint16_t) + inIdx];
+	}
+};
 #endif
 
 #endif

From 83c9f8b066c6b8b20ebbbc318688c0559a822d21 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 6 Jun 2023 18:21:09 +1000
Subject: [PATCH 10/91] Compile bugfix

---
 gf16/gf16_xor_avx2.c | 2 +-
 gf16/gf16_xor_sse2.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gf16/gf16_xor_avx2.c b/gf16/gf16_xor_avx2.c
index a81329bc..d9ba9578 100644
--- a/gf16/gf16_xor_avx2.c
+++ b/gf16/gf16_xor_avx2.c
@@ -726,7 +726,7 @@ void* gf16_xor_jit_init_mut_avx2() {
 #if defined(__AVX2__) && defined(PLATFORM_AMD64)
 	jit_wx_pair *jitCode = jit_alloc(XORDEP_JIT_SIZE);
 	if(!jitCode) return NULL;
-	xor_write_init_jit(jitCode->w, jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL);
+	xor_write_init_jit(jitCode->w, (uint8_t*)jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL);
 	return jitCode;
 #else
 	return NULL;
diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c
index 723ad74f..1f428c6c 100644
--- a/gf16/gf16_xor_sse2.c
+++ b/gf16/gf16_xor_sse2.c
@@ -1264,7 +1264,7 @@ void* gf16_xor_jit_init_mut_sse2() {
 #ifdef PLATFORM_X86
 	jit_wx_pair *jitCode = jit_alloc(XORDEP_JIT_SIZE);
 	if(!jitCode) return NULL;
-	xor_write_init_jit(jitCode->w, jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL);
+	xor_write_init_jit(jitCode->w, (uint8_t*)jitCode->w + XORDEP_JIT_SIZE/2, NULL, NULL);
 	return jitCode;
 #else
 	return NULL;

From 451422641a8c830855e214b784caaeb441a802d0 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 7 Jun 2023 20:52:48 +1000
Subject: [PATCH 11/91] Add option to specify exact recovery exponents to use

---
 README.md          |  1 +
 bin/parpar.js      | 13 ++++++++
 help.txt           |  5 ++++
 lib/par2gen.js     | 74 +++++++++++++++++++++++++++++++++-------------
 lib/par2outfile.js |  4 +--
 5 files changed, 73 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 6fbf5a4e..67a5d505 100644
--- a/README.md
+++ b/README.md
@@ -156,6 +156,7 @@ var par2creator = require('@animetosho/parpar').run(
             value: 65537
         },
         recoveryOffset: 0,
+        recoveryExponents: null, // if an array of numbers is specified, recoveryOffset is ignored, and a single output file is produced regardless of output* options
         memoryLimit: null, // 0 to specify no limit
         minChunkSize: 128*1024,
         processBatchSize: 12,
diff --git a/bin/parpar.js b/bin/parpar.js
index 42082df9..f03af0b6 100755
--- a/bin/parpar.js
+++ b/bin/parpar.js
@@ -61,6 +61,10 @@ var opts = {
 		type: 'int',
 		map: 'recoveryOffset'
 	},
+	'recovery-exponents': {
+		type: 'list',
+		map: 'recoveryExponents'
+	},
 	'comment': {
 		alias: 'c',
 		type: 'array',
@@ -417,6 +421,15 @@ if(argv['md5-method']) {
 	require('../lib/par2.js').set_outhash_method(argv['md5-method']);
 }
 
+if(argv['recovery-exponents']) {
+	['recovery-slices', 'min-recovery-slices', 'max-recovery-slices', 'recovery-offset', 'slice-dist', 'slices-per-file', 'slices-first-file', 'recovery-files'].forEach(function(conflictOpt) {
+		if(argv[conflictOpt])
+			error('`--recovery-exponents` cannot be used with `--' + conflictOpt + '`');
+	});
+	if(!argv.noindex)
+		error('`--recovery-exponents` cannot be used with `--noindex`');
+}
+
 var inputFiles = argv._;
 
 // copied from Nyuu; TODO: dedupe this somehow?
diff --git a/help.txt b/help.txt
index 9ed758ea..da6a3a05 100644
--- a/help.txt
+++ b/help.txt
@@ -88,6 +88,11 @@ PAR2 Options:
                              This default also causes an error to be thrown if
                              the number exceeds the maximum of 65535.
   -e,  --recovery-offset     Recovery slice start offset. Default 0.
+       --recovery-exponents  Comma-separated list of exact recovery exponents
+                             to use. If specified, `--recovery-offset` and
+                             `recovery-slices` related options cannot be used,
+                             and only a single PAR2 output file will be
+                             produced.
   -c,  --comment             Add PAR2 comment. Can be specified multiple times.
        --packet-redundancy   How many copies of critical packets to use in
                              recovery files. This option uses the same syntax
diff --git a/lib/par2gen.js b/lib/par2gen.js
index 278fdf9d..3545550d 100644
--- a/lib/par2gen.js
+++ b/lib/par2gen.js
@@ -181,6 +181,7 @@ function PAR2Gen(fileInfo, sliceSize, opts) {
 			value: 65536
 		},
 		recoveryOffset: 0,
+		recoveryExponents: null, // if an array of numbers is specified, recoveryOffset is ignored, and a single output file is produced regardless of output* options
 		memoryLimit: null, // 0 to specify no limit
 		minChunkSize: 128*1024, // 0 to disable chunking
 		processBatchSize: null, // default is typically 12 (may be adjusted based on GF method's preferred multiple)
@@ -222,6 +223,30 @@ function PAR2Gen(fileInfo, sliceSize, opts) {
 	};
 	if(opts) Par2._extend(o, opts);
 	
+	if(o.recoveryExponents) {
+		o.outputIndex = false;
+		o.outputSizeScheme = 'equal';
+		o.outputFirstFileSlices = null;
+		o.outputFileMaxSlices = {
+			unit: 'slices',
+			value: 65535
+		};
+		o.outputFileCount = 0;
+		o.recoveryOffset = 0;
+		
+		var used = {};
+		o.recoveryExponents = o.recoveryExponents.map(function(exp) {
+			exp = exp|0;
+			if(exp < 0 || exp > 65534)
+				throw new Error('Invalid recovery exponent ' + exp);
+			if(exp in used)
+				throw new Error('Duplicate recovery exponent ' + exp);
+			used[exp] = 1;
+			return exp;
+		});
+		o.recoverySlices = o.recoveryExponents.length;
+	}
+	
 	if(o.criticalRedundancyScheme === 'pow2') o.criticalRedundancyScheme = {unit: 'log', value: 2}; // backwards compatibility
 	if(!fileInfo || (typeof fileInfo != 'object')) throw new Error('No input files supplied');
 	var totalSize = 0, dataFiles = 0;
@@ -356,21 +381,23 @@ function PAR2Gen(fileInfo, sliceSize, opts) {
 	if(o.seqReadSize > MAX_BUFFER_SIZE_MOD2)
 		throw new Error('Read buffer size (' + o.seqReadSize + ') exceeds maximum size supported by this version of Node.js of ' + MAX_BUFFER_SIZE_MOD2 + ' bytes');
 	
-	o.recoverySlices = calcNumRecoverySlices(o.recoverySlices, o.sliceSize, this.inputSlices, fileInfo);
-	// check+apply min/max limits
-	var minRecSlices = Math.ceil(o.recoverySlices), maxRecSlices = Math.floor(o.recoverySlices);
-	if(o.minRecoverySlices !== null)
-		minRecSlices = Math.ceil(calcNumRecoverySlices(o.minRecoverySlices, o.sliceSize, this.inputSlices, fileInfo));
-	if(o.maxRecoverySlices !== null)
-		maxRecSlices = Math.floor(calcNumRecoverySlices(o.maxRecoverySlices, o.sliceSize, this.inputSlices, fileInfo));
-	o.recoverySlices = Math.max(o.recoverySlices, minRecSlices);
-	o.recoverySlices = Math.min(o.recoverySlices, maxRecSlices);
-	o.recoverySlices = Math.round(o.recoverySlices);
-	if(o.recoverySlices < minRecSlices || o.recoverySlices > maxRecSlices /*pedant check*/)
-		throw new Error('Could not satisfy specified min/max recovery slice count constraints');
-	
-	if(o.recoverySlices < 0 || isNaN(o.recoverySlices) || !isFinite(o.recoverySlices)) throw new Error('Invalid number of recovery slices');
-	if(o.recoverySlices+o.recoveryOffset > 65535) throw new Error('Cannot generate specified number of recovery slices: ' + (o.recoverySlices+o.recoveryOffset) + ' exceeds maximum of 65535');
+	if(!o.recoveryExponents) {
+		o.recoverySlices = calcNumRecoverySlices(o.recoverySlices, o.sliceSize, this.inputSlices, fileInfo);
+		// check+apply min/max limits
+		var minRecSlices = Math.ceil(o.recoverySlices), maxRecSlices = Math.floor(o.recoverySlices);
+		if(o.minRecoverySlices !== null)
+			minRecSlices = Math.ceil(calcNumRecoverySlices(o.minRecoverySlices, o.sliceSize, this.inputSlices, fileInfo));
+		if(o.maxRecoverySlices !== null)
+			maxRecSlices = Math.floor(calcNumRecoverySlices(o.maxRecoverySlices, o.sliceSize, this.inputSlices, fileInfo));
+		o.recoverySlices = Math.max(o.recoverySlices, minRecSlices);
+		o.recoverySlices = Math.min(o.recoverySlices, maxRecSlices);
+		o.recoverySlices = Math.round(o.recoverySlices);
+		if(o.recoverySlices < minRecSlices || o.recoverySlices > maxRecSlices /*pedant check*/)
+			throw new Error('Could not satisfy specified min/max recovery slice count constraints');
+		
+		if(o.recoverySlices < 0 || isNaN(o.recoverySlices) || !isFinite(o.recoverySlices)) throw new Error('Invalid number of recovery slices');
+		if(o.recoverySlices+o.recoveryOffset > 65535) throw new Error('Cannot generate specified number of recovery slices: ' + (o.recoverySlices+o.recoveryOffset) + ' exceeds maximum of 65535');
+	}
 	
 	if(this.inputSlices < 1 && o.recoverySlices > 0) throw new Error('Cannot generate recovery from empty input data');
 	
@@ -789,7 +816,10 @@ function PAR2Gen(fileInfo, sliceSize, opts) {
 		throw new Error('Cannot allocate '+o.recoverySlices+' recovery slices to '+o.outputFileCount+' volumes as there aren\'t enough slices');
 	
 	if(this.totalSize > 0) {
-		if(o.outputSizeScheme == 'pow2') {
+		if(o.recoveryExponents) {
+			this._rfPush(o.recoveryExponents.length, o.recoveryExponents, critPackets, creatorPkt);
+		}
+		else if(o.outputSizeScheme == 'pow2') {
 			var slices = o.outputFirstFileSlices || 1, totalSlices = o.recoverySlices + o.recoveryOffset;
 			var getSliceNumsOffsets = function(slices) {
 				var result = [];
@@ -917,7 +947,7 @@ PAR2Gen.prototype = {
 	readSize: 0,
 	_buf: null,
 
-	_rfPush: function(numSlices, sliceOffset, critPackets, creator) {
+	_rfPush: function(numSlices, sliceOffsetOrExponents, critPackets, creator) {
 		var packets, recvSize = 0, critTotalSize = sumSize(critPackets);
 		if(numSlices) recvSize = this.par2.packetRecoverySize();
 		
@@ -933,7 +963,8 @@ PAR2Gen.prototype = {
 				packets = Array(numSlices + critNum +1);
 				var pos = 0, critWritten = 0;
 				for(var i=0; i<numSlices; i++) {
-					packets[pos++] = new PAR2GenPacket('recovery', recvSize, i + sliceOffset);
+					var exp = (Array.isArray(sliceOffsetOrExponents) ? sliceOffsetOrExponents[i] : i + sliceOffsetOrExponents);
+					packets[pos++] = new PAR2GenPacket('recovery', recvSize, exp);
 					while(critWritten < Math.round(critRatio*(i+1))) {
 						var pkt = critPackets[critWritten % critPackets.length];
 						packets[pos++] = pkt.copy();
@@ -960,7 +991,8 @@ PAR2Gen.prototype = {
 				return pkt.copy();
 			}), [creator.copy()]);
 			for(var i=0; i<numSlices; i++) {
-				packets[i] = new PAR2GenPacket('recovery', recvSize, i + sliceOffset);
+				var exp = (Array.isArray(sliceOffsetOrExponents) ? sliceOffsetOrExponents[i] : i + sliceOffsetOrExponents);
+				packets[i] = new PAR2GenPacket('recovery', recvSize, exp);
 			}
 		}
 		
@@ -970,8 +1002,8 @@ PAR2Gen.prototype = {
 			recoveryIndex = lastFile.recoveryIndex + lastFile.recoverySlices;
 		}
 		this.recoveryFiles.push(new PAR2OutFile(
-			/*name:*/ this.opts.outputBase + module.exports.par2Ext(numSlices, sliceOffset, this.opts.recoverySlices + this.opts.recoveryOffset, this.opts.outputAltNamingScheme),
-			numSlices, sliceOffset, recoveryIndex, packets,
+			/*name:*/ this.opts.outputBase + (Array.isArray(sliceOffsetOrExponents) ? '' : module.exports.par2Ext(numSlices, sliceOffsetOrExponents, this.opts.recoverySlices + this.opts.recoveryOffset, this.opts.outputAltNamingScheme)),
+			numSlices, recoveryIndex, packets,
 			/*totalSize:*/ critTotalSize + creator.size + recvSize*numSlices
 		));
 	},
diff --git a/lib/par2outfile.js b/lib/par2outfile.js
index aef18630..5b0f8bb1 100644
--- a/lib/par2outfile.js
+++ b/lib/par2outfile.js
@@ -3,10 +3,9 @@ var fs = require('fs');
 var async = require('async');
 var MAX_WRITE_SIZE = 0x7ffff000; // writev is usually limited to 2GB - 4KB page?
 
-function PAR2OutFile(name, recoverySlices, recoveryOffset, recoveryIndex, packets, totalSize) {
+function PAR2OutFile(name, recoverySlices, recoveryIndex, packets, totalSize) {
 	this.name = name;
 	this.recoverySlices = recoverySlices;
-	this.recoveryOffset = recoveryOffset;
 	this.recoveryIndex = recoveryIndex;
 	this.packets = packets;
 	this.totalSize = totalSize;
@@ -34,7 +33,6 @@ var junkByte = (Buffer.alloc ? Buffer.from : Buffer)([255]);
 PAR2OutFile.prototype = {
 	name: null,
 	recoverySlices: 0,
-	recoveryOffset: 0, // actual base recovery index, shown in file name
 	recoveryIndex: 0,  // relative index used for processing
 	packets: null,
 	totalSize: 0,

From 362c636e4729fd77618bdb50d1e671d75c5bf810 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 7 Jun 2023 22:30:51 +1000
Subject: [PATCH 12/91] Add dot-product optimisation to matrix inversion

---
 gf16/gfmat_inv.cpp | 138 ++++++++++++++++++++++++++++++++++++---------
 gf16/gfmat_inv.h   |   4 ++
 2 files changed, 115 insertions(+), 27 deletions(-)

diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index a9b512e9..eab26279 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -8,6 +8,96 @@ extern "C" uint16_t* gf16_recip;
 #include "../src/platform.h" // for ALIGN_*
 #include "gf16mul.h"
 
+template<int rows>
+int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch) {
+	unsigned missingCol = validCount + rec;
+	
+	uint16_t baseCoeff;
+	uint16_t coeff[rows];
+	
+	void* srcRows[rows];
+	srcRows[0] = mat + rec * (stride / sizeof(uint16_t));
+	for(unsigned i=1; i<rows; i++)
+		srcRows[i] = (uint8_t*)srcRows[0] + i * stride;
+	
+	
+	#define SCALE_ROW(row) \
+		baseCoeff = gf.replace_word(srcRows[row], missingCol+row, 1); \
+		if(HEDLEY_UNLIKELY(baseCoeff == 0)) /* bad recovery coeff */ \
+			return row; \
+		if(HEDLEY_LIKELY(baseCoeff != 1)) \
+			gf.mul(srcRows[row], srcRows[row], stride, gf16_recip[baseCoeff], gfScratch)
+	// TODO: consider prefetching reciprocal?
+	#define MULADD_ROW(rowDst, rowSrc) \
+		coeff[0] = gf.replace_word(rowDst, missingCol+rowSrc, 0); \
+		if(HEDLEY_LIKELY(coeff[0] != 0)) \
+			gf.mul_add(rowDst, srcRows[rowSrc], stride, coeff[0], gfScratch)
+	// TODO: is a coefficient of 0 ever correct?
+	#define MULADD_MULTI_ROW(rowDst, srcOffs, numRows) \
+		for(unsigned i=0; i<numRows; i++) \
+			coeff[i] = gf.replace_word(rowDst, missingCol+srcOffs+i, 0); \
+		gf.mul_add_multi(numRows, 0, rowDst, srcRows+srcOffs, stride, coeff, gfScratch)
+	
+	// scale down factor
+	SCALE_ROW(0);
+	
+	if(rows >= 2) {
+		// multiply-add to the next row
+		MULADD_ROW(srcRows[1], 0);
+		// scale it, and multiply-add back
+		SCALE_ROW(1);
+		MULADD_ROW(srcRows[0], 1);
+	}
+	if(rows >= 3) {
+		MULADD_MULTI_ROW(srcRows[2], 0, 2);
+		SCALE_ROW(2);
+		if(rows >= 4) {
+			MULADD_MULTI_ROW(srcRows[3], 0, 2);
+			MULADD_ROW(srcRows[3], 2);
+			SCALE_ROW(3);
+			MULADD_ROW(srcRows[2], 3);
+			MULADD_MULTI_ROW(srcRows[0], 2, 2);
+			MULADD_MULTI_ROW(srcRows[1], 2, 2);
+		} else {
+			MULADD_ROW(srcRows[0], 2);
+			MULADD_ROW(srcRows[1], 2);
+		}
+	}
+	if(rows >= 5) {
+		MULADD_MULTI_ROW(srcRows[4], 0, 4);
+		SCALE_ROW(4);
+		if(rows >= 6) {
+			MULADD_MULTI_ROW(srcRows[5], 0, 4);
+			MULADD_ROW(srcRows[5], 4);
+			SCALE_ROW(5);
+			MULADD_ROW(srcRows[4], 5);
+			for(unsigned rec2 = 0; rec2 < 4; rec2++) {
+				MULADD_MULTI_ROW(srcRows[rec2], 4, 2);
+			}
+		} else {
+			for(unsigned rec2 = 0; rec2 < 4; rec2++) {
+				MULADD_ROW(srcRows[rec2], 4);
+			}
+		}
+	}
+	
+	for(unsigned rec2 = 0; rec2 < invalidCount; rec2++) {
+		if(HEDLEY_UNLIKELY(rec2 >= rec && rec2 < rec+rows)) continue;
+		uint16_t* row2 = mat + rec2 * (stride / sizeof(uint16_t));
+		if(rows > 1) {
+			MULADD_MULTI_ROW(row2, 0, rows);
+		} else {
+			MULADD_ROW(row2, 0);
+		}
+	}
+	
+	#undef SCALE_ROW
+	#undef MULADD_ROW
+	#undef MULADD_MULTI_ROW
+	
+	return -1;
+}
+
 bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, std::function<void(uint16_t, uint16_t)> progressCb) {
 	if(mat) ALIGN_FREE(mat);
 	
@@ -67,34 +157,28 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		
 		// invert
 		// TODO: optimise: multi-thread + packed arrangement
-		missingCol = validCount;
-		for(unsigned rec = 0; rec < invalidCount; rec++) {
-			if(progressCb) progressCb(rec + progressOffset, totalProgress);
-			
-			uint16_t* row = mat + rec * stride16;
-			// scale down factor
-			uint16_t baseCoeff = gf.replace_word(row, missingCol, 1);
-			if(HEDLEY_UNLIKELY(baseCoeff == 0)) { // bad recovery coeff
-				// ignore this recovery row and try again
-				recovery.erase(recovery.begin() + rec);
-				goto invert_loop;
-			}
-			baseCoeff = gf16_recip[baseCoeff]; // TODO: consider prefetching this?
-			if(HEDLEY_LIKELY(baseCoeff != 1)) {
-				gf.mul(row, row, stride, baseCoeff, gfScratch);
-			}
-			
-			for(unsigned rec2 = 0; rec2 < invalidCount; rec2++) {
-				if(HEDLEY_UNLIKELY(rec == rec2)) continue;
-				uint16_t* row2 = mat + rec2 * stride16;
-				uint16_t coeff = gf.replace_word(row2, missingCol, 0);
-				if(HEDLEY_LIKELY(coeff != 0)) {
-					gf.mul_add(row2, row, stride, coeff, gfScratch);
-				} // TODO: is a coefficient of 0 ever correct?
+		unsigned rec = 0;
+		#define INVERT_GROUP(rows) \
+			if(gfInfo.idealInputMultiple >= rows && invalidCount >= rows) { \
+				for(; rec <= invalidCount-rows; rec+=rows) { \
+					if(progressCb) progressCb(rec + progressOffset, totalProgress); \
+					 \
+					int badRowOffset = processRow<rows>(rec, validCount, invalidCount, gf, gfScratch); \
+					if(badRowOffset >= 0) { \
+						/* ignore this recovery row and try again */ \
+						recovery.erase(recovery.begin() + rec + badRowOffset); \
+						goto invert_loop; \
+					} \
+				} \
 			}
-			
-			missingCol++;
-		}
+		// max out at 6 groups (registers + cache assoc?)
+		INVERT_GROUP(6)
+		INVERT_GROUP(5)
+		INVERT_GROUP(4)
+		INVERT_GROUP(3)
+		INVERT_GROUP(2)
+		INVERT_GROUP(1)
+		#undef INVERT_GROUP
 		
 		// post transform
 		if(gf.needPrepare()) {
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index 3aa4aa25..ee29911c 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -6,9 +6,13 @@
 #include "../src/stdint.h"
 
 #ifdef PARPAR_INVERT_SUPPORT
+class Galois16Mul;
 class Galois16RecMatrix {
 	uint16_t* mat;
 	unsigned stride;
+	
+	template<int rows>
+	int processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch);
 public:
 	Galois16RecMatrix() : mat(nullptr) {}
 	~Galois16RecMatrix();

From d2a4e99b3ca8807aec821e3d5453ae4536291ca5 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sat, 10 Jun 2023 18:16:21 +1000
Subject: [PATCH 13/91] Add prefetching to matrix inversion

---
 gf16/gf16_affine.h       |  2 +
 gf16/gf16_clmul.h        |  1 +
 gf16/gf16_muladd_multi.h | 99 ++++++++++++++++++++++++++++++++++++++--
 gf16/gf16_shuffle.h      |  3 ++
 gf16/gf16mul.cpp         | 19 ++++++++
 gf16/gf16mul.h           | 29 ++++++++++++
 gf16/gfmat_inv.cpp       | 97 +++++++++++++++++++++++++++++++--------
 7 files changed, 226 insertions(+), 24 deletions(-)

diff --git a/gf16/gf16_affine.h b/gf16/gf16_affine.h
index 254e5334..33c3b074 100644
--- a/gf16/gf16_affine.h
+++ b/gf16/gf16_affine.h
@@ -6,6 +6,7 @@
 	void gf16_affine_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_affine_muladd_prefetch_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch); \
 	void gf16_affine_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
+	void gf16_affine_muladd_multi_stridepf_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch); \
 	void gf16_affine_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_affine_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \
 	void gf16_affine_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
@@ -24,6 +25,7 @@ FUNCS(avx512);
 	void gf16_affine2x_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_affine2x_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_affine2x_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
+	void gf16_affine2x_muladd_multi_stridepf_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch); \
 	void gf16_affine2x_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_affine2x_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \
 	void gf16_affine2x_prepare_##v(void* dst, const void* src, size_t srcLen); \
diff --git a/gf16/gf16_clmul.h b/gf16/gf16_clmul.h
index 696f0dcc..d8f189c2 100644
--- a/gf16/gf16_clmul.h
+++ b/gf16/gf16_clmul.h
@@ -3,6 +3,7 @@
 
 #define FUNCS(v) \
 	void gf16_clmul_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
+	void gf16_clmul_muladd_multi_stridepf_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch); \
 	void gf16_clmul_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_clmul_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \
 	void gf16_clmul_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
diff --git a/gf16/gf16_muladd_multi.h b/gf16/gf16_muladd_multi.h
index 41a7e36f..98ccc9d4 100644
--- a/gf16/gf16_muladd_multi.h
+++ b/gf16/gf16_muladd_multi.h
@@ -29,6 +29,11 @@ void fnpre ## _muladd_multi ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsign
 	gf16_muladd_multi(scratch, &xfn, procRegions, regions, offset, dst, src, len, coefficients); \
 	finisher; \
 } \
+void fnpre ## _muladd_multi_stridepf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \
+	UNUSED(mutScratch); \
+	gf16_muladd_multi_stridepf(scratch, &xfn, procRegions, regions, srcStride, dst, src, len, coefficients, pfFactor, prefetch); \
+	finisher; \
+} \
 void fnpre ## _muladd_multi_packed ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \
 	UNUSED(mutScratch); \
 	gf16_muladd_multi_packed(scratch, &xfn, procRegions, procRegions, packedRegions, regions, dst, src, len, blocksize, coefficients); \
@@ -45,6 +50,10 @@ void fnpre ## _muladd_multi ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsign
 	UNUSED(mutScratch); \
 	UNUSED(scratch); UNUSED(regions); UNUSED(offset); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); \
 } \
+void fnpre ## _muladd_multi_stridepf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \
+	UNUSED(mutScratch); \
+	UNUSED(scratch); UNUSED(regions); UNUSED(srcStride); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); UNUSED(prefetch); \
+} \
 void fnpre ## _muladd_multi_packed ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \
 	UNUSED(mutScratch); \
 	UNUSED(scratch); UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); \
@@ -102,8 +111,8 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT s
 	uint8_t* _dst = (uint8_t*)dst + offset + len;
 	
 	#define _SRC(limit, n) limit > n ? (const uint8_t*)src[region+n] + offset + len : NULL
-	unsigned region = 0;
-	if(regions >= interleave) do {
+	unsigned region;
+	for(region = 0; region + interleave <= regions; region += interleave) {
 		muladd_pf(
 			scratch, _dst, 1, interleave,
 			(const uint8_t*)src[region] + offset + len,
@@ -114,8 +123,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT s
 			_SRC(interleave,17),
 			len, coefficients + region, 0, NULL
 		);
-		region += interleave;
-	} while(interleave <= regions - region);
+	}
 	unsigned remaining = regions - region;
 	HEDLEY_ASSUME(remaining < interleave); // doesn't seem to always work, so we have additional checks in the switch cases
 	switch(remaining) {
@@ -140,6 +148,89 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT s
 	#undef _SRC
 }
 
+static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const unsigned pfFactor, const void* HEDLEY_RESTRICT prefetch) {
+	uint8_t* _dst = (uint8_t*)dst + len;
+	uint8_t* srcEnd = (uint8_t*)src + len;
+	
+	size_t pfLen = len>>pfFactor;
+	const char* _pf = (const char*)prefetch + pfLen;
+	unsigned outputPfRounds = 1<<pfFactor;
+	
+	#define _SRC(limit, n) limit > n ? srcEnd + srcStride*n : NULL
+	unsigned region;
+	for(region = 0; region + interleave <= regions && outputPfRounds; region += interleave) {
+		muladd_pf(
+			scratch, _dst, 1, interleave,
+			srcEnd,
+			_SRC(interleave, 1), _SRC(interleave,  2), _SRC(interleave,  3), _SRC(interleave,  4),
+			_SRC(interleave, 5), _SRC(interleave,  6), _SRC(interleave,  7), _SRC(interleave,  8),
+			_SRC(interleave, 9), _SRC(interleave, 10), _SRC(interleave, 11), _SRC(interleave, 12),
+			_SRC(interleave,13), _SRC(interleave, 14), _SRC(interleave, 15), _SRC(interleave, 16),
+			_SRC(interleave,17),
+			len, coefficients + region, 1, _pf
+		);
+		srcEnd += srcStride*interleave;
+		outputPfRounds--;
+		_pf += pfLen;
+	}
+	for(; region + interleave <= regions; region += interleave) {
+		muladd_pf(
+			scratch, _dst, 1, interleave,
+			srcEnd,
+			_SRC(interleave, 1), _SRC(interleave,  2), _SRC(interleave,  3), _SRC(interleave,  4),
+			_SRC(interleave, 5), _SRC(interleave,  6), _SRC(interleave,  7), _SRC(interleave,  8),
+			_SRC(interleave, 9), _SRC(interleave, 10), _SRC(interleave, 11), _SRC(interleave, 12),
+			_SRC(interleave,13), _SRC(interleave, 14), _SRC(interleave, 15), _SRC(interleave, 16),
+			_SRC(interleave,17),
+			len, coefficients + region, 0, NULL
+		);
+		srcEnd += srcStride*interleave;
+	}
+	unsigned remaining = regions - region;
+	HEDLEY_ASSUME(remaining < interleave); // doesn't seem to always work, so we have additional checks in the switch cases
+	if(outputPfRounds) {
+		switch(remaining) {
+			#define CASE(x) \
+				case x: \
+					HEDLEY_ASSUME(x < interleave); \
+					muladd_pf( \
+						scratch, _dst, 1, x, \
+						srcEnd, \
+						_SRC(x, 1), _SRC(x,  2), _SRC(x,  3), _SRC(x,  4), \
+						_SRC(x, 5), _SRC(x,  6), _SRC(x,  7), _SRC(x,  8), \
+						_SRC(x, 9), _SRC(x, 10), _SRC(x, 11), _SRC(x, 12), \
+						_SRC(x,13), _SRC(x, 14), _SRC(x, 15), _SRC(x, 16), \
+						_SRC(x,17), \
+						len, coefficients + region, 1, _pf \
+					); \
+				break
+				REMAINING_CASES;
+			#undef CASE
+			default: break;
+		}
+	} else {
+		switch(remaining) {
+			#define CASE(x) \
+				case x: \
+					HEDLEY_ASSUME(x < interleave); \
+					muladd_pf( \
+						scratch, _dst, 1, x, \
+						srcEnd, \
+						_SRC(x, 1), _SRC(x,  2), _SRC(x,  3), _SRC(x,  4), \
+						_SRC(x, 5), _SRC(x,  6), _SRC(x,  7), _SRC(x,  8), \
+						_SRC(x, 9), _SRC(x, 10), _SRC(x, 11), _SRC(x, 12), \
+						_SRC(x,13), _SRC(x, 14), _SRC(x, 15), _SRC(x, 16), \
+						_SRC(x,17), \
+						len, coefficients + region, 0, NULL \
+					); \
+				break
+				REMAINING_CASES;
+			#undef CASE
+			default: break;
+		}
+	}
+	#undef _SRC
+}
 
 static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packed(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regionsPerCall, unsigned inputPackSize, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, size_t blockLen, const uint16_t *HEDLEY_RESTRICT coefficients) {
 	ASSUME(regions <= inputPackSize);
diff --git a/gf16/gf16_shuffle.h b/gf16/gf16_shuffle.h
index 0344bcef..87f3fe05 100644
--- a/gf16/gf16_shuffle.h
+++ b/gf16/gf16_shuffle.h
@@ -26,6 +26,7 @@ FUNCS(avx512);
 // multi-region
 #define FUNCS(v) \
 	void gf16_shuffle_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
+	void gf16_shuffle_muladd_multi_stridepf_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch); \
 	void gf16_shuffle_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_shuffle_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut)
 
@@ -93,6 +94,7 @@ extern int gf16_available_sve2;
 	void gf16_shuffle2x_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_shuffle2x_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_shuffle2x_muladd_multi_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
+	void gf16_shuffle2x_muladd_multi_stridepf_##v(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch); \
 	void gf16_shuffle2x_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_shuffle2x_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut)
 
@@ -110,6 +112,7 @@ int gf16_shuffle2x_finish_partial_packsum_sve(void *HEDLEY_RESTRICT dst, void *H
 void gf16_shuffle2x_mul_128_sve2(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_shuffle2x_muladd_128_sve2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_shuffle2x_muladd_multi_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch);
+void gf16_shuffle2x_muladd_multi_stridepf_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch);
 void gf16_shuffle2x_muladd_multi_packed_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch);
 void gf16_shuffle2x_muladd_multi_packpf_128_sve2(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut);
 
diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index 690d4a88..292805b3 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -558,6 +558,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 					#ifdef PLATFORM_AMD64
 					// if 32 registers are available, can do multi-region
 					_mul_add_multi = &gf16_shuffle_muladd_multi_avx512;
+					_mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_avx512;
 					_mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_avx512;
 					_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_avx512;
 					add_multi_packed = &gf_add_multi_packed_v2i3_avx512;
@@ -590,6 +591,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			add_multi = &gf_add_multi_avx512;
 			#ifdef PLATFORM_AMD64
 			_mul_add_multi = &gf16_shuffle_muladd_multi_vbmi;
+			_mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_vbmi;
 			_mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_vbmi;
 			_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_vbmi;
 			add_multi_packed = &gf_add_multi_packed_v2i4_avx512;
@@ -618,6 +620,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			add_multi = &gf_add_multi_avx512;
 			#ifdef PLATFORM_AMD64
 			_mul_add_multi = &gf16_shuffle2x_muladd_multi_avx512;
+			_mul_add_multi_stridepf = &gf16_shuffle2x_muladd_multi_stridepf_avx512;
 			_mul_add_multi_packed = &gf16_shuffle2x_muladd_multi_packed_avx512;
 			_mul_add_multi_packpf = &gf16_shuffle2x_muladd_multi_packpf_avx512;
 			add_multi_packed = &gf_add_multi_packed_v1i6_avx512;
@@ -646,6 +649,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			add_multi = &gf_add_multi_avx2;
 			#ifdef PLATFORM_AMD64
 			_mul_add_multi = &gf16_shuffle2x_muladd_multi_avx2;
+			_mul_add_multi_stridepf = &gf16_shuffle2x_muladd_multi_stridepf_avx2;
 			_mul_add_multi_packed = &gf16_shuffle2x_muladd_multi_packed_avx2;
 			_mul_add_multi_packpf = &gf16_shuffle2x_muladd_multi_packpf_avx2;
 			add_multi_packed = &gf_add_multi_packed_v1i2_avx2;
@@ -677,6 +681,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			#ifdef __aarch64__
 			// enable only if 32 registers available
 			_mul_add_multi = &gf16_shuffle_muladd_multi_neon;
+			_mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_neon;
 			_mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_neon;
 			// TODO: on Cortex A53, prefetching seems to be slower, so disabled for now
 			//_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_neon;
@@ -700,6 +705,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			_mul = &gf16_clmul_mul_neon;
 			_mul_add = &gf16_clmul_muladd_neon;
 			_mul_add_multi = &gf16_clmul_muladd_multi_neon;
+			_mul_add_multi_stridepf = &gf16_clmul_muladd_multi_stridepf_neon;
 			_mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_neon;
 			add_multi = &gf_add_multi_neon;
 			add_multi_packed = &gf_add_multi_packed_clmul_neon;
@@ -724,6 +730,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			_mul = &gf16_shuffle_mul_128_sve;
 			_mul_add = &gf16_shuffle_muladd_128_sve;
 			_mul_add_multi = &gf16_shuffle_muladd_multi_128_sve;
+			_mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_128_sve;
 			_mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_128_sve;
 			//_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_128_sve;
 			add_multi = &gf_add_multi_sve;
@@ -745,6 +752,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			_mul = &gf16_shuffle_mul_128_sve2;
 			_mul_add = &gf16_shuffle_muladd_128_sve2;
 			_mul_add_multi = &gf16_shuffle_muladd_multi_128_sve2;
+			_mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_128_sve2;
 			_mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_128_sve2;
 			//_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_128_sve2;
 			add_multi = &gf_add_multi_sve2;
@@ -766,6 +774,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			_mul = &gf16_shuffle2x_mul_128_sve2;
 			_mul_add = &gf16_shuffle2x_muladd_128_sve2;
 			_mul_add_multi = &gf16_shuffle2x_muladd_multi_128_sve2;
+			_mul_add_multi_stridepf = &gf16_shuffle2x_muladd_multi_stridepf_128_sve2;
 			_mul_add_multi_packed = &gf16_shuffle2x_muladd_multi_packed_128_sve2;
 			//_mul_add_multi_packpf = &gf16_shuffle2x_muladd_multi_packpf_128_sve2;
 			add_multi = &gf_add_multi_sve2;
@@ -790,6 +799,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			_mul = &gf16_shuffle_mul_512_sve2;
 			_mul_add = &gf16_shuffle_muladd_512_sve2;
 			_mul_add_multi = &gf16_shuffle_muladd_multi_512_sve2;
+			_mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_512_sve2;
 			_mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_512_sve2;
 			//_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_512_sve2;
 			add_multi = &gf_add_multi_sve2;
@@ -811,6 +821,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			_mul = &gf16_clmul_mul_sve2;
 			_mul_add = &gf16_clmul_muladd_sve2;
 			_mul_add_multi = &gf16_clmul_muladd_multi_sve2;
+			_mul_add_multi_stridepf = &gf16_clmul_muladd_multi_stridepf_sve2;
 			_mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_sve2;
 			//_mul_add_multi_packpf = &gf16_clmul_muladd_multi_packpf_sve2;
 			add_multi = &gf_add_multi_sve2;
@@ -835,6 +846,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			add_multi = &gf_add_multi_avx512;
 			#ifdef PLATFORM_AMD64
 			_mul_add_multi = &gf16_affine_muladd_multi_avx512;
+			_mul_add_multi_stridepf = &gf16_affine_muladd_multi_stridepf_avx512;
 			_mul_add_multi_packed = &gf16_affine_muladd_multi_packed_avx512;
 			_mul_add_multi_packpf = &gf16_affine_muladd_multi_packpf_avx512;
 			add_multi_packed = &gf_add_multi_packed_v2i6_avx512;
@@ -865,6 +877,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			add_multi = &gf_add_multi_avx2;
 			#ifdef PLATFORM_AMD64
 			_mul_add_multi = &gf16_affine_muladd_multi_avx2;
+			_mul_add_multi_stridepf = &gf16_affine_muladd_multi_stridepf_avx2;
 			_mul_add_multi_packed = &gf16_affine_muladd_multi_packed_avx2;
 			_mul_add_multi_packpf = &gf16_affine_muladd_multi_packpf_avx2;
 			add_multi_packed = &gf_add_multi_packed_v2i3_avx2;
@@ -895,6 +908,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			add_multi = &gf_add_multi_sse2;
 			#ifdef PLATFORM_AMD64
 			_mul_add_multi = &gf16_affine_muladd_multi_gfni;
+			_mul_add_multi_stridepf = &gf16_affine_muladd_multi_stridepf_gfni;
 			_mul_add_multi_packed = &gf16_affine_muladd_multi_packed_gfni;
 			_mul_add_multi_packpf = &gf16_affine_muladd_multi_packpf_gfni;
 			add_multi_packed = &gf_add_multi_packed_v2i3_sse2;
@@ -922,6 +936,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			_mul = &gf16_affine2x_mul_avx512;
 			_mul_add = &gf16_affine2x_muladd_avx512;
 			_mul_add_multi = &gf16_affine2x_muladd_multi_avx512;
+			_mul_add_multi_stridepf = &gf16_affine2x_muladd_multi_stridepf_avx512;
 			_mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_avx512;
 			_mul_add_multi_packpf = &gf16_affine2x_muladd_multi_packpf_avx512;
 			add_multi = &gf_add_multi_avx512;
@@ -951,6 +966,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			_mul = &gf16_affine2x_mul_avx2;
 			_mul_add = &gf16_affine2x_muladd_avx2;
 			_mul_add_multi = &gf16_affine2x_muladd_multi_avx2;
+			_mul_add_multi_stridepf = &gf16_affine2x_muladd_multi_stridepf_avx2;
 			_mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_avx2;
 			_mul_add_multi_packpf = &gf16_affine2x_muladd_multi_packpf_avx2;
 			add_multi = &gf_add_multi_avx2;
@@ -980,6 +996,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			_mul = &gf16_affine2x_mul_gfni;
 			_mul_add = &gf16_affine2x_muladd_gfni;
 			_mul_add_multi = &gf16_affine2x_muladd_multi_gfni;
+			_mul_add_multi_stridepf = &gf16_affine2x_muladd_multi_stridepf_gfni;
 			_mul_add_multi_packed = &gf16_affine2x_muladd_multi_packed_gfni;
 			_mul_add_multi_packpf = &gf16_affine2x_muladd_multi_packpf_gfni;
 			add_multi = &gf_add_multi_sse2;
@@ -1171,6 +1188,7 @@ Galois16Mul::Galois16Mul(Galois16Methods method) {
 	add_multi_packed = &gf_add_multi_packed_generic;
 	add_multi_packpf = &gf_add_multi_packpf_generic;
 	_mul_add_multi = NULL;
+	_mul_add_multi_stridepf = NULL;
 	_mul_add_multi_packed = NULL;
 	_mul_add_multi_packpf = NULL;
 	copy_cksum = &gf16_cksum_copy_generic;
@@ -1208,6 +1226,7 @@ void Galois16Mul::move(Galois16Mul& other) {
 	_mul_add = other._mul_add;
 	_mul_add_pf = other._mul_add_pf;
 	_mul_add_multi = other._mul_add_multi;
+	_mul_add_multi_stridepf = other._mul_add_multi_stridepf;
 	_mul_add_multi_packed = other._mul_add_multi_packed;
 	_mul_add_multi_packpf = other._mul_add_multi_packpf;
 	_pow = other._pow;
diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h
index 8b3223c0..53540b26 100644
--- a/gf16/gf16mul.h
+++ b/gf16/gf16mul.h
@@ -23,6 +23,7 @@ typedef void(*Galois16MulRstFunc) (const void *HEDLEY_RESTRICT scratch, void *HE
 typedef void(*Galois16MulPfFunc) (const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch);
 typedef void(*Galois16PowFunc) (const void *HEDLEY_RESTRICT scratch, unsigned outputs, size_t offset, void **HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 typedef void(*Galois16MulMultiFunc) (const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch);
+typedef void(*Galois16MulStridePfFunc) (const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch);
 typedef void(*Galois16MulPackedFunc) (const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch);
 typedef void(*Galois16MulPackPfFunc) (const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut);
 typedef void(*Galois16AddFunc) (void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len);
@@ -118,6 +119,7 @@ class Galois16Mul {
 	Galois16PowFunc _pow;
 	Galois16PowFunc _pow_add;
 	Galois16MulMultiFunc _mul_add_multi;
+	Galois16MulStridePfFunc _mul_add_multi_stridepf;
 	Galois16MulPackedFunc _mul_add_multi_packed;
 	Galois16MulPackPfFunc _mul_add_multi_packpf;
 	
@@ -241,6 +243,17 @@ class Galois16Mul {
 		_mul_add(scratch, dst, src, len, coefficient, mutScratch);
 	}
 	
+	inline void mul_add_pf(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) const {
+		assert(isMultipleOfStride(len));
+		assert(len > 0);
+		
+		if(HEDLEY_UNLIKELY(coefficient == 0)) return;
+		if(_mul_add_pf)
+			_mul_add_pf(scratch, dst, src, len, coefficient, mutScratch, prefetch);
+		else
+			_mul_add(scratch, dst, src, len, coefficient, mutScratch);
+	}
+	
 	inline void pow(unsigned outputs, size_t offset, void **HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) const {
 		assert(isMultipleOfStride(len));
 		assert(len > 0);
@@ -294,6 +307,22 @@ class Galois16Mul {
 		}
 	}
 	
+	inline void mul_add_multi_stridepf(unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) const {
+		assert(isMultipleOfStride(len));
+		assert(len > 0);
+		assert(srcStride > 0);
+		assert(regions > 0);
+		
+		if(_mul_add_multi_stridepf)
+			_mul_add_multi_stridepf(scratch, regions, srcStride, dst, src, len, coefficients, mutScratch, prefetch);
+		else {
+			// TODO: _mul_add_pf fallback; _mul_add_multi shouldn't be set (exception: XorJit AVX512)
+			for(unsigned region = 0; region<regions; region++) {
+				_mul_add(scratch, dst, (const uint8_t*)src+region*srcStride, len, coefficients[region], mutScratch);
+			}
+		}
+	}
+	
 	inline void mul_add_multi_packed(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) const {
 		assert(isMultipleOfStride(len));
 		assert(len > 0);
diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index eab26279..ad72e830 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -15,8 +15,10 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 	uint16_t baseCoeff;
 	uint16_t coeff[rows];
 	
+	#define MAT_ROW(r) (mat + (r) * (stride / sizeof(uint16_t)))
+	
 	void* srcRows[rows];
-	srcRows[0] = mat + rec * (stride / sizeof(uint16_t));
+	srcRows[0] = MAT_ROW(rec);
 	for(unsigned i=1; i<rows; i++)
 		srcRows[i] = (uint8_t*)srcRows[0] + i * stride;
 	
@@ -33,69 +35,124 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 		if(HEDLEY_LIKELY(coeff[0] != 0)) \
 			gf.mul_add(rowDst, srcRows[rowSrc], stride, coeff[0], gfScratch)
 	// TODO: is a coefficient of 0 ever correct?
+	#define MULADD_ROW_PF(rowDst, rowSrc, rowPf) \
+		coeff[0] = gf.replace_word(rowDst, missingCol+rowSrc, 0); \
+		if(HEDLEY_LIKELY(coeff[0] != 0)) \
+			gf.mul_add_pf(rowDst, srcRows[rowSrc], stride, coeff[0], gfScratch, rowPf)
 	#define MULADD_MULTI_ROW(rowDst, srcOffs, numRows) \
 		for(unsigned i=0; i<numRows; i++) \
 			coeff[i] = gf.replace_word(rowDst, missingCol+srcOffs+i, 0); \
 		gf.mul_add_multi(numRows, 0, rowDst, srcRows+srcOffs, stride, coeff, gfScratch)
+	#define MULADD_MULTI_ROW_PF(rowDst, srcOffs, numRows, rowPf) \
+		for(unsigned i=0; i<numRows; i++) \
+			coeff[i] = gf.replace_word(rowDst, missingCol+srcOffs+i, 0); \
+		gf.mul_add_multi_stridepf(numRows, stride, rowDst, srcRows[srcOffs], stride, coeff, gfScratch, rowPf)
 	
-	// scale down factor
+	#define MULADD_LASTROW(rowDst, rowSrc) \
+		if(HEDLEY_LIKELY(rec2 < invalidCount)) { \
+			MULADD_ROW_PF(rowDst, rowSrc, MAT_ROW(rec2)); \
+		} else { \
+			if(nextScaleRow) { \
+				MULADD_ROW_PF(rowDst, rowSrc, nextScaleRow); \
+			} else { \
+				MULADD_ROW(rowDst, rowSrc); \
+			} \
+			return -1; \
+		}
+	#define MULADD_MULTI_LASTROW(rowDst, srcOffs, numRows) \
+		if(HEDLEY_LIKELY(rec2 < invalidCount)) { \
+			MULADD_MULTI_ROW_PF(rowDst, srcOffs, numRows, MAT_ROW(rec2)); \
+		} else { \
+			if(nextScaleRow) { \
+				MULADD_MULTI_ROW_PF(rowDst, srcOffs, numRows, nextScaleRow); \
+			} else { \
+				MULADD_MULTI_ROW(rowDst, srcOffs, numRows); \
+			} \
+			return -1; \
+		}
+	
+	unsigned rec2 = rec == 0 ? rows : 0;
+	// the next row when `processRow` is called; last action will prefetch this row
+	uint16_t* nextScaleRow = (rec+rows < invalidCount) ? MAT_ROW(rec+rows) : nullptr;
+	
+	// rescale the row
 	SCALE_ROW(0);
 	
+	// if we're processing multiple source rows, run elimination on the source group first
 	if(rows >= 2) {
 		// multiply-add to the next row
 		MULADD_ROW(srcRows[1], 0);
 		// scale it, and multiply-add back
 		SCALE_ROW(1);
-		MULADD_ROW(srcRows[0], 1);
+		if(rows > 2) {
+			MULADD_ROW_PF(srcRows[0], 1, srcRows[2]);
+		} else MULADD_LASTROW(srcRows[0], 1)
+	} else {
+		if(rec2 >= invalidCount)
+			return -1;
 	}
 	if(rows >= 3) {
-		MULADD_MULTI_ROW(srcRows[2], 0, 2);
-		SCALE_ROW(2);
 		if(rows >= 4) {
+			MULADD_MULTI_ROW_PF(srcRows[2], 0, 2, srcRows[3]);
+			SCALE_ROW(2);
 			MULADD_MULTI_ROW(srcRows[3], 0, 2);
 			MULADD_ROW(srcRows[3], 2);
 			SCALE_ROW(3);
 			MULADD_ROW(srcRows[2], 3);
 			MULADD_MULTI_ROW(srcRows[0], 2, 2);
-			MULADD_MULTI_ROW(srcRows[1], 2, 2);
+			if(rows > 4) {
+				MULADD_MULTI_ROW_PF(srcRows[1], 2, 2, srcRows[4]);
+			} else MULADD_MULTI_LASTROW(srcRows[1], 2, 2)
 		} else {
+			MULADD_MULTI_ROW(srcRows[2], 0, 2);
+			SCALE_ROW(2);
 			MULADD_ROW(srcRows[0], 2);
-			MULADD_ROW(srcRows[1], 2);
+			MULADD_LASTROW(srcRows[1], 2)
 		}
 	}
 	if(rows >= 5) {
-		MULADD_MULTI_ROW(srcRows[4], 0, 4);
-		SCALE_ROW(4);
 		if(rows >= 6) {
+			MULADD_MULTI_ROW_PF(srcRows[4], 0, 4, srcRows[5]);
+			SCALE_ROW(4);
 			MULADD_MULTI_ROW(srcRows[5], 0, 4);
 			MULADD_ROW(srcRows[5], 4);
 			SCALE_ROW(5);
 			MULADD_ROW(srcRows[4], 5);
-			for(unsigned rec2 = 0; rec2 < 4; rec2++) {
-				MULADD_MULTI_ROW(srcRows[rec2], 4, 2);
+			for(unsigned r = 0; r < 3; r++) {
+				MULADD_MULTI_ROW(srcRows[r], 4, 2);
 			}
+			MULADD_MULTI_LASTROW(srcRows[3], 4, 2)
 		} else {
-			for(unsigned rec2 = 0; rec2 < 4; rec2++) {
-				MULADD_ROW(srcRows[rec2], 4);
+			MULADD_MULTI_ROW(srcRows[4], 0, 4);
+			SCALE_ROW(4);
+			for(unsigned r = 0; r < 3; r++) {
+				MULADD_ROW(srcRows[r], 4);
 			}
+			MULADD_LASTROW(srcRows[3], 4)
 		}
 	}
 	
-	for(unsigned rec2 = 0; rec2 < invalidCount; rec2++) {
-		if(HEDLEY_UNLIKELY(rec2 >= rec && rec2 < rec+rows)) continue;
-		uint16_t* row2 = mat + rec2 * (stride / sizeof(uint16_t));
+	// do main elimination, using the source group
+	while(1) {
+		uint16_t* row2 = MAT_ROW(rec2);
+		rec2++;
+		if(HEDLEY_UNLIKELY(rec2 == rec))
+			rec2 += rows;
 		if(rows > 1) {
-			MULADD_MULTI_ROW(row2, 0, rows);
+			MULADD_MULTI_LASTROW(row2, 0, rows)
 		} else {
-			MULADD_ROW(row2, 0);
+			MULADD_LASTROW(row2, 0)
 		}
 	}
 	
+	#undef MAT_ROW
 	#undef SCALE_ROW
 	#undef MULADD_ROW
+	#undef MULADD_ROW_PF
 	#undef MULADD_MULTI_ROW
-	
-	return -1;
+	#undef MULADD_MULTI_ROW_PF
+	#undef MULADD_LASTROW
+	#undef MULADD_MULTI_LASTROW
 }
 
 bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, std::function<void(uint16_t, uint16_t)> progressCb) {

From 2e73db2d7665dae48f835cdbafeda5d7cd30d1c2 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sat, 10 Jun 2023 18:55:05 +1000
Subject: [PATCH 14/91] Faster matrix construction for inversion + prefetch
 fallback for mul_add_multi_stridepf

---
 gf16/gf16mul.h     | 21 +++++++++++++++------
 gf16/gfmat_inv.cpp | 45 +++++++++++++++++++++++++++++++++++----------
 2 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h
index 53540b26..b1e91b7b 100644
--- a/gf16/gf16mul.h
+++ b/gf16/gf16mul.h
@@ -313,13 +313,22 @@ class Galois16Mul {
 		assert(srcStride > 0);
 		assert(regions > 0);
 		
-		if(_mul_add_multi_stridepf)
+		if(_mul_add_multi_stridepf) {
 			_mul_add_multi_stridepf(scratch, regions, srcStride, dst, src, len, coefficients, mutScratch, prefetch);
-		else {
-			// TODO: _mul_add_pf fallback; _mul_add_multi shouldn't be set (exception: XorJit AVX512)
-			for(unsigned region = 0; region<regions; region++) {
-				_mul_add(scratch, dst, (const uint8_t*)src+region*srcStride, len, coefficients[region], mutScratch);
-			}
+			return;
+		}
+		
+		// assume _mul_add_multi isn't set (exception: XorJit AVX512)
+		// fallback to using single multiplies
+		unsigned region = 0;
+		size_t pfLen = len>>_info.prefetchDownscale;
+		const char* _pf = (const char*)prefetch;
+		for(unsigned outputPfRounds = 1<<_info.prefetchDownscale; region<regions && outputPfRounds; region++, outputPfRounds--) {
+			_mul_add_pf(scratch, dst, (const uint8_t*)src+region*srcStride, len, coefficients[region], mutScratch, _pf);
+			_pf += pfLen;
+		}
+		for(; region<regions; region++) {
+			_mul_add(scratch, dst, (const uint8_t*)src+region*srcStride, len, coefficients[region], mutScratch);
 		}
 	}
 	
diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index ad72e830..7e8077cc 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -166,8 +166,8 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	
 	unsigned invalidCount = inputValid.size() - validCount;
 	assert(validCount < inputValid.size()); // i.e. invalidCount > 0
-	assert(inputValid.size() <= 32768);
-	assert(recovery.size() <= 65535);
+	assert(inputValid.size() <= 32768 && inputValid.size() > 0);
+	assert(recovery.size() <= 65535 && recovery.size() > 0);
 	
 	ALIGN_ALLOC(mat, invalidCount * stride, gfInfo.alignment);
 	
@@ -190,11 +190,36 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		// generate matrix
 		validCol = 0;
 		missingCol = validCount;
-		for(unsigned input = 0; input < inputValid.size(); input++) {
-			uint16_t inputLog = gfmat_input_log(input);
-			unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++;
-			for(unsigned rec = 0; rec < invalidCount; rec++) {
-				mat[rec * stride16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec));
+		unsigned rec, recStart = 0;
+		if(recovery.at(0) == 0) { // first recovery has exponent 0 is a common case
+			for(unsigned input = 0; input < inputValid.size(); input++) {
+				mat[input] = 1;
+			}
+			recStart++;
+		}
+		{
+			unsigned input = 0;
+			const unsigned GROUP_AMOUNT = 4;
+			for(; input + GROUP_AMOUNT <= inputValid.size(); input+=GROUP_AMOUNT) {
+				uint16_t inputLog[GROUP_AMOUNT];
+				unsigned targetCol[GROUP_AMOUNT];
+				for(unsigned i=0; i<GROUP_AMOUNT; i++) {
+					inputLog[i] = gfmat_input_log(input+i);
+					targetCol[i] = inputValid.at(input+i) ? validCol++ : missingCol++;
+				}
+				for(rec = recStart; rec < invalidCount; rec++) {
+					uint16_t exp = recovery.at(rec);
+					for(unsigned i=0; i<GROUP_AMOUNT; i++) {
+						mat[rec * stride16 + targetCol[i]] = gfmat_coeff_from_log(inputLog[i], exp);
+					}
+				}
+			}
+			for(; input < inputValid.size(); input++) {
+				uint16_t inputLog = gfmat_input_log(input);
+				unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++;
+				for(rec = recStart; rec < invalidCount; rec++) {
+					mat[rec * stride16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec));
+				}
 			}
 		}
 		assert(validCol == validCount);
@@ -205,7 +230,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 			if(progressCb) progressCb(1, totalProgress);
 			progressOffset = 2;
 			
-			for(unsigned rec = 0; rec < invalidCount; rec++) {
+			for(rec = 0; rec < invalidCount; rec++) {
 				uint16_t* row = mat + rec * stride16;
 				//memset(row + matWidth, 0, stride - matWidth); // not necessary, but do this to avoid uninitialized memory
 				gf.prepare(row, row, stride);
@@ -214,7 +239,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		
 		// invert
 		// TODO: optimise: multi-thread + packed arrangement
-		unsigned rec = 0;
+		rec = 0;
 		#define INVERT_GROUP(rows) \
 			if(gfInfo.idealInputMultiple >= rows && invalidCount >= rows) { \
 				for(; rec <= invalidCount-rows; rec+=rows) { \
@@ -241,7 +266,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		if(gf.needPrepare()) {
 			if(progressCb) progressCb(totalProgress-1, totalProgress);
 			
-			for(unsigned rec = 0; rec < invalidCount; rec++) {
+			for(rec = 0; rec < invalidCount; rec++) {
 				uint16_t* row = mat + rec * stride16;
 				gf.finish(row, stride);
 				

From cddb4663dbe6b650e551a1e081240034f953206d Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 11 Jun 2023 11:36:17 +1000
Subject: [PATCH 15/91] Construct matrix for inversion via multiplication, if
 possible Point multiplication via SIMD is often faster than lookups for
 exponentiation

---
 gf16/gf16pmul.cpp         |  28 +++++++
 gf16/gf16pmul.h           |  20 +++++
 gf16/gf16pmul_clmul_sse.c |  90 ++++++++++++++++++++
 gf16/gfmat_inv.cpp        | 169 +++++++++++++++++++++++++++-----------
 gf16/gfmat_inv.h          |   1 +
 5 files changed, 258 insertions(+), 50 deletions(-)
 create mode 100644 gf16/gf16pmul.cpp
 create mode 100644 gf16/gf16pmul.h
 create mode 100644 gf16/gf16pmul_clmul_sse.c

diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp
new file mode 100644
index 00000000..b9bb3b24
--- /dev/null
+++ b/gf16/gf16pmul.cpp
@@ -0,0 +1,28 @@
+#include "gf16pmul.h"
+#include "../src/cpuid.h"
+
+Gf16PMulFunc gf16pmul = nullptr;
+size_t gf16pmul_alignment = 1;
+size_t gf16pmul_blocklen = 1;
+
+void setup_pmul() {
+	gf16pmul = nullptr;
+	gf16pmul_alignment = 1;
+	gf16pmul_blocklen = 1;
+	
+	// CPU detection
+#ifdef PLATFORM_X86
+	int cpuInfo[4];
+	_cpuid(cpuInfo, 1);
+	bool hasClMul = ((cpuInfo[2] & 0x80202) == 0x80202); // SSE4.1 + SSSE3 + CLMUL
+	if(hasClMul && gf16pmul_clmul_sse_available) {
+		gf16pmul = &gf16pmul_clmul_sse;
+		gf16pmul_alignment = 16;
+		gf16pmul_blocklen = 16;
+	} else
+		gf16pmul_clmul_sse_available = 0;
+#endif
+	
+#ifdef PLATFORM_ARM
+#endif
+}
diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h
new file mode 100644
index 00000000..a7819721
--- /dev/null
+++ b/gf16/gf16pmul.h
@@ -0,0 +1,20 @@
+#ifndef __GF16PMUL_H__
+#define __GF16PMUL_H__
+
+#include "../src/hedley.h"
+#include <stddef.h>
+
+// TODO: consider multi-dest
+typedef void(*Gf16PMulFunc)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len);
+extern Gf16PMulFunc gf16pmul;
+extern size_t gf16pmul_alignment;
+extern size_t gf16pmul_blocklen;
+
+void setup_pmul();
+
+HEDLEY_BEGIN_C_DECLS
+void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len);
+extern int gf16pmul_clmul_sse_available;
+HEDLEY_END_C_DECLS
+
+#endif // defined(__GF16PMUL_H__)
diff --git a/gf16/gf16pmul_clmul_sse.c b/gf16/gf16pmul_clmul_sse.c
new file mode 100644
index 00000000..d7fde702
--- /dev/null
+++ b/gf16/gf16pmul_clmul_sse.c
@@ -0,0 +1,90 @@
+#include "gf16_global.h"
+
+#if defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)
+int gf16pmul_clmul_sse_available = 1;
+
+void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+	assert(len % sizeof(__m128i) == 0);
+	
+	const uint8_t* _src1 = (const uint8_t*)src1 + len;
+	const uint8_t* _src2 = (const uint8_t*)src2 + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	
+	__m128i wordMask = _mm_set1_epi32(0xffff);
+	__m128i shufLoHi = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100);
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)) {
+		__m128i data1 = _mm_load_si128((__m128i*)(_src1 + ptr));
+		__m128i data2 = _mm_load_si128((__m128i*)(_src2 + ptr));
+		
+		// do multiply
+		__m128i data1Even = _mm_and_si128(wordMask, data1);
+		__m128i data1Odd  = _mm_andnot_si128(wordMask, data1);
+		__m128i data2Even = _mm_and_si128(wordMask, data2);
+		__m128i data2Odd  = _mm_andnot_si128(wordMask, data2);
+		__m128i prod1Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x00);
+		__m128i prod2Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x11);
+		__m128i prod1Odd  = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x00);
+		__m128i prod2Odd  = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x11);
+		__m128i prod1 = _mm_blend_epi16(prod1Even, prod1Odd, 0xCC);
+		__m128i prod2 = _mm_blend_epi16(prod2Even, prod2Odd, 0xCC);
+		
+		// do reduction
+		/*  obvious Barret reduction strategy, using CLMUL instructions
+		const __m128i barretConst = _mm_set_epi32(0, 0x1100b, 0, 0x1111a);
+		
+		__m128i quot1 = _mm_srli_epi32(prod1, 16);
+		__m128i quot2 = _mm_srli_epi32(prod2, 16);
+		__m128i quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x00);
+		__m128i quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x01);
+		__m128i quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x00);
+		__m128i quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x01);
+		quot1 = _mm_unpacklo_epi64(quot11, quot12);
+		quot2 = _mm_unpacklo_epi64(quot21, quot22);
+		
+		quot1 = _mm_srli_epi32(quot1, 16);
+		quot2 = _mm_srli_epi32(quot2, 16);
+		quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x10);
+		quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x11);
+		quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x10);
+		quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x11);
+		quot1 = _mm_unpacklo_epi64(quot11, quot12);
+		quot2 = _mm_unpacklo_epi64(quot21, quot22);
+		
+		quot1 = _mm_xor_si128(quot1, prod1);
+		quot2 = _mm_xor_si128(quot2, prod2);
+		
+		__m128i result = _mm_packus_epi32(
+			_mm_and_si128(wordMask, quot1),
+			_mm_and_si128(wordMask, quot2)
+		);
+		*/
+		
+		// since there aren't that many bits in the Barret constants, doing manual shift+xor is more efficient
+		// split low/high 16-bit parts
+		__m128i tmp1 = _mm_shuffle_epi8(prod1, shufLoHi);
+		__m128i tmp2 = _mm_shuffle_epi8(prod2, shufLoHi);
+		__m128i rem = _mm_unpacklo_epi64(tmp1, tmp2);
+		__m128i quot = _mm_unpackhi_epi64(tmp1, tmp2);
+		
+		// multiply by 0x1111a (or rather, 0x11118, since the '2' bit doesn't matter due to the product being at most 31 bits) and retain high half
+		tmp1 = _mm_xor_si128(quot, _mm_srli_epi16(quot, 4));
+		tmp1 = _mm_xor_si128(tmp1, _mm_srli_epi16(tmp1, 8));
+		quot = _mm_xor_si128(tmp1, _mm_srli_epi16(quot, 13));
+		
+		// multiply by 0x100b, retain low half
+		tmp1 = _mm_xor_si128(quot, _mm_slli_epi16(quot, 3));
+		tmp1 = _mm_xor_si128(tmp1, _mm_add_epi16(quot, quot));
+		quot = _mm_xor_si128(tmp1, _mm_slli_epi16(quot, 12));
+		
+		__m128i result = _mm_xor_si128(quot, rem);
+		
+		_mm_store_si128((__m128i*)(_dst + ptr), result);
+	}
+}
+
+#else
+int gf16pmul_clmul_sse_available = 0;
+void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+	UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len);
+}
+#endif
diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index 7e8077cc..fc2c97d1 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -1,5 +1,7 @@
 #include "gfmat_coeff.h"
 #include "gfmat_inv.h"
+#include "gf16pmul.h"
+#include <algorithm>
 
 #ifdef PARPAR_INVERT_SUPPORT
 extern "C" uint16_t* gf16_recip;
@@ -155,8 +157,107 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 	#undef MULADD_MULTI_LASTROW
 }
 
+
+// construct initial matrix (pre-inversion)
+void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned validCount, const std::vector<uint16_t>& recovery) {
+	unsigned validCol = 0;
+	unsigned missingCol = validCount;
+	unsigned recStart = 0;
+	unsigned stride16 = stride / sizeof(uint16_t);
+	unsigned invalidCount = inputValid.size() - validCount;
+	if(recovery.at(0) == 0) { // first recovery having exponent 0 is a common case
+		for(unsigned input = 0; input < inputValid.size(); input++) {
+			mat[input] = 1;
+		}
+		recStart++;
+	}
+	if(recStart >= recovery.size()) return;
+	
+	
+	unsigned input = 0;
+	const unsigned GROUP_AMOUNT = 4;
+	#define CONSTRUCT_VIA_EXP(loopcond) \
+		for(; input + GROUP_AMOUNT <= inputValid.size(); input+=GROUP_AMOUNT) { \
+			uint16_t inputLog[GROUP_AMOUNT]; \
+			unsigned targetCol[GROUP_AMOUNT]; \
+			for(unsigned i=0; i<GROUP_AMOUNT; i++) { \
+				inputLog[i] = gfmat_input_log(input+i); \
+				targetCol[i] = inputValid.at(input+i) ? validCol++ : missingCol++; \
+			} \
+			for(loopcond) { \
+				uint16_t exp = recovery.at(rec); \
+				for(unsigned i=0; i<GROUP_AMOUNT; i++) { \
+					mat[rec * stride16 + targetCol[i]] = gfmat_coeff_from_log(inputLog[i], exp); \
+				} \
+			} \
+		} \
+		for(; input < inputValid.size(); input++) { \
+			uint16_t inputLog = gfmat_input_log(input); \
+			unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++; \
+			for(loopcond) { \
+				mat[rec * stride16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec)); \
+			} \
+		} \
+		assert(validCol == validCount)
+	
+	if(recovery.at(recStart) == 1) {
+		bool canUseFastMul = false;
+		if(gf16pmul) {
+			// these shouldn't fail, but just in case, check alignments
+			// blocklen is assumed to be a multiple of alignment
+			canUseFastMul = (stride % gf16pmul_blocklen == 0) && ((uintptr_t)mat % gf16pmul_alignment == 0);
+		}
+		
+		if(canUseFastMul) {
+			// there's a good chance that we have a mostly sequential sequence of recovery blocks
+			// check this by looking for gaps in the sequence
+			std::vector<uint16_t> recSkips;
+			recSkips.reserve(invalidCount);
+			recSkips.push_back(recStart);
+			unsigned maxSkips = invalidCount/2; // TODO: tune threshold
+			uint16_t lastExp = 1;
+			for(unsigned rec = recStart+1; rec < invalidCount; rec++) {
+				uint16_t exp = recovery.at(rec);
+				if(exp != lastExp+1) {
+					recSkips.push_back(rec);
+					if(recSkips.size() >= maxSkips) break;
+				}
+				lastExp = exp;
+			}
+			
+			if(recSkips.size() < maxSkips) {
+				// not many gaps - use the strategy of filling these gaps first...
+				CONSTRUCT_VIA_EXP(uint16_t rec : recSkips);
+				
+				// ...then compute most of the rows via multiplication
+				lastExp = 1;
+				uint16_t* src1 = mat + recStart * stride16;
+				for(unsigned rec = recStart+1; rec < invalidCount; rec++) {
+					uint16_t exp = recovery.at(rec);
+					bool skip = (exp != lastExp+1);
+					lastExp = exp;
+					if(skip) continue;
+					
+					gf16pmul(mat + rec * stride16, src1, mat + (rec-1) * stride16, stride);
+				}
+				
+				return;
+			}
+		}
+	}
+	
+	CONSTRUCT_VIA_EXP(unsigned rec = recStart; rec < invalidCount; rec++);
+	#undef CONSTRUCT_VIA_EXP
+}
+
 bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, std::function<void(uint16_t, uint16_t)> progressCb) {
-	if(mat) ALIGN_FREE(mat);
+	unsigned invalidCount = inputValid.size() - validCount;
+	assert(validCount < inputValid.size()); // i.e. invalidCount > 0
+	assert(inputValid.size() <= 32768 && inputValid.size() > 0);
+	assert(recovery.size() <= 65535 && recovery.size() > 0);
+	
+	if(invalidCount > recovery.size()) return false;
+	
 	
 	unsigned matWidth = inputValid.size() * sizeof(uint16_t);
 	Galois16Mul gf(Galois16Mul::default_method(matWidth, inputValid.size(), inputValid.size(), true));
@@ -164,19 +265,23 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	const auto gfInfo = gf.info();
 	void* gfScratch = gf.mutScratch_alloc();
 	
-	unsigned invalidCount = inputValid.size() - validCount;
-	assert(validCount < inputValid.size()); // i.e. invalidCount > 0
-	assert(inputValid.size() <= 32768 && inputValid.size() > 0);
-	assert(recovery.size() <= 65535 && recovery.size() > 0);
-	
+	if(mat) ALIGN_FREE(mat);
 	ALIGN_ALLOC(mat, invalidCount * stride, gfInfo.alignment);
 	
-	unsigned validCol, missingCol;
 	unsigned stride16 = stride / sizeof(uint16_t);
 	assert(stride16 * sizeof(uint16_t) == stride);
 	
 	uint16_t totalProgress = invalidCount + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc
 	
+	// easier to handle if exponents are in order
+	std::sort(recovery.begin(), recovery.end());
+	
+	static bool pmulInit = false;
+	if(!pmulInit) {
+		pmulInit = true;
+		setup_pmul();
+	}
+	
 	invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying?
 		if(invalidCount > recovery.size()) { // not enough recovery
 			gf.mutScratch_free(gfScratch);
@@ -186,43 +291,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		}
 		
 		if(progressCb) progressCb(0, totalProgress);
-		
-		// generate matrix
-		validCol = 0;
-		missingCol = validCount;
-		unsigned rec, recStart = 0;
-		if(recovery.at(0) == 0) { // first recovery has exponent 0 is a common case
-			for(unsigned input = 0; input < inputValid.size(); input++) {
-				mat[input] = 1;
-			}
-			recStart++;
-		}
-		{
-			unsigned input = 0;
-			const unsigned GROUP_AMOUNT = 4;
-			for(; input + GROUP_AMOUNT <= inputValid.size(); input+=GROUP_AMOUNT) {
-				uint16_t inputLog[GROUP_AMOUNT];
-				unsigned targetCol[GROUP_AMOUNT];
-				for(unsigned i=0; i<GROUP_AMOUNT; i++) {
-					inputLog[i] = gfmat_input_log(input+i);
-					targetCol[i] = inputValid.at(input+i) ? validCol++ : missingCol++;
-				}
-				for(rec = recStart; rec < invalidCount; rec++) {
-					uint16_t exp = recovery.at(rec);
-					for(unsigned i=0; i<GROUP_AMOUNT; i++) {
-						mat[rec * stride16 + targetCol[i]] = gfmat_coeff_from_log(inputLog[i], exp);
-					}
-				}
-			}
-			for(; input < inputValid.size(); input++) {
-				uint16_t inputLog = gfmat_input_log(input);
-				unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++;
-				for(rec = recStart; rec < invalidCount; rec++) {
-					mat[rec * stride16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec));
-				}
-			}
-		}
-		assert(validCol == validCount);
+		Construct(inputValid, validCount, recovery);
 		
 		// pre-transform
 		uint16_t progressOffset = 1;
@@ -230,8 +299,8 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 			if(progressCb) progressCb(1, totalProgress);
 			progressOffset = 2;
 			
-			for(rec = 0; rec < invalidCount; rec++) {
-				uint16_t* row = mat + rec * stride16;
+			for(unsigned r = 0; r < invalidCount; r++) {
+				uint16_t* row = mat + r * stride16;
 				//memset(row + matWidth, 0, stride - matWidth); // not necessary, but do this to avoid uninitialized memory
 				gf.prepare(row, row, stride);
 			}
@@ -239,7 +308,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		
 		// invert
 		// TODO: optimise: multi-thread + packed arrangement
-		rec = 0;
+		unsigned rec = 0;
 		#define INVERT_GROUP(rows) \
 			if(gfInfo.idealInputMultiple >= rows && invalidCount >= rows) { \
 				for(; rec <= invalidCount-rows; rec+=rows) { \
@@ -266,15 +335,15 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		if(gf.needPrepare()) {
 			if(progressCb) progressCb(totalProgress-1, totalProgress);
 			
-			for(rec = 0; rec < invalidCount; rec++) {
-				uint16_t* row = mat + rec * stride16;
+			for(unsigned r = 0; r < invalidCount; r++) {
+				uint16_t* row = mat + r * stride16;
 				gf.finish(row, stride);
 				
 				/*
 				// check for zeroes; TODO: does this need to be the full row?
 				for(unsigned col = validCount; col < inputValid.size(); col++) {
 					if(HEDLEY_UNLIKELY(row[col] == 0)) { // bad coeff
-						recovery.erase(recovery.begin() + rec);
+						recovery.erase(recovery.begin() + r);
 						goto invert_loop;
 					}
 				}
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index ee29911c..b5290900 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -11,6 +11,7 @@ class Galois16RecMatrix {
 	uint16_t* mat;
 	unsigned stride;
 	
+	void Construct(const std::vector<bool>& inputValid, unsigned validCount, const std::vector<uint16_t>& recovery);
 	template<int rows>
 	int processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch);
 public:

From 9686b903498d04dc7cffd8db801ba526fb65254a Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 11 Jun 2023 21:01:46 +1000
Subject: [PATCH 16/91] Add AVX2+VPCLMUL+GFNI variants for PointMultiply

---
 gf16/gf16pmul.cpp              |  43 ++++++-
 gf16/gf16pmul.h                |  12 +-
 gf16/gf16pmul_clmul_avx2.c     |  12 ++
 gf16/gf16pmul_clmul_sse.c      |  90 --------------
 gf16/gf16pmul_clmul_vpclgfni.c |  15 +++
 gf16/gf16pmul_clmul_vpclmul.c  |  14 +++
 gf16/gf16pmul_clmul_x86.h      | 213 +++++++++++++++++++++++++++++++++
 7 files changed, 304 insertions(+), 95 deletions(-)
 create mode 100644 gf16/gf16pmul_clmul_avx2.c
 delete mode 100644 gf16/gf16pmul_clmul_sse.c
 create mode 100644 gf16/gf16pmul_clmul_vpclgfni.c
 create mode 100644 gf16/gf16pmul_clmul_vpclmul.c
 create mode 100644 gf16/gf16pmul_clmul_x86.h

diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp
index b9bb3b24..0e1e99a9 100644
--- a/gf16/gf16pmul.cpp
+++ b/gf16/gf16pmul.cpp
@@ -13,14 +13,51 @@ void setup_pmul() {
 	// CPU detection
 #ifdef PLATFORM_X86
 	int cpuInfo[4];
+	int cpuInfoX[4];
 	_cpuid(cpuInfo, 1);
 	bool hasClMul = ((cpuInfo[2] & 0x80202) == 0x80202); // SSE4.1 + SSSE3 + CLMUL
-	if(hasClMul && gf16pmul_clmul_sse_available) {
+	bool hasAVX2 = false, hasVPCLMUL = false, hasGFNI = false;
+	
+#if !defined(_MSC_VER) || _MSC_VER >= 1600
+	_cpuidX(cpuInfoX, 7, 0);
+	if((cpuInfo[2] & 0x1C000000) == 0x1C000000) { // has AVX + OSXSAVE + XSAVE
+		int xcr = _GET_XCR() & 0xff;
+		if((xcr & 6) == 6) { // AVX enabled
+			hasAVX2 = cpuInfoX[1] & 0x20;
+			hasVPCLMUL = hasAVX2 && (cpuInfoX[2] & 0x400);
+		}
+	}
+	hasGFNI = (cpuInfoX[2] & 0x100) == 0x100;
+#endif
+	
+	if(!hasGFNI) gf16pmul_clmul_available_vpclgfni = 0;
+	if(!hasVPCLMUL) {
+		gf16pmul_clmul_available_vpclmul = 0;
+		gf16pmul_clmul_available_vpclgfni = 0;
+	}
+	if(!hasAVX2) gf16pmul_clmul_available_avx2 = 0;
+	if(!hasClMul) gf16pmul_clmul_available_sse = 0;
+	
+	if(gf16pmul_clmul_available_vpclgfni) {
+		gf16pmul = &gf16pmul_clmul_vpclgfni;
+		gf16pmul_alignment = 32;
+		gf16pmul_blocklen = 64;
+	}
+	else if(gf16pmul_clmul_available_vpclmul) {
+		gf16pmul = &gf16pmul_clmul_vpclmul;
+		gf16pmul_alignment = 32;
+		gf16pmul_blocklen = 32;
+	}
+	else if(gf16pmul_clmul_available_avx2) {
+		gf16pmul = &gf16pmul_clmul_avx2;
+		gf16pmul_alignment = 32;
+		gf16pmul_blocklen = 32;
+	}
+	else if(gf16pmul_clmul_available_sse) {
 		gf16pmul = &gf16pmul_clmul_sse;
 		gf16pmul_alignment = 16;
 		gf16pmul_blocklen = 16;
-	} else
-		gf16pmul_clmul_sse_available = 0;
+	}
 #endif
 	
 #ifdef PLATFORM_ARM
diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h
index a7819721..ac472f12 100644
--- a/gf16/gf16pmul.h
+++ b/gf16/gf16pmul.h
@@ -13,8 +13,16 @@ extern size_t gf16pmul_blocklen;
 void setup_pmul();
 
 HEDLEY_BEGIN_C_DECLS
-void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len);
-extern int gf16pmul_clmul_sse_available;
+#define _PMUL_DECL(f) \
+	void gf16pmul_clmul_##f(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len); \
+	extern int gf16pmul_clmul_available_##f
+
+_PMUL_DECL(sse);
+_PMUL_DECL(avx2);
+_PMUL_DECL(vpclmul);
+_PMUL_DECL(vpclgfni);
+
+#undef _PMUL_DECL
 HEDLEY_END_C_DECLS
 
 #endif // defined(__GF16PMUL_H__)
diff --git a/gf16/gf16pmul_clmul_avx2.c b/gf16/gf16pmul_clmul_avx2.c
new file mode 100644
index 00000000..ce965f4c
--- /dev/null
+++ b/gf16/gf16pmul_clmul_avx2.c
@@ -0,0 +1,12 @@
+#include "../src/platform.h"
+
+#define _mword __m256i
+#define _MM(f) _mm256_ ## f
+#define _MMI(f) _mm256_ ## f ## _si256
+#define MWORD_SIZE 32
+#define _FNSUFFIX _avx2
+
+#if defined(__PCLMUL__) && defined(__AVX2__)
+# define _AVAILABLE 1
+#endif
+#include "gf16pmul_clmul_x86.h"
diff --git a/gf16/gf16pmul_clmul_sse.c b/gf16/gf16pmul_clmul_sse.c
deleted file mode 100644
index d7fde702..00000000
--- a/gf16/gf16pmul_clmul_sse.c
+++ /dev/null
@@ -1,90 +0,0 @@
-#include "gf16_global.h"
-
-#if defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)
-int gf16pmul_clmul_sse_available = 1;
-
-void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
-	assert(len % sizeof(__m128i) == 0);
-	
-	const uint8_t* _src1 = (const uint8_t*)src1 + len;
-	const uint8_t* _src2 = (const uint8_t*)src2 + len;
-	uint8_t* _dst = (uint8_t*)dst + len;
-	
-	__m128i wordMask = _mm_set1_epi32(0xffff);
-	__m128i shufLoHi = _mm_set_epi16(0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100);
-	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m128i)) {
-		__m128i data1 = _mm_load_si128((__m128i*)(_src1 + ptr));
-		__m128i data2 = _mm_load_si128((__m128i*)(_src2 + ptr));
-		
-		// do multiply
-		__m128i data1Even = _mm_and_si128(wordMask, data1);
-		__m128i data1Odd  = _mm_andnot_si128(wordMask, data1);
-		__m128i data2Even = _mm_and_si128(wordMask, data2);
-		__m128i data2Odd  = _mm_andnot_si128(wordMask, data2);
-		__m128i prod1Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x00);
-		__m128i prod2Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x11);
-		__m128i prod1Odd  = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x00);
-		__m128i prod2Odd  = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x11);
-		__m128i prod1 = _mm_blend_epi16(prod1Even, prod1Odd, 0xCC);
-		__m128i prod2 = _mm_blend_epi16(prod2Even, prod2Odd, 0xCC);
-		
-		// do reduction
-		/*  obvious Barret reduction strategy, using CLMUL instructions
-		const __m128i barretConst = _mm_set_epi32(0, 0x1100b, 0, 0x1111a);
-		
-		__m128i quot1 = _mm_srli_epi32(prod1, 16);
-		__m128i quot2 = _mm_srli_epi32(prod2, 16);
-		__m128i quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x00);
-		__m128i quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x01);
-		__m128i quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x00);
-		__m128i quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x01);
-		quot1 = _mm_unpacklo_epi64(quot11, quot12);
-		quot2 = _mm_unpacklo_epi64(quot21, quot22);
-		
-		quot1 = _mm_srli_epi32(quot1, 16);
-		quot2 = _mm_srli_epi32(quot2, 16);
-		quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x10);
-		quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x11);
-		quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x10);
-		quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x11);
-		quot1 = _mm_unpacklo_epi64(quot11, quot12);
-		quot2 = _mm_unpacklo_epi64(quot21, quot22);
-		
-		quot1 = _mm_xor_si128(quot1, prod1);
-		quot2 = _mm_xor_si128(quot2, prod2);
-		
-		__m128i result = _mm_packus_epi32(
-			_mm_and_si128(wordMask, quot1),
-			_mm_and_si128(wordMask, quot2)
-		);
-		*/
-		
-		// since there aren't that many bits in the Barret constants, doing manual shift+xor is more efficient
-		// split low/high 16-bit parts
-		__m128i tmp1 = _mm_shuffle_epi8(prod1, shufLoHi);
-		__m128i tmp2 = _mm_shuffle_epi8(prod2, shufLoHi);
-		__m128i rem = _mm_unpacklo_epi64(tmp1, tmp2);
-		__m128i quot = _mm_unpackhi_epi64(tmp1, tmp2);
-		
-		// multiply by 0x1111a (or rather, 0x11118, since the '2' bit doesn't matter due to the product being at most 31 bits) and retain high half
-		tmp1 = _mm_xor_si128(quot, _mm_srli_epi16(quot, 4));
-		tmp1 = _mm_xor_si128(tmp1, _mm_srli_epi16(tmp1, 8));
-		quot = _mm_xor_si128(tmp1, _mm_srli_epi16(quot, 13));
-		
-		// multiply by 0x100b, retain low half
-		tmp1 = _mm_xor_si128(quot, _mm_slli_epi16(quot, 3));
-		tmp1 = _mm_xor_si128(tmp1, _mm_add_epi16(quot, quot));
-		quot = _mm_xor_si128(tmp1, _mm_slli_epi16(quot, 12));
-		
-		__m128i result = _mm_xor_si128(quot, rem);
-		
-		_mm_store_si128((__m128i*)(_dst + ptr), result);
-	}
-}
-
-#else
-int gf16pmul_clmul_sse_available = 0;
-void gf16pmul_clmul_sse(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
-	UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len);
-}
-#endif
diff --git a/gf16/gf16pmul_clmul_vpclgfni.c b/gf16/gf16pmul_clmul_vpclgfni.c
new file mode 100644
index 00000000..26ad0478
--- /dev/null
+++ b/gf16/gf16pmul_clmul_vpclgfni.c
@@ -0,0 +1,15 @@
+#include "../src/platform.h"
+
+#define _mword __m256i
+#define _MM(f) _mm256_ ## f
+#define _MMI(f) _mm256_ ## f ## _si256
+#define MWORD_SIZE 32
+#define _FNSUFFIX _vpclgfni
+
+#define _USE_VPCLMUL 1
+#define _USE_GFNI 1
+
+#if defined(__VPCLMULQDQ__) && defined(__GFNI__) && defined(__AVX2__)
+# define _AVAILABLE 1
+#endif
+#include "gf16pmul_clmul_x86.h"
diff --git a/gf16/gf16pmul_clmul_vpclmul.c b/gf16/gf16pmul_clmul_vpclmul.c
new file mode 100644
index 00000000..715544a9
--- /dev/null
+++ b/gf16/gf16pmul_clmul_vpclmul.c
@@ -0,0 +1,14 @@
+#include "../src/platform.h"
+
+#define _mword __m256i
+#define _MM(f) _mm256_ ## f
+#define _MMI(f) _mm256_ ## f ## _si256
+#define MWORD_SIZE 32
+#define _FNSUFFIX _vpclmul
+
+#define _USE_VPCLMUL 1
+
+#if defined(__VPCLMULQDQ__) && defined(__AVX2__)
+# define _AVAILABLE 1
+#endif
+#include "gf16pmul_clmul_x86.h"
diff --git a/gf16/gf16pmul_clmul_x86.h b/gf16/gf16pmul_clmul_x86.h
new file mode 100644
index 00000000..03589049
--- /dev/null
+++ b/gf16/gf16pmul_clmul_x86.h
@@ -0,0 +1,213 @@
+#include "gf16_global.h"
+
+#if defined(_AVAILABLE)
+int _FN(gf16pmul_clmul_available) = 1;
+
+static HEDLEY_ALWAYS_INLINE void _FN(gf16pmul_clmul_initmul)(const _mword* src1, const _mword* src2, _mword* prod1, _mword* prod2) {
+	_mword wordMask = _MM(set1_epi32)(0xffff);
+	
+	_mword data1 = _MMI(load)(src1);
+	_mword data2 = _MMI(load)(src2);
+	
+	// do multiply
+	_mword data1Even = _MMI(and)(wordMask, data1);
+	_mword data1Odd  = _MMI(andnot)(wordMask, data1);
+	_mword data2Even = _MMI(and)(wordMask, data2);
+	_mword data2Odd  = _MMI(andnot)(wordMask, data2);
+#if MWORD_SIZE == 32 && !defined(_USE_VPCLMUL)
+	__m128i data1EvenA = _mm256_castsi256_si128(data1Even);
+	__m128i data1EvenB = _mm256_extracti128_si256(data1Even, 1);
+	__m128i data1OddA = _mm256_castsi256_si128(data1Odd);
+	__m128i data1OddB = _mm256_extracti128_si256(data1Odd, 1);
+	__m128i data2EvenA = _mm256_castsi256_si128(data2Even);
+	__m128i data2EvenB = _mm256_extracti128_si256(data2Even, 1);
+	__m128i data2OddA = _mm256_castsi256_si128(data2Odd);
+	__m128i data2OddB = _mm256_extracti128_si256(data2Odd, 1);
+	
+	__m128i prod1EvenA = _mm_clmulepi64_si128(data1EvenA, data2EvenA, 0x00);
+	__m128i prod1EvenB = _mm_clmulepi64_si128(data1EvenB, data2EvenB, 0x00);
+	__m128i prod2EvenA = _mm_clmulepi64_si128(data1EvenA, data2EvenA, 0x11);
+	__m128i prod2EvenB = _mm_clmulepi64_si128(data1EvenB, data2EvenB, 0x11);
+	__m128i prod1OddA  = _mm_clmulepi64_si128(data1OddA, data2OddA, 0x00);
+	__m128i prod1OddB  = _mm_clmulepi64_si128(data1OddB, data2OddB, 0x00);
+	__m128i prod2OddA  = _mm_clmulepi64_si128(data1OddA, data2OddA, 0x11);
+	__m128i prod2OddB  = _mm_clmulepi64_si128(data1OddB, data2OddB, 0x11);
+	
+	__m128i prod1A = _mm_blend_epi16(prod1EvenA, prod1OddA, 0xCC);
+	__m128i prod1B = _mm_blend_epi16(prod1EvenB, prod1OddB, 0xCC);
+	__m128i prod2A = _mm_blend_epi16(prod2EvenA, prod2OddA, 0xCC);
+	__m128i prod2B = _mm_blend_epi16(prod2EvenB, prod2OddB, 0xCC);
+	*prod1 = _mm256_inserti128_si256(_mm256_castsi128_si256(prod1A), prod1B, 1);
+	*prod2 = _mm256_inserti128_si256(_mm256_castsi128_si256(prod2A), prod2B, 1);
+#else
+# if MWORD_SIZE == 16
+	_mword prod1Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x00);
+	_mword prod2Even = _mm_clmulepi64_si128(data1Even, data2Even, 0x11);
+	_mword prod1Odd  = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x00);
+	_mword prod2Odd  = _mm_clmulepi64_si128(data1Odd, data2Odd, 0x11);
+# else
+	_mword prod1Even = _MM(clmulepi64_epi128)(data1Even, data2Even, 0x00);
+	_mword prod2Even = _MM(clmulepi64_epi128)(data1Even, data2Even, 0x11);
+	_mword prod1Odd  = _MM(clmulepi64_epi128)(data1Odd, data2Odd, 0x00);
+	_mword prod2Odd  = _MM(clmulepi64_epi128)(data1Odd, data2Odd, 0x11);
+# endif
+	*prod1 = _MM(blend_epi16)(prod1Even, prod1Odd, 0xCC);
+	*prod2 = _MM(blend_epi16)(prod2Even, prod2Odd, 0xCC);
+#endif
+}
+
+void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+	assert(len % sizeof(_mword) == 0);
+	
+	const uint8_t* _src1 = (const uint8_t*)src1 + len;
+	const uint8_t* _src2 = (const uint8_t*)src2 + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	
+	_mword shufLoHi = _MM(set_epi16)(
+#if MWORD_SIZE >= 32
+		0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100,
+#endif
+		0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100
+	);
+	
+#ifdef _USE_GFNI
+	assert(len % (sizeof(_mword)*2) == 0);
+	_mword shufBLoHi = _MM(set_epi8)(
+# if MWORD_SIZE >= 32
+		15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0,
+# endif
+		15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0
+	);
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)*2) {
+		_mword prod1, prod2, prod3, prod4;
+		_FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2);
+		_FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr) +1, (_mword*)(_src2 + ptr) +1, &prod3, &prod4);
+		
+		// split low/high
+		_mword tmp1 = _MM(shuffle_epi8)(prod1, shufLoHi);
+		_mword tmp2 = _MM(shuffle_epi8)(prod2, shufLoHi);
+		_mword rem1 = _MM(unpacklo_epi64)(tmp1, tmp2);
+		_mword quot1 = _MM(unpackhi_epi64)(tmp1, tmp2);
+		tmp1 = _MM(shuffle_epi8)(prod3, shufLoHi);
+		tmp2 = _MM(shuffle_epi8)(prod4, shufLoHi);
+		_mword rem2 = _MM(unpacklo_epi64)(tmp1, tmp2);
+		_mword quot2 = _MM(unpackhi_epi64)(tmp1, tmp2);
+		
+		// split quot into bytes
+		tmp1 = _MM(shuffle_epi8)(quot1, shufBLoHi);
+		tmp2 = _MM(shuffle_epi8)(quot2, shufBLoHi);
+		quot1 = _MM(unpacklo_epi64)(tmp1, tmp2);
+		quot2 = _MM(unpackhi_epi64)(tmp1, tmp2);
+		
+		// do reduction
+		tmp2 = _MMI(xor)(
+			_MM(gf2p8affine_epi64_epi8)(quot2, _MM(set1_epi64x)(0xbb77eedd0b162c58), 0),
+			_MM(gf2p8affine_epi64_epi8)(quot1, _MM(set1_epi64x)(0xa040800011224488), 0)
+		);
+		tmp1 = _MMI(xor)(
+			_MM(gf2p8affine_epi64_epi8)(quot2, _MM(set1_epi64x)(0xb1d3a6fdfbf7eedd), 0),
+			_MM(gf2p8affine_epi64_epi8)(quot1, _MM(set1_epi64x)(0x113366ddba74e8d0), 0)
+		);
+		
+		/* mappings for above affine matrices: (tmp1 = bottom, tmp2 = top)
+		 * Mul by 0x1111a
+		 *   top->top: top ^ top>>4
+		 *   top->bot: top ^ top>>4 ^ top<<4 ^ top>>5 ^ top>>7
+		 *   bot->bot: bot ^ bot>>4
+		 * Mul by 0x100b
+		 *   top->top: top ^ top<<1 ^ top<<3
+		 *   bot->top: bot>>7 ^ bot>>5 ^ bot<<4
+		 *   bot->bot: bot ^ bot<<1 ^ bot<<3
+		 * Together:
+		 *   top->top:
+		 *     b = top ^ top<<4 ^ top>>4 ^ top>>5 ^ top>>7
+		 *     top ^= top>>4
+		 *     top ^= top<<1 ^ top<<3
+		 *     top ^= b>>7 ^ b>>5 ^ b<<4
+		 *   top->bot:
+		 *     bot = top ^ top<<4 ^ top>>4 ^ top>>5 ^ top>>7
+		 *     bot ^= bot<<1 ^ bot<<3
+		 *   bot->top:
+		 *     b = bot ^ bot>>4
+		 *     top = b>>7 ^ b>>5 ^ b<<4
+		 *   bot->bot:
+		 *     bot ^= bot>>4
+		 *     bot ^= bot<<1 ^ bot<<3
+		 */
+		
+		// join together
+		quot1 = _MM(unpacklo_epi8)(tmp1, tmp2);
+		quot2 = _MM(unpackhi_epi8)(tmp1, tmp2);
+		
+		// xor with rem
+		quot1 = _MMI(xor)(quot1, rem1);
+		quot2 = _MMI(xor)(quot2, rem2);
+		
+		_MMI(store)((_mword*)(_dst + ptr), quot1);
+		_MMI(store)((_mword*)(_dst + ptr) + 1, quot2);
+	}
+#else
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)) {
+		_mword prod1, prod2;
+		_FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2);
+		
+		// do reduction
+		/*  obvious Barret reduction strategy, using CLMUL instructions
+		const __m128i barretConst = _mm_set_epi32(0, 0x1100b, 0, 0x1111a);
+		
+		__m128i quot1 = _mm_srli_epi32(prod1, 16);
+		__m128i quot2 = _mm_srli_epi32(prod2, 16);
+		__m128i quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x00);
+		__m128i quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x01);
+		__m128i quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x00);
+		__m128i quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x01);
+		quot1 = _mm_unpacklo_epi64(quot11, quot12);
+		quot2 = _mm_unpacklo_epi64(quot21, quot22);
+		
+		quot1 = _mm_srli_epi32(quot1, 16);
+		quot2 = _mm_srli_epi32(quot2, 16);
+		quot11 = _mm_clmulepi64_si128(quot1, barretConst, 0x10);
+		quot12 = _mm_clmulepi64_si128(quot1, barretConst, 0x11);
+		quot21 = _mm_clmulepi64_si128(quot2, barretConst, 0x10);
+		quot22 = _mm_clmulepi64_si128(quot2, barretConst, 0x11);
+		quot1 = _mm_unpacklo_epi64(quot11, quot12);
+		quot2 = _mm_unpacklo_epi64(quot21, quot22);
+		
+		quot1 = _mm_xor_si128(quot1, prod1);
+		quot2 = _mm_xor_si128(quot2, prod2);
+		
+		__m128i result = _mm_packus_epi32(
+			_mm_and_si128(wordMask, quot1),
+			_mm_and_si128(wordMask, quot2)
+		);
+		*/
+		
+		// since there aren't that many bits in the Barret constants, doing manual shift+xor is more efficient
+		// split low/high 16-bit parts
+		_mword tmp1 = _MM(shuffle_epi8)(prod1, shufLoHi);
+		_mword tmp2 = _MM(shuffle_epi8)(prod2, shufLoHi);
+		_mword rem = _MM(unpacklo_epi64)(tmp1, tmp2);
+		_mword quot = _MM(unpackhi_epi64)(tmp1, tmp2);
+		
+		// multiply by 0x1111a (or rather, 0x11118, since the '2' bit doesn't matter due to the product being at most 31 bits) and retain high half
+		tmp1 = _MMI(xor)(quot, _MM(srli_epi16)(quot, 4));
+		tmp1 = _MMI(xor)(tmp1, _MM(srli_epi16)(tmp1, 8));
+		quot = _MMI(xor)(tmp1, _MM(srli_epi16)(quot, 13));
+		
+		// multiply by 0x100b, retain low half
+		tmp1 = _MMI(xor)(quot, _MM(slli_epi16)(quot, 3));
+		tmp1 = _MMI(xor)(tmp1, _MM(add_epi16)(quot, quot));
+		quot = _MMI(xor)(tmp1, _MM(slli_epi16)(quot, 12));
+		
+		_mword result = _MMI(xor)(quot, rem);
+		_MMI(store)((_mword*)(_dst + ptr), result);
+	}
+#endif
+}
+
+#else
+int _FN(gf16pmul_clmul_available) = 0;
+void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+	UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len);
+}
+#endif

From 43b6d41159ac258076c2911b9117add1a1a1481d Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 11 Jun 2023 21:02:27 +1000
Subject: [PATCH 17/91] Missed in last commit

---
 gf16/gf16pmul_clmul_sse.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 gf16/gf16pmul_clmul_sse.c

diff --git a/gf16/gf16pmul_clmul_sse.c b/gf16/gf16pmul_clmul_sse.c
new file mode 100644
index 00000000..5338858d
--- /dev/null
+++ b/gf16/gf16pmul_clmul_sse.c
@@ -0,0 +1,12 @@
+#include "../src/platform.h"
+
+#define _mword __m128i
+#define _MM(f) _mm_ ## f
+#define _MMI(f) _mm_ ## f ## _si128
+#define MWORD_SIZE 16
+#define _FNSUFFIX _sse
+
+#if defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)
+# define _AVAILABLE 1
+#endif
+#include "gf16pmul_clmul_x86.h"

From 84bef8eda29aec56d650d9de2661158c937860f8 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 12 Jun 2023 20:35:07 +1000
Subject: [PATCH 18/91] Make VPCLMUL PointMultiply compatible with AVX-512,
 should it ever be useful

---
 gf16/gf16pmul_clmul_x86.h | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/gf16/gf16pmul_clmul_x86.h b/gf16/gf16pmul_clmul_x86.h
index 03589049..9dfa801e 100644
--- a/gf16/gf16pmul_clmul_x86.h
+++ b/gf16/gf16pmul_clmul_x86.h
@@ -51,8 +51,13 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16pmul_clmul_initmul)(const _mword* src1,
 	_mword prod1Odd  = _MM(clmulepi64_epi128)(data1Odd, data2Odd, 0x00);
 	_mword prod2Odd  = _MM(clmulepi64_epi128)(data1Odd, data2Odd, 0x11);
 # endif
+# if MWORD_SIZE >= 64
+	*prod1 = _MM(mask_blend_epi32)(0xAAAA, prod1Even, prod1Odd);
+	*prod2 = _MM(mask_blend_epi32)(0xAAAA, prod2Even, prod2Odd);
+# else
 	*prod1 = _MM(blend_epi16)(prod1Even, prod1Odd, 0xCC);
 	*prod2 = _MM(blend_epi16)(prod2Even, prod2Odd, 0xCC);
+# endif
 #endif
 }
 
@@ -63,21 +68,29 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void
 	const uint8_t* _src2 = (const uint8_t*)src2 + len;
 	uint8_t* _dst = (uint8_t*)dst + len;
 	
+#if MWORD_SIZE >= 64
+	_mword shufLoHi = _MM(set4_epi32)(0x0f0e0b0a, 0x07060302, 0x0d0c0908, 0x05040100);
+#else
 	_mword shufLoHi = _MM(set_epi16)(
-#if MWORD_SIZE >= 32
+# if MWORD_SIZE >= 32
 		0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100,
-#endif
+# endif
 		0x0f0e, 0x0b0a, 0x0706, 0x0302, 0x0d0c, 0x0908, 0x0504, 0x0100
 	);
+#endif
 	
 #ifdef _USE_GFNI
 	assert(len % (sizeof(_mword)*2) == 0);
+# if MWORD_SIZE >= 64
+	_mword shufBLoHi = _MM(set4_epi32)(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
+# else
 	_mword shufBLoHi = _MM(set_epi8)(
-# if MWORD_SIZE >= 32
+#  if MWORD_SIZE >= 32
 		15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0,
-# endif
+#  endif
 		15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0
 	);
+# endif
 	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)*2) {
 		_mword prod1, prod2, prod3, prod4;
 		_FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2);
@@ -100,14 +113,20 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void
 		quot2 = _MM(unpackhi_epi64)(tmp1, tmp2);
 		
 		// do reduction
+		#if MWORD_SIZE >= 64
+		# define SET1_EPI64 _MM(set1_epi64)
+		#else
+		# define SET1_EPI64 _MM(set1_epi64x)
+		#endif
 		tmp2 = _MMI(xor)(
-			_MM(gf2p8affine_epi64_epi8)(quot2, _MM(set1_epi64x)(0xbb77eedd0b162c58), 0),
-			_MM(gf2p8affine_epi64_epi8)(quot1, _MM(set1_epi64x)(0xa040800011224488), 0)
+			_MM(gf2p8affine_epi64_epi8)(quot2, SET1_EPI64(0xbb77eedd0b162c58), 0),
+			_MM(gf2p8affine_epi64_epi8)(quot1, SET1_EPI64(0xa040800011224488), 0)
 		);
 		tmp1 = _MMI(xor)(
-			_MM(gf2p8affine_epi64_epi8)(quot2, _MM(set1_epi64x)(0xb1d3a6fdfbf7eedd), 0),
-			_MM(gf2p8affine_epi64_epi8)(quot1, _MM(set1_epi64x)(0x113366ddba74e8d0), 0)
+			_MM(gf2p8affine_epi64_epi8)(quot2, SET1_EPI64(0xb1d3a6fdfbf7eedd), 0),
+			_MM(gf2p8affine_epi64_epi8)(quot1, SET1_EPI64(0x113366ddba74e8d0), 0)
 		);
+		#undef SET1_EPI64
 		
 		/* mappings for above affine matrices: (tmp1 = bottom, tmp2 = top)
 		 * Mul by 0x1111a

From 19195b904ea27ce40373434b4bb9ff1b5b47f591 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 12 Jun 2023 20:46:36 +1000
Subject: [PATCH 19/91] Prep support for EOR3 in ClMul NEON

---
 gf16/gf16_clmul_neon.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/gf16/gf16_clmul_neon.c b/gf16/gf16_clmul_neon.c
index 7d73a938..1f301a79 100644
--- a/gf16/gf16_clmul_neon.c
+++ b/gf16/gf16_clmul_neon.c
@@ -3,7 +3,6 @@
 #include "gf16_muladd_multi.h"
 
 // TODO: for any multiplicand byte that's 0 (e.g. for coeff < 256), can shortcut a bunch of stuff, but may not be worth the effort
-// can also look at BCAX/EOR3 from SHA3 if bored; SVE2 implementation can also use XAR
 
 #if defined(__ARM_NEON)
 
@@ -47,6 +46,7 @@ typedef poly8x8_t coeff_t;
 # define coeff_fn(f1, f2) f1##_##f2
 #endif
 
+// NOTE: we avoid EOR3 in pmacl* - only chip which supports NEON-SHA3 without SVE2, are the Apple chips and Neoverse V1; the former has PMULL+EOR fusion, which is better than EOR3
 #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) && defined(__APPLE__)
 // Apple M1 supports fusing PMULL+EOR, so ensure these are paired
 static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_low(poly16x8_t sum, poly8x16_t a, poly8x16_t b) {
@@ -113,6 +113,10 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round(const void* src, poly16x8
 	*high2 = pmacl_high(*high2, data.val[1], coeff[1]);
 }
 
+static HEDLEY_ALWAYS_INLINE uint8x16_t eor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
+	return veorq_u8(a, veorq_u8(b, c));
+}
+
 static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, poly16x8_t low2, poly16x8_t mid1, poly16x8_t mid2, poly16x8_t* high1, poly16x8_t high2) {
 	// put data in proper form
 	uint8x16x2_t hibytes = vuzpq_u8(vreinterpretq_u8_p16(*high1), vreinterpretq_u8_p16(high2));
@@ -121,8 +125,8 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, pol
 	// merge mid into high/low
 	uint8x16x2_t midbytes = vuzpq_u8(vreinterpretq_u8_p16(mid1), vreinterpretq_u8_p16(mid2));
 	uint8x16_t libytes = veorq_u8(hibytes.val[0], lobytes.val[1]);
-	lobytes.val[1] = veorq_u8(libytes, veorq_u8(lobytes.val[0], midbytes.val[0]));
-	hibytes.val[0] = veorq_u8(libytes, veorq_u8(hibytes.val[1], midbytes.val[1]));
+	lobytes.val[1] = eor3q_u8(libytes, lobytes.val[0], midbytes.val[0]);
+	hibytes.val[0] = eor3q_u8(libytes, hibytes.val[1], midbytes.val[1]);
 	
 	
 	// Barrett reduction
@@ -130,7 +134,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, pol
 	// multiply hibytes by 0x11100
 	uint8x16_t highest_nibble = vshrq_n_u8(hibytes.val[1], 4);
 	uint8x16_t th0 = vsriq_n_u8(vshlq_n_u8(hibytes.val[1], 4), hibytes.val[0], 4);
-	th0 = veorq_u8(th0, veorq_u8(hibytes.val[0], hibytes.val[1]));
+	th0 = eor3q_u8(th0, hibytes.val[0], hibytes.val[1]);
 	uint8x16_t th1 = veorq_u8(hibytes.val[1], highest_nibble);
 	
 	// subsequent polynomial multiplication doesn't need the low bits of th0 to be correct, so trim these now for a shorter dep chain
@@ -154,11 +158,11 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, pol
 	poly8x16_t redL = vdupq_n_p8(0x0b);
 	hibytes.val[1] = veorq_u8(th0_hi3, th0_hi1);
 	hibytes.val[1] = vsliq_n_u8(hibytes.val[1], th0, 4);
-	lobytes.val[1] = veorq_u8(lobytes.val[1], vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th1), redL)));
+	th1 = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th1), redL));
 	hibytes.val[0] = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th0), redL));
 	
 	*low1 = vreinterpretq_p16_u8(veorq_u8(lobytes.val[0], hibytes.val[0]));
-	*high1 = vreinterpretq_p16_u8(veorq_u8(lobytes.val[1], hibytes.val[1]));
+	*high1 = vreinterpretq_p16_u8(eor3q_u8(hibytes.val[1], lobytes.val[1], th1));
 }
 
 #ifdef __aarch64__

From 908b74b0a9ad4dcb3e1ae35b49bd1fbb9835a54e Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 12 Jun 2023 22:08:44 +1000
Subject: [PATCH 20/91] Add NEON+SVE2 versions of PointMultiply

---
 gf16/gf16_clmul_neon.c     | 93 +-----------------------------------
 gf16/gf16_clmul_neon.h     | 97 ++++++++++++++++++++++++++++++++++++++
 gf16/gf16_clmul_sve2.c     | 49 +------------------
 gf16/gf16_clmul_sve2.h     | 52 ++++++++++++++++++++
 gf16/gf16pmul.cpp          | 13 +++++
 gf16/gf16pmul.h            |  5 ++
 gf16/gf16pmul_clmul_neon.c | 39 +++++++++++++++
 gf16/gf16pmul_clmul_sve2.c | 44 +++++++++++++++++
 8 files changed, 252 insertions(+), 140 deletions(-)
 create mode 100644 gf16/gf16_clmul_neon.h
 create mode 100644 gf16/gf16_clmul_sve2.h
 create mode 100644 gf16/gf16pmul_clmul_neon.c
 create mode 100644 gf16/gf16pmul_clmul_sve2.c

diff --git a/gf16/gf16_clmul_neon.c b/gf16/gf16_clmul_neon.c
index 1f301a79..95a89cf2 100644
--- a/gf16/gf16_clmul_neon.c
+++ b/gf16/gf16_clmul_neon.c
@@ -1,51 +1,11 @@
 
-#include "gf16_neon_common.h"
+#include "gf16_clmul_neon.h"
 #include "gf16_muladd_multi.h"
 
 // TODO: for any multiplicand byte that's 0 (e.g. for coeff < 256), can shortcut a bunch of stuff, but may not be worth the effort
 
 #if defined(__ARM_NEON)
 
-// `vaddq_p8` and co seems to be missing from some compilers (like GCC), so define our own variant
-static HEDLEY_ALWAYS_INLINE poly8x16_t veorq_p8(poly8x16_t a, poly8x16_t b) {
-	return vreinterpretq_p8_u8(veorq_u8(vreinterpretq_u8_p8(a), vreinterpretq_u8_p8(b)));
-}
-
-#ifdef __aarch64__
-typedef poly8x16_t coeff_t;
-# if defined(__GNUC__) || defined(__clang__)
-// because GCC/CLang doesn't seem to handle these cases well, explicitly tell them what to do
-static HEDLEY_ALWAYS_INLINE poly16x8_t pmull_low(poly8x16_t a, poly8x16_t b) {
-	poly16x8_t result;
-	__asm__ ("pmull %0.8h,%1.8b,%2.8b"
-		: "=w"(result)
-		: "w"(a), "w"(b)
-		: /* No clobbers */);
-	return result;
-}
-static HEDLEY_ALWAYS_INLINE poly16x8_t pmull_high(poly8x16_t a, poly8x16_t b) {
-	poly16x8_t result;
-	__asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-		: "=w"(result)
-		: "w"(a), "w"(b)
-		: /* No clobbers */);
-	return result;
-}
-# else
-#  define pmull_low(x, y) vmull_p8(vget_low_p8(x), vget_low_p8(y))
-#  define pmull_high vmull_high_p8
-# endif
-# define coeff_fn(f1, f2) f1##q_##f2
-#else
-static HEDLEY_ALWAYS_INLINE poly8x8_t veor_p8(poly8x8_t a, poly8x8_t b) {
-	return vreinterpret_p8_u8(veor_u8(vreinterpret_u8_p8(a), vreinterpret_u8_p8(b)));
-}
-typedef poly8x8_t coeff_t;
-# define pmull_low(x, y) vmull_p8(vget_low_p8(x), y)
-# define pmull_high(x, y) vmull_p8(vget_high_p8(x), y)
-# define coeff_fn(f1, f2) f1##_##f2
-#endif
-
 // NOTE: we avoid EOR3 in pmacl* - only chip which supports NEON-SHA3 without SVE2, are the Apple chips and Neoverse V1; the former has PMULL+EOR fusion, which is better than EOR3
 #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) && defined(__APPLE__)
 // Apple M1 supports fusing PMULL+EOR, so ensure these are paired
@@ -113,57 +73,6 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round(const void* src, poly16x8
 	*high2 = pmacl_high(*high2, data.val[1], coeff[1]);
 }
 
-static HEDLEY_ALWAYS_INLINE uint8x16_t eor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
-	return veorq_u8(a, veorq_u8(b, c));
-}
-
-static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, poly16x8_t low2, poly16x8_t mid1, poly16x8_t mid2, poly16x8_t* high1, poly16x8_t high2) {
-	// put data in proper form
-	uint8x16x2_t hibytes = vuzpq_u8(vreinterpretq_u8_p16(*high1), vreinterpretq_u8_p16(high2));
-	uint8x16x2_t lobytes = vuzpq_u8(vreinterpretq_u8_p16(*low1), vreinterpretq_u8_p16(low2));
-	
-	// merge mid into high/low
-	uint8x16x2_t midbytes = vuzpq_u8(vreinterpretq_u8_p16(mid1), vreinterpretq_u8_p16(mid2));
-	uint8x16_t libytes = veorq_u8(hibytes.val[0], lobytes.val[1]);
-	lobytes.val[1] = eor3q_u8(libytes, lobytes.val[0], midbytes.val[0]);
-	hibytes.val[0] = eor3q_u8(libytes, hibytes.val[1], midbytes.val[1]);
-	
-	
-	// Barrett reduction
-	// first reduction coefficient is 0x1111a
-	// multiply hibytes by 0x11100
-	uint8x16_t highest_nibble = vshrq_n_u8(hibytes.val[1], 4);
-	uint8x16_t th0 = vsriq_n_u8(vshlq_n_u8(hibytes.val[1], 4), hibytes.val[0], 4);
-	th0 = eor3q_u8(th0, hibytes.val[0], hibytes.val[1]);
-	uint8x16_t th1 = veorq_u8(hibytes.val[1], highest_nibble);
-	
-	// subsequent polynomial multiplication doesn't need the low bits of th0 to be correct, so trim these now for a shorter dep chain
-	uint8x16_t th0_hi3 = vshrq_n_u8(th0, 5);
-	uint8x16_t th0_hi1 = vshrq_n_u8(th0_hi3, 2); // or is `vshrq_n_u8(th0, 7)` better?
-	
-	// mul by 0x1a => we only care about upper byte
-#ifdef __aarch64__
-	th0 = veorq_u8(th0, vqtbl1q_u8(
-		vmakeq_u8(0,1,3,2,6,7,5,4,13,12,14,15,11,10,8,9),
-		highest_nibble
-	));
-#else
-	th0 = veorq_u8(th0, vshrq_n_u8(vreinterpretq_u8_p8(vmulq_p8(
-		vreinterpretq_p8_u8(highest_nibble),
-		vdupq_n_p8(0x1a)
-	)), 4));
-#endif
-	
-	// multiply by polynomial: 0x100b
-	poly8x16_t redL = vdupq_n_p8(0x0b);
-	hibytes.val[1] = veorq_u8(th0_hi3, th0_hi1);
-	hibytes.val[1] = vsliq_n_u8(hibytes.val[1], th0, 4);
-	th1 = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th1), redL));
-	hibytes.val[0] = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th0), redL));
-	
-	*low1 = vreinterpretq_p16_u8(veorq_u8(lobytes.val[0], hibytes.val[0]));
-	*high1 = vreinterpretq_p16_u8(eor3q_u8(hibytes.val[1], lobytes.val[1], th1));
-}
 
 #ifdef __aarch64__
 # define CLMUL_NUM_REGIONS 8
diff --git a/gf16/gf16_clmul_neon.h b/gf16/gf16_clmul_neon.h
new file mode 100644
index 00000000..a638101f
--- /dev/null
+++ b/gf16/gf16_clmul_neon.h
@@ -0,0 +1,97 @@
+#include "gf16_neon_common.h"
+
+#if defined(__ARM_NEON)
+
+// `vaddq_p8` and co seems to be missing from some compilers (like GCC), so define our own variant
+static HEDLEY_ALWAYS_INLINE poly8x16_t veorq_p8(poly8x16_t a, poly8x16_t b) {
+	return vreinterpretq_p8_u8(veorq_u8(vreinterpretq_u8_p8(a), vreinterpretq_u8_p8(b)));
+}
+
+#ifdef __aarch64__
+typedef poly8x16_t coeff_t;
+# if defined(__GNUC__) || defined(__clang__)
+// because GCC/CLang doesn't seem to handle these cases well, explicitly tell them what to do
+static HEDLEY_ALWAYS_INLINE poly16x8_t pmull_low(poly8x16_t a, poly8x16_t b) {
+	poly16x8_t result;
+	__asm__ ("pmull %0.8h,%1.8b,%2.8b"
+		: "=w"(result)
+		: "w"(a), "w"(b)
+		: /* No clobbers */);
+	return result;
+}
+static HEDLEY_ALWAYS_INLINE poly16x8_t pmull_high(poly8x16_t a, poly8x16_t b) {
+	poly16x8_t result;
+	__asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
+		: "=w"(result)
+		: "w"(a), "w"(b)
+		: /* No clobbers */);
+	return result;
+}
+# else
+#  define pmull_low(x, y) vmull_p8(vget_low_p8(x), vget_low_p8(y))
+#  define pmull_high vmull_high_p8
+# endif
+# define coeff_fn(f1, f2) f1##q_##f2
+#else
+static HEDLEY_ALWAYS_INLINE poly8x8_t veor_p8(poly8x8_t a, poly8x8_t b) {
+	return vreinterpret_p8_u8(veor_u8(vreinterpret_u8_p8(a), vreinterpret_u8_p8(b)));
+}
+typedef poly8x8_t coeff_t;
+# define pmull_low(x, y) vmull_p8(vget_low_p8(x), y)
+# define pmull_high(x, y) vmull_p8(vget_high_p8(x), y)
+# define coeff_fn(f1, f2) f1##_##f2
+#endif
+
+static HEDLEY_ALWAYS_INLINE uint8x16_t eor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
+	return veorq_u8(a, veorq_u8(b, c));
+}
+
+static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, poly16x8_t low2, poly16x8_t mid1, poly16x8_t mid2, poly16x8_t* high1, poly16x8_t high2) {
+	// put data in proper form
+	uint8x16x2_t hibytes = vuzpq_u8(vreinterpretq_u8_p16(*high1), vreinterpretq_u8_p16(high2));
+	uint8x16x2_t lobytes = vuzpq_u8(vreinterpretq_u8_p16(*low1), vreinterpretq_u8_p16(low2));
+	
+	// merge mid into high/low
+	uint8x16x2_t midbytes = vuzpq_u8(vreinterpretq_u8_p16(mid1), vreinterpretq_u8_p16(mid2));
+	uint8x16_t libytes = veorq_u8(hibytes.val[0], lobytes.val[1]);
+	lobytes.val[1] = eor3q_u8(libytes, lobytes.val[0], midbytes.val[0]);
+	hibytes.val[0] = eor3q_u8(libytes, hibytes.val[1], midbytes.val[1]);
+	
+	
+	// Barrett reduction
+	// first reduction coefficient is 0x1111a
+	// multiply hibytes by 0x11100
+	uint8x16_t highest_nibble = vshrq_n_u8(hibytes.val[1], 4);
+	uint8x16_t th0 = vsriq_n_u8(vshlq_n_u8(hibytes.val[1], 4), hibytes.val[0], 4);
+	th0 = eor3q_u8(th0, hibytes.val[0], hibytes.val[1]);
+	uint8x16_t th1 = veorq_u8(hibytes.val[1], highest_nibble);
+	
+	// subsequent polynomial multiplication doesn't need the low bits of th0 to be correct, so trim these now for a shorter dep chain
+	uint8x16_t th0_hi3 = vshrq_n_u8(th0, 5);
+	uint8x16_t th0_hi1 = vshrq_n_u8(th0_hi3, 2); // or is `vshrq_n_u8(th0, 7)` better?
+	
+	// mul by 0x1a => we only care about upper byte
+#ifdef __aarch64__
+	th0 = veorq_u8(th0, vqtbl1q_u8(
+		vmakeq_u8(0,1,3,2,6,7,5,4,13,12,14,15,11,10,8,9),
+		highest_nibble
+	));
+#else
+	th0 = veorq_u8(th0, vshrq_n_u8(vreinterpretq_u8_p8(vmulq_p8(
+		vreinterpretq_p8_u8(highest_nibble),
+		vdupq_n_p8(0x1a)
+	)), 4));
+#endif
+	
+	// multiply by polynomial: 0x100b
+	poly8x16_t redL = vdupq_n_p8(0x0b);
+	hibytes.val[1] = veorq_u8(th0_hi3, th0_hi1);
+	hibytes.val[1] = vsliq_n_u8(hibytes.val[1], th0, 4);
+	th1 = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th1), redL));
+	hibytes.val[0] = vreinterpretq_u8_p8(vmulq_p8(vreinterpretq_p8_u8(th0), redL));
+	
+	*low1 = vreinterpretq_p16_u8(veorq_u8(lobytes.val[0], hibytes.val[0]));
+	*high1 = vreinterpretq_p16_u8(eor3q_u8(hibytes.val[1], lobytes.val[1], th1));
+}
+
+#endif
diff --git a/gf16/gf16_clmul_sve2.c b/gf16/gf16_clmul_sve2.c
index 913dc8a9..c038ad47 100644
--- a/gf16/gf16_clmul_sve2.c
+++ b/gf16/gf16_clmul_sve2.c
@@ -1,5 +1,5 @@
 
-#include "gf16_sve_common.h"
+#include "gf16_clmul_sve2.h"
 #include "gf16_muladd_multi.h"
 
 #if defined(__ARM_FEATURE_SVE2)
@@ -42,53 +42,6 @@ static HEDLEY_ALWAYS_INLINE void gf16_clmul_sve2_merge2(
 	*high2 = sveor3_u8(*high2, high2b, high2c);
 }
 
-static HEDLEY_ALWAYS_INLINE void gf16_clmul_sve2_reduction(svuint8_t* low1, svuint8_t low2, svuint8_t mid1, svuint8_t mid2, svuint8_t* high1, svuint8_t high2) {
-	// put data in proper form
-	svuint8_t hibytesL = svtrn1_u8(*high1, high2);
-	svuint8_t hibytesH = svtrn2_u8(*high1, high2);
-	svuint8_t lobytesL = svtrn1_u8(*low1, low2);
-	svuint8_t lobytesH = svtrn2_u8(*low1, low2);
-	
-	// merge mid into high/low
-	svuint8_t midbytesL = svtrn1_u8(mid1, mid2);
-	svuint8_t midbytesH = svtrn2_u8(mid1, mid2);
-	svuint8_t libytes = NOMASK(sveor_u8, hibytesL, lobytesH);
-	lobytesH = sveor3_u8(midbytesL, lobytesL, libytes);
-	hibytesL = sveor3_u8(midbytesH, hibytesH, libytes);
-	
-	// Barrett reduction
-	// first reduction coefficient is 0x1111a
-	svuint8_t highest_nibble = NOMASK(svlsr_n_u8, hibytesH, 4);
-	
-	svuint8_t th0 = svsri_n_u8(NOMASK(svlsl_n_u8, hibytesH, 4), hibytesL, 4);
-	th0 = sveor3_u8(th0, hibytesH, hibytesL);
-	svuint8_t th0_hi3 = NOMASK(svlsr_n_u8, th0, 5);
-	th0 = NOMASK(sveor_u8, th0, NOMASK(svlsr_n_u8,
-		svpmul_n_u8(highest_nibble, 0x1a), 4
-	));
-	
-	// alternative strategy to above, using nibble flipped ops; looks like one less op, but 0xf vector needs to be constructed, so still the same; maybe there's a better way to leverage it?
-	// svuint8_t th0 = svxar_n_u8(hibytesH, hibytesL, 4);
-	// th0 = svbcax_n_u8(th0, svpmul_n_u8(highest_nibble, 0x1a), 0xf);
-	// th0 = svxar_n_u8(th0, svbsl_n_u8(hibytesH, hibytesL, 0xf), 4);
-	// svuint8_t th0_hi3 = NOMASK(svlsr_n_u8, th0, 5);
-	
-	svuint8_t th1 = NOMASK(sveor_u8, hibytesH, highest_nibble);
-	
-	
-	// multiply by polynomial: 0x100b
-	lobytesH = sveor3_u8(
-		lobytesH,
-		svpmul_n_u8(th1, 0x0b),
-		NOMASK(svlsr_n_u8, th0_hi3, 2)
-	);
-	lobytesH = NOMASK(sveor_u8, lobytesH, svsli_n_u8(th0_hi3, th0, 4));
-	lobytesL = NOMASK(sveor_u8, lobytesL, svpmul_n_u8(th0, 0x0b));
-	
-	*low1 = lobytesL;
-	*high1 = lobytesH;
-}
-
 #define CLMUL_NUM_REGIONS 8
 
 static HEDLEY_ALWAYS_INLINE void gf16_clmul_muladd_x_sve2(
diff --git a/gf16/gf16_clmul_sve2.h b/gf16/gf16_clmul_sve2.h
new file mode 100644
index 00000000..f5fc40f6
--- /dev/null
+++ b/gf16/gf16_clmul_sve2.h
@@ -0,0 +1,52 @@
+#include "gf16_sve_common.h"
+
+#if defined(__ARM_FEATURE_SVE2)
+
+static HEDLEY_ALWAYS_INLINE void gf16_clmul_sve2_reduction(svuint8_t* low1, svuint8_t low2, svuint8_t mid1, svuint8_t mid2, svuint8_t* high1, svuint8_t high2) {
+	// put data in proper form
+	svuint8_t hibytesL = svtrn1_u8(*high1, high2);
+	svuint8_t hibytesH = svtrn2_u8(*high1, high2);
+	svuint8_t lobytesL = svtrn1_u8(*low1, low2);
+	svuint8_t lobytesH = svtrn2_u8(*low1, low2);
+	
+	// merge mid into high/low
+	svuint8_t midbytesL = svtrn1_u8(mid1, mid2);
+	svuint8_t midbytesH = svtrn2_u8(mid1, mid2);
+	svuint8_t libytes = NOMASK(sveor_u8, hibytesL, lobytesH);
+	lobytesH = sveor3_u8(midbytesL, lobytesL, libytes);
+	hibytesL = sveor3_u8(midbytesH, hibytesH, libytes);
+	
+	// Barrett reduction
+	// first reduction coefficient is 0x1111a
+	svuint8_t highest_nibble = NOMASK(svlsr_n_u8, hibytesH, 4);
+	
+	svuint8_t th0 = svsri_n_u8(NOMASK(svlsl_n_u8, hibytesH, 4), hibytesL, 4);
+	th0 = sveor3_u8(th0, hibytesH, hibytesL);
+	svuint8_t th0_hi3 = NOMASK(svlsr_n_u8, th0, 5);
+	th0 = NOMASK(sveor_u8, th0, NOMASK(svlsr_n_u8,
+		svpmul_n_u8(highest_nibble, 0x1a), 4
+	));
+	
+	// alternative strategy to above, using nibble flipped ops; looks like one less op, but 0xf vector needs to be constructed, so still the same; maybe there's a better way to leverage it?
+	// svuint8_t th0 = svxar_n_u8(hibytesH, hibytesL, 4);
+	// th0 = svbcax_n_u8(th0, svpmul_n_u8(highest_nibble, 0x1a), 0xf);
+	// th0 = svxar_n_u8(th0, svbsl_n_u8(hibytesH, hibytesL, 0xf), 4);
+	// svuint8_t th0_hi3 = NOMASK(svlsr_n_u8, th0, 5);
+	
+	svuint8_t th1 = NOMASK(sveor_u8, hibytesH, highest_nibble);
+	
+	
+	// multiply by polynomial: 0x100b
+	lobytesH = sveor3_u8(
+		lobytesH,
+		svpmul_n_u8(th1, 0x0b),
+		NOMASK(svlsr_n_u8, th0_hi3, 2)
+	);
+	lobytesH = NOMASK(sveor_u8, lobytesH, svsli_n_u8(th0_hi3, th0, 4));
+	lobytesL = NOMASK(sveor_u8, lobytesL, svpmul_n_u8(th0, 0x0b));
+	
+	*low1 = lobytesL;
+	*high1 = lobytesH;
+}
+
+#endif
diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp
index 0e1e99a9..ffc59f35 100644
--- a/gf16/gf16pmul.cpp
+++ b/gf16/gf16pmul.cpp
@@ -61,5 +61,18 @@ void setup_pmul() {
 #endif
 	
 #ifdef PLATFORM_ARM
+	if(!CPU_HAS_SVE2) gf16pmul_clmul_available_sve2 = 0;
+	if(!CPU_HAS_NEON) gf16pmul_clmul_available_neon = 0;
+	
+	if(gf16pmul_clmul_available_sve2) {
+		gf16pmul = &gf16pmul_clmul_sve2;
+		gf16pmul_alignment = gf16pmul_clmul_sve2_width();
+		gf16pmul_blocklen = gf16pmul_alignment*2;
+	}
+	else if(gf16pmul_clmul_available_neon) {
+		gf16pmul = &gf16pmul_clmul_neon;
+		gf16pmul_alignment = 16;
+		gf16pmul_blocklen = 32;
+	}
 #endif
 }
diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h
index ac472f12..180a9025 100644
--- a/gf16/gf16pmul.h
+++ b/gf16/gf16pmul.h
@@ -21,8 +21,13 @@ _PMUL_DECL(sse);
 _PMUL_DECL(avx2);
 _PMUL_DECL(vpclmul);
 _PMUL_DECL(vpclgfni);
+_PMUL_DECL(neon);
+_PMUL_DECL(sve2);
 
 #undef _PMUL_DECL
+
+unsigned gf16pmul_clmul_sve2_width();
+
 HEDLEY_END_C_DECLS
 
 #endif // defined(__GF16PMUL_H__)
diff --git a/gf16/gf16pmul_clmul_neon.c b/gf16/gf16pmul_clmul_neon.c
new file mode 100644
index 00000000..c4d8f76c
--- /dev/null
+++ b/gf16/gf16pmul_clmul_neon.c
@@ -0,0 +1,39 @@
+#include "gf16_global.h"
+#include "gf16_clmul_neon.h"
+
+#ifdef __ARM_NEON
+int gf16pmul_clmul_available_neon = 1;
+
+void gf16pmul_clmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+	assert(len % sizeof(uint8x16_t)*2 == 0);
+	
+	const poly8_t* _src1 = (const poly8_t*)src1 + len;
+	const poly8_t* _src2 = (const poly8_t*)src2 + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) {
+		poly8x16x2_t data1 = vld2q_p8(_src1+ptr);
+		poly8x16x2_t data2 = vld2q_p8(_src2+ptr);
+		poly16x8_t low1 = pmull_low(data1.val[0], data2.val[0]);
+		poly16x8_t low2 = pmull_high(data1.val[0], data2.val[0]);
+		poly8x16_t dataMid1 = veorq_p8(data1.val[0], data1.val[1]);
+		poly8x16_t dataMid2 = veorq_p8(data2.val[0], data2.val[1]);
+		poly16x8_t mid1 = pmull_low(dataMid1, dataMid2);
+		poly16x8_t mid2 = pmull_high(dataMid1, dataMid2);
+		poly16x8_t high1 = pmull_low(data1.val[1], data2.val[1]);
+		poly16x8_t high2 = pmull_high(data1.val[1], data2.val[1]);
+		
+		gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2);
+		uint8x16x2_t out;
+		out.val[0] = vreinterpretq_u8_p16(low1);
+		out.val[1] = vreinterpretq_u8_p16(high1);
+		vst2q_u8(_dst+ptr, out);
+	}
+}
+
+#else // defined(__ARM_NEON)
+int gf16pmul_clmul_available_neon = 0;
+void gf16pmul_clmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+	UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len);
+}
+#endif
diff --git a/gf16/gf16pmul_clmul_sve2.c b/gf16/gf16pmul_clmul_sve2.c
new file mode 100644
index 00000000..b88fe2c2
--- /dev/null
+++ b/gf16/gf16pmul_clmul_sve2.c
@@ -0,0 +1,44 @@
+#include "gf16_global.h"
+#include "gf16_clmul_sve2.h"
+
+#ifdef __ARM_FEATURE_SVE2
+int gf16pmul_clmul_available_sve2 = 1;
+
+void gf16pmul_clmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+	assert(len % svcntb()*2 == 0);
+	
+	const uint8_t* _src1 = (const uint8_t*)src1 + len;
+	const uint8_t* _src2 = (const uint8_t*)src2 + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += svcntb()*2) {
+		svuint8x2_t data1 = svld2_u8(svptrue_b8(), _src1+ptr);
+		svuint8x2_t data2 = svld2_u8(svptrue_b8(), _src2+ptr);
+		svuint8_t low1 = svpmullb_pair_u8(svget2(data1, 0), svget2(data2, 0));
+		svuint8_t low2 = svpmullt_pair_u8(svget2(data1, 0), svget2(data2, 0));
+		svuint8_t dataMid1 = NOMASK(sveor_u8, svget2(data1, 0), svget2(data1, 1));
+		svuint8_t dataMid2 = NOMASK(sveor_u8, svget2(data2, 0), svget2(data2, 1));
+		svuint8_t mid1 = svpmullb_pair_u8(dataMid1, dataMid2);
+		svuint8_t mid2 = svpmullt_pair_u8(dataMid1, dataMid2);
+		svuint8_t high1 = svpmullb_pair_u8(svget2(data1, 1), svget2(data2, 1));
+		svuint8_t high2 = svpmullt_pair_u8(svget2(data1, 1), svget2(data2, 1));
+		
+		gf16_clmul_sve2_reduction(&low1, low2, mid1, mid2, &high1, high2);
+		svst2_u8(svptrue_b8(), _dst+ptr, svcreate2_u8(low1, high1));
+	}
+}
+
+unsigned gf16pmul_clmul_sve2_width() {
+	return svcntb();
+}
+
+#else // defined(__ARM_FEATURE_SVE2)
+int gf16pmul_clmul_available_sve2 = 0;
+void gf16pmul_clmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+	UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len);
+}
+
+unsigned gf16pmul_clmul_sve2_width() {
+	return 1;
+}
+#endif

From b4e37d8b85ef9074d759b8b8dff0bfecb17f8538 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Fri, 16 Jun 2023 11:54:16 +1000
Subject: [PATCH 21/91] Initial loop-tiling + striping for matrix inversion

---
 gf16/gfmat_inv.cpp | 232 ++++++++++++++++++++++++++-------------------
 gf16/gfmat_inv.h   |  11 ++-
 2 files changed, 145 insertions(+), 98 deletions(-)

diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index fc2c97d1..d5874406 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -11,48 +11,61 @@ extern "C" uint16_t* gf16_recip;
 #include "gf16mul.h"
 
 template<int rows>
-int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch) {
+int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs) {
 	unsigned missingCol = validCount + rec;
 	
 	uint16_t baseCoeff;
 	uint16_t coeff[rows];
 	
-	#define MAT_ROW(r) (mat + (r) * (stride / sizeof(uint16_t)))
+	unsigned sw16 = stripeWidth / sizeof(uint16_t);
+	// TODO: consider optimisation for numStripes == 1 ?
+	
+	
+	#define MAT_ROW(s, r) (mat + (((s)*invalidCount) + (r)) * sw16)
+	#define REPLACE_WORD(r, c, v) gf.replace_word(MAT_ROW((c)/sw16, r), (c)%sw16, v)
 	
 	void* srcRows[rows];
-	srcRows[0] = MAT_ROW(rec);
+	srcRows[0] = MAT_ROW(0, rec);
 	for(unsigned i=1; i<rows; i++)
-		srcRows[i] = (uint8_t*)srcRows[0] + i * stride;
+		srcRows[i] = (uint8_t*)srcRows[0] + i * stripeWidth;
 	
 	
 	#define SCALE_ROW(row) \
-		baseCoeff = gf.replace_word(srcRows[row], missingCol+row, 1); \
+		baseCoeff = REPLACE_WORD(rec+row, missingCol+row, 1); \
 		if(HEDLEY_UNLIKELY(baseCoeff == 0)) /* bad recovery coeff */ \
 			return row; \
-		if(HEDLEY_LIKELY(baseCoeff != 1)) \
-			gf.mul(srcRows[row], srcRows[row], stride, gf16_recip[baseCoeff], gfScratch)
+		if(HEDLEY_LIKELY(baseCoeff != 1)) { \
+			for(unsigned stripe=0; stripe<numStripes; stripe++) \
+				gf.mul(MAT_ROW(stripe, rec+row), MAT_ROW(stripe, rec+row), stripeWidth, gf16_recip[baseCoeff], gfScratch); \
+		} void(0)
 	// TODO: consider prefetching reciprocal?
 	#define MULADD_ROW(rowDst, rowSrc) \
-		coeff[0] = gf.replace_word(rowDst, missingCol+rowSrc, 0); \
-		if(HEDLEY_LIKELY(coeff[0] != 0)) \
-			gf.mul_add(rowDst, srcRows[rowSrc], stride, coeff[0], gfScratch)
+		coeff[0] = REPLACE_WORD(rowDst, missingCol+rowSrc, 0); \
+		if(HEDLEY_LIKELY(coeff[0] != 0)) { \
+			for(unsigned stripe=0; stripe<numStripes; stripe++) \
+				gf.mul_add(MAT_ROW(stripe, rowDst), MAT_ROW(stripe, rec+rowSrc), stripeWidth, coeff[0], gfScratch); \
+		} void(0)
 	// TODO: is a coefficient of 0 ever correct?
 	#define MULADD_ROW_PF(rowDst, rowSrc, rowPf) \
-		coeff[0] = gf.replace_word(rowDst, missingCol+rowSrc, 0); \
-		if(HEDLEY_LIKELY(coeff[0] != 0)) \
-			gf.mul_add_pf(rowDst, srcRows[rowSrc], stride, coeff[0], gfScratch, rowPf)
+		coeff[0] = REPLACE_WORD(rowDst, missingCol+rowSrc, 0); \
+		if(HEDLEY_LIKELY(coeff[0] != 0)) { \
+			for(unsigned stripe=0; stripe<numStripes; stripe++) \
+				gf.mul_add_pf(MAT_ROW(stripe, rowDst), MAT_ROW(stripe, rec+rowSrc), stripeWidth, coeff[0], gfScratch, (uint8_t*)(rowPf) + stripe*stripeWidth); \
+		} void(0)
 	#define MULADD_MULTI_ROW(rowDst, srcOffs, numRows) \
 		for(unsigned i=0; i<numRows; i++) \
-			coeff[i] = gf.replace_word(rowDst, missingCol+srcOffs+i, 0); \
-		gf.mul_add_multi(numRows, 0, rowDst, srcRows+srcOffs, stride, coeff, gfScratch)
+			coeff[i] = REPLACE_WORD(rowDst, missingCol+srcOffs+i, 0); \
+		for(unsigned stripe=0; stripe<numStripes; stripe++) \
+			gf.mul_add_multi(numRows, stripeWidth*stripe, MAT_ROW(0, rowDst), srcRows+srcOffs, stripeWidth, coeff, gfScratch)
 	#define MULADD_MULTI_ROW_PF(rowDst, srcOffs, numRows, rowPf) \
 		for(unsigned i=0; i<numRows; i++) \
-			coeff[i] = gf.replace_word(rowDst, missingCol+srcOffs+i, 0); \
-		gf.mul_add_multi_stridepf(numRows, stride, rowDst, srcRows[srcOffs], stride, coeff, gfScratch, rowPf)
+			coeff[i] = REPLACE_WORD(rowDst, missingCol+srcOffs+i, 0); \
+		for(unsigned stripe=0; stripe<numStripes; stripe++) \
+			gf.mul_add_multi_stridepf(numRows, stripeWidth, MAT_ROW(stripe, rowDst), MAT_ROW(stripe, rec+srcOffs), stripeWidth, coeff, gfScratch, (uint8_t*)(rowPf) + stripe*stripeWidth)
 	
 	#define MULADD_LASTROW(rowDst, rowSrc) \
-		if(HEDLEY_LIKELY(rec2 < invalidCount)) { \
-			MULADD_ROW_PF(rowDst, rowSrc, MAT_ROW(rec2)); \
+		if(HEDLEY_LIKELY(recFirst < invalidCount)) { \
+			MULADD_ROW_PF(rowDst, rowSrc, MAT_ROW(0, recFirst)); \
 		} else { \
 			if(nextScaleRow) { \
 				MULADD_ROW_PF(rowDst, rowSrc, nextScaleRow); \
@@ -62,8 +75,8 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 			return -1; \
 		}
 	#define MULADD_MULTI_LASTROW(rowDst, srcOffs, numRows) \
-		if(HEDLEY_LIKELY(rec2 < invalidCount)) { \
-			MULADD_MULTI_ROW_PF(rowDst, srcOffs, numRows, MAT_ROW(rec2)); \
+		if(HEDLEY_LIKELY(recFirst < invalidCount)) { \
+			MULADD_MULTI_ROW_PF(rowDst, srcOffs, numRows, MAT_ROW(0, recFirst)); \
 		} else { \
 			if(nextScaleRow) { \
 				MULADD_MULTI_ROW_PF(rowDst, srcOffs, numRows, nextScaleRow); \
@@ -73,9 +86,9 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 			return -1; \
 		}
 	
-	unsigned rec2 = rec == 0 ? rows : 0;
+	unsigned recFirst = rec == 0 ? rows : 0;
 	// the next row when `processRow` is called; last action will prefetch this row
-	uint16_t* nextScaleRow = (rec+rows < invalidCount) ? MAT_ROW(rec+rows) : nullptr;
+	uint16_t* nextScaleRow = (rec+rows < invalidCount) ? MAT_ROW(0, rec+rows) : nullptr;
 	
 	// rescale the row
 	SCALE_ROW(0);
@@ -83,71 +96,98 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 	// if we're processing multiple source rows, run elimination on the source group first
 	if(rows >= 2) {
 		// multiply-add to the next row
-		MULADD_ROW(srcRows[1], 0);
+		MULADD_ROW(rec+1, 0);
 		// scale it, and multiply-add back
 		SCALE_ROW(1);
 		if(rows > 2) {
-			MULADD_ROW_PF(srcRows[0], 1, srcRows[2]);
-		} else MULADD_LASTROW(srcRows[0], 1)
+			MULADD_ROW_PF(rec+0, 1, srcRows[2]);
+		} else MULADD_LASTROW(rec+0, 1)
 	} else {
-		if(rec2 >= invalidCount)
+		if(recFirst >= invalidCount)
 			return -1;
 	}
 	if(rows >= 3) {
 		if(rows >= 4) {
-			MULADD_MULTI_ROW_PF(srcRows[2], 0, 2, srcRows[3]);
+			MULADD_MULTI_ROW_PF(rec+2, 0, 2, srcRows[3]);
 			SCALE_ROW(2);
-			MULADD_MULTI_ROW(srcRows[3], 0, 2);
-			MULADD_ROW(srcRows[3], 2);
+			MULADD_MULTI_ROW(rec+3, 0, 2);
+			MULADD_ROW(rec+3, 2);
 			SCALE_ROW(3);
-			MULADD_ROW(srcRows[2], 3);
-			MULADD_MULTI_ROW(srcRows[0], 2, 2);
+			MULADD_ROW(rec+2, 3);
+			MULADD_MULTI_ROW(rec+0, 2, 2);
 			if(rows > 4) {
-				MULADD_MULTI_ROW_PF(srcRows[1], 2, 2, srcRows[4]);
-			} else MULADD_MULTI_LASTROW(srcRows[1], 2, 2)
+				MULADD_MULTI_ROW_PF(rec+1, 2, 2, srcRows[4]);
+			} else MULADD_MULTI_LASTROW(rec+1, 2, 2)
 		} else {
-			MULADD_MULTI_ROW(srcRows[2], 0, 2);
+			MULADD_MULTI_ROW(rec+2, 0, 2);
 			SCALE_ROW(2);
-			MULADD_ROW(srcRows[0], 2);
-			MULADD_LASTROW(srcRows[1], 2)
+			MULADD_ROW(rec+0, 2);
+			MULADD_LASTROW(rec+1, 2)
 		}
 	}
 	if(rows >= 5) {
 		if(rows >= 6) {
-			MULADD_MULTI_ROW_PF(srcRows[4], 0, 4, srcRows[5]);
+			MULADD_MULTI_ROW_PF(rec+4, 0, 4, srcRows[5]);
 			SCALE_ROW(4);
-			MULADD_MULTI_ROW(srcRows[5], 0, 4);
-			MULADD_ROW(srcRows[5], 4);
+			MULADD_MULTI_ROW(rec+5, 0, 4);
+			MULADD_ROW(rec+5, 4);
 			SCALE_ROW(5);
-			MULADD_ROW(srcRows[4], 5);
+			MULADD_ROW(rec+4, 5);
 			for(unsigned r = 0; r < 3; r++) {
-				MULADD_MULTI_ROW(srcRows[r], 4, 2);
+				MULADD_MULTI_ROW(rec+r, 4, 2);
 			}
-			MULADD_MULTI_LASTROW(srcRows[3], 4, 2)
+			MULADD_MULTI_LASTROW(rec+3, 4, 2)
 		} else {
-			MULADD_MULTI_ROW(srcRows[4], 0, 4);
+			MULADD_MULTI_ROW(rec+4, 0, 4);
 			SCALE_ROW(4);
 			for(unsigned r = 0; r < 3; r++) {
-				MULADD_ROW(srcRows[r], 4);
+				MULADD_ROW(rec+r, 4);
 			}
-			MULADD_LASTROW(srcRows[3], 4)
+			MULADD_LASTROW(rec+3, 4)
 		}
 	}
 	
 	// do main elimination, using the source group
-	while(1) {
-		uint16_t* row2 = MAT_ROW(rec2);
-		rec2++;
-		if(HEDLEY_UNLIKELY(rec2 == rec))
-			rec2 += rows;
-		if(rows > 1) {
-			MULADD_MULTI_LASTROW(row2, 0, rows)
+	// first, gather all relevant coefficients
+	for(unsigned r=0; r<invalidCount; r++) {
+		if(HEDLEY_UNLIKELY(r == rec)) {
+			r += rows-1;
 		} else {
-			MULADD_LASTROW(row2, 0)
+			for(unsigned c=0; c<rows; c++)
+				rowCoeffs[r*rows + c] = REPLACE_WORD(r, missingCol+c, 0);
+		}
+	}
+	for(unsigned stripe=0; stripe<numStripes; stripe++) {
+		for(unsigned rec2=recFirst; rec2<invalidCount; ) {
+			unsigned curRec2 = rec2++;
+			if(HEDLEY_UNLIKELY(rec2 == rec))
+				rec2 += rows;
+			
+			void* pf = nextScaleRow;
+			if(HEDLEY_LIKELY(rec2 < invalidCount)) {
+				pf = MAT_ROW(stripe, rec2);
+			} else if(stripe < numStripes-1) {
+				pf = MAT_ROW(stripe+1, recFirst);
+			}
+			
+			if(rows > 1) {
+				if(HEDLEY_LIKELY(pf))
+					gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs + curRec2*rows, gfScratch, pf);
+				else
+					gf.mul_add_multi(rows, stripeWidth*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, rowCoeffs + curRec2*rows, gfScratch);
+			} else {
+				if(HEDLEY_LIKELY(pf))
+					gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs[curRec2], gfScratch, pf);
+				else
+					gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs[curRec2], gfScratch);
+			}
 		}
 	}
 	
+	return -1;
+	
 	#undef MAT_ROW
+	#undef REPLACE_WORD
 	#undef SCALE_ROW
 	#undef MULADD_ROW
 	#undef MULADD_ROW_PF
@@ -163,11 +203,12 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 	unsigned validCol = 0;
 	unsigned missingCol = validCount;
 	unsigned recStart = 0;
-	unsigned stride16 = stride / sizeof(uint16_t);
+	unsigned sw16 = stripeWidth/sizeof(uint16_t);
 	unsigned invalidCount = inputValid.size() - validCount;
 	if(recovery.at(0) == 0) { // first recovery having exponent 0 is a common case
-		for(unsigned input = 0; input < inputValid.size(); input++) {
-			mat[input] = 1;
+		for(unsigned stripe=0; stripe<numStripes; stripe++) {
+			for(unsigned i=0; i<sw16; i++)
+				mat[stripe * invalidCount*sw16 + i] = 1;
 		}
 		recStart++;
 	}
@@ -183,29 +224,33 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 			for(unsigned i=0; i<GROUP_AMOUNT; i++) { \
 				inputLog[i] = gfmat_input_log(input+i); \
 				targetCol[i] = inputValid.at(input+i) ? validCol++ : missingCol++; \
+				targetCol[i] = (targetCol[i]/sw16)*sw16*invalidCount + (targetCol[i]%sw16); \
 			} \
 			for(loopcond) { \
 				uint16_t exp = recovery.at(rec); \
 				for(unsigned i=0; i<GROUP_AMOUNT; i++) { \
-					mat[rec * stride16 + targetCol[i]] = gfmat_coeff_from_log(inputLog[i], exp); \
+					mat[rec * sw16 + targetCol[i]] = gfmat_coeff_from_log(inputLog[i], exp); \
 				} \
 			} \
 		} \
 		for(; input < inputValid.size(); input++) { \
 			uint16_t inputLog = gfmat_input_log(input); \
 			unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++; \
+			targetCol = (targetCol/sw16)*sw16*invalidCount + (targetCol%sw16); \
 			for(loopcond) { \
-				mat[rec * stride16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec)); \
+				mat[rec * sw16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec)); \
 			} \
 		} \
 		assert(validCol == validCount)
+	// TODO: zerofill padding for good measure
 	
 	if(recovery.at(recStart) == 1) {
 		bool canUseFastMul = false;
 		if(gf16pmul) {
 			// these shouldn't fail, but just in case, check alignments
 			// blocklen is assumed to be a multiple of alignment
-			canUseFastMul = (stride % gf16pmul_blocklen == 0) && ((uintptr_t)mat % gf16pmul_alignment == 0);
+			assert(gf16pmul_blocklen % gf16pmul_alignment == 0);
+			canUseFastMul = (stripeWidth % gf16pmul_blocklen == 0) && ((uintptr_t)mat % gf16pmul_alignment == 0);
 		}
 		
 		if(canUseFastMul) {
@@ -230,15 +275,18 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 				CONSTRUCT_VIA_EXP(uint16_t rec : recSkips);
 				
 				// ...then compute most of the rows via multiplication
-				lastExp = 1;
-				uint16_t* src1 = mat + recStart * stride16;
-				for(unsigned rec = recStart+1; rec < invalidCount; rec++) {
-					uint16_t exp = recovery.at(rec);
-					bool skip = (exp != lastExp+1);
-					lastExp = exp;
-					if(skip) continue;
-					
-					gf16pmul(mat + rec * stride16, src1, mat + (rec-1) * stride16, stride);
+				for(unsigned stripe=0; stripe<numStripes; stripe++) {
+					lastExp = 1;
+					uint16_t* matStripe = mat + stripe * invalidCount*sw16;
+					uint16_t* src1 = matStripe + recStart * sw16;
+					for(unsigned rec = recStart+1; rec < invalidCount; rec++) {
+						uint16_t exp = recovery.at(rec);
+						bool skip = (exp != lastExp+1);
+						lastExp = exp;
+						if(skip) continue;
+						
+						gf16pmul(matStripe + rec * sw16, src1, matStripe + (rec-1) * sw16, stripeWidth);
+					}
 				}
 				
 				return;
@@ -250,6 +298,9 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 	#undef CONSTRUCT_VIA_EXP
 }
 
+#define CEIL_DIV(a, b) (((a) + (b)-1) / (b))
+#define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b))
+
 bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, std::function<void(uint16_t, uint16_t)> progressCb) {
 	unsigned invalidCount = inputValid.size() - validCount;
 	assert(validCount < inputValid.size()); // i.e. invalidCount > 0
@@ -261,15 +312,20 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	
 	unsigned matWidth = inputValid.size() * sizeof(uint16_t);
 	Galois16Mul gf(Galois16Mul::default_method(matWidth, inputValid.size(), inputValid.size(), true));
-	stride = gf.alignToStride(matWidth);
 	const auto gfInfo = gf.info();
+	
+	// divide the matrix up into evenly sized stripes (for loop tiling optimisation)
+	numStripes = ROUND_DIV(matWidth, gfInfo.idealChunkSize);
+	if(numStripes < 1) numStripes = 1;
+	stripeWidth = gf.alignToStride(CEIL_DIV(matWidth, numStripes));
+	numStripes = CEIL_DIV(matWidth, stripeWidth);
+	assert(numStripes >= 1);
+	
 	void* gfScratch = gf.mutScratch_alloc();
 	
 	if(mat) ALIGN_FREE(mat);
-	ALIGN_ALLOC(mat, invalidCount * stride, gfInfo.alignment);
-	
-	unsigned stride16 = stride / sizeof(uint16_t);
-	assert(stride16 * sizeof(uint16_t) == stride);
+	unsigned matSize = invalidCount * stripeWidth*numStripes;
+	ALIGN_ALLOC(mat, matSize, gfInfo.alignment);
 	
 	uint16_t totalProgress = invalidCount + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc
 	
@@ -299,22 +355,17 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 			if(progressCb) progressCb(1, totalProgress);
 			progressOffset = 2;
 			
-			for(unsigned r = 0; r < invalidCount; r++) {
-				uint16_t* row = mat + r * stride16;
-				//memset(row + matWidth, 0, stride - matWidth); // not necessary, but do this to avoid uninitialized memory
-				gf.prepare(row, row, stride);
-			}
+			gf.prepare(mat, mat, matSize);
 		}
 		
 		// invert
-		// TODO: optimise: multi-thread + packed arrangement
 		unsigned rec = 0;
 		#define INVERT_GROUP(rows) \
 			if(gfInfo.idealInputMultiple >= rows && invalidCount >= rows) { \
 				for(; rec <= invalidCount-rows; rec+=rows) { \
 					if(progressCb) progressCb(rec + progressOffset, totalProgress); \
 					 \
-					int badRowOffset = processRow<rows>(rec, validCount, invalidCount, gf, gfScratch); \
+					int badRowOffset = processRow<rows>(rec, validCount, invalidCount, gf, gfScratch, rowCoeffs); \
 					if(badRowOffset >= 0) { \
 						/* ignore this recovery row and try again */ \
 						recovery.erase(recovery.begin() + rec + badRowOffset); \
@@ -323,37 +374,28 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 				} \
 			}
 		// max out at 6 groups (registers + cache assoc?)
+		uint16_t* rowCoeffs = new uint16_t[invalidCount*6];
 		INVERT_GROUP(6)
 		INVERT_GROUP(5)
 		INVERT_GROUP(4)
 		INVERT_GROUP(3)
 		INVERT_GROUP(2)
 		INVERT_GROUP(1)
+		delete[] rowCoeffs;
 		#undef INVERT_GROUP
 		
 		// post transform
 		if(gf.needPrepare()) {
 			if(progressCb) progressCb(totalProgress-1, totalProgress);
 			
-			for(unsigned r = 0; r < invalidCount; r++) {
-				uint16_t* row = mat + r * stride16;
-				gf.finish(row, stride);
-				
-				/*
-				// check for zeroes; TODO: does this need to be the full row?
-				for(unsigned col = validCount; col < inputValid.size(); col++) {
-					if(HEDLEY_UNLIKELY(row[col] == 0)) { // bad coeff
-						recovery.erase(recovery.begin() + r);
-						goto invert_loop;
-					}
-				}
-				*/
-			}
+			gf.finish(mat, matSize);
+			// TODO: check for zeroes??
 		}
 	}
 	
 	// remove excess recovery
 	recovery.resize(invalidCount);
+	numRec = invalidCount;
 	
 	gf.mutScratch_free(gfScratch);
 	return true;
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index b5290900..bb0be942 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -9,17 +9,22 @@
 class Galois16Mul;
 class Galois16RecMatrix {
 	uint16_t* mat;
-	unsigned stride;
+	unsigned numStripes;
+	unsigned stripeWidth;
+	unsigned numRec;
 	
 	void Construct(const std::vector<bool>& inputValid, unsigned validCount, const std::vector<uint16_t>& recovery);
 	template<int rows>
-	int processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch);
+	int processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs);
 public:
 	Galois16RecMatrix() : mat(nullptr) {}
 	~Galois16RecMatrix();
 	bool Compute(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, std::function<void(uint16_t, uint16_t)> progressCb = nullptr);
 	inline uint16_t GetFactor(uint16_t inIdx, uint16_t recIdx) const {
-		return mat[recIdx * stride/sizeof(uint16_t) + inIdx];
+		// TODO: check if numStripes==1? consider optimising division?
+		unsigned sw = stripeWidth/sizeof(uint16_t);
+		unsigned stripe = inIdx / sw;
+		return mat[stripe * numRec*sw + recIdx * sw + (inIdx % sw)];
 	}
 };
 #endif

From 54ee89f2139151cb196dd85574745e7f6003962c Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sat, 17 Jun 2023 18:29:19 +1000
Subject: [PATCH 22/91] Extract main invert loop to own function

---
 gf16/gfmat_inv.cpp | 119 ++++++++++++++++++++++++---------------------
 gf16/gfmat_inv.h   |   5 +-
 2 files changed, 67 insertions(+), 57 deletions(-)

diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index d5874406..bbd15dd4 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -10,8 +10,41 @@ extern "C" uint16_t* gf16_recip;
 #include "../src/platform.h" // for ALIGN_*
 #include "gf16mul.h"
 
+#define MAT_ROW(s, r) (mat + (((s)*numRec) + (r)) * (stripeWidth / sizeof(uint16_t)))
+
 template<int rows>
-int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs) {
+void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void* srcRows[rows], Galois16Mul& gf, void* gfScratch, const void* nextPf) {
+	for(unsigned stripe=stripeStart; stripe<stripeEnd; stripe++) {
+		for(unsigned rec2=recFirst; rec2<recLast; ) {
+			unsigned curRec2 = rec2++;
+			if(HEDLEY_UNLIKELY(rec2 == recSrc))
+				rec2 += rows;
+			
+			const void* pf = nextPf;
+			if(HEDLEY_LIKELY(rec2 < recLast)) {
+				pf = MAT_ROW(stripe, rec2);
+			} else if(stripe < stripeEnd-1) {
+				pf = MAT_ROW(stripe+1, recFirst);
+				// TODO: need to prefetch next stripe's initial matrix?
+			}
+			
+			if(rows > 1) {
+				if(HEDLEY_LIKELY(pf))
+					gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs + curRec2*rows, gfScratch, pf);
+				else
+					gf.mul_add_multi(rows, stripeWidth*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, rowCoeffs + curRec2*rows, gfScratch);
+			} else {
+				if(HEDLEY_LIKELY(pf))
+					gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs[curRec2], gfScratch, pf);
+				else
+					gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs[curRec2], gfScratch);
+			}
+		}
+	}
+}
+
+template<int rows>
+int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs) {
 	unsigned missingCol = validCount + rec;
 	
 	uint16_t baseCoeff;
@@ -21,7 +54,6 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 	// TODO: consider optimisation for numStripes == 1 ?
 	
 	
-	#define MAT_ROW(s, r) (mat + (((s)*invalidCount) + (r)) * sw16)
 	#define REPLACE_WORD(r, c, v) gf.replace_word(MAT_ROW((c)/sw16, r), (c)%sw16, v)
 	
 	void* srcRows[rows];
@@ -64,7 +96,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 			gf.mul_add_multi_stridepf(numRows, stripeWidth, MAT_ROW(stripe, rowDst), MAT_ROW(stripe, rec+srcOffs), stripeWidth, coeff, gfScratch, (uint8_t*)(rowPf) + stripe*stripeWidth)
 	
 	#define MULADD_LASTROW(rowDst, rowSrc) \
-		if(HEDLEY_LIKELY(recFirst < invalidCount)) { \
+		if(HEDLEY_LIKELY(recFirst < numRec)) { \
 			MULADD_ROW_PF(rowDst, rowSrc, MAT_ROW(0, recFirst)); \
 		} else { \
 			if(nextScaleRow) { \
@@ -75,7 +107,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 			return -1; \
 		}
 	#define MULADD_MULTI_LASTROW(rowDst, srcOffs, numRows) \
-		if(HEDLEY_LIKELY(recFirst < invalidCount)) { \
+		if(HEDLEY_LIKELY(recFirst < numRec)) { \
 			MULADD_MULTI_ROW_PF(rowDst, srcOffs, numRows, MAT_ROW(0, recFirst)); \
 		} else { \
 			if(nextScaleRow) { \
@@ -88,7 +120,9 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 	
 	unsigned recFirst = rec == 0 ? rows : 0;
 	// the next row when `processRow` is called; last action will prefetch this row
-	uint16_t* nextScaleRow = (rec+rows < invalidCount) ? MAT_ROW(0, rec+rows) : nullptr;
+	uint16_t* nextScaleRow = (rec+rows < numRec) ? MAT_ROW(0, rec+rows) : nullptr;
+	
+	// TODO: consider loop tiling this stuff; requires extracting a small matrix (rows*rows), and solving that, which means a scalar multiply is necessary
 	
 	// rescale the row
 	SCALE_ROW(0);
@@ -103,7 +137,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 			MULADD_ROW_PF(rec+0, 1, srcRows[2]);
 		} else MULADD_LASTROW(rec+0, 1)
 	} else {
-		if(recFirst >= invalidCount)
+		if(recFirst >= numRec)
 			return -1;
 	}
 	if(rows >= 3) {
@@ -149,7 +183,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 	
 	// do main elimination, using the source group
 	// first, gather all relevant coefficients
-	for(unsigned r=0; r<invalidCount; r++) {
+	for(unsigned r=0; r<numRec; r++) {
 		if(HEDLEY_UNLIKELY(r == rec)) {
 			r += rows-1;
 		} else {
@@ -157,36 +191,10 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 				rowCoeffs[r*rows + c] = REPLACE_WORD(r, missingCol+c, 0);
 		}
 	}
-	for(unsigned stripe=0; stripe<numStripes; stripe++) {
-		for(unsigned rec2=recFirst; rec2<invalidCount; ) {
-			unsigned curRec2 = rec2++;
-			if(HEDLEY_UNLIKELY(rec2 == rec))
-				rec2 += rows;
-			
-			void* pf = nextScaleRow;
-			if(HEDLEY_LIKELY(rec2 < invalidCount)) {
-				pf = MAT_ROW(stripe, rec2);
-			} else if(stripe < numStripes-1) {
-				pf = MAT_ROW(stripe+1, recFirst);
-			}
-			
-			if(rows > 1) {
-				if(HEDLEY_LIKELY(pf))
-					gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs + curRec2*rows, gfScratch, pf);
-				else
-					gf.mul_add_multi(rows, stripeWidth*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, rowCoeffs + curRec2*rows, gfScratch);
-			} else {
-				if(HEDLEY_LIKELY(pf))
-					gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs[curRec2], gfScratch, pf);
-				else
-					gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, rowCoeffs[curRec2], gfScratch);
-			}
-		}
-	}
+	invertLoop<rows>(0, numStripes, recFirst, numRec, rec, rowCoeffs, srcRows, gf, gfScratch, nextScaleRow);
 	
 	return -1;
 	
-	#undef MAT_ROW
 	#undef REPLACE_WORD
 	#undef SCALE_ROW
 	#undef MULADD_ROW
@@ -196,6 +204,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, unsigned in
 	#undef MULADD_LASTROW
 	#undef MULADD_MULTI_LASTROW
 }
+#undef MAT_ROW
 
 
 // construct initial matrix (pre-inversion)
@@ -204,11 +213,10 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 	unsigned missingCol = validCount;
 	unsigned recStart = 0;
 	unsigned sw16 = stripeWidth/sizeof(uint16_t);
-	unsigned invalidCount = inputValid.size() - validCount;
 	if(recovery.at(0) == 0) { // first recovery having exponent 0 is a common case
 		for(unsigned stripe=0; stripe<numStripes; stripe++) {
 			for(unsigned i=0; i<sw16; i++)
-				mat[stripe * invalidCount*sw16 + i] = 1;
+				mat[stripe * numRec*sw16 + i] = 1;
 		}
 		recStart++;
 	}
@@ -224,7 +232,7 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 			for(unsigned i=0; i<GROUP_AMOUNT; i++) { \
 				inputLog[i] = gfmat_input_log(input+i); \
 				targetCol[i] = inputValid.at(input+i) ? validCol++ : missingCol++; \
-				targetCol[i] = (targetCol[i]/sw16)*sw16*invalidCount + (targetCol[i]%sw16); \
+				targetCol[i] = (targetCol[i]/sw16)*sw16*numRec + (targetCol[i]%sw16); \
 			} \
 			for(loopcond) { \
 				uint16_t exp = recovery.at(rec); \
@@ -236,7 +244,7 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 		for(; input < inputValid.size(); input++) { \
 			uint16_t inputLog = gfmat_input_log(input); \
 			unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++; \
-			targetCol = (targetCol/sw16)*sw16*invalidCount + (targetCol%sw16); \
+			targetCol = (targetCol/sw16)*sw16*numRec + (targetCol%sw16); \
 			for(loopcond) { \
 				mat[rec * sw16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec)); \
 			} \
@@ -257,11 +265,11 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 			// there's a good chance that we have a mostly sequential sequence of recovery blocks
 			// check this by looking for gaps in the sequence
 			std::vector<uint16_t> recSkips;
-			recSkips.reserve(invalidCount);
+			recSkips.reserve(numRec);
 			recSkips.push_back(recStart);
-			unsigned maxSkips = invalidCount/2; // TODO: tune threshold
+			unsigned maxSkips = numRec/2; // TODO: tune threshold
 			uint16_t lastExp = 1;
-			for(unsigned rec = recStart+1; rec < invalidCount; rec++) {
+			for(unsigned rec = recStart+1; rec < numRec; rec++) {
 				uint16_t exp = recovery.at(rec);
 				if(exp != lastExp+1) {
 					recSkips.push_back(rec);
@@ -277,9 +285,9 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 				// ...then compute most of the rows via multiplication
 				for(unsigned stripe=0; stripe<numStripes; stripe++) {
 					lastExp = 1;
-					uint16_t* matStripe = mat + stripe * invalidCount*sw16;
+					uint16_t* matStripe = mat + stripe * numRec*sw16;
 					uint16_t* src1 = matStripe + recStart * sw16;
-					for(unsigned rec = recStart+1; rec < invalidCount; rec++) {
+					for(unsigned rec = recStart+1; rec < numRec; rec++) {
 						uint16_t exp = recovery.at(rec);
 						bool skip = (exp != lastExp+1);
 						lastExp = exp;
@@ -294,7 +302,7 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 		}
 	}
 	
-	CONSTRUCT_VIA_EXP(unsigned rec = recStart; rec < invalidCount; rec++);
+	CONSTRUCT_VIA_EXP(unsigned rec = recStart; rec < numRec; rec++);
 	#undef CONSTRUCT_VIA_EXP
 }
 
@@ -302,12 +310,12 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 #define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b))
 
 bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, std::function<void(uint16_t, uint16_t)> progressCb) {
-	unsigned invalidCount = inputValid.size() - validCount;
-	assert(validCount < inputValid.size()); // i.e. invalidCount > 0
+	numRec = inputValid.size() - validCount;
+	assert(validCount < inputValid.size()); // i.e. numRec > 0
 	assert(inputValid.size() <= 32768 && inputValid.size() > 0);
 	assert(recovery.size() <= 65535 && recovery.size() > 0);
 	
-	if(invalidCount > recovery.size()) return false;
+	if(numRec > recovery.size()) return false;
 	
 	
 	unsigned matWidth = inputValid.size() * sizeof(uint16_t);
@@ -324,10 +332,10 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	void* gfScratch = gf.mutScratch_alloc();
 	
 	if(mat) ALIGN_FREE(mat);
-	unsigned matSize = invalidCount * stripeWidth*numStripes;
+	unsigned matSize = numRec * stripeWidth*numStripes;
 	ALIGN_ALLOC(mat, matSize, gfInfo.alignment);
 	
-	uint16_t totalProgress = invalidCount + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc
+	uint16_t totalProgress = numRec + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc
 	
 	// easier to handle if exponents are in order
 	std::sort(recovery.begin(), recovery.end());
@@ -339,7 +347,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	}
 	
 	invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying?
-		if(invalidCount > recovery.size()) { // not enough recovery
+		if(numRec > recovery.size()) { // not enough recovery
 			gf.mutScratch_free(gfScratch);
 			ALIGN_FREE(mat);
 			mat = nullptr;
@@ -361,11 +369,11 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		// invert
 		unsigned rec = 0;
 		#define INVERT_GROUP(rows) \
-			if(gfInfo.idealInputMultiple >= rows && invalidCount >= rows) { \
-				for(; rec <= invalidCount-rows; rec+=rows) { \
+			if(gfInfo.idealInputMultiple >= rows && numRec >= rows) { \
+				for(; rec <= numRec-rows; rec+=rows) { \
 					if(progressCb) progressCb(rec + progressOffset, totalProgress); \
 					 \
-					int badRowOffset = processRow<rows>(rec, validCount, invalidCount, gf, gfScratch, rowCoeffs); \
+					int badRowOffset = processRow<rows>(rec, validCount, gf, gfScratch, rowCoeffs); \
 					if(badRowOffset >= 0) { \
 						/* ignore this recovery row and try again */ \
 						recovery.erase(recovery.begin() + rec + badRowOffset); \
@@ -374,7 +382,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 				} \
 			}
 		// max out at 6 groups (registers + cache assoc?)
-		uint16_t* rowCoeffs = new uint16_t[invalidCount*6];
+		uint16_t* rowCoeffs = new uint16_t[numRec*6];
 		INVERT_GROUP(6)
 		INVERT_GROUP(5)
 		INVERT_GROUP(4)
@@ -394,8 +402,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	}
 	
 	// remove excess recovery
-	recovery.resize(invalidCount);
-	numRec = invalidCount;
+	recovery.resize(numRec);
 	
 	gf.mutScratch_free(gfScratch);
 	return true;
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index bb0be942..489f3dc3 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -14,8 +14,11 @@ class Galois16RecMatrix {
 	unsigned numRec;
 	
 	void Construct(const std::vector<bool>& inputValid, unsigned validCount, const std::vector<uint16_t>& recovery);
+	
+	template<int rows>
+	void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void* srcRows[rows], Galois16Mul& gf, void* gfScratch, const void* nextPf);
 	template<int rows>
-	int processRow(unsigned rec, unsigned validCount, unsigned invalidCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs);
+	int processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs);
 public:
 	Galois16RecMatrix() : mat(nullptr) {}
 	~Galois16RecMatrix();

From 7f98988e17c5283dd249f1ced43cc7a8ec3e8279 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 18 Jun 2023 16:34:11 +1000
Subject: [PATCH 23/91] Add basic threading support to matrix inversion

---
 gf16/gfmat_inv.cpp | 164 ++++++++++++++++++++++++++++++++++++++++++---
 gf16/gfmat_inv.h   |  12 ++--
 2 files changed, 162 insertions(+), 14 deletions(-)

diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index bbd15dd4..dd876676 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -9,17 +9,64 @@ extern "C" uint16_t* gf16_recip;
 #include <cassert>
 #include "../src/platform.h" // for ALIGN_*
 #include "gf16mul.h"
+#include "threadqueue.h"
+#include <future>
+
+static const unsigned MIN_THREAD_REC = 10; // minimum number of rows to process on a thread
+
+class Galois16RecMatrixWorker {
+	const Galois16Mul& gf;
+public:
+	MessageThread thread;
+	void* gfScratch;
+	
+	explicit Galois16RecMatrixWorker(const Galois16Mul& _gf) : gf(_gf) {
+		gfScratch = _gf.mutScratch_alloc();
+	}
+	Galois16RecMatrixWorker(Galois16RecMatrixWorker&& other) noexcept : gf(other.gf) {
+		thread = std::move(other.thread);
+		gfScratch = other.gfScratch;
+	}
+	~Galois16RecMatrixWorker() {
+		thread.end();
+		gf.mutScratch_free(gfScratch);
+	}
+};
+
+struct Galois16RecMatrixWorkerMessage {
+	unsigned stripeStart, stripeEnd;
+	unsigned recFirst, recLast;
+	unsigned recSrc; uint16_t* rowCoeffs; void** srcRows; Galois16Mul* gf; void* gfScratch;
+	void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, void**, Galois16Mul&, void*, const void*);
+	Galois16RecMatrix* parent;
+	std::atomic<int>* procRefs;
+	std::promise<void>* done;
+};
+
+static void invert_worker(ThreadMessageQueue<void*>& q) {
+	Galois16RecMatrixWorkerMessage* req;
+	while((req = static_cast<Galois16RecMatrixWorkerMessage*>(q.pop())) != NULL) {
+		(req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->rowCoeffs, req->srcRows, *(req->gf), req->gfScratch, nullptr);
+		if(req->procRefs->fetch_sub(1, std::memory_order_acq_rel) <= 1) {
+			req->done->set_value();
+		}
+		delete req;
+	}
+}
 
 #define MAT_ROW(s, r) (mat + (((s)*numRec) + (r)) * (stripeWidth / sizeof(uint16_t)))
+#define CEIL_DIV(a, b) (((a) + (b)-1) / (b))
+#define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b))
 
 template<int rows>
-void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void* srcRows[rows], Galois16Mul& gf, void* gfScratch, const void* nextPf) {
+void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) {
 	for(unsigned stripe=stripeStart; stripe<stripeEnd; stripe++) {
 		for(unsigned rec2=recFirst; rec2<recLast; ) {
 			unsigned curRec2 = rec2++;
 			if(HEDLEY_UNLIKELY(rec2 == recSrc))
 				rec2 += rows;
 			
+			// TODO: fixup prefetching
 			const void* pf = nextPf;
 			if(HEDLEY_LIKELY(rec2 < recLast)) {
 				pf = MAT_ROW(stripe, rec2);
@@ -44,7 +91,7 @@ void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, uns
 }
 
 template<int rows>
-int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs) {
+int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector<Galois16RecMatrixWorker>& workers) {
 	unsigned missingCol = validCount + rec;
 	
 	uint16_t baseCoeff;
@@ -191,7 +238,85 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 				rowCoeffs[r*rows + c] = REPLACE_WORD(r, missingCol+c, 0);
 		}
 	}
-	invertLoop<rows>(0, numStripes, recFirst, numRec, rec, rowCoeffs, srcRows, gf, gfScratch, nextScaleRow);
+	if(workers.empty())
+		// process elimination directly
+		invertLoop<rows>(0, numStripes, recFirst, numRec, rec, rowCoeffs, srcRows, gf, gfScratch, nextScaleRow);
+	else {
+		// process using workers
+		std::atomic<int> procRefs;
+		std::promise<void> done;
+		auto makeReq = [&, this]() -> Galois16RecMatrixWorkerMessage* {
+			auto* req = new Galois16RecMatrixWorkerMessage;
+			req->recFirst = recFirst;
+			req->recLast = numRec;
+			req->recSrc = rec;
+			req->rowCoeffs = rowCoeffs;
+			req->srcRows = srcRows;
+			req->gf = &gf;
+			req->fn = &Galois16RecMatrix::invertLoop<rows>;
+			req->parent = this;
+			req->procRefs = &procRefs;
+			req->done = &done;
+			return req;
+		};
+		if(numStripes >= workers.size()) { // split full stripes across workers
+			float stripesPerWorker = (float)numStripes / workers.size();
+			float stripe = 0.5;
+			procRefs.store(workers.size());
+			for(auto& worker : workers) {
+				auto* req = makeReq();
+				req->stripeStart = (unsigned)stripe;
+				req->stripeEnd = (unsigned)(stripe + stripesPerWorker);
+				req->gfScratch = worker.gfScratch;
+				worker.thread.send(req);
+				stripe += stripesPerWorker;
+			}
+		} else { // each stripe may need >1 worker
+			std::vector<Galois16RecMatrixWorkerMessage*> reqs;
+			reqs.reserve(workers.size());
+			float workersPerStripe = (float)workers.size() / numStripes;
+			float workerCnt = 0.5;
+			for(unsigned stripe=0; stripe<numStripes; stripe++) {
+				unsigned workerNum = (unsigned)(workerCnt + workersPerStripe) - (unsigned)workerCnt;
+				unsigned numRows = CEIL_DIV(numRec - rows, workerNum);
+				if(numRows < MIN_THREAD_REC) numRows = MIN_THREAD_REC; // ensure workers have a half decent amount of stuff to do
+				unsigned rowPos = recFirst;
+				
+				while(rowPos < numRec) {
+					unsigned sendRows = numRows;
+					if(rowPos+sendRows > rec && rowPos <= rec)
+						// need to send extra to compensate for the gap
+						sendRows += rows;
+					if(rowPos+sendRows > numRec)
+						sendRows = numRec - rowPos;
+					
+					auto* req = makeReq();
+					req->stripeStart = stripe;
+					req->stripeEnd = stripe+1;
+					req->recFirst = rowPos;
+					req->recLast = rowPos+sendRows;
+					reqs.push_back(req);
+					
+					rowPos += sendRows;
+					if(rowPos == rec) rowPos += rows;
+				}
+				
+				workerCnt += workersPerStripe;
+			}
+			assert(reqs.size() <= workers.size());
+			procRefs.store(reqs.size());
+			
+			for(unsigned i=0; i<reqs.size(); i++) {
+				auto& worker = workers[i];
+				auto* req = reqs[i];
+				req->gfScratch = worker.gfScratch;
+				worker.thread.send(req);
+			}
+		}
+		
+		// wait for threads to finish
+		done.get_future().wait();
+	}
 	
 	return -1;
 	
@@ -306,9 +431,6 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 	#undef CONSTRUCT_VIA_EXP
 }
 
-#define CEIL_DIV(a, b) (((a) + (b)-1) / (b))
-#define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b))
-
 bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, std::function<void(uint16_t, uint16_t)> progressCb) {
 	numRec = inputValid.size() - validCount;
 	assert(validCount < inputValid.size()); // i.e. numRec > 0
@@ -329,7 +451,6 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	numStripes = CEIL_DIV(matWidth, stripeWidth);
 	assert(numStripes >= 1);
 	
-	void* gfScratch = gf.mutScratch_alloc();
 	
 	if(mat) ALIGN_FREE(mat);
 	unsigned matSize = numRec * stripeWidth*numStripes;
@@ -346,9 +467,24 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		setup_pmul();
 	}
 	
+	std::vector<Galois16RecMatrixWorker> workers;
+	void* gfScratch;
+	unsigned _numThreads = numThreads;
+	if(numRec < MIN_THREAD_REC) _numThreads = 1; // don't spawn threads if not enough work
+	if(_numThreads > 1) {
+		for(unsigned i=0; i<_numThreads; i++) {
+			workers.push_back(Galois16RecMatrixWorker(gf));
+			workers[i].thread.name = "gauss_worker";
+			workers[i].thread.setCallback(invert_worker);
+		}
+		gfScratch = nullptr; // ...otherwise MSVC won't be happy
+	} else
+		gfScratch = gf.mutScratch_alloc();
+	
 	invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying?
 		if(numRec > recovery.size()) { // not enough recovery
-			gf.mutScratch_free(gfScratch);
+			if(_numThreads <= 1)
+				gf.mutScratch_free(gfScratch);
 			ALIGN_FREE(mat);
 			mat = nullptr;
 			return false;
@@ -373,7 +509,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 				for(; rec <= numRec-rows; rec+=rows) { \
 					if(progressCb) progressCb(rec + progressOffset, totalProgress); \
 					 \
-					int badRowOffset = processRow<rows>(rec, validCount, gf, gfScratch, rowCoeffs); \
+					int badRowOffset = processRow<rows>(rec, validCount, gf, gfScratch, rowCoeffs, workers); \
 					if(badRowOffset >= 0) { \
 						/* ignore this recovery row and try again */ \
 						recovery.erase(recovery.begin() + rec + badRowOffset); \
@@ -404,10 +540,18 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	// remove excess recovery
 	recovery.resize(numRec);
 	
-	gf.mutScratch_free(gfScratch);
+	if(_numThreads <= 1)
+		gf.mutScratch_free(gfScratch);
 	return true;
 }
 
+Galois16RecMatrix::Galois16RecMatrix() : mat(nullptr) {
+	numThreads = hardware_concurrency();
+	numRec = 0;
+	numStripes = 0;
+	stripeWidth = 0;
+}
+
 Galois16RecMatrix::~Galois16RecMatrix() {
 	if(mat) ALIGN_FREE(mat);
 }
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index 489f3dc3..79e84795 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -7,21 +7,25 @@
 
 #ifdef PARPAR_INVERT_SUPPORT
 class Galois16Mul;
+class Galois16RecMatrixWorker;
 class Galois16RecMatrix {
 	uint16_t* mat;
 	unsigned numStripes;
 	unsigned stripeWidth;
 	unsigned numRec;
-	
+	unsigned numThreads;
 	void Construct(const std::vector<bool>& inputValid, unsigned validCount, const std::vector<uint16_t>& recovery);
 	
 	template<int rows>
-	void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void* srcRows[rows], Galois16Mul& gf, void* gfScratch, const void* nextPf);
+	void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf);
 	template<int rows>
-	int processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs);
+	int processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector<Galois16RecMatrixWorker>& workers);
 public:
-	Galois16RecMatrix() : mat(nullptr) {}
+	Galois16RecMatrix();
 	~Galois16RecMatrix();
+	void setNumThreads(int threads) {
+		numThreads = threads;
+	}
 	bool Compute(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, std::function<void(uint16_t, uint16_t)> progressCb = nullptr);
 	inline uint16_t GetFactor(uint16_t inIdx, uint16_t recIdx) const {
 		// TODO: check if numStripes==1? consider optimising division?

From a1efeb3ed8d53adcdc21a3ac254042654526e899 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 17 Jul 2023 21:51:36 +1000
Subject: [PATCH 24/91] Fix missing CpuCap on non-x86/ARM platforms Ref
 animetosho/par2cmdline-turbo#12

---
 gf16/gf16mul.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index 292805b3..5d6a33c4 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -1263,12 +1263,12 @@ void Galois16Mul::mutScratch_free(void* mutScratch) const {
 }
 
 Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inputs, unsigned /*outputs*/, bool forInvert) {
-	const CpuCap caps(true);
 	(void)regionSizeHint;
 	(void)inputs;
 	(void)forInvert;
 	
 #ifdef PLATFORM_X86
+	const CpuCap caps(true);
 	if(caps.hasGFNI) {
 		if(gf16_affine_available_avx512 && caps.hasAVX512VLBW)
 			return GF16_AFFINE_AVX512;
@@ -1306,6 +1306,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu
 		return GF16_XOR_SSE2;
 #endif
 #ifdef PLATFORM_ARM
+	const CpuCap caps(true);
 	if(caps.hasSVE2) {
 		if(gf16_sve_get_size() >= 64)
 			return GF16_SHUFFLE_512_SVE2;
@@ -1335,8 +1336,8 @@ std::vector<Galois16Methods> Galois16Mul::availableMethods(bool checkCpuid) {
 	if(gf16_lookup3_stride())
 		ret.push_back(GF16_LOOKUP3);
 	
-	const CpuCap caps(checkCpuid);
 #ifdef PLATFORM_X86
+	const CpuCap caps(checkCpuid);
 	if(gf16_shuffle_available_ssse3 && caps.hasSSSE3)
 		ret.push_back(GF16_SHUFFLE_SSSE3);
 	if(gf16_shuffle_available_avx && caps.hasAVX)
@@ -1384,6 +1385,7 @@ std::vector<Galois16Methods> Galois16Mul::availableMethods(bool checkCpuid) {
 	}
 #endif
 #ifdef PLATFORM_ARM
+	const CpuCap caps(checkCpuid);
 	if(gf16_available_neon && caps.hasNEON) {
 		ret.push_back(GF16_SHUFFLE_NEON);
 		ret.push_back(GF16_CLMUL_NEON);

From 69e3f0c5490e334316facbf915b323450626a4d0 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 17 Jul 2023 22:27:19 +1000
Subject: [PATCH 25/91] Warning suppression for non-x86/ARM build

---
 hasher/hasher.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp
index 7caf79d0..cff2ac4a 100644
--- a/hasher/hasher.cpp
+++ b/hasher/hasher.cpp
@@ -26,6 +26,7 @@ void setup_hasher() {
 	CRC32_Calc = &CRC32_Calc_Slice4;
 	
 	struct _CpuCap CpuCap;
+	(void)CpuCap;
 	
 	// CPU detection
 #ifdef PLATFORM_X86
@@ -222,6 +223,7 @@ void set_hasherMD5MultiLevel(MD5MultiLevels level) {
 		case MD5MULT_AVX2:
 		case MD5MULT_SSE:
 #endif
+		default:
 		case MD5MULT_SCALAR: break;
 	}
 #undef SET_LEVEL

From d75bfad8ebe7db364818340654830a18790edba7 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 17 Jul 2023 22:44:58 +1000
Subject: [PATCH 26/91] Use non-GNUC compatible inline-asm declaration

---
 gf16/gf16_xor_common.h   | 10 +++++-----
 gf16/gf16_xor_sse2.c     |  2 +-
 hasher/md5-arm-asm.h     |  2 +-
 hasher/md5-arm64-asm.h   | 14 +++++++-------
 hasher/md5-avx512-asm.h  |  8 ++++----
 hasher/md5-scalar-base.h |  6 +++---
 hasher/md5-x86-asm.h     |  8 ++++----
 hasher/md5x2-arm-asm.h   |  2 +-
 hasher/md5x2-neon-asm.h  |  4 ++--
 hasher/md5x2-sse-asm.h   | 22 +++++++++++-----------
 hasher/md5x2-x86-asm.h   | 20 ++++++++++----------
 11 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/gf16/gf16_xor_common.h b/gf16/gf16_xor_common.h
index 80877440..8c987ad3 100644
--- a/gf16/gf16_xor_common.h
+++ b/gf16/gf16_xor_common.h
@@ -37,7 +37,7 @@ extern void gf16_xor256_jit_multi_stub(intptr_t dst, intptr_t dstEnd, const void
 #  endif
 static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_stub(intptr_t src, intptr_t dEnd, intptr_t dest, intptr_t pf, void* fn) {
 	WRITE_JIT(2048)
-	asm volatile(
+	__asm__ volatile(
 		"callq *%q[f]\n"
 		: "+a"(src), "+d"(dest), "+S"(pf) : "c"(dEnd), [f]"r"(fn)
 		: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory"
@@ -46,7 +46,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_stub(intptr_t src, intptr_t dEnd,
 #  ifdef __AVX2__
 static HEDLEY_ALWAYS_INLINE void gf16_xor256_jit_stub(intptr_t src, intptr_t dEnd, intptr_t dest, intptr_t pf, void* fn) {
 	WRITE_JIT(2048)
-	asm volatile(
+	__asm__ volatile(
 		"callq *%q[f]\n"
 		: "+a"(src), "+d"(dest), "+S"(pf) : "c"(dEnd), [f]"r"(fn)
 		: "memory" // GCC pre 4.9 doesn't accept YMM registers
@@ -59,7 +59,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor256_jit_stub(intptr_t src, intptr_t dEn
 #  ifdef __AVX512F__
 static HEDLEY_ALWAYS_INLINE void gf16_xor512_jit_stub(intptr_t src, intptr_t dEnd, intptr_t dest, intptr_t pf, void* fn) {
 	WRITE_JIT(2048)
-	asm volatile(
+	__asm__ volatile(
 		"callq *%q[f]\n"
 		: "+a"(src), "+d"(dest), "+S"(pf) : "c"(dEnd), [f]"r"(fn)
 		: "%zmm1", "%zmm2", "%zmm3", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory"
@@ -69,7 +69,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor512_jit_multi_stub(
 	intptr_t dst, intptr_t dstEnd, const void** src, void* fn
 ) {
 	WRITE_JIT(8192)
-	asm volatile(
+	__asm__ volatile(
 		"movq 8(%%rdx), %%rsi\n"
 		"movq 16(%%rdx), %%rdi\n"
 		"movq 24(%%rdx), %%r8\n"
@@ -102,7 +102,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_stub(intptr_t src, intptr_t dEnd,
 }
 # else
 static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_stub(intptr_t src, intptr_t dEnd, intptr_t dest, intptr_t pf, void* fn) {
-	asm volatile(
+	__asm__ volatile(
 		"calll *%[f]\n"
 		: "+a"(src), "+d"(dest), "+S"(pf) : "c"(dEnd), [f]"r"(fn)
 		: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "memory"
diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c
index 1f428c6c..bce368ec 100644
--- a/gf16/gf16_xor_sse2.c
+++ b/gf16/gf16_xor_sse2.c
@@ -262,7 +262,7 @@ static HEDLEY_ALWAYS_INLINE void STOREU_XMM(void* dest, __m128i xmm) {
 
 /* conditional move, because, for whatever reason, no-one thought of making a CMOVcc intrinsic */
 #if defined(__GNUC__) || defined(__clang__)
-	#define CMOV(cond, dst, src) asm( \
+	#define CMOV(cond, dst, src) __asm__( \
 		"test %[c], %[c]\n" \
 		"cmovnz %[s], %[d]\n" \
 		: [d]"+r"(dst): [c]"r"(cond), [s]"r"(src))
diff --git a/hasher/md5-arm-asm.h b/hasher/md5-arm-asm.h
index 20e53e7b..66996b76 100644
--- a/hasher/md5-arm-asm.h
+++ b/hasher/md5-arm-asm.h
@@ -142,7 +142,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR
 	D = state[3];
 #endif
 	
-	asm(
+	__asm__(
 		"ldr " REG(TMP2) ", [%[in]]\n"
 		REV(TMP2)
 #ifdef ARM_THUMB_LIMIT_REGS
diff --git a/hasher/md5-arm64-asm.h b/hasher/md5-arm64-asm.h
index 48180964..695a4e00 100644
--- a/hasher/md5-arm64-asm.h
+++ b/hasher/md5-arm64-asm.h
@@ -106,7 +106,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR
 
 
 #define RF4(i0, i1, i2, i3, i4, i5, kr) \
-	asm( \
+	__asm__( \
 		ROUND_F(A, A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 25, "ldp %w[cache2], %w[cache3], " LDP_SRC(2)) \
 		ROUND_F(D, D, A, B, C, "%w[cache1]", "k0", "", 20, "") \
 		ROUND_F(C, C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 15, "ldp %w[cache4], %w[cache5], " LDP_SRC(4)) \
@@ -117,7 +117,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR
 	:);
 	
 #define RG4(i0, i1, i2, i3, kr) \
-	asm( \
+	__asm__( \
 		ROUND_G(A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 27) \
 		ROUND_G(D, A, B, C, "%w[cache1]", "k0", "", 23) \
 		ROUND_G(C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 18) \
@@ -127,7 +127,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR
 	:);
 	
 #define RH4(i0, i1, i2, i3, kr) \
-	asm( \
+	__asm__( \
 		ROUND_H(A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 28) \
 		ROUND_H(D, A, B, C, "%w[cache1]", "k0", "", 21) \
 		ROUND_H(C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 16) \
@@ -137,7 +137,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR
 	:);
 	
 #define RI4(i0, i1, i2, i3, kr) \
-	asm( \
+	__asm__( \
 		ROUND_I(A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 26) \
 		ROUND_I(D, A, B, C, "%w[cache1]", "k0", "", 22) \
 		ROUND_I(C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 17) \
@@ -146,7 +146,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR
 	: [kM]"r"(md5_constants_aarch64), [cache0]"r"(cache[i0]), [cache1]"r"(cache[i1]), [cache2]"r"(cache[i2]), [cache3]"r"(cache[i3]) \
 	:);
 	
-	asm(
+	__asm__(
 		"ldp %w[cache0], %w[cache1], " LDP_SRC(0) "\n"
 		"ldp %[k0], %[k1], [%[kM]]\n"
 		ROUND_F(IA, A, IB, IC, ID, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 25, "ldp %w[cache2], %w[cache3], " LDP_SRC(2))
@@ -162,7 +162,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR
 	RF4( 4,  5,  6,  7,  8,  9,   32)
 	RF4( 8,  9, 10, 11, 12, 13,   48)
 	
-	asm(
+	__asm__(
 		ROUND_F(A, A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 25, "ldp %w[cache2], %w[cache3], " LDP_SRC(14))
 		ROUND_F(D, D, A, B, C, "%w[cache1]", "k0", "", 20, "")
 		ROUND_F(C, C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 15, "")
@@ -187,7 +187,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR
 	RI4(  7, 14,  5,12,   208)
 	RI4(  3, 10,  1, 8,   224)
 	RI4( 15,  6, 13, 4,   240)
-	asm(
+	__asm__(
 		ROUND_I(A, B, C, D, "%w[cache0]", "k0", "lsr %[k0], %[k0], #32", 26)
 		ROUND_I(D, A, B, C, "%w[cache1]", "k0", "", 22)
 		ROUND_I(C, D, A, B, "%w[cache2]", "k1", "lsr %[k1], %[k1], #32", 17)
diff --git a/hasher/md5-avx512-asm.h b/hasher/md5-avx512-asm.h
index 9a625df5..1eb0e484 100644
--- a/hasher/md5-avx512-asm.h
+++ b/hasher/md5-avx512-asm.h
@@ -113,7 +113,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_avx512(uint32_t* HEDLEY_RESTR
 	"vprord $" STR(R) ", %[" STR(A) "], %[" STR(A) "]\n" \
 	"vpaddd %[" STR(B) "], %[" STR(A) "], %[" STR(A) "]\n"
 	
-	asm(
+	__asm__(
 		"vmovdqa %[ID], %[TMP2]\n"
 		RF4_FIRST(0)
 		RF4(4)
@@ -132,7 +132,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_avx512(uint32_t* HEDLEY_RESTR
 	: [k0]"m"(md5_constants_avx512[n]), [k1]"m"(md5_constants_avx512[n+4]), [k2]"m"(md5_constants_avx512[n+8]), [k3]"m"(md5_constants_avx512[n+12]) \
 	:
 	
-	asm(
+	__asm__(
 		"vpaddd %[k0], %[in0], %[in0]\n"
 		"vpaddd %[k1], %[in4], %[in4]\n"
 		"vpaddd %[k2], %[in8], %[in8]\n"
@@ -143,7 +143,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_avx512(uint32_t* HEDLEY_RESTR
 		RG4("%[in12]", "%[in0]", "%[in4]")
 	: ASM_PARAMS(16));
 	
-	asm(
+	__asm__(
 		"vpaddd %[k1], %[in4], %[in4]\n"
 		"vpsrlq $32, %[in4], %[TMP1]\n"
 		
@@ -168,7 +168,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_avx512(uint32_t* HEDLEY_RESTR
 		"vmovdqa %[D], %[TMP2]\n"
 	: ASM_PARAMS(32));
 	
-	asm(
+	__asm__(
 		"vpaddd %[k0], %[in0], %[in0]\n"
 		"vpaddd %[k1], %[in4], %[in4]\n"
 		"vpaddd %[k3], %[in12], %[in12]\n"
diff --git a/hasher/md5-scalar-base.h b/hasher/md5-scalar-base.h
index a05c4fcb..df2823b6 100644
--- a/hasher/md5-scalar-base.h
+++ b/hasher/md5-scalar-base.h
@@ -38,7 +38,7 @@
    */
 #  if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
 #   define ROTATE(a,n)  ({ unsigned int ret;   \
-                                asm (                   \
+                                __asm__ (                   \
                                 "roll %1,%0"            \
                                 : "=r"(ret)             \
                                 : "I"(n), "0"((unsigned int)(a))        \
@@ -48,7 +48,7 @@
 #  elif defined(_ARCH_PPC) || defined(_ARCH_PPC64) || \
         defined(__powerpc) || defined(__ppc__) || defined(__powerpc64__)
 #   define ROTATE(a,n)  ({ unsigned int ret;   \
-                                asm (                   \
+                                __asm__ (                   \
                                 "rlwinm %0,%1,%2,0,31"  \
                                 : "=r"(ret)             \
                                 : "r"(a), "I"(n));      \
@@ -56,7 +56,7 @@
                         })
 #  elif defined(__s390x__)
 #   define ROTATE(a,n) ({ unsigned int ret;    \
-                                asm ("rll %0,%1,%2"     \
+                                __asm__ ("rll %0,%1,%2"     \
                                 : "=r"(ret)             \
                                 : "r"(a), "I"(n));      \
                           ret;                          \
diff --git a/hasher/md5-x86-asm.h b/hasher/md5-x86-asm.h
index 6aca35ba..17499bdd 100644
--- a/hasher/md5-x86-asm.h
+++ b/hasher/md5-x86-asm.h
@@ -221,7 +221,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR
 #endif
 	
 	
-	asm(
+	__asm__(
 #ifdef PLATFORM_AMD64
 		"movl %[input0], %k[TMP2]\n"
 		"movl %k[ID], %k[TMP1]\n"
@@ -249,7 +249,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_scalar(uint32_t* HEDLEY_RESTR
 	ASM_INPUTS
 	:);
 	
-	asm(
+	__asm__(
 		RG4( 6, 11,  0,  5,  -0x09e1da9e, -0x3fbf4cc0, 0x265e5a51, -0x16493856)
 		RG4(10, 15,  4,  9,  -0x29d0efa3, 0x02441453, -0x275e197f, -0x182c0438)
 		RG4(14,  3,  8, 13,  0x21e1cde6, -0x3cc8f82a, -0x0b2af279, 0x455a14ed)
@@ -429,7 +429,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_nolea(uint32_t* HEDLEY_RESTRI
 	D = state[3];
 	
 	
-	asm(
+	__asm__(
 		"addl %[input0], %k[A]\n"
 		"movl %k[D], %k[TMP1]\n"
 		RF4(,  1,  2,  3,  4,  -0x28955b88, -0x173848aa, 0x242070db, -0x3e423112)
@@ -444,7 +444,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_nolea(uint32_t* HEDLEY_RESTRI
 	: ASM_INPUTS
 	:);
 	
-	asm(
+	__asm__(
 		RG4( 6, 11,  0,  5,  -0x09e1da9e, -0x3fbf4cc0, 0x265e5a51, -0x16493856)
 		RG4(10, 15,  4,  9,  -0x29d0efa3, 0x02441453, -0x275e197f, -0x182c0438)
 		RG4(14,  3,  8, 13,  0x21e1cde6, -0x3cc8f82a, -0x0b2af279, 0x455a14ed)
diff --git a/hasher/md5x2-arm-asm.h b/hasher/md5x2-arm-asm.h
index f931609d..f01e5195 100644
--- a/hasher/md5x2-arm-asm.h
+++ b/hasher/md5x2-arm-asm.h
@@ -176,7 +176,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co
 	ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "[%[i0], #" STR(i2) "]", "[%[i1], #" STR(i2) "]", k2l, k2h, 17) \
 	ROUND_I(B1, C1, D1, A1, B2, C2, D2, A2, "[%[i0], #" STR(i3) "]", "[%[i1], #" STR(i3) "]", k3l, k3h, 11)
 	
-	asm(
+	__asm__(
 		"ldr " REG(TMP1) ", [%[i0]]\n"
 		"ldr " REG(TMP2) ", [%[i1]]\n"
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
diff --git a/hasher/md5x2-neon-asm.h b/hasher/md5x2-neon-asm.h
index 1789ab5a..e55e7cad 100644
--- a/hasher/md5x2-neon-asm.h
+++ b/hasher/md5x2-neon-asm.h
@@ -128,7 +128,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_neon(uint32x2_t* state, co
 	ROUND_I(C, D, A, B, "v18", 17, 15) \
 	ROUND_I(B, C, D, A, "v19", 11, 21)
 
-	asm(
+	__asm__(
 		"ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[i0]]\n"
 		"ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[i1]]\n"
 		"zip1 v24.4s, v20.4s, v28.4s\n"
@@ -237,7 +237,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_neon(uint32x2_t* state, co
 	ROUND_I(C, D, A, B, "d14", 17, 15) \
 	ROUND_I(B, C, D, A, "d15", 11, 21)
 	
-	asm(
+	__asm__(
 		"vld1.8 {d16-d19}, [%[i0]]\n"
 		"add r4, %[i0], #32\n"
 		"vld1.8 {d24-d27}, [%[i1]]\n"
diff --git a/hasher/md5x2-sse-asm.h b/hasher/md5x2-sse-asm.h
index d9391698..3516e5dd 100644
--- a/hasher/md5x2-sse-asm.h
+++ b/hasher/md5x2-sse-asm.h
@@ -152,7 +152,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_sse(__m128i* state, const
 	"psrlq $" STR(R) ", %[" STR(A) "]\n" \
 	"paddd %[" STR(B) "], %[" STR(A) "]\n"
 
-#define RF4(offs, r1, r2) asm( \
+#define RF4(offs, r1, r2) __asm__( \
 	READ4 \
 	ROUND_F(A, B, C, D, "%[TMPI1]", 25) \
 	"psrlq $32, %[TMPI1]\n" \
@@ -216,21 +216,21 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_sse(__m128i* state, const
 	RF4(2, 4, 5)
 	RF4(3, 6, 7)
 	
-	asm(
+	__asm__(
 		RG4(0, 0, 3, 5)
 		RG4(1, 2, 5, 7)
 		RG4(2, 4, 7, 1)
 		RG4(3, 6, 1, 3)
 	: ASM_PARAMS(32));
 	
-	asm(
+	__asm__(
 		RH4(0, ROUND_H_FIRST, 2, 4, 5, 7)
 		RH4(1, ROUND_H, 0, 2, 3, 5)
 		RH4(2, ROUND_H, 6, 0, 1, 3)
 		RH4(3, ROUND_H, 4, 6, 7, 1)
 	: ASM_PARAMS(64));
 	
-	asm(
+	__asm__(
 		"pcmpeqb %[TMPF2], %[TMPF2]\n"
 		RI4(0, 0, 3, 7, 2)
 		RI4(1, 6, 1, 5, 0)
@@ -314,7 +314,7 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_avx(__m128i* state, const
 	"vpsrlq $" STR(R) ", %[" STR(A) "], %[" STR(A) "]\n" \
 	"vpaddd %[" STR(B) "], %[" STR(A) "], %[" STR(A) "]\n"
 
-#define RF4(offs, r1, r2) asm( \
+#define RF4(offs, r1, r2) __asm__( \
 	READ4 \
 	ROUND_F(A, B, C, D, "%[TMPI1]", 25) \
 	"vpsrlq $32, %[TMPI1], %[TMPI1]\n" \
@@ -380,21 +380,21 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_avx(__m128i* state, const
 	RF4(2, 4, 5)
 	RF4(3, 6, 7)
 	
-	asm(
+	__asm__(
 		RG4(0, 0, 3, 5)
 		RG4(1, 2, 5, 7)
 		RG4(2, 4, 7, 1)
 		RG4(3, 6, 1, 3)
 	: ASM_PARAMS(32));
 	
-	asm(
+	__asm__(
 		RH4(0, ROUND_H, 2, 4, 5, 7)
 		RH4(1, ROUND_H, 0, 2, 3, 5)
 		RH4(2, ROUND_H, 6, 0, 1, 3)
 		RH4(3, ROUND_H, 4, 6, 7, 1)
 	: ASM_PARAMS(64));
 	
-	asm(
+	__asm__(
 		"vpcmpeqb %[TMPF2], %[TMPF2], %[TMPF2]\n"
 		RI4(0, 0, 3, 7, 2)
 		RI4(1, 6, 1, 5, 0)
@@ -460,21 +460,21 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_avx512(__m128i* state, con
 	RF4(2, 4, 5)
 	RF4(3, 6, 7)
 	
-	asm(
+	__asm__(
 		RG4(0, 0, 3, 5)
 		RG4(1, 2, 5, 7)
 		RG4(2, 4, 7, 1)
 		RG4(3, 6, 1, 3)
 	: ASM_PARAMS(32));
 	
-	asm(
+	__asm__(
 		RH4(0, ROUND_H_FIRST, 2, 4, 5, 7)
 		RH4(1, ROUND_H, 0, 2, 3, 5)
 		RH4(2, ROUND_H, 6, 0, 1, 3)
 		RH4(3, ROUND_H, 4, 6, 7, 1)
 	: ASM_PARAMS(64));
 	
-	asm(
+	__asm__(
 		RI4(0, 0, 3, 7, 2)
 		RI4(1, 6, 1, 5, 0)
 		RI4(2, 4, 7, 3, 6)
diff --git a/hasher/md5x2-x86-asm.h b/hasher/md5x2-x86-asm.h
index e0f18b96..64322d2f 100644
--- a/hasher/md5x2-x86-asm.h
+++ b/hasher/md5x2-x86-asm.h
@@ -163,34 +163,34 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co
   [i1_0]"m"(_data[1][i0]), [i1_1]"m"(_data[1][i1])  ASM_PARAMS_ONES \
 :
 
-#define RF4(i0, i1, i2, i3, k0, k1, k2, k3) asm( \
+#define RF4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \
 	ROUND_F(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 7) \
 	ROUND_F(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 12) \
-: ASM_PARAMS(i0, i1)); asm( \
+: ASM_PARAMS(i0, i1)); __asm__( \
 	ROUND_F(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 17) \
 	ROUND_F(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 22) \
 : ASM_PARAMS(i2, i3));
 	
-#define RG4(i0, i1, i2, i3, k0, k1, k2, k3) asm( \
+#define RG4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \
 	ROUND_G(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 5) \
 	ROUND_G(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 9) \
-: ASM_PARAMS(i0, i1)); asm( \
+: ASM_PARAMS(i0, i1)); __asm__( \
 	ROUND_G(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 14) \
 	ROUND_G(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 20) \
 : ASM_PARAMS(i2, i3));
 	
-#define RH4(i0, i1, i2, i3, k0, k1, k2, k3) asm( \
+#define RH4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \
 	ROUND_H(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 4) \
 	ROUND_H(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 11) \
-: ASM_PARAMS(i0, i1)); asm( \
+: ASM_PARAMS(i0, i1)); __asm__( \
 	ROUND_H(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 16) \
 	ROUND_H(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 23) \
 : ASM_PARAMS(i2, i3));
 	
-#define RI4(i0, i1, i2, i3, k0, k1, k2, k3) asm( \
+#define RI4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \
 	ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 6) \
 	ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 10) \
-: ASM_PARAMS(i0, i1)); asm( \
+: ASM_PARAMS(i0, i1)); __asm__( \
 	ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 15) \
 	ROUND_I(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 21) \
 : ASM_PARAMS(i2, i3));
@@ -217,10 +217,10 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co
 	RI4( 3, 10,  1,  8,  0x655b59c3, -0x70f3336e, -0x00100b83, -0x7a7ba22f)
 	RI4(15,  6, 13,  4,  0x6fa87e4f, -0x01d31920, -0x5cfebcec, 0x4e0811a1)
 	
-	asm(
+	__asm__(
 		ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", -0x08ac817e, 6)
 		ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", -0x42c50dcb, 10)
-	: ASM_PARAMS(11, 2)); asm(
+	: ASM_PARAMS(11, 2)); __asm__(
 		ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", 0x2ad7d2bb, 15)
 		ROUND_I_LAST(B1, C1, D1, A1, B2, C2, D2, A2, -0x14792c6f, 21)
 	: ASM_PARAMS(9, 0));

From 465d1f7c1e1e90bb352b30a148f6e70094f17c1e Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 17 Jul 2023 22:59:44 +1000
Subject: [PATCH 27/91] Add missing C99 flags

---
 binding.gyp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/binding.gyp b/binding.gyp
index 8d62640e..fd4af8ec 100644
--- a/binding.gyp
+++ b/binding.gyp
@@ -21,6 +21,7 @@
         ]
       }]
     ],
+    "cflags": ["-std=gnu99", "-D_POSIX_C_SOURCE=200112L"],
     "cxxflags": ["-std=c++11"],
     "msvs_settings": {"VCCLCompilerTool": {"Optimization": "MaxSpeed"}},
     "configurations": {"Release": {

From 5b2c917e2831307189f4e845ec19a662288b836d Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 18 Jul 2023 18:19:00 +1000
Subject: [PATCH 28/91] Try to use MAP_ANONYMOUS flag if available

---
 gf16/x86_jit.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gf16/x86_jit.h b/gf16/x86_jit.h
index 0a8fbe50..2597d9d8 100644
--- a/gf16/x86_jit.h
+++ b/gf16/x86_jit.h
@@ -738,7 +738,13 @@ static HEDLEY_ALWAYS_INLINE jit_wx_pair* jit_alloc(size_t len) {
 	if(!ret) return NULL;
 	
 	ret->len = len;
-	void* mem = mmap(NULL, len, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0);
+	void* mem = mmap(NULL, len, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE |
+# ifdef MAP_ANONYMOUS
+		MAP_ANONYMOUS,
+# else
+		MAP_ANON,
+# endif
+		-1, 0);
 	if(mem) {
 		if((uintptr_t)mem & 63) { // page not cacheline aligned? something's gone wrong...
 			munmap(mem, len);

From 277c458ff5441a7c542a7e6bb6ceb487285d218f Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 19 Jul 2023 15:13:17 +1000
Subject: [PATCH 29/91] Row grouping in matrix inversion + bug fixes

---
 gf16/gfmat_inv.cpp | 200 ++++++++++++++++++++++++++++++++-------------
 gf16/gfmat_inv.h   |   9 +-
 2 files changed, 151 insertions(+), 58 deletions(-)

diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index dd876676..84f1fdc7 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -26,10 +26,12 @@ class Galois16RecMatrixWorker {
 	Galois16RecMatrixWorker(Galois16RecMatrixWorker&& other) noexcept : gf(other.gf) {
 		thread = std::move(other.thread);
 		gfScratch = other.gfScratch;
+		other.gfScratch = nullptr;
 	}
 	~Galois16RecMatrixWorker() {
 		thread.end();
-		gf.mutScratch_free(gfScratch);
+		if(gfScratch)
+			gf.mutScratch_free(gfScratch);
 	}
 };
 
@@ -37,7 +39,8 @@ struct Galois16RecMatrixWorkerMessage {
 	unsigned stripeStart, stripeEnd;
 	unsigned recFirst, recLast;
 	unsigned recSrc; uint16_t* rowCoeffs; void** srcRows; Galois16Mul* gf; void* gfScratch;
-	void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, void**, Galois16Mul&, void*, const void*);
+	unsigned coeffWidth;
+	void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void**, Galois16Mul&, void*, const void*);
 	Galois16RecMatrix* parent;
 	std::atomic<int>* procRefs;
 	std::promise<void>* done;
@@ -46,7 +49,7 @@ struct Galois16RecMatrixWorkerMessage {
 static void invert_worker(ThreadMessageQueue<void*>& q) {
 	Galois16RecMatrixWorkerMessage* req;
 	while((req = static_cast<Galois16RecMatrixWorkerMessage*>(q.pop())) != NULL) {
-		(req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->rowCoeffs, req->srcRows, *(req->gf), req->gfScratch, nullptr);
+		(req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->rowCoeffs, req->coeffWidth, req->srcRows, *(req->gf), req->gfScratch, nullptr);
 		if(req->procRefs->fetch_sub(1, std::memory_order_acq_rel) <= 1) {
 			req->done->set_value();
 		}
@@ -59,7 +62,7 @@ static void invert_worker(ThreadMessageQueue<void*>& q) {
 #define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b))
 
 template<int rows>
-void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) {
+void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) {
 	for(unsigned stripe=stripeStart; stripe<stripeEnd; stripe++) {
 		for(unsigned rec2=recFirst; rec2<recLast; ) {
 			unsigned curRec2 = rec2++;
@@ -75,34 +78,33 @@ void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, uns
 				// TODO: need to prefetch next stripe's initial matrix?
 			}
 			
+			uint16_t* coeffPtr = rowCoeffs + (curRec2-recFirst)*coeffWidth;
 			if(rows > 1) {
 				if(HEDLEY_LIKELY(pf))
-					gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs + curRec2*rows, gfScratch, pf);
+					gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, coeffPtr, gfScratch, pf);
 				else
-					gf.mul_add_multi(rows, stripeWidth*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, rowCoeffs + curRec2*rows, gfScratch);
+					gf.mul_add_multi(rows, stripeWidth*numRec*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, coeffPtr, gfScratch);
 			} else {
 				if(HEDLEY_LIKELY(pf))
-					gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs[curRec2], gfScratch, pf);
+					gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, *coeffPtr, gfScratch, pf);
 				else
-					gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, rowCoeffs[curRec2], gfScratch);
+					gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, *coeffPtr, gfScratch);
 			}
 		}
 	}
 }
 
+#define REPLACE_WORD(r, c, v) gf.replace_word(MAT_ROW((c)/(stripeWidth / sizeof(uint16_t)), r), (c)%(stripeWidth / sizeof(uint16_t)), v)
+
 template<int rows>
-int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector<Galois16RecMatrixWorker>& workers) {
+int Galois16RecMatrix::initScale(unsigned rec, unsigned validCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch) {
+	assert(recFirst <= recLast);
+	
 	unsigned missingCol = validCount + rec;
 	
 	uint16_t baseCoeff;
 	uint16_t coeff[rows];
 	
-	unsigned sw16 = stripeWidth / sizeof(uint16_t);
-	// TODO: consider optimisation for numStripes == 1 ?
-	
-	
-	#define REPLACE_WORD(r, c, v) gf.replace_word(MAT_ROW((c)/sw16, r), (c)%sw16, v)
-	
 	void* srcRows[rows];
 	srcRows[0] = MAT_ROW(0, rec);
 	for(unsigned i=1; i<rows; i++)
@@ -135,7 +137,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 		for(unsigned i=0; i<numRows; i++) \
 			coeff[i] = REPLACE_WORD(rowDst, missingCol+srcOffs+i, 0); \
 		for(unsigned stripe=0; stripe<numStripes; stripe++) \
-			gf.mul_add_multi(numRows, stripeWidth*stripe, MAT_ROW(0, rowDst), srcRows+srcOffs, stripeWidth, coeff, gfScratch)
+			gf.mul_add_multi(numRows, stripeWidth*numRec*stripe, MAT_ROW(0, rowDst), srcRows+srcOffs, stripeWidth, coeff, gfScratch)
 	#define MULADD_MULTI_ROW_PF(rowDst, srcOffs, numRows, rowPf) \
 		for(unsigned i=0; i<numRows; i++) \
 			coeff[i] = REPLACE_WORD(rowDst, missingCol+srcOffs+i, 0); \
@@ -165,9 +167,9 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 			return -1; \
 		}
 	
-	unsigned recFirst = rec == 0 ? rows : 0;
+	if(recFirst == rec) recFirst += rows;
 	// the next row when `processRow` is called; last action will prefetch this row
-	uint16_t* nextScaleRow = (rec+rows < numRec) ? MAT_ROW(0, rec+rows) : nullptr;
+	uint16_t* nextScaleRow = (rec+rows < recLast) ? MAT_ROW(0, rec+rows) : nullptr;
 	
 	// TODO: consider loop tiling this stuff; requires extracting a small matrix (rows*rows), and solving that, which means a scalar multiply is necessary
 	
@@ -181,7 +183,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 		// scale it, and multiply-add back
 		SCALE_ROW(1);
 		if(rows > 2) {
-			MULADD_ROW_PF(rec+0, 1, srcRows[2]);
+			MULADD_ROW_PF(rec+0, 1, MAT_ROW(0, 2));
 		} else MULADD_LASTROW(rec+0, 1)
 	} else {
 		if(recFirst >= numRec)
@@ -189,7 +191,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 	}
 	if(rows >= 3) {
 		if(rows >= 4) {
-			MULADD_MULTI_ROW_PF(rec+2, 0, 2, srcRows[3]);
+			MULADD_MULTI_ROW_PF(rec+2, 0, 2, MAT_ROW(0, 3));
 			SCALE_ROW(2);
 			MULADD_MULTI_ROW(rec+3, 0, 2);
 			MULADD_ROW(rec+3, 2);
@@ -197,7 +199,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 			MULADD_ROW(rec+2, 3);
 			MULADD_MULTI_ROW(rec+0, 2, 2);
 			if(rows > 4) {
-				MULADD_MULTI_ROW_PF(rec+1, 2, 2, srcRows[4]);
+				MULADD_MULTI_ROW_PF(rec+1, 2, 2, MAT_ROW(0, 4));
 			} else MULADD_MULTI_LASTROW(rec+1, 2, 2)
 		} else {
 			MULADD_MULTI_ROW(rec+2, 0, 2);
@@ -208,7 +210,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 	}
 	if(rows >= 5) {
 		if(rows >= 6) {
-			MULADD_MULTI_ROW_PF(rec+4, 0, 4, srcRows[5]);
+			MULADD_MULTI_ROW_PF(rec+4, 0, 4, MAT_ROW(0, 5));
 			SCALE_ROW(4);
 			MULADD_MULTI_ROW(rec+5, 0, 4);
 			MULADD_ROW(rec+5, 4);
@@ -227,20 +229,50 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 			MULADD_LASTROW(rec+3, 4)
 		}
 	}
-	
-	// do main elimination, using the source group
-	// first, gather all relevant coefficients
-	for(unsigned r=0; r<numRec; r++) {
+	return -1;
+	#undef SCALE_ROW
+	#undef MULADD_ROW
+	#undef MULADD_ROW_PF
+	#undef MULADD_MULTI_ROW
+	#undef MULADD_MULTI_ROW_PF
+	#undef MULADD_LASTROW
+	#undef MULADD_MULTI_LASTROW
+}
+
+void Galois16RecMatrix::fillCoeffs(uint16_t* rowCoeffs, unsigned rows, unsigned validCount, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth, Galois16Mul& gf) {
+	unsigned missingCol = validCount + rec;
+	if(recFirst == rec) recFirst += rows;
+	for(unsigned r=recFirst; r<recLast; r++) {
 		if(HEDLEY_UNLIKELY(r == rec)) {
 			r += rows-1;
 		} else {
 			for(unsigned c=0; c<rows; c++)
-				rowCoeffs[r*rows + c] = REPLACE_WORD(r, missingCol+c, 0);
+				rowCoeffs[(r-recFirst)*coeffWidth + c] = REPLACE_WORD(r, missingCol+c, 0);
 		}
 	}
+}
+
+template<int rows>
+void Galois16RecMatrix::processRow(unsigned rec, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector<Galois16RecMatrixWorker>& workers) {
+	// TODO: consider optimisation for numStripes == 1 ?
+	
+	assert(recFirst <= recLast);
+	
+	void* srcRows[rows];
+	srcRows[0] = MAT_ROW(0, rec);
+	for(unsigned i=1; i<rows; i++)
+		srcRows[i] = (uint8_t*)srcRows[0] + i * stripeWidth;
+	
+	if(recFirst == rec) recFirst += rows;
+	// the next row when `processRow` is called; last action will prefetch this row
+	uint16_t* nextScaleRow = (rec+rows < recLast) ? MAT_ROW(0, rec+rows) : nullptr;
+	
+	if(recFirst >= recLast) return;
+	
+	// do main elimination, using the source group
 	if(workers.empty())
 		// process elimination directly
-		invertLoop<rows>(0, numStripes, recFirst, numRec, rec, rowCoeffs, srcRows, gf, gfScratch, nextScaleRow);
+		invertLoop<rows>(0, numStripes, recFirst, recLast, rec, rowCoeffs, coeffWidth, srcRows, gf, gfScratch, nextScaleRow);
 	else {
 		// process using workers
 		std::atomic<int> procRefs;
@@ -248,11 +280,12 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 		auto makeReq = [&, this]() -> Galois16RecMatrixWorkerMessage* {
 			auto* req = new Galois16RecMatrixWorkerMessage;
 			req->recFirst = recFirst;
-			req->recLast = numRec;
+			req->recLast = recLast;
 			req->recSrc = rec;
 			req->rowCoeffs = rowCoeffs;
 			req->srcRows = srcRows;
 			req->gf = &gf;
+			req->coeffWidth = coeffWidth;
 			req->fn = &Galois16RecMatrix::invertLoop<rows>;
 			req->parent = this;
 			req->procRefs = &procRefs;
@@ -278,23 +311,26 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 			float workerCnt = 0.5;
 			for(unsigned stripe=0; stripe<numStripes; stripe++) {
 				unsigned workerNum = (unsigned)(workerCnt + workersPerStripe) - (unsigned)workerCnt;
-				unsigned numRows = CEIL_DIV(numRec - rows, workerNum);
+				unsigned numRows = recLast - recFirst;
+				if(rec >= recFirst && rec < recLast) numRows -= rows;
+				numRows = CEIL_DIV(numRows, workerNum);
 				if(numRows < MIN_THREAD_REC) numRows = MIN_THREAD_REC; // ensure workers have a half decent amount of stuff to do
 				unsigned rowPos = recFirst;
 				
-				while(rowPos < numRec) {
+				while(rowPos < recLast) {
 					unsigned sendRows = numRows;
 					if(rowPos+sendRows > rec && rowPos <= rec)
 						// need to send extra to compensate for the gap
 						sendRows += rows;
-					if(rowPos+sendRows > numRec)
-						sendRows = numRec - rowPos;
+					if(rowPos+sendRows > recLast)
+						sendRows = recLast - rowPos;
 					
 					auto* req = makeReq();
 					req->stripeStart = stripe;
 					req->stripeEnd = stripe+1;
 					req->recFirst = rowPos;
 					req->recLast = rowPos+sendRows;
+					req->rowCoeffs += (rowPos-recFirst) * coeffWidth;
 					reqs.push_back(req);
 					
 					rowPos += sendRows;
@@ -305,6 +341,7 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 			}
 			assert(reqs.size() <= workers.size());
 			procRefs.store(reqs.size());
+			assert(procRefs > 0);
 			
 			for(unsigned i=0; i<reqs.size(); i++) {
 				auto& worker = workers[i];
@@ -317,21 +354,67 @@ int Galois16RecMatrix::processRow(unsigned rec, unsigned validCount, Galois16Mul
 		// wait for threads to finish
 		done.get_future().wait();
 	}
-	
-	return -1;
-	
-	#undef REPLACE_WORD
-	#undef SCALE_ROW
-	#undef MULADD_ROW
-	#undef MULADD_ROW_PF
-	#undef MULADD_MULTI_ROW
-	#undef MULADD_MULTI_ROW_PF
-	#undef MULADD_LASTROW
-	#undef MULADD_MULTI_LASTROW
 }
+#undef REPLACE_WORD
 #undef MAT_ROW
 
 
+template<int rows>
+int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector<Galois16RecMatrixWorker>& workers, std::function<void(uint16_t, uint16_t)> progressCb, uint16_t progressOffset, uint16_t totalProgress) {
+	unsigned alignedRowGroupSize = (rowGroupSize / rows) * rows;
+	while(rec <= numRec-rows) {
+		
+		unsigned curRowGroupSize = alignedRowGroupSize;
+		if(numRec-rec < curRowGroupSize) {
+			curRowGroupSize = numRec-rec;
+			curRowGroupSize -= curRowGroupSize % rows;
+		}
+		assert(curRowGroupSize > 0);
+		
+		unsigned recStart = rec;
+		// for progress indicator, we'll even it out by computing a ratio to advance by
+		unsigned progressRatio = (curRowGroupSize<<16)/numRec;
+		unsigned progressBase = recStart + progressOffset;
+		
+		// loop through this row group (normalize values)
+		for(; rec < curRowGroupSize+recStart; rec+=rows) {
+			if(progressCb) progressCb(progressBase + (((rec-recStart)*progressRatio+32768)>>16), totalProgress);
+			int badRowOffset = initScale<rows>(rec, validCount, recStart, curRowGroupSize+recStart, gf, gfScratch);
+			if(badRowOffset >= 0) return rec+badRowOffset;
+			fillCoeffs(rowCoeffs, rows, validCount, recStart, curRowGroupSize+recStart, rec, rows, gf);
+			processRow<rows>(rec, recStart, curRowGroupSize+recStart, gf, gfScratch, rowCoeffs, rows, workers);
+		}
+		
+		
+		// apply current row group to all other row groups
+		for(unsigned recGroup=0; recGroup<numRec; ) {
+			if(recGroup == recStart) {
+				recGroup += curRowGroupSize;
+				continue;
+			}
+			if(progressCb) {
+				unsigned progressPos = recGroup;
+				if(recGroup < recStart) progressPos += curRowGroupSize;
+				progressCb(progressBase + ((progressPos*progressRatio+32768)>>16), totalProgress);
+			}
+			
+			unsigned curRowGroupSize2 = rowGroupSize;
+			if(numRec-recGroup < curRowGroupSize2)
+				curRowGroupSize2 = numRec-recGroup;
+			if(recGroup < recStart && recGroup+curRowGroupSize2 > recStart)
+				curRowGroupSize2 = recStart-recGroup; // don't let this group cross into the normalized group
+			fillCoeffs(rowCoeffs, curRowGroupSize, validCount, recGroup, recGroup+curRowGroupSize2, recStart, curRowGroupSize, gf);
+			for(unsigned rec2=recStart; rec2 < curRowGroupSize+recStart; rec2+=rows) {
+				processRow<rows>(rec2, recGroup, recGroup+curRowGroupSize2, gf, gfScratch, rowCoeffs + (rec2-recStart), curRowGroupSize, workers);
+			}
+			recGroup += curRowGroupSize2;
+		}
+	}
+	return -1;
+}
+
+
+
 // construct initial matrix (pre-inversion)
 void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned validCount, const std::vector<uint16_t>& recovery) {
 	unsigned validCol = 0;
@@ -472,15 +555,23 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	unsigned _numThreads = numThreads;
 	if(numRec < MIN_THREAD_REC) _numThreads = 1; // don't spawn threads if not enough work
 	if(_numThreads > 1) {
+		workers.reserve(_numThreads);
 		for(unsigned i=0; i<_numThreads; i++) {
-			workers.push_back(Galois16RecMatrixWorker(gf));
+			workers.emplace_back(gf);
 			workers[i].thread.name = "gauss_worker";
 			workers[i].thread.setCallback(invert_worker);
 		}
-		gfScratch = nullptr; // ...otherwise MSVC won't be happy
+		gfScratch = workers[0].gfScratch;
 	} else
 		gfScratch = gf.mutScratch_alloc();
 	
+	// target L3 slice? use 1MB target for now; TODO: improve this
+	unsigned rowGroupSize = (1024*1024 / stripeWidth);
+	// if it's going to be split amongst cores, increase the number of rows in a group
+	if(numStripes < _numThreads) rowGroupSize *= _numThreads/numStripes;
+	if(rowGroupSize < gfInfo.idealInputMultiple*2) rowGroupSize = gfInfo.idealInputMultiple*2;
+	if(rowGroupSize > numRec) rowGroupSize = numRec;
+	
 	invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying?
 		if(numRec > recovery.size()) { // not enough recovery
 			if(_numThreads <= 1)
@@ -506,19 +597,15 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		unsigned rec = 0;
 		#define INVERT_GROUP(rows) \
 			if(gfInfo.idealInputMultiple >= rows && numRec >= rows) { \
-				for(; rec <= numRec-rows; rec+=rows) { \
-					if(progressCb) progressCb(rec + progressOffset, totalProgress); \
-					 \
-					int badRowOffset = processRow<rows>(rec, validCount, gf, gfScratch, rowCoeffs, workers); \
-					if(badRowOffset >= 0) { \
-						/* ignore this recovery row and try again */ \
-						recovery.erase(recovery.begin() + rec + badRowOffset); \
-						goto invert_loop; \
-					} \
+				int badRow = processRows<rows>(rec, rowGroupSize, validCount, gf, gfScratch, rowCoeffs, workers, progressCb, progressOffset, totalProgress); \
+				if(badRow >= 0) { \
+					/* ignore this recovery row and try again */ \
+					recovery.erase(recovery.begin() + badRow); \
+					goto invert_loop; \
 				} \
 			}
 		// max out at 6 groups (registers + cache assoc?)
-		uint16_t* rowCoeffs = new uint16_t[numRec*6];
+		uint16_t* rowCoeffs = new uint16_t[rowGroupSize*rowGroupSize];
 		INVERT_GROUP(6)
 		INVERT_GROUP(5)
 		INVERT_GROUP(4)
@@ -547,6 +634,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 
 Galois16RecMatrix::Galois16RecMatrix() : mat(nullptr) {
 	numThreads = hardware_concurrency();
+	if(numThreads > 4) numThreads = 4; // by default, cap at 4 threads, as scaling doesn't work so well; TODO: tweak this later
 	numRec = 0;
 	numStripes = 0;
 	stripeWidth = 0;
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index 79e84795..477b08a0 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -17,9 +17,14 @@ class Galois16RecMatrix {
 	void Construct(const std::vector<bool>& inputValid, unsigned validCount, const std::vector<uint16_t>& recovery);
 	
 	template<int rows>
-	void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf);
+	void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf);
 	template<int rows>
-	int processRow(unsigned rec, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector<Galois16RecMatrixWorker>& workers);
+	int initScale(unsigned rec, unsigned validCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch);
+	void fillCoeffs(uint16_t* rowCoeffs, unsigned rows, unsigned validCount, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth, Galois16Mul& gf);
+	template<int rows>
+	void processRow(unsigned rec, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector<Galois16RecMatrixWorker>& workers);
+	template<int rows>
+	int processRows(unsigned& rec, unsigned rowGroupSize, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector<Galois16RecMatrixWorker>& workers, std::function<void(uint16_t, uint16_t)> progressCb, uint16_t progressOffset, uint16_t totalProgress);
 public:
 	Galois16RecMatrix();
 	~Galois16RecMatrix();

From 070f14a725280144ae02dd750ac1277073369f98 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 19 Jul 2023 15:13:54 +1000
Subject: [PATCH 30/91] Update CFLAGS

---
 binding.gyp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/binding.gyp b/binding.gyp
index fd4af8ec..70a36159 100644
--- a/binding.gyp
+++ b/binding.gyp
@@ -21,7 +21,7 @@
         ]
       }]
     ],
-    "cflags": ["-std=gnu99", "-D_POSIX_C_SOURCE=200112L"],
+    "cflags": ["-std=c99", "-D_POSIX_C_SOURCE=200112L", "-D_DARWIN_C_SOURCE", "-D_GNU_SOURCE"],
     "cxxflags": ["-std=c++11"],
     "msvs_settings": {"VCCLCompilerTool": {"Optimization": "MaxSpeed"}},
     "configurations": {"Release": {

From 41a7a4c0b9b23deae9ca78e887382abe837aff2b Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 19 Jul 2023 15:14:46 +1000
Subject: [PATCH 31/91] Clarification on --auto-slice-size

---
 help.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/help.txt b/help.txt
index da6a3a05..66674bbd 100644
--- a/help.txt
+++ b/help.txt
@@ -33,7 +33,9 @@ PAR2 Options:
                              equals that size, otherwise is 4B
   -S,  --auto-slice-size     Automatically scale up input slice size if the
                              number of input slices would exceed the maximum
-                             allowed. This option takes no parameters.
+                             allowed. The chosen slice size will respect
+                             `--slice-size-multiple` when scaling up the slice.
+                             This option takes no parameters.
                              Alias for `--max-input-slices=32768`
   -r,  --recovery-slices     Number of recovery slices to generate. You can
                              append a suffix to auto-calculate this, as in the

From 2274f71f4abc5e42ac030ca1c658633d61afd504 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 19 Jul 2023 15:24:50 +1000
Subject: [PATCH 32/91] Fix max length of Windows thread name

---
 gf16/threadqueue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gf16/threadqueue.h b/gf16/threadqueue.h
index ecb535b4..ee11a8c2 100644
--- a/gf16/threadqueue.h
+++ b/gf16/threadqueue.h
@@ -282,7 +282,7 @@ class MessageThread {
 				if(fnSetTD) {
 					wchar_t nameUCS2[17];
 					//assert(strlen(self->name) <= 16); // always hard-coded string, plus Linux limits it to 16 chars, so shouldn't ever overflow
-					MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, self->name, -1, nameUCS2, 50);
+					MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, self->name, -1, nameUCS2, sizeof(nameUCS2)/sizeof(wchar_t) -1);
 					fnSetTD(GetCurrentThread(), nameUCS2);
 				}
 			}

From f2cf95d3fe5bc4d18ff309271b5b85d182febc56 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 19 Jul 2023 15:50:43 +1000
Subject: [PATCH 33/91] Fix checksum-prepare function prototype

---
 gf16/gf16_checksum_arm.h | 2 +-
 gf16/gf16_checksum_x86.h | 2 +-
 gf16/gf16_lookup.c       | 4 ++--
 gf16/gf16_sve_common.h   | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/gf16/gf16_checksum_arm.h b/gf16/gf16_checksum_arm.h
index b53cbb9a..2b945a31 100644
--- a/gf16/gf16_checksum_arm.h
+++ b/gf16/gf16_checksum_arm.h
@@ -76,7 +76,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_checksum_exp_neon(void *HEDLEY_RESTRICT ch
 	gf16_checksum_store(checksum, res);
 }
 
-static HEDLEY_ALWAYS_INLINE void gf16_checksum_prepare_neon(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock) {
+static HEDLEY_ALWAYS_INLINE void gf16_checksum_prepare_neon(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) {
 #define _X(bl) \
 	ALIGN_TO(16, uint8_t tmp[bl]) = {0}; \
 	vst1q_u8(tmp, gf16_checksum_load(checksum)); \
diff --git a/gf16/gf16_checksum_x86.h b/gf16/gf16_checksum_x86.h
index e7404888..37144352 100644
--- a/gf16/gf16_checksum_x86.h
+++ b/gf16/gf16_checksum_x86.h
@@ -149,7 +149,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_checksum_exp)(void *HEDLEY_RESTRICT ch
 	*(_mword*)checksum = res;
 }
 
-static HEDLEY_ALWAYS_INLINE void _FN(gf16_checksum_prepare)(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock) {
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_checksum_prepare)(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) {
 	// because some compilers don't like `tmp[blockLen]` despite blockLen being constant, just implement every possibility
 #define _X(bl) \
 	ALIGN_TO(MWORD_SIZE, uint8_t tmp[bl]) = {0}; \
diff --git a/gf16/gf16_lookup.c b/gf16/gf16_lookup.c
index c8ceef7a..dc293de8 100644
--- a/gf16/gf16_lookup.c
+++ b/gf16/gf16_lookup.c
@@ -579,7 +579,7 @@ HEDLEY_CONST size_t gf16_lookup3_stride() {
 
 
 
-static HEDLEY_ALWAYS_INLINE void gf16_lookup_checksum_prepare(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock) {
+static HEDLEY_ALWAYS_INLINE void gf16_lookup_checksum_prepare(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) {
 	UNUSED(prepareBlock);
 	memset(dst, 0, blockLen);
 	if(sizeof(uintptr_t) >= 8)
@@ -628,7 +628,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_lookup3_prepare_blocku(void *HEDLEY_RESTRI
 	memcpy(&data, src, remaining);
 	gf16_lookup3_prepare_block(dst, &data);
 }
-static HEDLEY_ALWAYS_INLINE void gf16_lookup3_checksum_prepare(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock) {
+static HEDLEY_ALWAYS_INLINE void gf16_lookup3_checksum_prepare(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) {
 	UNUSED(prepareBlock);
 	gf16_lookup3_prepare_block(dst, checksum);
 	memset((char*)dst+gf16_lookup3_stride(), 0, blockLen-gf16_lookup3_stride());
diff --git a/gf16/gf16_sve_common.h b/gf16/gf16_sve_common.h
index 0dabf676..2c9b7c6b 100644
--- a/gf16/gf16_sve_common.h
+++ b/gf16/gf16_sve_common.h
@@ -45,7 +45,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_finish_half_blocku_sve(void *HEDLEY_RESTRI
 	svst1_u8(svwhilelt_b8((uint64_t)0, (uint64_t)remaining), dst, svld1_u8(svptrue_b8(), src));
 }
 
-static HEDLEY_ALWAYS_INLINE void gf16_checksum_prepare_sve(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block prepareBlock) {
+static HEDLEY_ALWAYS_INLINE void gf16_checksum_prepare_sve(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) {
 	ALIGN_TO(16, int16_t tmp[blockLen/2]);
 	memset(tmp, 0, blockLen);
 	svst1_s16(svptrue_b16(), tmp, *(svint16_t*)checksum);

From f910bfa48de73583e755daeec957199230503050 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 19 Jul 2023 15:55:47 +1000
Subject: [PATCH 34/91] Fix type of JIT offset, as it's typically >255

---
 gf16/gf16_xor_avx2.c   | 6 +++---
 gf16/gf16_xor_avx512.c | 2 +-
 gf16/gf16_xor_common.h | 4 ++--
 gf16/gf16_xor_sse2.c   | 6 +++---
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/gf16/gf16_xor_avx2.c b/gf16/gf16_xor_avx2.c
index d9ba9578..219b05ae 100644
--- a/gf16/gf16_xor_avx2.c
+++ b/gf16/gf16_xor_avx2.c
@@ -675,7 +675,7 @@ GF_FINISH_PACKED_FUNCS_STUB(gf16_xor, _avx2)
 
 
 #if defined(__AVX2__) && defined(PLATFORM_AMD64)
-static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast8_t* sizeNorm, uint_fast8_t* sizeInsitu) {
+static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast16_t* sizeNorm, uint_fast16_t* sizeInsitu) {
 	uint8_t *jitCodeStart = jitCodeNorm;
 	jitCodeNorm += _jit_add_i(jitCodeNorm, AX, 512);
 	jitCodeNorm += _jit_add_i(jitCodeNorm, DX, 512);
@@ -684,7 +684,7 @@ static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uin
 	for(int i=3; i<16; i++) {
 		jitCodeNorm += _jit_vmovdqa_load(jitCodeNorm, i, AX, lshift32(i-4, 5));
 	}
-	if(sizeNorm) *sizeNorm = jitCodeNorm-jitCodeStart;
+	if(sizeNorm) *sizeNorm = (uint_fast16_t)(jitCodeNorm-jitCodeStart);
 	
 	
 	jitCodeStart = jitCodeInsitu;
@@ -696,7 +696,7 @@ static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uin
 	for(int i=0; i<3; i++) {
 		jitCodeInsitu += _jit_vmovdqa_store(jitCodeInsitu, AX, lshift32(i-4, 5), i);
 	}
-	if(sizeInsitu) *sizeInsitu = jitCodeInsitu-jitCodeStart;
+	if(sizeInsitu) *sizeInsitu = (uint_fast16_t)(jitCodeInsitu-jitCodeStart);
 }
 
 # include "gf16_bitdep_init_avx2.h"
diff --git a/gf16/gf16_xor_avx512.c b/gf16/gf16_xor_avx512.c
index cce0625b..694f5ef5 100644
--- a/gf16/gf16_xor_avx512.c
+++ b/gf16/gf16_xor_avx512.c
@@ -1206,7 +1206,7 @@ void* gf16_xor_jit_init_avx512(int polynomial, int jitOptStrat) {
 	gf16_bitdep_init256(ret->deps, polynomial, 0);
 	
 	ret->jitOptStrat = jitOptStrat;
-	ret->codeStart = (uint_fast8_t)xor_write_init_jit(tmpCode);
+	ret->codeStart = (uint_fast16_t)xor_write_init_jit(tmpCode);
 	return ret;
 #else
 	UNUSED(polynomial); UNUSED(jitOptStrat);
diff --git a/gf16/gf16_xor_common.h b/gf16/gf16_xor_common.h
index 8c987ad3..21b7ea80 100644
--- a/gf16/gf16_xor_common.h
+++ b/gf16/gf16_xor_common.h
@@ -115,8 +115,8 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_jit_stub(intptr_t src, intptr_t dEnd,
 struct gf16_xor_scratch {
 	uint8_t deps[16*16*2*4];
 	int jitOptStrat; // GF16_XOR_JIT_STRAT_*
-	uint_fast8_t codeStart;
-	uint_fast8_t codeStartInsitu;
+	uint_fast16_t codeStart;
+	uint_fast16_t codeStartInsitu;
 };
 
 
diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c
index bce368ec..fe68fbbf 100644
--- a/gf16/gf16_xor_sse2.c
+++ b/gf16/gf16_xor_sse2.c
@@ -1197,7 +1197,7 @@ GF_FINISH_PACKED_FUNCS_STUB(gf16_xor, _sse2)
 #include "gf16_bitdep_init_sse2.h"
 
 #ifdef PLATFORM_X86
-static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast8_t* sizeNorm, uint_fast8_t* sizeInsitu) {
+static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uint_fast16_t* sizeNorm, uint_fast16_t* sizeInsitu) {
 	uint8_t *jitCodeStart = jitCodeNorm;
 	jitCodeNorm += _jit_add_i(jitCodeNorm, AX, 256);
 	jitCodeNorm += _jit_add_i(jitCodeNorm, DX, 256);
@@ -1214,7 +1214,7 @@ static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uin
 	}
 # endif
 	
-	if(sizeNorm) *sizeNorm = jitCodeNorm-jitCodeStart;
+	if(sizeNorm) *sizeNorm = (uint_fast16_t)(jitCodeNorm-jitCodeStart);
 	
 	// in-situ version
 	jitCodeStart = jitCodeInsitu;
@@ -1237,7 +1237,7 @@ static void xor_write_init_jit(uint8_t *jitCodeNorm, uint8_t *jitCodeInsitu, uin
 	}
 # endif
 	
-	if(sizeInsitu) *sizeInsitu = jitCodeInsitu-jitCodeStart;
+	if(sizeInsitu) *sizeInsitu = (uint_fast16_t)(jitCodeInsitu-jitCodeStart);
 }
 #endif
 

From 731897f32d8a9e3e5ec995a9dd22658ca9706f0e Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 19 Jul 2023 22:37:56 +1000
Subject: [PATCH 35/91] Enable ClMul to be used for inversion, using Shuffle
 for single region multiplies

---
 gf16/gf16mul.cpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index 5d6a33c4..ac7765f3 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -702,8 +702,16 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			int available = gf16_clmul_init_arm(GF16_POLYNOMIAL);
 			
 			METHOD_REQUIRES(gf16_available_neon && available)
-			_mul = &gf16_clmul_mul_neon;
-			_mul_add = &gf16_clmul_muladd_neon;
+			
+			// use Shuffle for single region multiplies, because it's faster
+			scratch = gf16_shuffle_init_arm(GF16_POLYNOMIAL);
+			if(scratch) {
+				_mul = &gf16_shuffle_mul_neon;
+				_mul_add = &gf16_shuffle_muladd_neon;
+			} else {
+				_mul = &gf16_clmul_mul_neon;
+				_mul_add = &gf16_clmul_muladd_neon;
+			}
 			_mul_add_multi = &gf16_clmul_muladd_multi_neon;
 			_mul_add_multi_stridepf = &gf16_clmul_muladd_multi_stridepf_neon;
 			_mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_neon;
@@ -818,8 +826,9 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 		case GF16_CLMUL_SVE2:
 			METHOD_REQUIRES(gf16_available_sve2)
 			
-			_mul = &gf16_clmul_mul_sve2;
-			_mul_add = &gf16_clmul_muladd_sve2;
+			// single region multiplies (_mul/add) use Shuffle-128 instead
+			_mul = &gf16_shuffle_mul_128_sve2;
+			_mul_add = &gf16_shuffle_muladd_128_sve2;
 			_mul_add_multi = &gf16_clmul_muladd_multi_sve2;
 			_mul_add_multi_stridepf = &gf16_clmul_muladd_multi_stridepf_sve2;
 			_mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_sve2;
@@ -1310,7 +1319,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu
 	if(caps.hasSVE2) {
 		if(gf16_sve_get_size() >= 64)
 			return GF16_SHUFFLE_512_SVE2;
-		return inputs > 3 && !forInvert ? GF16_CLMUL_SVE2 : GF16_SHUFFLE_128_SVE2;
+		return inputs > 3 ? GF16_CLMUL_SVE2 : GF16_SHUFFLE_128_SVE2;
 	}
 	if(caps.hasSVE && gf16_sve_get_size() > 16)
 		return GF16_SHUFFLE_128_SVE;
@@ -1321,7 +1330,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu
 # else
 			inputs > 1
 # endif
-			&& !forInvert ? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON;
+			? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON;
 #endif
 	
 	

From 8abd213d5c1278cd0de5bd260644a7e2593077d0 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 20 Jul 2023 16:11:55 +1000
Subject: [PATCH 36/91] Nest row group apply loop inside striping loop during
 inversion Hopefully better cache usage + less waiting on threads

---
 gf16/gfmat_inv.cpp | 75 ++++++++++++++++++++++++----------------------
 gf16/gfmat_inv.h   |  4 +--
 2 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index 84f1fdc7..fbf03900 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -38,9 +38,9 @@ class Galois16RecMatrixWorker {
 struct Galois16RecMatrixWorkerMessage {
 	unsigned stripeStart, stripeEnd;
 	unsigned recFirst, recLast;
-	unsigned recSrc; uint16_t* rowCoeffs; void** srcRows; Galois16Mul* gf; void* gfScratch;
+	unsigned recSrc; unsigned recSrcCount; uint16_t* rowCoeffs; void** srcRows; Galois16Mul* gf; void* gfScratch;
 	unsigned coeffWidth;
-	void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void**, Galois16Mul&, void*, const void*);
+	void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void**, Galois16Mul&, void*, const void*);
 	Galois16RecMatrix* parent;
 	std::atomic<int>* procRefs;
 	std::promise<void>* done;
@@ -49,7 +49,7 @@ struct Galois16RecMatrixWorkerMessage {
 static void invert_worker(ThreadMessageQueue<void*>& q) {
 	Galois16RecMatrixWorkerMessage* req;
 	while((req = static_cast<Galois16RecMatrixWorkerMessage*>(q.pop())) != NULL) {
-		(req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->rowCoeffs, req->coeffWidth, req->srcRows, *(req->gf), req->gfScratch, nullptr);
+		(req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->recSrcCount, req->rowCoeffs, req->coeffWidth, req->srcRows, *(req->gf), req->gfScratch, nullptr);
 		if(req->procRefs->fetch_sub(1, std::memory_order_acq_rel) <= 1) {
 			req->done->set_value();
 		}
@@ -62,33 +62,39 @@ static void invert_worker(ThreadMessageQueue<void*>& q) {
 #define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b))
 
 template<int rows>
-void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) {
+void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) {
+	assert(recSrcCount % rows == 0);
 	for(unsigned stripe=stripeStart; stripe<stripeEnd; stripe++) {
-		for(unsigned rec2=recFirst; rec2<recLast; ) {
-			unsigned curRec2 = rec2++;
-			if(HEDLEY_UNLIKELY(rec2 == recSrc))
-				rec2 += rows;
-			
-			// TODO: fixup prefetching
-			const void* pf = nextPf;
-			if(HEDLEY_LIKELY(rec2 < recLast)) {
-				pf = MAT_ROW(stripe, rec2);
-			} else if(stripe < stripeEnd-1) {
-				pf = MAT_ROW(stripe+1, recFirst);
-				// TODO: need to prefetch next stripe's initial matrix?
-			}
-			
-			uint16_t* coeffPtr = rowCoeffs + (curRec2-recFirst)*coeffWidth;
-			if(rows > 1) {
-				if(HEDLEY_LIKELY(pf))
-					gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, coeffPtr, gfScratch, pf);
-				else
-					gf.mul_add_multi(rows, stripeWidth*numRec*stripe, MAT_ROW(0, curRec2), srcRows, stripeWidth, coeffPtr, gfScratch);
-			} else {
-				if(HEDLEY_LIKELY(pf))
-					gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, *coeffPtr, gfScratch, pf);
-				else
-					gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, recSrc), stripeWidth, *coeffPtr, gfScratch);
+		for(unsigned recI = 0; recI < recSrcCount; recI += rows) {
+			unsigned rec = recI+recSrc;
+			for(unsigned rec2=recFirst; rec2<recLast; ) {
+				unsigned curRec2 = rec2++;
+				if(HEDLEY_UNLIKELY(rec2 == rec))
+					rec2 += rows;
+				
+				// TODO: fixup prefetching
+				const void* pf = nextPf;
+				if(HEDLEY_LIKELY(rec2 < recLast)) {
+					pf = MAT_ROW(stripe, rec2);
+				} else if(recI+rows < recSrcCount) {
+					pf = nullptr; // TODO: same row group - no need to prefetch as it should be in cache?
+				} else if(stripe < stripeEnd-1) {
+					pf = MAT_ROW(stripe+1, recFirst);
+					// TODO: need to prefetch next stripe's initial matrix?
+				}
+				
+				uint16_t* coeffPtr = rowCoeffs + (curRec2-recFirst)*coeffWidth + recI;
+				if(rows > 1) {
+					if(HEDLEY_LIKELY(pf))
+						gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, coeffPtr, gfScratch, pf);
+					else
+						gf.mul_add_multi(rows, stripeWidth*(numRec*stripe + recI), MAT_ROW(0, curRec2) - recI*stripeWidth/sizeof(uint16_t), srcRows, stripeWidth, coeffPtr, gfScratch);
+				} else {
+					if(HEDLEY_LIKELY(pf))
+						gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch, pf);
+					else
+						gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch);
+				}
 			}
 		}
 	}
@@ -253,7 +259,7 @@ void Galois16RecMatrix::fillCoeffs(uint16_t* rowCoeffs, unsigned rows, unsigned
 }
 
 template<int rows>
-void Galois16RecMatrix::processRow(unsigned rec, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector<Galois16RecMatrixWorker>& workers) {
+void Galois16RecMatrix::processRow(unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector<Galois16RecMatrixWorker>& workers) {
 	// TODO: consider optimisation for numStripes == 1 ?
 	
 	assert(recFirst <= recLast);
@@ -272,7 +278,7 @@ void Galois16RecMatrix::processRow(unsigned rec, unsigned recFirst, unsigned rec
 	// do main elimination, using the source group
 	if(workers.empty())
 		// process elimination directly
-		invertLoop<rows>(0, numStripes, recFirst, recLast, rec, rowCoeffs, coeffWidth, srcRows, gf, gfScratch, nextScaleRow);
+		invertLoop<rows>(0, numStripes, recFirst, recLast, rec, recCount, rowCoeffs, coeffWidth, srcRows, gf, gfScratch, nextScaleRow);
 	else {
 		// process using workers
 		std::atomic<int> procRefs;
@@ -282,6 +288,7 @@ void Galois16RecMatrix::processRow(unsigned rec, unsigned recFirst, unsigned rec
 			req->recFirst = recFirst;
 			req->recLast = recLast;
 			req->recSrc = rec;
+			req->recSrcCount = recCount;
 			req->rowCoeffs = rowCoeffs;
 			req->srcRows = srcRows;
 			req->gf = &gf;
@@ -382,7 +389,7 @@ int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigne
 			int badRowOffset = initScale<rows>(rec, validCount, recStart, curRowGroupSize+recStart, gf, gfScratch);
 			if(badRowOffset >= 0) return rec+badRowOffset;
 			fillCoeffs(rowCoeffs, rows, validCount, recStart, curRowGroupSize+recStart, rec, rows, gf);
-			processRow<rows>(rec, recStart, curRowGroupSize+recStart, gf, gfScratch, rowCoeffs, rows, workers);
+			processRow<rows>(rec, rows, recStart, curRowGroupSize+recStart, gf, gfScratch, rowCoeffs, rows, workers);
 		}
 		
 		
@@ -404,9 +411,7 @@ int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigne
 			if(recGroup < recStart && recGroup+curRowGroupSize2 > recStart)
 				curRowGroupSize2 = recStart-recGroup; // don't let this group cross into the normalized group
 			fillCoeffs(rowCoeffs, curRowGroupSize, validCount, recGroup, recGroup+curRowGroupSize2, recStart, curRowGroupSize, gf);
-			for(unsigned rec2=recStart; rec2 < curRowGroupSize+recStart; rec2+=rows) {
-				processRow<rows>(rec2, recGroup, recGroup+curRowGroupSize2, gf, gfScratch, rowCoeffs + (rec2-recStart), curRowGroupSize, workers);
-			}
+			processRow<rows>(recStart, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, gf, gfScratch, rowCoeffs, curRowGroupSize, workers);
 			recGroup += curRowGroupSize2;
 		}
 	}
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index 477b08a0..7490dcaf 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -17,12 +17,12 @@ class Galois16RecMatrix {
 	void Construct(const std::vector<bool>& inputValid, unsigned validCount, const std::vector<uint16_t>& recovery);
 	
 	template<int rows>
-	void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf);
+	void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf);
 	template<int rows>
 	int initScale(unsigned rec, unsigned validCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch);
 	void fillCoeffs(uint16_t* rowCoeffs, unsigned rows, unsigned validCount, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth, Galois16Mul& gf);
 	template<int rows>
-	void processRow(unsigned rec, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector<Galois16RecMatrixWorker>& workers);
+	void processRow(unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector<Galois16RecMatrixWorker>& workers);
 	template<int rows>
 	int processRows(unsigned& rec, unsigned rowGroupSize, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector<Galois16RecMatrixWorker>& workers, std::function<void(uint16_t, uint16_t)> progressCb, uint16_t progressOffset, uint16_t totalProgress);
 public:

From 415729c5e3dbd6340133e423fea17a618f09c791 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 20 Jul 2023 16:51:36 +1000
Subject: [PATCH 37/91] Enable VPCLMULQDQ in MSVC

---
 src/platform.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/platform.h b/src/platform.h
index 40c36fb3..11d7f6d6 100644
--- a/src/platform.h
+++ b/src/platform.h
@@ -81,6 +81,9 @@
 #if defined(__AVX512F__) && _MSC_VER >= 1914
 	#define __AVX512VBMI__ 1
 #endif
+#if defined(__AVX2__) && _MSC_VER >= 1915
+	#define __VPCLMULQDQ__ 1
+#endif
 #if defined(__SSE2__) && _MSC_VER >= 1920
 	#define __GFNI__ 1
 #endif

From 8b5caa38120d40846f09d79f6bd076b3b9671a82 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 20 Jul 2023 21:30:45 +1000
Subject: [PATCH 38/91] Inversion code tweaks

---
 gf16/gfmat_inv.cpp | 203 ++++++++++++++++++++++++---------------------
 gf16/gfmat_inv.h   |  13 +--
 2 files changed, 116 insertions(+), 100 deletions(-)

diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index fbf03900..921a930c 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -14,6 +14,17 @@ extern "C" uint16_t* gf16_recip;
 
 static const unsigned MIN_THREAD_REC = 10; // minimum number of rows to process on a thread
 
+struct Galois16RecMatrixComputeState {
+	uint16_t* coeff;
+	Galois16Mul gf;
+	void* gfScratch;
+	unsigned validCount;
+	void* srcRowsBase[PP_INVERT_MAX_MULTI_ROWS];
+	std::vector<Galois16RecMatrixWorker> workers;
+	
+	Galois16RecMatrixComputeState(Galois16Methods method) : gf(method) {}
+};
+
 class Galois16RecMatrixWorker {
 	const Galois16Mul& gf;
 public:
@@ -38,18 +49,22 @@ class Galois16RecMatrixWorker {
 struct Galois16RecMatrixWorkerMessage {
 	unsigned stripeStart, stripeEnd;
 	unsigned recFirst, recLast;
-	unsigned recSrc; unsigned recSrcCount; uint16_t* rowCoeffs; void** srcRows; Galois16Mul* gf; void* gfScratch;
+	unsigned recSrc; unsigned recSrcCount; uint16_t* rowCoeffs; Galois16Mul* gf; void* gfScratch;
+	void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS];
 	unsigned coeffWidth;
-	void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void**, Galois16Mul&, void*, const void*);
+	void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void*(&)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul&, void*, const void*);
 	Galois16RecMatrix* parent;
 	std::atomic<int>* procRefs;
 	std::promise<void>* done;
+	
+	Galois16RecMatrixWorkerMessage(Galois16RecMatrixComputeState& state)
+	: rowCoeffs(state.coeff), gf(&state.gf), srcRowsBase(state.srcRowsBase) {}
 };
 
 static void invert_worker(ThreadMessageQueue<void*>& q) {
 	Galois16RecMatrixWorkerMessage* req;
 	while((req = static_cast<Galois16RecMatrixWorkerMessage*>(q.pop())) != NULL) {
-		(req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->recSrcCount, req->rowCoeffs, req->coeffWidth, req->srcRows, *(req->gf), req->gfScratch, nullptr);
+		(req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->recSrcCount, req->rowCoeffs, req->coeffWidth, req->srcRowsBase, *(req->gf), req->gfScratch, nullptr);
 		if(req->procRefs->fetch_sub(1, std::memory_order_acq_rel) <= 1) {
 			req->done->set_value();
 		}
@@ -62,7 +77,7 @@ static void invert_worker(ThreadMessageQueue<void*>& q) {
 #define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b))
 
 template<int rows>
-void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf) {
+void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf) {
 	assert(recSrcCount % rows == 0);
 	for(unsigned stripe=stripeStart; stripe<stripeEnd; stripe++) {
 		for(unsigned recI = 0; recI < recSrcCount; recI += rows) {
@@ -87,8 +102,10 @@ void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, uns
 				if(rows > 1) {
 					if(HEDLEY_LIKELY(pf))
 						gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, coeffPtr, gfScratch, pf);
-					else
-						gf.mul_add_multi(rows, stripeWidth*(numRec*stripe + recI), MAT_ROW(0, curRec2) - recI*stripeWidth/sizeof(uint16_t), srcRows, stripeWidth, coeffPtr, gfScratch);
+					else {
+						unsigned offset = rec*stripeWidth;
+						gf.mul_add_multi(rows, stripeWidth*numRec*stripe + offset, MAT_ROW(0, curRec2) - offset/sizeof(uint16_t), srcRowsBase, stripeWidth, coeffPtr, gfScratch);
+					}
 				} else {
 					if(HEDLEY_LIKELY(pf))
 						gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch, pf);
@@ -100,55 +117,50 @@ void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, uns
 	}
 }
 
-#define REPLACE_WORD(r, c, v) gf.replace_word(MAT_ROW((c)/(stripeWidth / sizeof(uint16_t)), r), (c)%(stripeWidth / sizeof(uint16_t)), v)
+#define REPLACE_WORD(r, c, v) state.gf.replace_word(MAT_ROW((c)/(stripeWidth / sizeof(uint16_t)), r), (c)%(stripeWidth / sizeof(uint16_t)), v)
 
 template<int rows>
-int Galois16RecMatrix::initScale(unsigned rec, unsigned validCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch) {
+int Galois16RecMatrix::initScale(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast) {
 	assert(recFirst <= recLast);
+	assert(rec != recFirst);
 	
-	unsigned missingCol = validCount + rec;
-	
-	uint16_t baseCoeff;
-	uint16_t coeff[rows];
-	
-	void* srcRows[rows];
-	srcRows[0] = MAT_ROW(0, rec);
-	for(unsigned i=1; i<rows; i++)
-		srcRows[i] = (uint8_t*)srcRows[0] + i * stripeWidth;
-	
+	unsigned missingCol = state.validCount + rec;
+	uint16_t tmpCoeff;
 	
 	#define SCALE_ROW(row) \
-		baseCoeff = REPLACE_WORD(rec+row, missingCol+row, 1); \
-		if(HEDLEY_UNLIKELY(baseCoeff == 0)) /* bad recovery coeff */ \
+		tmpCoeff = REPLACE_WORD(rec+row, missingCol+row, 1); \
+		if(HEDLEY_UNLIKELY(tmpCoeff == 0)) /* bad recovery coeff */ \
 			return row; \
-		if(HEDLEY_LIKELY(baseCoeff != 1)) { \
+		if(HEDLEY_LIKELY(tmpCoeff != 1)) { \
 			for(unsigned stripe=0; stripe<numStripes; stripe++) \
-				gf.mul(MAT_ROW(stripe, rec+row), MAT_ROW(stripe, rec+row), stripeWidth, gf16_recip[baseCoeff], gfScratch); \
+				state.gf.mul(MAT_ROW(stripe, rec+row), MAT_ROW(stripe, rec+row), stripeWidth, gf16_recip[tmpCoeff], state.gfScratch); \
 		} void(0)
 	// TODO: consider prefetching reciprocal?
 	#define MULADD_ROW(rowDst, rowSrc) \
-		coeff[0] = REPLACE_WORD(rowDst, missingCol+rowSrc, 0); \
-		if(HEDLEY_LIKELY(coeff[0] != 0)) { \
+		tmpCoeff = REPLACE_WORD(rowDst, missingCol+rowSrc, 0); \
+		if(HEDLEY_LIKELY(tmpCoeff != 0)) { \
 			for(unsigned stripe=0; stripe<numStripes; stripe++) \
-				gf.mul_add(MAT_ROW(stripe, rowDst), MAT_ROW(stripe, rec+rowSrc), stripeWidth, coeff[0], gfScratch); \
+				state.gf.mul_add(MAT_ROW(stripe, rowDst), MAT_ROW(stripe, rec+rowSrc), stripeWidth, tmpCoeff, state.gfScratch); \
 		} void(0)
 	// TODO: is a coefficient of 0 ever correct?
 	#define MULADD_ROW_PF(rowDst, rowSrc, rowPf) \
-		coeff[0] = REPLACE_WORD(rowDst, missingCol+rowSrc, 0); \
-		if(HEDLEY_LIKELY(coeff[0] != 0)) { \
+		tmpCoeff = REPLACE_WORD(rowDst, missingCol+rowSrc, 0); \
+		if(HEDLEY_LIKELY(tmpCoeff != 0)) { \
 			for(unsigned stripe=0; stripe<numStripes; stripe++) \
-				gf.mul_add_pf(MAT_ROW(stripe, rowDst), MAT_ROW(stripe, rec+rowSrc), stripeWidth, coeff[0], gfScratch, (uint8_t*)(rowPf) + stripe*stripeWidth); \
+				state.gf.mul_add_pf(MAT_ROW(stripe, rowDst), MAT_ROW(stripe, rec+rowSrc), stripeWidth, tmpCoeff, state.gfScratch, (uint8_t*)(rowPf) + stripe*stripeWidth); \
 		} void(0)
 	#define MULADD_MULTI_ROW(rowDst, srcOffs, numRows) \
 		for(unsigned i=0; i<numRows; i++) \
-			coeff[i] = REPLACE_WORD(rowDst, missingCol+srcOffs+i, 0); \
-		for(unsigned stripe=0; stripe<numStripes; stripe++) \
-			gf.mul_add_multi(numRows, stripeWidth*numRec*stripe, MAT_ROW(0, rowDst), srcRows+srcOffs, stripeWidth, coeff, gfScratch)
+			state.coeff[i] = REPLACE_WORD(rowDst, missingCol+srcOffs+i, 0); \
+		for(unsigned stripe=0; stripe<numStripes; stripe++) { \
+			unsigned offset = (rec+srcOffs)*stripeWidth; \
+			state.gf.mul_add_multi(numRows, stripeWidth*numRec*stripe + offset, MAT_ROW(0, rowDst) - offset/sizeof(uint16_t), state.srcRowsBase, stripeWidth, state.coeff, state.gfScratch); \
+		}
 	#define MULADD_MULTI_ROW_PF(rowDst, srcOffs, numRows, rowPf) \
 		for(unsigned i=0; i<numRows; i++) \
-			coeff[i] = REPLACE_WORD(rowDst, missingCol+srcOffs+i, 0); \
+			state.coeff[i] = REPLACE_WORD(rowDst, missingCol+srcOffs+i, 0); \
 		for(unsigned stripe=0; stripe<numStripes; stripe++) \
-			gf.mul_add_multi_stridepf(numRows, stripeWidth, MAT_ROW(stripe, rowDst), MAT_ROW(stripe, rec+srcOffs), stripeWidth, coeff, gfScratch, (uint8_t*)(rowPf) + stripe*stripeWidth)
+			state.gf.mul_add_multi_stridepf(numRows, stripeWidth, MAT_ROW(stripe, rowDst), MAT_ROW(stripe, rec+srcOffs), stripeWidth, state.coeff, state.gfScratch, (uint8_t*)(rowPf) + stripe*stripeWidth)
 	
 	#define MULADD_LASTROW(rowDst, rowSrc) \
 		if(HEDLEY_LIKELY(recFirst < numRec)) { \
@@ -173,7 +185,6 @@ int Galois16RecMatrix::initScale(unsigned rec, unsigned validCount, unsigned rec
 			return -1; \
 		}
 	
-	if(recFirst == rec) recFirst += rows;
 	// the next row when `processRow` is called; last action will prefetch this row
 	uint16_t* nextScaleRow = (rec+rows < recLast) ? MAT_ROW(0, rec+rows) : nullptr;
 	
@@ -235,6 +246,8 @@ int Galois16RecMatrix::initScale(unsigned rec, unsigned validCount, unsigned rec
 			MULADD_LASTROW(rec+3, 4)
 		}
 	}
+	HEDLEY_STATIC_ASSERT(rows <= PP_INVERT_MAX_MULTI_ROWS && rows <= 6, "PP_INVERT_MAX_MULTI_ROWS > 6 case not handled");
+	
 	return -1;
 	#undef SCALE_ROW
 	#undef MULADD_ROW
@@ -245,53 +258,42 @@ int Galois16RecMatrix::initScale(unsigned rec, unsigned validCount, unsigned rec
 	#undef MULADD_MULTI_LASTROW
 }
 
-void Galois16RecMatrix::fillCoeffs(uint16_t* rowCoeffs, unsigned rows, unsigned validCount, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth, Galois16Mul& gf) {
-	unsigned missingCol = validCount + rec;
-	if(recFirst == rec) recFirst += rows;
+void Galois16RecMatrix::fillCoeffs(Galois16RecMatrixComputeState& state, unsigned rows, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth) {
+	assert(rec != recFirst);
+	unsigned missingCol = state.validCount + rec;
 	for(unsigned r=recFirst; r<recLast; r++) {
 		if(HEDLEY_UNLIKELY(r == rec)) {
 			r += rows-1;
 		} else {
 			for(unsigned c=0; c<rows; c++)
-				rowCoeffs[(r-recFirst)*coeffWidth + c] = REPLACE_WORD(r, missingCol+c, 0);
+				state.coeff[(r-recFirst)*coeffWidth + c] = REPLACE_WORD(r, missingCol+c, 0);
 		}
 	}
 }
 
 template<int rows>
-void Galois16RecMatrix::processRow(unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector<Galois16RecMatrixWorker>& workers) {
+void Galois16RecMatrix::processRow(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth) {
 	// TODO: consider optimisation for numStripes == 1 ?
 	
-	assert(recFirst <= recLast);
-	
-	void* srcRows[rows];
-	srcRows[0] = MAT_ROW(0, rec);
-	for(unsigned i=1; i<rows; i++)
-		srcRows[i] = (uint8_t*)srcRows[0] + i * stripeWidth;
-	
-	if(recFirst == rec) recFirst += rows;
+	assert(recFirst < recLast);
+	assert(rec != recFirst);
 	// the next row when `processRow` is called; last action will prefetch this row
 	uint16_t* nextScaleRow = (rec+rows < recLast) ? MAT_ROW(0, rec+rows) : nullptr;
 	
-	if(recFirst >= recLast) return;
-	
 	// do main elimination, using the source group
-	if(workers.empty())
+	if(state.workers.empty())
 		// process elimination directly
-		invertLoop<rows>(0, numStripes, recFirst, recLast, rec, recCount, rowCoeffs, coeffWidth, srcRows, gf, gfScratch, nextScaleRow);
+		invertLoop<rows>(0, numStripes, recFirst, recLast, rec, recCount, state.coeff, coeffWidth, state.srcRowsBase, state.gf, state.gfScratch, nextScaleRow);
 	else {
 		// process using workers
 		std::atomic<int> procRefs;
 		std::promise<void> done;
 		auto makeReq = [&, this]() -> Galois16RecMatrixWorkerMessage* {
-			auto* req = new Galois16RecMatrixWorkerMessage;
+			auto* req = new Galois16RecMatrixWorkerMessage(state);
 			req->recFirst = recFirst;
 			req->recLast = recLast;
 			req->recSrc = rec;
 			req->recSrcCount = recCount;
-			req->rowCoeffs = rowCoeffs;
-			req->srcRows = srcRows;
-			req->gf = &gf;
 			req->coeffWidth = coeffWidth;
 			req->fn = &Galois16RecMatrix::invertLoop<rows>;
 			req->parent = this;
@@ -299,11 +301,11 @@ void Galois16RecMatrix::processRow(unsigned rec, unsigned recCount, unsigned rec
 			req->done = &done;
 			return req;
 		};
-		if(numStripes >= workers.size()) { // split full stripes across workers
-			float stripesPerWorker = (float)numStripes / workers.size();
+		if(numStripes >= state.workers.size()) { // split full stripes across workers
+			float stripesPerWorker = (float)numStripes / state.workers.size();
 			float stripe = 0.5;
-			procRefs.store(workers.size());
-			for(auto& worker : workers) {
+			procRefs.store((int)state.workers.size());
+			for(auto& worker : state.workers) {
 				auto* req = makeReq();
 				req->stripeStart = (unsigned)stripe;
 				req->stripeEnd = (unsigned)(stripe + stripesPerWorker);
@@ -311,10 +313,11 @@ void Galois16RecMatrix::processRow(unsigned rec, unsigned recCount, unsigned rec
 				worker.thread.send(req);
 				stripe += stripesPerWorker;
 			}
+			assert((unsigned)stripe == numStripes);
 		} else { // each stripe may need >1 worker
 			std::vector<Galois16RecMatrixWorkerMessage*> reqs;
-			reqs.reserve(workers.size());
-			float workersPerStripe = (float)workers.size() / numStripes;
+			reqs.reserve(state.workers.size());
+			float workersPerStripe = (float)state.workers.size() / numStripes;
 			float workerCnt = 0.5;
 			for(unsigned stripe=0; stripe<numStripes; stripe++) {
 				unsigned workerNum = (unsigned)(workerCnt + workersPerStripe) - (unsigned)workerCnt;
@@ -346,12 +349,13 @@ void Galois16RecMatrix::processRow(unsigned rec, unsigned recCount, unsigned rec
 				
 				workerCnt += workersPerStripe;
 			}
-			assert(reqs.size() <= workers.size());
-			procRefs.store(reqs.size());
+			assert(reqs.size() <= state.workers.size());
+			assert((size_t)workerCnt == state.workers.size());
+			procRefs.store((int)reqs.size());
 			assert(procRefs > 0);
 			
 			for(unsigned i=0; i<reqs.size(); i++) {
-				auto& worker = workers[i];
+				auto& worker = state.workers[i];
 				auto* req = reqs[i];
 				req->gfScratch = worker.gfScratch;
 				worker.thread.send(req);
@@ -367,7 +371,7 @@ void Galois16RecMatrix::processRow(unsigned rec, unsigned recCount, unsigned rec
 
 
 template<int rows>
-int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector<Galois16RecMatrixWorker>& workers, std::function<void(uint16_t, uint16_t)> progressCb, uint16_t progressOffset, uint16_t totalProgress) {
+int Galois16RecMatrix::processRows(Galois16RecMatrixComputeState& state, unsigned& rec, unsigned rowGroupSize, std::function<void(uint16_t, uint16_t)> progressCb, uint16_t progressOffset, uint16_t totalProgress) {
 	unsigned alignedRowGroupSize = (rowGroupSize / rows) * rows;
 	while(rec <= numRec-rows) {
 		
@@ -386,10 +390,14 @@ int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigne
 		// loop through this row group (normalize values)
 		for(; rec < curRowGroupSize+recStart; rec+=rows) {
 			if(progressCb) progressCb(progressBase + (((rec-recStart)*progressRatio+32768)>>16), totalProgress);
-			int badRowOffset = initScale<rows>(rec, validCount, recStart, curRowGroupSize+recStart, gf, gfScratch);
+			unsigned recFirst = recStart;
+			if(recFirst == rec) recFirst += rows;
+			
+			int badRowOffset = initScale<rows>(state, rec, recFirst, curRowGroupSize+recStart);
 			if(badRowOffset >= 0) return rec+badRowOffset;
-			fillCoeffs(rowCoeffs, rows, validCount, recStart, curRowGroupSize+recStart, rec, rows, gf);
-			processRow<rows>(rec, rows, recStart, curRowGroupSize+recStart, gf, gfScratch, rowCoeffs, rows, workers);
+			if(recFirst == curRowGroupSize+recStart) continue;
+			fillCoeffs(state, rows, recFirst, curRowGroupSize+recStart, rec, rows);
+			processRow<rows>(state, rec, rows, recFirst, curRowGroupSize+recStart, rows);
 		}
 		
 		
@@ -410,8 +418,9 @@ int Galois16RecMatrix::processRows(unsigned& rec, unsigned rowGroupSize, unsigne
 				curRowGroupSize2 = numRec-recGroup;
 			if(recGroup < recStart && recGroup+curRowGroupSize2 > recStart)
 				curRowGroupSize2 = recStart-recGroup; // don't let this group cross into the normalized group
-			fillCoeffs(rowCoeffs, curRowGroupSize, validCount, recGroup, recGroup+curRowGroupSize2, recStart, curRowGroupSize, gf);
-			processRow<rows>(recStart, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, gf, gfScratch, rowCoeffs, curRowGroupSize, workers);
+			assert(curRowGroupSize2 > 0);
+			fillCoeffs(state, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, recStart, curRowGroupSize);
+			processRow<rows>(state, recStart, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, curRowGroupSize);
 			recGroup += curRowGroupSize2;
 		}
 	}
@@ -520,7 +529,7 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 }
 
 bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned validCount, std::vector<uint16_t>& recovery, std::function<void(uint16_t, uint16_t)> progressCb) {
-	numRec = inputValid.size() - validCount;
+	numRec = (unsigned)inputValid.size() - validCount;
 	assert(validCount < inputValid.size()); // i.e. numRec > 0
 	assert(inputValid.size() <= 32768 && inputValid.size() > 0);
 	assert(recovery.size() <= 65535 && recovery.size() > 0);
@@ -528,14 +537,15 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	if(numRec > recovery.size()) return false;
 	
 	
-	unsigned matWidth = inputValid.size() * sizeof(uint16_t);
-	Galois16Mul gf(Galois16Mul::default_method(matWidth, inputValid.size(), inputValid.size(), true));
-	const auto gfInfo = gf.info();
+	unsigned matWidth = (unsigned)inputValid.size() * sizeof(uint16_t);
+	Galois16RecMatrixComputeState state(Galois16Mul::default_method(matWidth, (unsigned)inputValid.size(), (unsigned)inputValid.size(), true));
+	state.validCount = validCount;
+	const auto gfInfo = state.gf.info();
 	
 	// divide the matrix up into evenly sized stripes (for loop tiling optimisation)
-	numStripes = ROUND_DIV(matWidth, gfInfo.idealChunkSize);
+	numStripes = ROUND_DIV(matWidth, (unsigned)gfInfo.idealChunkSize);
 	if(numStripes < 1) numStripes = 1;
-	stripeWidth = gf.alignToStride(CEIL_DIV(matWidth, numStripes));
+	stripeWidth = (unsigned)state.gf.alignToStride(CEIL_DIV(matWidth, numStripes));
 	numStripes = CEIL_DIV(matWidth, stripeWidth);
 	assert(numStripes >= 1);
 	
@@ -544,7 +554,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	unsigned matSize = numRec * stripeWidth*numStripes;
 	ALIGN_ALLOC(mat, matSize, gfInfo.alignment);
 	
-	uint16_t totalProgress = numRec + (gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc
+	uint16_t totalProgress = numRec + (state.gf.needPrepare() ? 3 : 1); // provision for prepare/finish/init-calc
 	
 	// easier to handle if exponents are in order
 	std::sort(recovery.begin(), recovery.end());
@@ -555,32 +565,35 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		setup_pmul();
 	}
 	
-	std::vector<Galois16RecMatrixWorker> workers;
-	void* gfScratch;
+	state.srcRowsBase[0] = mat;
+	for(unsigned i=1; i<PP_INVERT_MAX_MULTI_ROWS; i++)
+		state.srcRowsBase[i] = (uint8_t*)state.srcRowsBase[0] + i * stripeWidth;
+	
 	unsigned _numThreads = numThreads;
 	if(numRec < MIN_THREAD_REC) _numThreads = 1; // don't spawn threads if not enough work
 	if(_numThreads > 1) {
-		workers.reserve(_numThreads);
+		state.workers.reserve(_numThreads);
 		for(unsigned i=0; i<_numThreads; i++) {
-			workers.emplace_back(gf);
-			workers[i].thread.name = "gauss_worker";
-			workers[i].thread.setCallback(invert_worker);
+			state.workers.emplace_back(state.gf);
+			state.workers[i].thread.name = "gauss_worker";
+			state.workers[i].thread.setCallback(invert_worker);
 		}
-		gfScratch = workers[0].gfScratch;
+		state.gfScratch = state.workers[0].gfScratch;
 	} else
-		gfScratch = gf.mutScratch_alloc();
+		state.gfScratch = state.gf.mutScratch_alloc();
 	
 	// target L3 slice? use 1MB target for now; TODO: improve this
 	unsigned rowGroupSize = (1024*1024 / stripeWidth);
 	// if it's going to be split amongst cores, increase the number of rows in a group
 	if(numStripes < _numThreads) rowGroupSize *= _numThreads/numStripes;
-	if(rowGroupSize < gfInfo.idealInputMultiple*2) rowGroupSize = gfInfo.idealInputMultiple*2;
+	unsigned rowMultiple = (std::min)(gfInfo.idealInputMultiple, PP_INVERT_MAX_MULTI_ROWS);
+	if(rowGroupSize < rowMultiple*2) rowGroupSize = rowMultiple*2;
 	if(rowGroupSize > numRec) rowGroupSize = numRec;
 	
 	invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying?
 		if(numRec > recovery.size()) { // not enough recovery
 			if(_numThreads <= 1)
-				gf.mutScratch_free(gfScratch);
+				state.gf.mutScratch_free(state.gfScratch);
 			ALIGN_FREE(mat);
 			mat = nullptr;
 			return false;
@@ -591,18 +604,18 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 		
 		// pre-transform
 		uint16_t progressOffset = 1;
-		if(gf.needPrepare()) {
+		if(state.gf.needPrepare()) {
 			if(progressCb) progressCb(1, totalProgress);
 			progressOffset = 2;
 			
-			gf.prepare(mat, mat, matSize);
+			state.gf.prepare(mat, mat, matSize);
 		}
 		
 		// invert
 		unsigned rec = 0;
 		#define INVERT_GROUP(rows) \
 			if(gfInfo.idealInputMultiple >= rows && numRec >= rows) { \
-				int badRow = processRows<rows>(rec, rowGroupSize, validCount, gf, gfScratch, rowCoeffs, workers, progressCb, progressOffset, totalProgress); \
+				int badRow = processRows<rows>(state, rec, rowGroupSize, progressCb, progressOffset, totalProgress); \
 				if(badRow >= 0) { \
 					/* ignore this recovery row and try again */ \
 					recovery.erase(recovery.begin() + badRow); \
@@ -610,21 +623,21 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 				} \
 			}
 		// max out at 6 groups (registers + cache assoc?)
-		uint16_t* rowCoeffs = new uint16_t[rowGroupSize*rowGroupSize];
+		state.coeff = new uint16_t[rowGroupSize*rowGroupSize];
 		INVERT_GROUP(6)
 		INVERT_GROUP(5)
 		INVERT_GROUP(4)
 		INVERT_GROUP(3)
 		INVERT_GROUP(2)
 		INVERT_GROUP(1)
-		delete[] rowCoeffs;
+		delete[] state.coeff;
 		#undef INVERT_GROUP
 		
 		// post transform
-		if(gf.needPrepare()) {
+		if(state.gf.needPrepare()) {
 			if(progressCb) progressCb(totalProgress-1, totalProgress);
 			
-			gf.finish(mat, matSize);
+			state.gf.finish(mat, matSize);
 			// TODO: check for zeroes??
 		}
 	}
@@ -633,7 +646,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	recovery.resize(numRec);
 	
 	if(_numThreads <= 1)
-		gf.mutScratch_free(gfScratch);
+		state.gf.mutScratch_free(state.gfScratch);
 	return true;
 }
 
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index 7490dcaf..3ad049c6 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -6,8 +6,11 @@
 #include "../src/stdint.h"
 
 #ifdef PARPAR_INVERT_SUPPORT
+const unsigned PP_INVERT_MAX_MULTI_ROWS = 6; // process up to 6 rows in a multi-mul call
+
 class Galois16Mul;
 class Galois16RecMatrixWorker;
+struct Galois16RecMatrixComputeState;
 class Galois16RecMatrix {
 	uint16_t* mat;
 	unsigned numStripes;
@@ -17,14 +20,14 @@ class Galois16RecMatrix {
 	void Construct(const std::vector<bool>& inputValid, unsigned validCount, const std::vector<uint16_t>& recovery);
 	
 	template<int rows>
-	void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void** srcRows, Galois16Mul& gf, void* gfScratch, const void* nextPf);
+	void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf);
 	template<int rows>
-	int initScale(unsigned rec, unsigned validCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch);
-	void fillCoeffs(uint16_t* rowCoeffs, unsigned rows, unsigned validCount, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth, Galois16Mul& gf);
+	int initScale(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast);
+	void fillCoeffs(Galois16RecMatrixComputeState& state, unsigned rows, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth);
 	template<int rows>
-	void processRow(unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, unsigned coeffWidth, std::vector<Galois16RecMatrixWorker>& workers);
+	void processRow(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth);
 	template<int rows>
-	int processRows(unsigned& rec, unsigned rowGroupSize, unsigned validCount, Galois16Mul& gf, void* gfScratch, uint16_t* rowCoeffs, std::vector<Galois16RecMatrixWorker>& workers, std::function<void(uint16_t, uint16_t)> progressCb, uint16_t progressOffset, uint16_t totalProgress);
+	int processRows(Galois16RecMatrixComputeState& state, unsigned& rec, unsigned rowGroupSize, std::function<void(uint16_t, uint16_t)> progressCb, uint16_t progressOffset, uint16_t totalProgress);
 public:
 	Galois16RecMatrix();
 	~Galois16RecMatrix();

From 2c0f4005898626612284a502745377acc227ee4b Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 20 Jul 2023 21:31:14 +1000
Subject: [PATCH 39/91] Add missing VZEROUPPER in x86 pmul kernel

---
 gf16/gf16pmul_clmul_x86.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gf16/gf16pmul_clmul_x86.h b/gf16/gf16pmul_clmul_x86.h
index 9dfa801e..654449fa 100644
--- a/gf16/gf16pmul_clmul_x86.h
+++ b/gf16/gf16pmul_clmul_x86.h
@@ -222,6 +222,9 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void
 		_MMI(store)((_mword*)(_dst + ptr), result);
 	}
 #endif
+#if MWORD_SIZE >= 32
+	_mm256_zeroupper();
+#endif
 }
 
 #else

From a57701a4aa757fb3306072acf2e321b82fa82285 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Fri, 21 Jul 2023 11:30:54 +1000
Subject: [PATCH 40/91] Tweak prefetching during inversion

---
 gf16/gf16_muladd_multi.h |  4 +-
 gf16/gfmat_inv.cpp       | 94 +++++++++++++++++++++++++---------------
 gf16/gfmat_inv.h         | 14 +++---
 3 files changed, 68 insertions(+), 44 deletions(-)

diff --git a/gf16/gf16_muladd_multi.h b/gf16/gf16_muladd_multi.h
index 98ccc9d4..cb938cb4 100644
--- a/gf16/gf16_muladd_multi.h
+++ b/gf16/gf16_muladd_multi.h
@@ -167,7 +167,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_R
 			_SRC(interleave, 9), _SRC(interleave, 10), _SRC(interleave, 11), _SRC(interleave, 12),
 			_SRC(interleave,13), _SRC(interleave, 14), _SRC(interleave, 15), _SRC(interleave, 16),
 			_SRC(interleave,17),
-			len, coefficients + region, 1, _pf
+			len, coefficients + region, 2, _pf
 		);
 		srcEnd += srcStride*interleave;
 		outputPfRounds--;
@@ -201,7 +201,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_R
 						_SRC(x, 9), _SRC(x, 10), _SRC(x, 11), _SRC(x, 12), \
 						_SRC(x,13), _SRC(x, 14), _SRC(x, 15), _SRC(x, 16), \
 						_SRC(x,17), \
-						len, coefficients + region, 1, _pf \
+						len, coefficients + region, 2, _pf \
 					); \
 				break
 				REMAINING_CASES;
diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index 921a930c..8ba80fbe 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -21,6 +21,7 @@ struct Galois16RecMatrixComputeState {
 	unsigned validCount;
 	void* srcRowsBase[PP_INVERT_MAX_MULTI_ROWS];
 	std::vector<Galois16RecMatrixWorker> workers;
+	unsigned pfFactor;
 	
 	Galois16RecMatrixComputeState(Galois16Methods method) : gf(method) {}
 };
@@ -52,19 +53,20 @@ struct Galois16RecMatrixWorkerMessage {
 	unsigned recSrc; unsigned recSrcCount; uint16_t* rowCoeffs; Galois16Mul* gf; void* gfScratch;
 	void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS];
 	unsigned coeffWidth;
-	void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void*(&)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul&, void*, const void*);
+	void(Galois16RecMatrix::*fn)(unsigned, unsigned, unsigned, unsigned, unsigned, unsigned, uint16_t*, unsigned, void*(&)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul&, void*, const void*, unsigned);
+	unsigned pfFactor;
 	Galois16RecMatrix* parent;
 	std::atomic<int>* procRefs;
 	std::promise<void>* done;
 	
 	Galois16RecMatrixWorkerMessage(Galois16RecMatrixComputeState& state)
-	: rowCoeffs(state.coeff), gf(&state.gf), srcRowsBase(state.srcRowsBase) {}
+	: rowCoeffs(state.coeff), gf(&state.gf), srcRowsBase(state.srcRowsBase), pfFactor(state.pfFactor) {}
 };
 
 static void invert_worker(ThreadMessageQueue<void*>& q) {
 	Galois16RecMatrixWorkerMessage* req;
 	while((req = static_cast<Galois16RecMatrixWorkerMessage*>(q.pop())) != NULL) {
-		(req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->recSrcCount, req->rowCoeffs, req->coeffWidth, req->srcRowsBase, *(req->gf), req->gfScratch, nullptr);
+		(req->parent->*(req->fn))(req->stripeStart, req->stripeEnd, req->recFirst, req->recLast, req->recSrc, req->recSrcCount, req->rowCoeffs, req->coeffWidth, req->srcRowsBase, *(req->gf), req->gfScratch, nullptr, req->pfFactor);
 		if(req->procRefs->fetch_sub(1, std::memory_order_acq_rel) <= 1) {
 			req->done->set_value();
 		}
@@ -76,9 +78,14 @@ static void invert_worker(ThreadMessageQueue<void*>& q) {
 #define CEIL_DIV(a, b) (((a) + (b)-1) / (b))
 #define ROUND_DIV(a, b) (((a) + ((b)>>1)) / (b))
 
-template<int rows>
-void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf) {
+template<unsigned rows>
+void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf, unsigned pfFactor) {
 	assert(recSrcCount % rows == 0);
+	// when to start prefetching the next stripe
+	unsigned recStartPf = 0;
+	if(recSrcCount > rows<<pfFactor)
+		recStartPf = recSrcCount - (rows<<pfFactor);
+	const uint8_t* pf = nullptr;
 	for(unsigned stripe=stripeStart; stripe<stripeEnd; stripe++) {
 		for(unsigned recI = 0; recI < recSrcCount; recI += rows) {
 			unsigned rec = recI+recSrc;
@@ -87,30 +94,34 @@ void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, uns
 				if(HEDLEY_UNLIKELY(rec2 == rec))
 					rec2 += rows;
 				
-				// TODO: fixup prefetching
-				const void* pf = nextPf;
-				if(HEDLEY_LIKELY(rec2 < recLast)) {
-					pf = MAT_ROW(stripe, rec2);
-				} else if(recI+rows < recSrcCount) {
-					pf = nullptr; // TODO: same row group - no need to prefetch as it should be in cache?
-				} else if(stripe < stripeEnd-1) {
-					pf = MAT_ROW(stripe+1, recFirst);
-					// TODO: need to prefetch next stripe's initial matrix?
-				}
+				if(recI >= recStartPf) {
+					if(recI == recStartPf && curRec2 == recFirst) {
+						if(stripe < stripeEnd-1)
+							// prefetch next stripe
+							pf = (const uint8_t*)(MAT_ROW(stripe+1, recFirst));
+						else
+							pf = (const uint8_t*)nextPf;
+					} else if(pf) {
+						pf += stripeWidth >> pfFactor;
+					}
+					// TODO: if numStripes==1, we might want to avoid prefetching the same row group as the first applyRows loop would
+				} else
+					pf = nullptr;
 				
+				uint16_t* target = MAT_ROW(stripe, curRec2);
 				uint16_t* coeffPtr = rowCoeffs + (curRec2-recFirst)*coeffWidth + recI;
 				if(rows > 1) {
-					if(HEDLEY_LIKELY(pf))
-						gf.mul_add_multi_stridepf(rows, stripeWidth, MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, coeffPtr, gfScratch, pf);
+					if(pf)
+						gf.mul_add_multi_stridepf(rows, stripeWidth, target, MAT_ROW(stripe, rec), stripeWidth, coeffPtr, gfScratch, pf);
 					else {
 						unsigned offset = rec*stripeWidth;
 						gf.mul_add_multi(rows, stripeWidth*numRec*stripe + offset, MAT_ROW(0, curRec2) - offset/sizeof(uint16_t), srcRowsBase, stripeWidth, coeffPtr, gfScratch);
 					}
 				} else {
-					if(HEDLEY_LIKELY(pf))
-						gf.mul_add_pf(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch, pf);
+					if(pf)
+						gf.mul_add_pf(target, MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch, pf);
 					else
-						gf.mul_add(MAT_ROW(stripe, curRec2), MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch);
+						gf.mul_add(target, MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch);
 				}
 			}
 		}
@@ -119,8 +130,8 @@ void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, uns
 
 #define REPLACE_WORD(r, c, v) state.gf.replace_word(MAT_ROW((c)/(stripeWidth / sizeof(uint16_t)), r), (c)%(stripeWidth / sizeof(uint16_t)), v)
 
-template<int rows>
-int Galois16RecMatrix::initScale(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast) {
+template<unsigned rows>
+int Galois16RecMatrix::scaleRows(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast) {
 	assert(recFirst <= recLast);
 	assert(rec != recFirst);
 	
@@ -185,8 +196,10 @@ int Galois16RecMatrix::initScale(Galois16RecMatrixComputeState& state, unsigned
 			return -1; \
 		}
 	
-	// the next row when `processRow` is called; last action will prefetch this row
-	uint16_t* nextScaleRow = (rec+rows < recLast) ? MAT_ROW(0, rec+rows) : nullptr;
+	// the next row when `applyRows` is called; last action will prefetch this row
+	uint16_t* nextScaleRow = nullptr;
+	if(!state.workers.empty() && recFirst < recLast)
+		nextScaleRow = MAT_ROW(0, recFirst); // only prefetch if we're not sending data to threads
 	
 	// TODO: consider loop tiling this stuff; requires extracting a small matrix (rows*rows), and solving that, which means a scalar multiply is necessary
 	
@@ -271,20 +284,19 @@ void Galois16RecMatrix::fillCoeffs(Galois16RecMatrixComputeState& state, unsigne
 	}
 }
 
-template<int rows>
-void Galois16RecMatrix::processRow(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth) {
+template<unsigned rows>
+void Galois16RecMatrix::applyRows(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth, int nextRow) {
 	// TODO: consider optimisation for numStripes == 1 ?
 	
 	assert(recFirst < recLast);
 	assert(rec != recFirst);
-	// the next row when `processRow` is called; last action will prefetch this row
-	uint16_t* nextScaleRow = (rec+rows < recLast) ? MAT_ROW(0, rec+rows) : nullptr;
 	
 	// do main elimination, using the source group
-	if(state.workers.empty())
+	if(state.workers.empty()) {
 		// process elimination directly
-		invertLoop<rows>(0, numStripes, recFirst, recLast, rec, recCount, state.coeff, coeffWidth, state.srcRowsBase, state.gf, state.gfScratch, nextScaleRow);
-	else {
+		uint16_t* nextScaleRow = nextRow >= 0 ? MAT_ROW(0, (unsigned)nextRow) : nullptr;
+		invertLoop<rows>(0, numStripes, recFirst, recLast, rec, recCount, state.coeff, coeffWidth, state.srcRowsBase, state.gf, state.gfScratch, nextScaleRow, state.pfFactor);
+	} else {
 		// process using workers
 		std::atomic<int> procRefs;
 		std::promise<void> done;
@@ -370,7 +382,7 @@ void Galois16RecMatrix::processRow(Galois16RecMatrixComputeState& state, unsigne
 #undef MAT_ROW
 
 
-template<int rows>
+template<unsigned rows>
 int Galois16RecMatrix::processRows(Galois16RecMatrixComputeState& state, unsigned& rec, unsigned rowGroupSize, std::function<void(uint16_t, uint16_t)> progressCb, uint16_t progressOffset, uint16_t totalProgress) {
 	unsigned alignedRowGroupSize = (rowGroupSize / rows) * rows;
 	while(rec <= numRec-rows) {
@@ -393,11 +405,15 @@ int Galois16RecMatrix::processRows(Galois16RecMatrixComputeState& state, unsigne
 			unsigned recFirst = recStart;
 			if(recFirst == rec) recFirst += rows;
 			
-			int badRowOffset = initScale<rows>(state, rec, recFirst, curRowGroupSize+recStart);
+			int badRowOffset = scaleRows<rows>(state, rec, recFirst, curRowGroupSize+recStart);
 			if(badRowOffset >= 0) return rec+badRowOffset;
 			if(recFirst == curRowGroupSize+recStart) continue;
 			fillCoeffs(state, rows, recFirst, curRowGroupSize+recStart, rec, rows);
-			processRow<rows>(state, rec, rows, recFirst, curRowGroupSize+recStart, rows);
+			
+			int nextRow = recStart;
+			if(rec+rows == curRowGroupSize+recStart)
+				nextRow = recStart > 0 ? 0 : (numRec>=curRowGroupSize*2 ? curRowGroupSize : -1);
+			applyRows<rows>(state, rec, rows, recFirst, curRowGroupSize+recStart, rows, nextRow);
 		}
 		
 		
@@ -420,7 +436,14 @@ int Galois16RecMatrix::processRows(Galois16RecMatrixComputeState& state, unsigne
 				curRowGroupSize2 = recStart-recGroup; // don't let this group cross into the normalized group
 			assert(curRowGroupSize2 > 0);
 			fillCoeffs(state, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, recStart, curRowGroupSize);
-			processRow<rows>(state, recStart, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, curRowGroupSize);
+			
+			int nextRow = recGroup + curRowGroupSize2;
+			if((unsigned)nextRow >= numRec)
+				nextRow = rec+curRowGroupSize2 < numRec ? rec : -1;
+			else if((unsigned)nextRow+curRowGroupSize2 > numRec)
+				nextRow = -1; // don't over prefetch; TODO: is there a way to still prefetch something?
+			applyRows<rows>(state, recStart, curRowGroupSize, recGroup, recGroup+curRowGroupSize2, curRowGroupSize, nextRow);
+			
 			recGroup += curRowGroupSize2;
 		}
 	}
@@ -541,6 +564,7 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	Galois16RecMatrixComputeState state(Galois16Mul::default_method(matWidth, (unsigned)inputValid.size(), (unsigned)inputValid.size(), true));
 	state.validCount = validCount;
 	const auto gfInfo = state.gf.info();
+	state.pfFactor = gfInfo.prefetchDownscale;
 	
 	// divide the matrix up into evenly sized stripes (for loop tiling optimisation)
 	numStripes = ROUND_DIV(matWidth, (unsigned)gfInfo.idealChunkSize);
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index 3ad049c6..ce53d7fe 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -19,14 +19,14 @@ class Galois16RecMatrix {
 	unsigned numThreads;
 	void Construct(const std::vector<bool>& inputValid, unsigned validCount, const std::vector<uint16_t>& recovery);
 	
-	template<int rows>
-	void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf);
-	template<int rows>
-	int initScale(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast);
+	template<unsigned rows>
+	void invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf, unsigned pfFactor);
+	template<unsigned rows>
+	int scaleRows(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recFirst, unsigned recLast);
 	void fillCoeffs(Galois16RecMatrixComputeState& state, unsigned rows, unsigned recFirst, unsigned recLast, unsigned rec, unsigned coeffWidth);
-	template<int rows>
-	void processRow(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth);
-	template<int rows>
+	template<unsigned rows>
+	void applyRows(Galois16RecMatrixComputeState& state, unsigned rec, unsigned recCount, unsigned recFirst, unsigned recLast, unsigned coeffWidth, int nextRow);
+	template<unsigned rows>
 	int processRows(Galois16RecMatrixComputeState& state, unsigned& rec, unsigned rowGroupSize, std::function<void(uint16_t, uint16_t)> progressCb, uint16_t progressOffset, uint16_t totalProgress);
 public:
 	Galois16RecMatrix();

From f5497dd4f4497d26dfed3f5b82785ca0a269b10b Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Fri, 21 Jul 2023 13:26:29 +1000
Subject: [PATCH 41/91] Also prefetch next rowSrc in inversion

---
 gf16/gfmat_inv.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index 8ba80fbe..7f0d91e4 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -82,9 +82,11 @@ template<unsigned rows>
 void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, unsigned recFirst, unsigned recLast, unsigned recSrc, unsigned recSrcCount, uint16_t* rowCoeffs, unsigned coeffWidth, void* (&srcRowsBase)[PP_INVERT_MAX_MULTI_ROWS], Galois16Mul& gf, void* gfScratch, const void* nextPf, unsigned pfFactor) {
 	assert(recSrcCount % rows == 0);
 	// when to start prefetching the next stripe
-	unsigned recStartPf = 0;
+	unsigned recStartPf = 0, recSrcStartPf = recFirst;
 	if(recSrcCount > rows<<pfFactor)
 		recStartPf = recSrcCount - (rows<<pfFactor);
+	if(recLast-recFirst > rows<<pfFactor)
+		recSrcStartPf = recLast - (rows<<pfFactor);
 	const uint8_t* pf = nullptr;
 	for(unsigned stripe=stripeStart; stripe<stripeEnd; stripe++) {
 		for(unsigned recI = 0; recI < recSrcCount; recI += rows) {
@@ -101,12 +103,15 @@ void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, uns
 							pf = (const uint8_t*)(MAT_ROW(stripe+1, recFirst));
 						else
 							pf = (const uint8_t*)nextPf;
-					} else if(pf) {
-						pf += stripeWidth >> pfFactor;
 					}
 					// TODO: if numStripes==1, we might want to avoid prefetching the same row group as the first applyRows loop would
-				} else
-					pf = nullptr;
+				} else {
+					if(curRec2 == recSrcStartPf)
+						// prefetch next rowSrc
+						pf = (const uint8_t*)(MAT_ROW(stripe, rec+rows));
+					else if(curRec2 < recSrcStartPf)
+						pf = nullptr;
+				}
 				
 				uint16_t* target = MAT_ROW(stripe, curRec2);
 				uint16_t* coeffPtr = rowCoeffs + (curRec2-recFirst)*coeffWidth + recI;
@@ -123,6 +128,7 @@ void Galois16RecMatrix::invertLoop(unsigned stripeStart, unsigned stripeEnd, uns
 					else
 						gf.mul_add(target, MAT_ROW(stripe, rec), stripeWidth, *coeffPtr, gfScratch);
 				}
+				if(pf) pf += stripeWidth >> pfFactor;
 			}
 		}
 	}
@@ -676,7 +682,6 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 
 Galois16RecMatrix::Galois16RecMatrix() : mat(nullptr) {
 	numThreads = hardware_concurrency();
-	if(numThreads > 4) numThreads = 4; // by default, cap at 4 threads, as scaling doesn't work so well; TODO: tweak this later
 	numRec = 0;
 	numStripes = 0;
 	stripeWidth = 0;

From 6f90be9ad4b095d4a978360b991d03e4426b6790 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Fri, 21 Jul 2023 15:33:28 +1000
Subject: [PATCH 42/91] Target 512K cache for inversion + ensure
 hardware_concurrency() > 0

---
 gf16/gfmat_inv.cpp | 5 +++--
 gf16/threadqueue.h | 7 ++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index 7f0d91e4..26b4055b 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -612,8 +612,9 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	} else
 		state.gfScratch = state.gf.mutScratch_alloc();
 	
-	// target L3 slice? use 1MB target for now; TODO: improve this
-	unsigned rowGroupSize = (1024*1024 / stripeWidth);
+	// target L2 size - 512K seems to be a reasonable guess for now; TODO: improve this
+	// - targeting larger L2 (e.g. >1MB) seems to perform worse, so a fixed size might end up being better
+	unsigned rowGroupSize = (512*1024 / stripeWidth);
 	// if it's going to be split amongst cores, increase the number of rows in a group
 	if(numStripes < _numThreads) rowGroupSize *= _numThreads/numStripes;
 	unsigned rowMultiple = (std::min)(gfInfo.idealInputMultiple, PP_INVERT_MAX_MULTI_ROWS);
diff --git a/gf16/threadqueue.h b/gf16/threadqueue.h
index ee11a8c2..3db53cb3 100644
--- a/gf16/threadqueue.h
+++ b/gf16/threadqueue.h
@@ -392,8 +392,8 @@ class MessageThread {
 };
 
 static inline int hardware_concurrency() {
-#ifdef USE_LIBUV
 	int threads;
+#ifdef USE_LIBUV
 #if UV_VERSION_HEX >= 0x12c00  // 1.44.0
 	threads = uv_available_parallelism();
 #else
@@ -401,10 +401,11 @@ static inline int hardware_concurrency() {
 	uv_cpu_info(&info, &threads);
 	uv_free_cpu_info(info, threads);
 #endif
-	return threads;
 #else
-	return (int)std::thread::hardware_concurrency();
+	threads = (int)std::thread::hardware_concurrency();
 #endif
+	if(threads < 1) threads = 1;
+	return threads;
 }
 
 

From 10739248ebf4ab5abe75d20ac6fc57a0beb93f2a Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 25 Jul 2023 18:26:55 +1000
Subject: [PATCH 43/91] Rename gf16pmul_clmul* -> gf16_pmul*

---
 gf16/gf16pmul.cpp                             | 40 +++++++++----------
 gf16/gf16pmul.h                               |  6 +--
 ...{gf16pmul_clmul_avx2.c => gf16pmul_avx2.c} |  2 +-
 ...{gf16pmul_clmul_neon.c => gf16pmul_neon.c} |  8 ++--
 gf16/{gf16pmul_clmul_sse.c => gf16pmul_sse.c} |  2 +-
 ...{gf16pmul_clmul_sve2.c => gf16pmul_sve2.c} | 12 +++---
 ...l_clmul_vpclgfni.c => gf16pmul_vpclgfni.c} |  2 +-
 ...mul_clmul_vpclmul.c => gf16pmul_vpclmul.c} |  2 +-
 gf16/{gf16pmul_clmul_x86.h => gf16pmul_x86.h} | 16 ++++----
 9 files changed, 45 insertions(+), 45 deletions(-)
 rename gf16/{gf16pmul_clmul_avx2.c => gf16pmul_avx2.c} (88%)
 rename gf16/{gf16pmul_clmul_neon.c => gf16pmul_neon.c} (80%)
 rename gf16/{gf16pmul_clmul_sse.c => gf16pmul_sse.c} (88%)
 rename gf16/{gf16pmul_clmul_sve2.c => gf16pmul_sve2.c} (77%)
 rename gf16/{gf16pmul_clmul_vpclgfni.c => gf16pmul_vpclgfni.c} (90%)
 rename gf16/{gf16pmul_clmul_vpclmul.c => gf16pmul_vpclmul.c} (89%)
 rename gf16/{gf16pmul_clmul_x86.h => gf16pmul_x86.h} (92%)

diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp
index ffc59f35..5e1f7504 100644
--- a/gf16/gf16pmul.cpp
+++ b/gf16/gf16pmul.cpp
@@ -30,47 +30,47 @@ void setup_pmul() {
 	hasGFNI = (cpuInfoX[2] & 0x100) == 0x100;
 #endif
 	
-	if(!hasGFNI) gf16pmul_clmul_available_vpclgfni = 0;
+	if(!hasGFNI) gf16pmul_available_vpclgfni = 0;
 	if(!hasVPCLMUL) {
-		gf16pmul_clmul_available_vpclmul = 0;
-		gf16pmul_clmul_available_vpclgfni = 0;
+		gf16pmul_available_vpclmul = 0;
+		gf16pmul_available_vpclgfni = 0;
 	}
-	if(!hasAVX2) gf16pmul_clmul_available_avx2 = 0;
-	if(!hasClMul) gf16pmul_clmul_available_sse = 0;
+	if(!hasAVX2) gf16pmul_available_avx2 = 0;
+	if(!hasClMul) gf16pmul_available_sse = 0;
 	
-	if(gf16pmul_clmul_available_vpclgfni) {
-		gf16pmul = &gf16pmul_clmul_vpclgfni;
+	if(gf16pmul_available_vpclgfni) {
+		gf16pmul = &gf16pmul_vpclgfni;
 		gf16pmul_alignment = 32;
 		gf16pmul_blocklen = 64;
 	}
-	else if(gf16pmul_clmul_available_vpclmul) {
-		gf16pmul = &gf16pmul_clmul_vpclmul;
+	else if(gf16pmul_available_vpclmul) {
+		gf16pmul = &gf16pmul_vpclmul;
 		gf16pmul_alignment = 32;
 		gf16pmul_blocklen = 32;
 	}
-	else if(gf16pmul_clmul_available_avx2) {
-		gf16pmul = &gf16pmul_clmul_avx2;
+	else if(gf16pmul_available_avx2) {
+		gf16pmul = &gf16pmul_avx2;
 		gf16pmul_alignment = 32;
 		gf16pmul_blocklen = 32;
 	}
-	else if(gf16pmul_clmul_available_sse) {
-		gf16pmul = &gf16pmul_clmul_sse;
+	else if(gf16pmul_available_sse) {
+		gf16pmul = &gf16pmul_sse;
 		gf16pmul_alignment = 16;
 		gf16pmul_blocklen = 16;
 	}
 #endif
 	
 #ifdef PLATFORM_ARM
-	if(!CPU_HAS_SVE2) gf16pmul_clmul_available_sve2 = 0;
-	if(!CPU_HAS_NEON) gf16pmul_clmul_available_neon = 0;
+	if(!CPU_HAS_SVE2) gf16pmul_available_sve2 = 0;
+	if(!CPU_HAS_NEON) gf16pmul_available_neon = 0;
 	
-	if(gf16pmul_clmul_available_sve2) {
-		gf16pmul = &gf16pmul_clmul_sve2;
-		gf16pmul_alignment = gf16pmul_clmul_sve2_width();
+	if(gf16pmul_available_sve2) {
+		gf16pmul = &gf16pmul_sve2;
+		gf16pmul_alignment = gf16pmul_sve2_width();
 		gf16pmul_blocklen = gf16pmul_alignment*2;
 	}
-	else if(gf16pmul_clmul_available_neon) {
-		gf16pmul = &gf16pmul_clmul_neon;
+	else if(gf16pmul_available_neon) {
+		gf16pmul = &gf16pmul_neon;
 		gf16pmul_alignment = 16;
 		gf16pmul_blocklen = 32;
 	}
diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h
index 180a9025..7ef94ded 100644
--- a/gf16/gf16pmul.h
+++ b/gf16/gf16pmul.h
@@ -14,8 +14,8 @@ void setup_pmul();
 
 HEDLEY_BEGIN_C_DECLS
 #define _PMUL_DECL(f) \
-	void gf16pmul_clmul_##f(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len); \
-	extern int gf16pmul_clmul_available_##f
+	void gf16pmul_##f(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len); \
+	extern int gf16pmul_available_##f
 
 _PMUL_DECL(sse);
 _PMUL_DECL(avx2);
@@ -26,7 +26,7 @@ _PMUL_DECL(sve2);
 
 #undef _PMUL_DECL
 
-unsigned gf16pmul_clmul_sve2_width();
+unsigned gf16pmul_sve2_width();
 
 HEDLEY_END_C_DECLS
 
diff --git a/gf16/gf16pmul_clmul_avx2.c b/gf16/gf16pmul_avx2.c
similarity index 88%
rename from gf16/gf16pmul_clmul_avx2.c
rename to gf16/gf16pmul_avx2.c
index ce965f4c..1c0cc4a1 100644
--- a/gf16/gf16pmul_clmul_avx2.c
+++ b/gf16/gf16pmul_avx2.c
@@ -9,4 +9,4 @@
 #if defined(__PCLMUL__) && defined(__AVX2__)
 # define _AVAILABLE 1
 #endif
-#include "gf16pmul_clmul_x86.h"
+#include "gf16pmul_x86.h"
diff --git a/gf16/gf16pmul_clmul_neon.c b/gf16/gf16pmul_neon.c
similarity index 80%
rename from gf16/gf16pmul_clmul_neon.c
rename to gf16/gf16pmul_neon.c
index c4d8f76c..c23cc3c4 100644
--- a/gf16/gf16pmul_clmul_neon.c
+++ b/gf16/gf16pmul_neon.c
@@ -2,9 +2,9 @@
 #include "gf16_clmul_neon.h"
 
 #ifdef __ARM_NEON
-int gf16pmul_clmul_available_neon = 1;
+int gf16pmul_available_neon = 1;
 
-void gf16pmul_clmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+void gf16pmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
 	assert(len % sizeof(uint8x16_t)*2 == 0);
 	
 	const poly8_t* _src1 = (const poly8_t*)src1 + len;
@@ -32,8 +32,8 @@ void gf16pmul_clmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void
 }
 
 #else // defined(__ARM_NEON)
-int gf16pmul_clmul_available_neon = 0;
-void gf16pmul_clmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+int gf16pmul_available_neon = 0;
+void gf16pmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
 	UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len);
 }
 #endif
diff --git a/gf16/gf16pmul_clmul_sse.c b/gf16/gf16pmul_sse.c
similarity index 88%
rename from gf16/gf16pmul_clmul_sse.c
rename to gf16/gf16pmul_sse.c
index 5338858d..2def2279 100644
--- a/gf16/gf16pmul_clmul_sse.c
+++ b/gf16/gf16pmul_sse.c
@@ -9,4 +9,4 @@
 #if defined(__PCLMUL__) && defined(__SSSE3__) && defined(__SSE4_1__)
 # define _AVAILABLE 1
 #endif
-#include "gf16pmul_clmul_x86.h"
+#include "gf16pmul_x86.h"
diff --git a/gf16/gf16pmul_clmul_sve2.c b/gf16/gf16pmul_sve2.c
similarity index 77%
rename from gf16/gf16pmul_clmul_sve2.c
rename to gf16/gf16pmul_sve2.c
index b88fe2c2..43adc210 100644
--- a/gf16/gf16pmul_clmul_sve2.c
+++ b/gf16/gf16pmul_sve2.c
@@ -2,9 +2,9 @@
 #include "gf16_clmul_sve2.h"
 
 #ifdef __ARM_FEATURE_SVE2
-int gf16pmul_clmul_available_sve2 = 1;
+int gf16pmul_available_sve2 = 1;
 
-void gf16pmul_clmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+void gf16pmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
 	assert(len % svcntb()*2 == 0);
 	
 	const uint8_t* _src1 = (const uint8_t*)src1 + len;
@@ -28,17 +28,17 @@ void gf16pmul_clmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void
 	}
 }
 
-unsigned gf16pmul_clmul_sve2_width() {
+unsigned gf16pmul_sve2_width() {
 	return svcntb();
 }
 
 #else // defined(__ARM_FEATURE_SVE2)
-int gf16pmul_clmul_available_sve2 = 0;
-void gf16pmul_clmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+int gf16pmul_available_sve2 = 0;
+void gf16pmul_sve2(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
 	UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len);
 }
 
-unsigned gf16pmul_clmul_sve2_width() {
+unsigned gf16pmul_sve2_width() {
 	return 1;
 }
 #endif
diff --git a/gf16/gf16pmul_clmul_vpclgfni.c b/gf16/gf16pmul_vpclgfni.c
similarity index 90%
rename from gf16/gf16pmul_clmul_vpclgfni.c
rename to gf16/gf16pmul_vpclgfni.c
index 26ad0478..4474f6fe 100644
--- a/gf16/gf16pmul_clmul_vpclgfni.c
+++ b/gf16/gf16pmul_vpclgfni.c
@@ -12,4 +12,4 @@
 #if defined(__VPCLMULQDQ__) && defined(__GFNI__) && defined(__AVX2__)
 # define _AVAILABLE 1
 #endif
-#include "gf16pmul_clmul_x86.h"
+#include "gf16pmul_x86.h"
diff --git a/gf16/gf16pmul_clmul_vpclmul.c b/gf16/gf16pmul_vpclmul.c
similarity index 89%
rename from gf16/gf16pmul_clmul_vpclmul.c
rename to gf16/gf16pmul_vpclmul.c
index 715544a9..a4140b10 100644
--- a/gf16/gf16pmul_clmul_vpclmul.c
+++ b/gf16/gf16pmul_vpclmul.c
@@ -11,4 +11,4 @@
 #if defined(__VPCLMULQDQ__) && defined(__AVX2__)
 # define _AVAILABLE 1
 #endif
-#include "gf16pmul_clmul_x86.h"
+#include "gf16pmul_x86.h"
diff --git a/gf16/gf16pmul_clmul_x86.h b/gf16/gf16pmul_x86.h
similarity index 92%
rename from gf16/gf16pmul_clmul_x86.h
rename to gf16/gf16pmul_x86.h
index 654449fa..b74d24cf 100644
--- a/gf16/gf16pmul_clmul_x86.h
+++ b/gf16/gf16pmul_x86.h
@@ -1,9 +1,9 @@
 #include "gf16_global.h"
 
 #if defined(_AVAILABLE)
-int _FN(gf16pmul_clmul_available) = 1;
+int _FN(gf16pmul_available) = 1;
 
-static HEDLEY_ALWAYS_INLINE void _FN(gf16pmul_clmul_initmul)(const _mword* src1, const _mword* src2, _mword* prod1, _mword* prod2) {
+static HEDLEY_ALWAYS_INLINE void _FN(gf16pmul_initmul)(const _mword* src1, const _mword* src2, _mword* prod1, _mword* prod2) {
 	_mword wordMask = _MM(set1_epi32)(0xffff);
 	
 	_mword data1 = _MMI(load)(src1);
@@ -61,7 +61,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16pmul_clmul_initmul)(const _mword* src1,
 #endif
 }
 
-void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+void _FN(gf16pmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
 	assert(len % sizeof(_mword) == 0);
 	
 	const uint8_t* _src1 = (const uint8_t*)src1 + len;
@@ -93,8 +93,8 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void
 # endif
 	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)*2) {
 		_mword prod1, prod2, prod3, prod4;
-		_FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2);
-		_FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr) +1, (_mword*)(_src2 + ptr) +1, &prod3, &prod4);
+		_FN(gf16pmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2);
+		_FN(gf16pmul_initmul)((_mword*)(_src1 + ptr) +1, (_mword*)(_src2 + ptr) +1, &prod3, &prod4);
 		
 		// split low/high
 		_mword tmp1 = _MM(shuffle_epi8)(prod1, shufLoHi);
@@ -168,7 +168,7 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void
 #else
 	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)) {
 		_mword prod1, prod2;
-		_FN(gf16pmul_clmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2);
+		_FN(gf16pmul_initmul)((_mword*)(_src1 + ptr), (_mword*)(_src2 + ptr), &prod1, &prod2);
 		
 		// do reduction
 		/*  obvious Barret reduction strategy, using CLMUL instructions
@@ -228,8 +228,8 @@ void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void
 }
 
 #else
-int _FN(gf16pmul_clmul_available) = 0;
-void _FN(gf16pmul_clmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
+int _FN(gf16pmul_available) = 0;
+void _FN(gf16pmul)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
 	UNUSED(dst); UNUSED(src1); UNUSED(src2); UNUSED(len);
 }
 #endif

From e294f6e51683f5312b61d9e2d1d2fa044af5d18d Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 27 Jul 2023 23:12:09 +1000
Subject: [PATCH 44/91] Prepare Affine kernel for AVX10

---
 gf16/gf16_affine2x_x86.h  |   4 +-
 gf16/gf16_affine_avx10.h  | 438 ++++++++++++++++++++++++++++++++++++++
 gf16/gf16_affine_avx2.c   |   2 +
 gf16/gf16_affine_avx512.c | 420 ++----------------------------------
 gf16/gf16_affine_gfni.c   |   2 +
 gf16/gf16_muladd_multi.h  |  16 +-
 gf16/gf16mul.cpp          |   4 +-
 7 files changed, 468 insertions(+), 418 deletions(-)
 create mode 100644 gf16/gf16_affine_avx10.h

diff --git a/gf16/gf16_affine2x_x86.h b/gf16/gf16_affine2x_x86.h
index b6391622..c8e494b9 100644
--- a/gf16/gf16_affine2x_x86.h
+++ b/gf16/gf16_affine2x_x86.h
@@ -73,9 +73,9 @@ void _FN(gf16_affine2x_prepare)(void* dst, const void* src, size_t srcLen) {
 
 #ifdef _AVAILABLE
 # ifdef PLATFORM_AMD64
-GF_PREPARE_PACKED_FUNCS(gf16_affine2x, _FNSUFFIX, sizeof(_mword), _FN(gf16_affine2x_prepare_block), _FN(gf16_affine2x_prepare_blocku), 6 + (MWORD_SIZE==64)*6, _MM_END, _mword checksum = _MMI(setzero)(), _FN(gf16_checksum_block), _FN(gf16_checksum_blocku), _FN(gf16_checksum_exp), _FN(gf16_checksum_prepare), sizeof(_mword))
+GF_PREPARE_PACKED_FUNCS(gf16_affine2x, _FNSUFFIX, sizeof(_mword), _FNPREP(gf16_affine2x_prepare_block), _FNPREP(gf16_affine2x_prepare_blocku), 6 + (MWORD_SIZE==64)*6, _MM_END, _mword checksum = _MMI(setzero)(), _FNPREP(gf16_checksum_block), _FNPREP(gf16_checksum_blocku), _FNPREP(gf16_checksum_exp), _FNPREP(gf16_checksum_prepare), sizeof(_mword))
 # else
-GF_PREPARE_PACKED_FUNCS(gf16_affine2x, _FNSUFFIX, sizeof(_mword), _FN(gf16_affine2x_prepare_block), _FN(gf16_affine2x_prepare_blocku), 2, _MM_END, _mword checksum = _MMI(setzero)(), _FN(gf16_checksum_block), _FN(gf16_checksum_blocku), _FN(gf16_checksum_exp), _FN(gf16_checksum_prepare), sizeof(_mword))
+GF_PREPARE_PACKED_FUNCS(gf16_affine2x, _FNSUFFIX, sizeof(_mword), _FNPREP(gf16_affine2x_prepare_block), _FNPREP(gf16_affine2x_prepare_blocku), 2, _MM_END, _mword checksum = _MMI(setzero)(), _FNPREP(gf16_checksum_block), _FNPREP(gf16_checksum_blocku), _FNPREP(gf16_checksum_exp), _FNPREP(gf16_checksum_prepare), sizeof(_mword))
 # endif
 #else
 GF_PREPARE_PACKED_FUNCS_STUB(gf16_affine2x, _FNSUFFIX)
diff --git a/gf16/gf16_affine_avx10.h b/gf16/gf16_affine_avx10.h
new file mode 100644
index 00000000..f2e9de1a
--- /dev/null
+++ b/gf16/gf16_affine_avx10.h
@@ -0,0 +1,438 @@
+
+#ifdef _AVAILABLE
+int _FN(gf16_affine_available) = 1;
+# include "gf16_shuffle_x86_prepare.h"
+# include "gf16_checksum_x86.h"
+#else
+int _FN(gf16_affine_available) = 0;
+#endif
+
+#include "gf16_affine2x_x86.h"
+#include "gf16_muladd_multi.h"
+
+
+#ifdef _AVAILABLE
+# ifdef PLATFORM_AMD64
+GF_PREPARE_PACKED_FUNCS(gf16_affine, _FNSUFFIX, sizeof(_mword)*2, _FNPREP(gf16_shuffle_prepare_block), _FNPREP(gf16_shuffle_prepare_blocku), 6, _mm256_zeroupper(), _mword checksum = _MMI(setzero)(), _FNPREP(gf16_checksum_block), _FNPREP(gf16_checksum_blocku), _FNPREP(gf16_checksum_exp), _FNPREP(gf16_checksum_prepare), sizeof(_mword))
+# else
+GF_PREPARE_PACKED_FUNCS(gf16_affine, _FNSUFFIX, sizeof(_mword)*2, _FNPREP(gf16_shuffle_prepare_block), _FNPREP(gf16_shuffle_prepare_blocku), 1, _mm256_zeroupper(), _mword checksum = _MMI(setzero)(), _FNPREP(gf16_checksum_block), _FNPREP(gf16_checksum_blocku), _FNPREP(gf16_checksum_exp), _FNPREP(gf16_checksum_prepare), sizeof(_mword))
+# endif
+#else
+GF_PREPARE_PACKED_FUNCS_STUB(gf16_affine, _FNSUFFIX)
+#endif
+
+
+#ifdef _AVAILABLE
+static HEDLEY_ALWAYS_INLINE __m256i gf16_affine_load_matrix(const void *HEDLEY_RESTRICT scratch, uint16_t coefficient) {
+	__m256i depmask = _mm256_xor_si256(
+		_mm256_load_si256((__m256i*)scratch + (coefficient & 0xf)*4),
+		_mm256_load_si256((__m256i*)((char*)scratch + ((coefficient << 3) & 0x780)) + 1)
+	);
+	depmask = _mm256_ternarylogic_epi32(
+		depmask,
+		_mm256_load_si256((__m256i*)((char*)scratch + ((coefficient >> 1) & 0x780)) + 2),
+		_mm256_load_si256((__m256i*)((char*)scratch + ((coefficient >> 5) & 0x780)) + 3),
+		0x96
+	);
+	return depmask;
+}
+#endif
+
+
+#ifdef _AVAILABLE
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine_muladd_round)(const _mword* src, _mword* tpl, _mword* tph, _mword mat_ll, _mword mat_hl, _mword mat_lh, _mword mat_hh) {
+	_mword ta = _MMI(load)(src);
+	_mword tb = _MMI(load)(src + 1);
+	
+	*tpl = _MM(ternarylogic_epi32)(
+		_MM(gf2p8affine_epi64_epi8)(ta, mat_lh, 0),
+		_MM(gf2p8affine_epi64_epi8)(tb, mat_ll, 0),
+		*tpl,
+		0x96
+	);
+	*tph = _MM(ternarylogic_epi32)(
+		_MM(gf2p8affine_epi64_epi8)(ta, mat_hh, 0),
+		_MM(gf2p8affine_epi64_epi8)(tb, mat_hl, 0),
+		*tph,
+		0x96
+	);
+}
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine_muladd_x)(
+	const void *HEDLEY_RESTRICT scratch,
+	uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale,
+	GF16_MULADD_MULTI_SRCLIST,
+	size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf
+) {
+	GF16_MULADD_MULTI_SRC_UNUSED(6);
+	
+	_mword mat_All, mat_Alh, mat_Ahl, mat_Ahh;
+	_mword mat_Bll, mat_Blh, mat_Bhl, mat_Bhh;
+	_mword mat_Cll, mat_Clh, mat_Chl, mat_Chh;
+	_mword mat_Dll, mat_Dlh, mat_Dhl, mat_Dhh;
+	_mword mat_Ell, mat_Elh, mat_Ehl, mat_Ehh;
+	_mword mat_Fll, mat_Flh, mat_Fhl, mat_Fhh;
+	
+	_mword depmask1;
+	#if MWORD_SIZE == 64
+		__m256i depmask256;
+		__m512i depmask2;
+		#define PERM1(dstVec, srcLL) \
+			dstVec##hh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(3,3,3,3)); \
+			dstVec##lh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(1,1,1,1)); \
+			dstVec##ll = _mm512_broadcastq_epi64(srcLL); \
+			dstVec##hl = _mm512_broadcastq_epi64(_mm512_castsi512_si128(depmask2))
+		#define PERM2(dstVec) \
+			depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(2,3,2,3)); \
+			dstVec##hh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(3,3,3,3)); \
+			dstVec##lh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(1,1,1,1)); \
+			dstVec##ll = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(2,2,2,2)); \
+			dstVec##hl = _mm512_broadcastq_epi64(_mm512_castsi512_si128(depmask2))
+		
+		if(srcCount == 1) {
+			depmask256 = gf16_affine_load_matrix(scratch, coefficients[0]);
+			depmask2 = _mm512_castsi256_si512(depmask256);
+			depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1));
+			PERM1(mat_A, _mm256_castsi256_si128(depmask256));
+		} else if(srcCount > 1) {
+			depmask1 = gf16_affine_load2_matrix(scratch, coefficients[0], coefficients[1]);
+			depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1));
+			PERM1(mat_A, _mm512_castsi512_si128(depmask1));
+			PERM2(mat_B);
+		}
+		if(srcCount == 3) {
+			depmask256 = gf16_affine_load_matrix(scratch, coefficients[2]);
+			depmask2 = _mm512_castsi256_si512(depmask256);
+			depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1));
+			PERM1(mat_C, _mm256_castsi256_si128(depmask256));
+		} else if(srcCount > 3) {
+			depmask1 = gf16_affine_load2_matrix(scratch, coefficients[2], coefficients[3]);
+			depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1));
+			PERM1(mat_C, _mm512_castsi512_si128(depmask1));
+			PERM2(mat_D);
+		}
+		if(srcCount == 5) {
+			depmask256 = gf16_affine_load_matrix(scratch, coefficients[4]);
+			depmask2 = _mm512_castsi256_si512(depmask256);
+			depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1));
+			PERM1(mat_E, _mm256_castsi256_si128(depmask256));
+		} else if(srcCount > 5) {
+			depmask1 = gf16_affine_load2_matrix(scratch, coefficients[4], coefficients[5]);
+			depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1));
+			PERM1(mat_E, _mm512_castsi512_si128(depmask1));
+			PERM2(mat_F);
+		}
+		#undef PERM2
+	#else
+		#define PERM1(dstVec) \
+			dstVec##hh = _mm256_permute4x64_epi64(depmask1, _MM_SHUFFLE(1,1,1,1)); \
+			dstVec##lh = _mm256_permute4x64_epi64(depmask1, _MM_SHUFFLE(3,3,3,3)); \
+			dstVec##ll = _mm256_broadcastq_epi64(_mm256_castsi256_si128(depmask1)); \
+			dstVec##hl = _mm256_permute4x64_epi64(depmask1, _MM_SHUFFLE(2,2,2,2))
+		#define LOAD_SRC(n, dstVec) \
+			if(srcCount > n) { \
+				depmask1 = gf16_affine_load_matrix(scratch, coefficients[n]); \
+				PERM1(dstVec); \
+			}
+		
+		LOAD_SRC(0, mat_A)
+		LOAD_SRC(1, mat_B)
+		LOAD_SRC(2, mat_C)
+		LOAD_SRC(3, mat_D)
+		LOAD_SRC(4, mat_E)
+		LOAD_SRC(5, mat_F)
+		#undef LOAD_SRC
+	#endif
+	#undef PERM1
+	
+	
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)*2) {
+		_mword tph = _MMI(load)((_mword*)(_dst + ptr));
+		_mword tpl = _MMI(load)((_mword*)(_dst + ptr) + 1);
+		_FN(gf16_affine_muladd_round)((_mword*)(_src1 + ptr*srcScale), &tpl, &tph, mat_All, mat_Ahl, mat_Alh, mat_Ahh);
+		if(srcCount >= 2)
+			_FN(gf16_affine_muladd_round)((_mword*)(_src2 + ptr*srcScale), &tpl, &tph, mat_Bll, mat_Bhl, mat_Blh, mat_Bhh);
+		if(srcCount >= 3)
+			_FN(gf16_affine_muladd_round)((_mword*)(_src3 + ptr*srcScale), &tpl, &tph, mat_Cll, mat_Chl, mat_Clh, mat_Chh);
+		if(srcCount >= 4)
+			_FN(gf16_affine_muladd_round)((_mword*)(_src4 + ptr*srcScale), &tpl, &tph, mat_Dll, mat_Dhl, mat_Dlh, mat_Dhh);
+		if(srcCount >= 5)
+			_FN(gf16_affine_muladd_round)((_mword*)(_src5 + ptr*srcScale), &tpl, &tph, mat_Ell, mat_Ehl, mat_Elh, mat_Ehh);
+		if(srcCount >= 6)
+			_FN(gf16_affine_muladd_round)((_mword*)(_src6 + ptr*srcScale), &tpl, &tph, mat_Fll, mat_Fhl, mat_Flh, mat_Fhh);
+		_MMI(store)((_mword*)(_dst + ptr), tph);
+		_MMI(store)((_mword*)(_dst + ptr)+1, tpl);
+		
+		if(doPrefetch == 1)
+			_mm_prefetch(_pf+(ptr>>1), MM_HINT_WT1);
+		if(doPrefetch == 2)
+			_mm_prefetch(_pf+(ptr>>1), _MM_HINT_T1);
+	}
+}
+#endif /*defined(_AVAILABLE)*/
+
+void _FN(gf16_affine_muladd)(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch);
+#ifdef _AVAILABLE
+	gf16_muladd_single(scratch, &_FN(gf16_affine_muladd_x), dst, src, len, coefficient);
+	_mm256_zeroupper();
+#else
+	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient);
+#endif
+}
+
+void _FN(gf16_affine_muladd_prefetch)(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) {
+	UNUSED(mutScratch);
+#ifdef _AVAILABLE
+	gf16_muladd_prefetch_single(scratch, &_FN(gf16_affine_muladd_x), dst, src, len, coefficient, prefetch);
+	_mm256_zeroupper();
+#else
+	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(prefetch);
+#endif
+}
+
+#if defined(_AVAILABLE) && defined(PLATFORM_AMD64)
+GF16_MULADD_MULTI_FUNCS(gf16_affine, _FNSUFFIX, _FN(gf16_affine_muladd_x), 6, sizeof(_mword)*2, 1, _mm256_zeroupper())
+#else
+GF16_MULADD_MULTI_FUNCS_STUB(gf16_affine, _FNSUFFIX)
+#endif
+
+
+
+#ifdef _AVAILABLE
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_2round)(const int srcCountOffs, const void* _src1, const void* _src2, _mword* result, _mword* swapped, _mword matNorm1, _mword matSwap1, _mword matNorm2, _mword matSwap2) {
+	if(srcCountOffs < 0) return;
+	
+	_mword data1 = _MMI(load)(_src1);
+	if(srcCountOffs == 0) {
+		*result = _MMI(xor)(
+			*result,
+			_MM(gf2p8affine_epi64_epi8)(data1, matNorm1, 0)
+		);
+		*swapped = _MMI(xor)(
+			*swapped,
+			_MM(gf2p8affine_epi64_epi8)(data1, matSwap1, 0)
+		);
+	}
+	else { // if(srcCountOffs > 0)
+		_mword data2 = _MMI(load)(_src2);
+		*result = _MM(ternarylogic_epi32)(
+			*result,
+			_MM(gf2p8affine_epi64_epi8)(data1, matNorm1, 0),
+			_MM(gf2p8affine_epi64_epi8)(data2, matNorm2, 0),
+			0x96
+		);
+		*swapped = _MM(ternarylogic_epi32)(
+			*swapped,
+			_MM(gf2p8affine_epi64_epi8)(data1, matSwap1, 0),
+			_MM(gf2p8affine_epi64_epi8)(data2, matSwap2, 0),
+			0x96
+		);
+	}
+}
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_x)(
+	const void *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale,
+	GF16_MULADD_MULTI_SRCLIST,
+	size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf
+) {
+	GF16_MULADD_MULTI_SRC_UNUSED(13);
+	
+	_mword depmask;
+	_mword matNormA, matSwapA;
+	_mword matNormB, matSwapB;
+	_mword matNormC, matSwapC;
+	_mword matNormD, matSwapD;
+	_mword matNormE, matSwapE;
+	_mword matNormF, matSwapF;
+	_mword matNormG, matSwapG;
+	_mword matNormH, matSwapH;
+	_mword matNormI, matSwapI;
+	_mword matNormJ, matSwapJ;
+	_mword matNormK, matSwapK;
+	_mword matNormL, matSwapL;
+	_mword matNormM, matSwapM;
+	
+	// prevent MSVC whining
+	matNormB = matSwapB = matNormC = matSwapC = matNormD = matSwapD = matNormE = matSwapE = matNormF = matSwapF = matNormG = matSwapG = matNormH = matSwapH = matNormI = matSwapI = matNormJ = matSwapJ = matNormK = matSwapK = matNormL = matSwapL = matNormM = matSwapM = 
+# if MWORD_SIZE == 64
+		_mm512_undefined_epi32();
+# else
+		_mm256_undefined_si256();
+# endif
+	
+# if MWORD_SIZE == 64
+	if(srcCount == 1) {
+		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[0]));
+		matNormA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+	}
+	if(srcCount > 1) {
+		depmask = gf16_affine_load2_matrix(scratch, coefficients[0], coefficients[1]);
+		matNormA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+		matNormB = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
+		matSwapB = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
+	}
+	if(srcCount == 3) {
+		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[2]));
+		matNormC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+	}
+	if(srcCount > 3) {
+		depmask = gf16_affine_load2_matrix(scratch, coefficients[2], coefficients[3]);
+		matNormC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+		matNormD = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
+		matSwapD = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
+	}
+	if(srcCount == 5) {
+		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[4]));
+		matNormE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+	}
+	if(srcCount > 5) {
+		depmask = gf16_affine_load2_matrix(scratch, coefficients[4], coefficients[5]);
+		matNormE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+		matNormF = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
+		matSwapF = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
+	}
+	if(srcCount == 7) {
+		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[6]));
+		matNormG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+	}
+	if(srcCount > 7) {
+		depmask = gf16_affine_load2_matrix(scratch, coefficients[6], coefficients[7]);
+		matNormG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+		matNormH = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
+		matSwapH = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
+	}
+	if(srcCount == 9) {
+		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[8]));
+		matNormI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+	}
+	if(srcCount > 9) {
+		depmask = gf16_affine_load2_matrix(scratch, coefficients[8], coefficients[9]);
+		matNormI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+		matNormJ = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
+		matSwapJ = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
+	}
+	if(srcCount == 11) {
+		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[10]));
+		matNormK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+	}
+	if(srcCount > 11) {
+		depmask = gf16_affine_load2_matrix(scratch, coefficients[10], coefficients[11]);
+		matNormK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+		matNormL = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
+		matSwapL = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
+	}
+	if(srcCount == 13) {
+		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[12]));
+		matNormM = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
+		matSwapM = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
+	}
+# else
+	#define LOAD_SRC(n, mat) \
+		if(srcCount > n) { \
+			depmask = gf16_affine_load_matrix(scratch, coefficients[n]); \
+			matNorm##mat = _mm256_inserti128_si256(depmask, _mm256_castsi256_si128(depmask), 1); \
+			matSwap##mat = _mm256_permute2x128_si256(depmask, depmask, 0x11); \
+		}
+	LOAD_SRC(0, A)
+	LOAD_SRC(1, B)
+	LOAD_SRC(2, C)
+	LOAD_SRC(3, D)
+	LOAD_SRC(4, E)
+	LOAD_SRC(5, F)
+	LOAD_SRC(6, G)
+	LOAD_SRC(7, H)
+	LOAD_SRC(8, I)
+	LOAD_SRC(9, J)
+	LOAD_SRC(10, K)
+	LOAD_SRC(11, L)
+	LOAD_SRC(12, M)
+	#undef LOAD_SRC
+# endif
+	
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(_mword)) {
+		_mword data = _MMI(load)((_mword*)(_src1 + ptr*srcScale));
+		_mword result = _MM(gf2p8affine_epi64_epi8)(data, matNormA, 0);
+		_mword swapped = _MM(gf2p8affine_epi64_epi8)(data, matSwapA, 0);
+		if(srcCount > 1)
+			data = _MMI(load)((_mword*)(_src2 + ptr*srcScale));
+		if(srcCount >= 3) {
+			_mword data2 = _MMI(load)((_mword*)(_src3 + ptr*srcScale));
+			result = _MM(ternarylogic_epi32)(
+				result,
+				_MM(gf2p8affine_epi64_epi8)(data, matNormB, 0),
+				_MM(gf2p8affine_epi64_epi8)(data2, matNormC, 0),
+				0x96
+			);
+			swapped = _MM(ternarylogic_epi32)(
+				swapped,
+				_MM(gf2p8affine_epi64_epi8)(data, matSwapB, 0),
+				_MM(gf2p8affine_epi64_epi8)(data2, matSwapC, 0),
+				0x96
+			);
+		} else if(srcCount == 2) {
+			result = _MMI(xor)(
+				result,
+				_MM(gf2p8affine_epi64_epi8)(data, matNormB, 0)
+			);
+			swapped = _MMI(xor)(
+				swapped,
+				_MM(gf2p8affine_epi64_epi8)(data, matSwapB, 0)
+			);
+		}
+		
+		_FN(gf16_affine2x_muladd_2round)(srcCount - 4, _src4 + ptr*srcScale, _src5 + ptr*srcScale, &result, &swapped, matNormD, matSwapD, matNormE, matSwapE);
+		_FN(gf16_affine2x_muladd_2round)(srcCount - 6, _src6 + ptr*srcScale, _src7 + ptr*srcScale, &result, &swapped, matNormF, matSwapF, matNormG, matSwapG);
+		_FN(gf16_affine2x_muladd_2round)(srcCount - 8, _src8 + ptr*srcScale, _src9 + ptr*srcScale, &result, &swapped, matNormH, matSwapH, matNormI, matSwapI);
+		_FN(gf16_affine2x_muladd_2round)(srcCount - 10, _src10 + ptr*srcScale, _src11 + ptr*srcScale, &result, &swapped, matNormJ, matSwapJ, matNormK, matSwapK);
+		_FN(gf16_affine2x_muladd_2round)(srcCount - 12, _src12 + ptr*srcScale, _src13 + ptr*srcScale, &result, &swapped, matNormL, matSwapL, matNormM, matSwapM);
+		
+		result = _MM(ternarylogic_epi32)(
+			result,
+			_MM(shuffle_epi32)(swapped, _MM_SHUFFLE(1,0,3,2)),
+			_MMI(load)((_mword*)(_dst + ptr)),
+			0x96
+		);
+		_MMI(store) ((_mword*)(_dst + ptr), result);
+		
+		if(doPrefetch == 1)
+			_mm_prefetch(_pf+ptr, MM_HINT_WT1);
+		if(doPrefetch == 2)
+			_mm_prefetch(_pf+ptr, _MM_HINT_T1);
+	}
+}
+#endif /*defined(_AVAILABLE)*/
+
+
+void _FN(gf16_affine2x_muladd)(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch);
+#ifdef _AVAILABLE
+	gf16_muladd_single(scratch, &_FN(gf16_affine2x_muladd_x), dst, src, len, coefficient);
+	_mm256_zeroupper();
+#else
+	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient);
+#endif
+}
+
+
+#ifdef _AVAILABLE
+# ifdef PLATFORM_AMD64
+// TODO: may not want 12 regions for non-packed variant
+GF16_MULADD_MULTI_FUNCS(gf16_affine2x, _FNSUFFIX, _FN(gf16_affine2x_muladd_x), 12, sizeof(_mword), 0, _mm256_zeroupper())
+# else
+// if only 8 registers available, only allow 2 parallel regions
+GF16_MULADD_MULTI_FUNCS(gf16_affine2x, _FNSUFFIX, _FN(gf16_affine2x_muladd_x), 2, sizeof(_mword), 0, _mm256_zeroupper())
+# endif
+#else
+GF16_MULADD_MULTI_FUNCS_STUB(gf16_affine2x, _FNSUFFIX)
+#endif
diff --git a/gf16/gf16_affine_avx2.c b/gf16/gf16_affine_avx2.c
index 1d42e328..67879d09 100644
--- a/gf16/gf16_affine_avx2.c
+++ b/gf16/gf16_affine_avx2.c
@@ -7,6 +7,7 @@
 #define _MM(f) _mm256_ ## f
 #define _MMI(f) _mm256_ ## f ## _si256
 #define _FNSUFFIX _avx2
+#define _FNPREP(f) f##_avx2
 #define _MM_END _mm256_zeroupper();
 
 #if defined(__GFNI__) && defined(__AVX2__)
@@ -24,6 +25,7 @@ int gf16_affine_available_avx2 = 0;
 #endif
 #undef _MM_END
 #undef _FNSUFFIX
+#undef _FNPREP
 #undef _MMI
 #undef _MM
 #undef _mword
diff --git a/gf16/gf16_affine_avx512.c b/gf16/gf16_affine_avx512.c
index 67f11f36..279ea3d0 100644
--- a/gf16/gf16_affine_avx512.c
+++ b/gf16/gf16_affine_avx512.c
@@ -7,55 +7,12 @@
 #define _MM(f) _mm512_ ## f
 #define _MMI(f) _mm512_ ## f ## _si512
 #define _FNSUFFIX _avx512
+#define _FNPREP(f) f##_avx512
 #define _MM_END _mm256_zeroupper();
 
 #if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
-int gf16_affine_available_avx512 = 1;
 # define _AVAILABLE 1
-# include "gf16_shuffle_x86_prepare.h"
-# include "gf16_checksum_x86.h"
-#else
-int gf16_affine_available_avx512 = 0;
-#endif
-
-#include "gf16_affine2x_x86.h"
-#ifdef _AVAILABLE
-# undef _AVAILABLE
-#endif
-#undef _MM_END
-#undef _FNSUFFIX
-#undef _MMI
-#undef _MM
-#undef _mword
-#undef MWORD_SIZE
-
-#include "gf16_muladd_multi.h"
-
-#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
-# ifdef PLATFORM_AMD64
-GF_PREPARE_PACKED_FUNCS(gf16_affine, _avx512, sizeof(__m512i)*2, gf16_shuffle_prepare_block_avx512, gf16_shuffle_prepare_blocku_avx512, 6, _mm256_zeroupper(), __m512i checksum = _mm512_setzero_si512(), gf16_checksum_block_avx512, gf16_checksum_blocku_avx512, gf16_checksum_exp_avx512, gf16_checksum_prepare_avx512, sizeof(__m512i))
-# else
-GF_PREPARE_PACKED_FUNCS(gf16_affine, _avx512, sizeof(__m512i)*2, gf16_shuffle_prepare_block_avx512, gf16_shuffle_prepare_blocku_avx512, 1, _mm256_zeroupper(), __m512i checksum = _mm512_setzero_si512(), gf16_checksum_block_avx512, gf16_checksum_blocku_avx512, gf16_checksum_exp_avx512, gf16_checksum_prepare_avx512, sizeof(__m512i))
-# endif
-#else
-GF_PREPARE_PACKED_FUNCS_STUB(gf16_affine, _avx512)
-#endif
 
-
-#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
-static HEDLEY_ALWAYS_INLINE __m256i gf16_affine_load_matrix(const void *HEDLEY_RESTRICT scratch, uint16_t coefficient) {
-	__m256i depmask = _mm256_xor_si256(
-		_mm256_load_si256((__m256i*)scratch + (coefficient & 0xf)*4),
-		_mm256_load_si256((__m256i*)((char*)scratch + ((coefficient << 3) & 0x780)) + 1)
-	);
-	depmask = _mm256_ternarylogic_epi32(
-		depmask,
-		_mm256_load_si256((__m256i*)((char*)scratch + ((coefficient >> 1) & 0x780)) + 2),
-		_mm256_load_si256((__m256i*)((char*)scratch + ((coefficient >> 5) & 0x780)) + 3),
-		0x96
-	);
-	return depmask;
-}
 static HEDLEY_ALWAYS_INLINE __m512i gf16_affine_load2_matrix(const void *HEDLEY_RESTRICT scratch, uint16_t coeff1, uint16_t coeff2) {
 	__m512i depmask = _mm512_xor_si512(
 		_mm512_inserti64x4(
@@ -87,6 +44,19 @@ static HEDLEY_ALWAYS_INLINE __m512i gf16_affine_load2_matrix(const void *HEDLEY_
 }
 #endif
 
+#include "gf16_affine_avx10.h"
+#ifdef _AVAILABLE
+# undef _AVAILABLE
+#endif
+#undef _MM_END
+#undef _FNSUFFIX
+#undef _FNPREP
+#undef _MMI
+#undef _MM
+#undef _mword
+#undef MWORD_SIZE
+
+
 void gf16_affine_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
 #if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
@@ -125,345 +95,6 @@ void gf16_affine_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, cons
 }
 
 
-#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
-static HEDLEY_ALWAYS_INLINE void gf16_affine_muladd_round(const __m512i* src, __m512i* tpl, __m512i* tph, __m512i mat_ll, __m512i mat_hl, __m512i mat_lh, __m512i mat_hh) {
-	__m512i ta = _mm512_load_si512(src);
-	__m512i tb = _mm512_load_si512(src + 1);
-	
-	*tpl = _mm512_ternarylogic_epi32(
-		_mm512_gf2p8affine_epi64_epi8(ta, mat_lh, 0),
-		_mm512_gf2p8affine_epi64_epi8(tb, mat_ll, 0),
-		*tpl,
-		0x96
-	);
-	*tph = _mm512_ternarylogic_epi32(
-		_mm512_gf2p8affine_epi64_epi8(ta, mat_hh, 0),
-		_mm512_gf2p8affine_epi64_epi8(tb, mat_hl, 0),
-		*tph,
-		0x96
-	);
-}
-static HEDLEY_ALWAYS_INLINE void gf16_affine_muladd_x_avx512(
-	const void *HEDLEY_RESTRICT scratch,
-	uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale,
-	GF16_MULADD_MULTI_SRCLIST,
-	size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf
-) {
-	GF16_MULADD_MULTI_SRC_UNUSED(6);
-	
-	__m512i mat_All, mat_Alh, mat_Ahl, mat_Ahh;
-	__m512i mat_Bll, mat_Blh, mat_Bhl, mat_Bhh;
-	__m512i mat_Cll, mat_Clh, mat_Chl, mat_Chh;
-	__m512i mat_Dll, mat_Dlh, mat_Dhl, mat_Dhh;
-	__m512i mat_Ell, mat_Elh, mat_Ehl, mat_Ehh;
-	__m512i mat_Fll, mat_Flh, mat_Fhl, mat_Fhh;
-	
-	#define PERM1(dstVec, srcLL) \
-		dstVec##hh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(3,3,3,3)); \
-		dstVec##lh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(1,1,1,1)); \
-		dstVec##ll = _mm512_broadcastq_epi64(srcLL); \
-		dstVec##hl = _mm512_broadcastq_epi64(_mm512_castsi512_si128(depmask2))
-	#define PERM2(dstVec) \
-		depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(2,3,2,3)); \
-		dstVec##hh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(3,3,3,3)); \
-		dstVec##lh = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(1,1,1,1)); \
-		dstVec##ll = _mm512_permutex_epi64(depmask2, _MM_SHUFFLE(2,2,2,2)); \
-		dstVec##hl = _mm512_broadcastq_epi64(_mm512_castsi512_si128(depmask2))
-	
-	__m256i depmask256;
-	__m512i depmask1, depmask2;
-	if(srcCount == 1) {
-		depmask256 = gf16_affine_load_matrix(scratch, coefficients[0]);
-		depmask2 = _mm512_castsi256_si512(depmask256);
-		depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1));
-		PERM1(mat_A, _mm256_castsi256_si128(depmask256));
-	} else if(srcCount > 1) {
-		depmask1 = gf16_affine_load2_matrix(scratch, coefficients[0], coefficients[1]);
-		depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1));
-		PERM1(mat_A, _mm512_castsi512_si128(depmask1));
-		PERM2(mat_B);
-	}
-	if(srcCount == 3) {
-		depmask256 = gf16_affine_load_matrix(scratch, coefficients[2]);
-		depmask2 = _mm512_castsi256_si512(depmask256);
-		depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1));
-		PERM1(mat_C, _mm256_castsi256_si128(depmask256));
-	} else if(srcCount > 3) {
-		depmask1 = gf16_affine_load2_matrix(scratch, coefficients[2], coefficients[3]);
-		depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1));
-		PERM1(mat_C, _mm512_castsi512_si128(depmask1));
-		PERM2(mat_D);
-	}
-	if(srcCount == 5) {
-		depmask256 = gf16_affine_load_matrix(scratch, coefficients[4]);
-		depmask2 = _mm512_castsi256_si512(depmask256);
-		depmask2 = _mm512_shuffle_i64x2(depmask2, depmask2, _MM_SHUFFLE(0,1,0,1));
-		PERM1(mat_E, _mm256_castsi256_si128(depmask256));
-	} else if(srcCount > 5) {
-		depmask1 = gf16_affine_load2_matrix(scratch, coefficients[4], coefficients[5]);
-		depmask2 = _mm512_shuffle_i64x2(depmask1, depmask1, _MM_SHUFFLE(0,1,0,1));
-		PERM1(mat_E, _mm512_castsi512_si128(depmask1));
-		PERM2(mat_F);
-	}
-	#undef PERM1
-	#undef PERM2
-	
-	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m512i)*2) {
-		__m512i tph = _mm512_load_si512((__m512i*)(_dst + ptr));
-		__m512i tpl = _mm512_load_si512((__m512i*)(_dst + ptr) + 1);
-		gf16_affine_muladd_round((__m512i*)(_src1 + ptr*srcScale), &tpl, &tph, mat_All, mat_Ahl, mat_Alh, mat_Ahh);
-		if(srcCount >= 2)
-			gf16_affine_muladd_round((__m512i*)(_src2 + ptr*srcScale), &tpl, &tph, mat_Bll, mat_Bhl, mat_Blh, mat_Bhh);
-		if(srcCount >= 3)
-			gf16_affine_muladd_round((__m512i*)(_src3 + ptr*srcScale), &tpl, &tph, mat_Cll, mat_Chl, mat_Clh, mat_Chh);
-		if(srcCount >= 4)
-			gf16_affine_muladd_round((__m512i*)(_src4 + ptr*srcScale), &tpl, &tph, mat_Dll, mat_Dhl, mat_Dlh, mat_Dhh);
-		if(srcCount >= 5)
-			gf16_affine_muladd_round((__m512i*)(_src5 + ptr*srcScale), &tpl, &tph, mat_Ell, mat_Ehl, mat_Elh, mat_Ehh);
-		if(srcCount >= 6)
-			gf16_affine_muladd_round((__m512i*)(_src6 + ptr*srcScale), &tpl, &tph, mat_Fll, mat_Fhl, mat_Flh, mat_Fhh);
-		_mm512_store_si512((__m512i*)(_dst + ptr), tph);
-		_mm512_store_si512((__m512i*)(_dst + ptr)+1, tpl);
-		
-		if(doPrefetch == 1)
-			_mm_prefetch(_pf+(ptr>>1), MM_HINT_WT1);
-		if(doPrefetch == 2)
-			_mm_prefetch(_pf+(ptr>>1), _MM_HINT_T1);
-	}
-}
-#endif /*defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)*/
-
-void gf16_affine_muladd_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
-	UNUSED(mutScratch);
-#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
-	gf16_muladd_single(scratch, &gf16_affine_muladd_x_avx512, dst, src, len, coefficient);
-	_mm256_zeroupper();
-#else
-	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient);
-#endif
-}
-
-void gf16_affine_muladd_prefetch_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch) {
-	UNUSED(mutScratch);
-#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
-	gf16_muladd_prefetch_single(scratch, &gf16_affine_muladd_x_avx512, dst, src, len, coefficient, prefetch);
-	_mm256_zeroupper();
-#else
-	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient); UNUSED(prefetch);
-#endif
-}
-
-#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__) && defined(PLATFORM_AMD64)
-GF16_MULADD_MULTI_FUNCS(gf16_affine, _avx512, gf16_affine_muladd_x_avx512, 6, sizeof(__m512i)*2, 1, _mm256_zeroupper())
-#else
-GF16_MULADD_MULTI_FUNCS_STUB(gf16_affine, _avx512)
-#endif
-
-
-#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
-# include "gf16_bitdep_init_avx2.h"
-#endif
-void* gf16_affine_init_avx512(int polynomial) {
-#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
-	__m128i* ret;
-	ALIGN_ALLOC(ret, sizeof(__m256i)*16*4, 32);
-	gf16_bitdep_init256(ret, polynomial, 1);
-	return ret;
-#else
-	UNUSED(polynomial);
-	return NULL;
-#endif
-}
-
-
-#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
-static HEDLEY_ALWAYS_INLINE void gf16_affine2x_muladd_2round(const int srcCountOffs, const void* _src1, const void* _src2, __m512i* result, __m512i* swapped, __m512i matNorm1, __m512i matSwap1, __m512i matNorm2, __m512i matSwap2) {
-	if(srcCountOffs < 0) return;
-	
-	__m512i data1 = _mm512_load_si512(_src1);
-	if(srcCountOffs == 0) {
-		*result = _mm512_xor_si512(
-			*result,
-			_mm512_gf2p8affine_epi64_epi8(data1, matNorm1, 0)
-		);
-		*swapped = _mm512_xor_si512(
-			*swapped,
-			_mm512_gf2p8affine_epi64_epi8(data1, matSwap1, 0)
-		);
-	}
-	else { // if(srcCountOffs > 0)
-		__m512i data2 = _mm512_load_si512(_src2);
-		*result = _mm512_ternarylogic_epi32(
-			*result,
-			_mm512_gf2p8affine_epi64_epi8(data1, matNorm1, 0),
-			_mm512_gf2p8affine_epi64_epi8(data2, matNorm2, 0),
-			0x96
-		);
-		*swapped = _mm512_ternarylogic_epi32(
-			*swapped,
-			_mm512_gf2p8affine_epi64_epi8(data1, matSwap1, 0),
-			_mm512_gf2p8affine_epi64_epi8(data2, matSwap2, 0),
-			0x96
-		);
-	}
-}
-static HEDLEY_ALWAYS_INLINE void gf16_affine2x_muladd_x_avx512(
-	const void *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale,
-	GF16_MULADD_MULTI_SRCLIST,
-	size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf
-) {
-	GF16_MULADD_MULTI_SRC_UNUSED(13);
-	
-	__m512i depmask;
-	__m512i matNormA, matSwapA;
-	__m512i matNormB, matSwapB;
-	__m512i matNormC, matSwapC;
-	__m512i matNormD, matSwapD;
-	__m512i matNormE, matSwapE;
-	__m512i matNormF, matSwapF;
-	__m512i matNormG, matSwapG;
-	__m512i matNormH, matSwapH;
-	__m512i matNormI, matSwapI;
-	__m512i matNormJ, matSwapJ;
-	__m512i matNormK, matSwapK;
-	__m512i matNormL, matSwapL;
-	__m512i matNormM, matSwapM;
-	
-	// prevent MSVC whining
-	matNormB = matSwapB = matNormC = matSwapC = matNormD = matSwapD = matNormE = matSwapE = matNormF = matSwapF = matNormG = matSwapG = matNormH = matSwapH = matNormI = matSwapI = matNormJ = matSwapJ = matNormK = matSwapK = matNormL = matSwapL = matNormM = matSwapM = _mm512_undefined_epi32();
-	
-	if(srcCount == 1) {
-		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[0]));
-		matNormA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-	}
-	if(srcCount > 1) {
-		depmask = gf16_affine_load2_matrix(scratch, coefficients[0], coefficients[1]);
-		matNormA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapA = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-		matNormB = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
-		matSwapB = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
-	}
-	if(srcCount == 3) {
-		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[2]));
-		matNormC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-	}
-	if(srcCount > 3) {
-		depmask = gf16_affine_load2_matrix(scratch, coefficients[2], coefficients[3]);
-		matNormC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapC = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-		matNormD = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
-		matSwapD = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
-	}
-	if(srcCount == 5) {
-		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[4]));
-		matNormE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-	}
-	if(srcCount > 5) {
-		depmask = gf16_affine_load2_matrix(scratch, coefficients[4], coefficients[5]);
-		matNormE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapE = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-		matNormF = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
-		matSwapF = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
-	}
-	if(srcCount == 7) {
-		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[6]));
-		matNormG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-	}
-	if(srcCount > 7) {
-		depmask = gf16_affine_load2_matrix(scratch, coefficients[6], coefficients[7]);
-		matNormG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapG = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-		matNormH = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
-		matSwapH = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
-	}
-	if(srcCount == 9) {
-		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[8]));
-		matNormI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-	}
-	if(srcCount > 9) {
-		depmask = gf16_affine_load2_matrix(scratch, coefficients[8], coefficients[9]);
-		matNormI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapI = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-		matNormJ = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
-		matSwapJ = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
-	}
-	if(srcCount == 11) {
-		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[10]));
-		matNormK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-	}
-	if(srcCount > 11) {
-		depmask = gf16_affine_load2_matrix(scratch, coefficients[10], coefficients[11]);
-		matNormK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapK = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-		matNormL = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(2,2,2,2));
-		matSwapL = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(3,3,3,3));
-	}
-	if(srcCount == 13) {
-		depmask = _mm512_castsi256_si512(gf16_affine_load_matrix(scratch, coefficients[12]));
-		matNormM = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(0,0,0,0));
-		matSwapM = _mm512_shuffle_i64x2(depmask, depmask, _MM_SHUFFLE(1,1,1,1));
-	}
-	
-	
-	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m512i)) {
-		__m512i data = _mm512_load_si512((__m512i*)(_src1 + ptr*srcScale));
-		__m512i result = _mm512_gf2p8affine_epi64_epi8(data, matNormA, 0);
-		__m512i swapped = _mm512_gf2p8affine_epi64_epi8(data, matSwapA, 0);
-		if(srcCount > 1)
-			data = _mm512_load_si512((__m512i*)(_src2 + ptr*srcScale));
-		if(srcCount >= 3) {
-			__m512i data2 = _mm512_load_si512((__m512i*)(_src3 + ptr*srcScale));
-			result = _mm512_ternarylogic_epi32(
-				result,
-				_mm512_gf2p8affine_epi64_epi8(data, matNormB, 0),
-				_mm512_gf2p8affine_epi64_epi8(data2, matNormC, 0),
-				0x96
-			);
-			swapped = _mm512_ternarylogic_epi32(
-				swapped,
-				_mm512_gf2p8affine_epi64_epi8(data, matSwapB, 0),
-				_mm512_gf2p8affine_epi64_epi8(data2, matSwapC, 0),
-				0x96
-			);
-		} else if(srcCount == 2) {
-			result = _mm512_xor_si512(
-				result,
-				_mm512_gf2p8affine_epi64_epi8(data, matNormB, 0)
-			);
-			swapped = _mm512_xor_si512(
-				swapped,
-				_mm512_gf2p8affine_epi64_epi8(data, matSwapB, 0)
-			);
-		}
-		
-		gf16_affine2x_muladd_2round(srcCount - 4, _src4 + ptr*srcScale, _src5 + ptr*srcScale, &result, &swapped, matNormD, matSwapD, matNormE, matSwapE);
-		gf16_affine2x_muladd_2round(srcCount - 6, _src6 + ptr*srcScale, _src7 + ptr*srcScale, &result, &swapped, matNormF, matSwapF, matNormG, matSwapG);
-		gf16_affine2x_muladd_2round(srcCount - 8, _src8 + ptr*srcScale, _src9 + ptr*srcScale, &result, &swapped, matNormH, matSwapH, matNormI, matSwapI);
-		gf16_affine2x_muladd_2round(srcCount - 10, _src10 + ptr*srcScale, _src11 + ptr*srcScale, &result, &swapped, matNormJ, matSwapJ, matNormK, matSwapK);
-		gf16_affine2x_muladd_2round(srcCount - 12, _src12 + ptr*srcScale, _src13 + ptr*srcScale, &result, &swapped, matNormL, matSwapL, matNormM, matSwapM);
-		
-		result = _mm512_ternarylogic_epi32(
-			result,
-			_mm512_shuffle_epi32(swapped, _MM_SHUFFLE(1,0,3,2)),
-			_mm512_load_si512((__m512i*)(_dst + ptr)),
-			0x96
-		);
-		_mm512_store_si512 ((__m512i*)(_dst + ptr), result);
-		
-		if(doPrefetch == 1)
-			_mm_prefetch(_pf+ptr, MM_HINT_WT1);
-		if(doPrefetch == 2)
-			_mm_prefetch(_pf+ptr, _MM_HINT_T1);
-	}
-}
-#endif /*defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)*/
-
 
 void gf16_affine2x_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
 	UNUSED(mutScratch);
@@ -487,26 +118,3 @@ void gf16_affine2x_mul_avx512(const void *HEDLEY_RESTRICT scratch, void* dst, co
 	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient);
 #endif
 }
-
-void gf16_affine2x_muladd_avx512(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch) {
-	UNUSED(mutScratch);
-#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
-	gf16_muladd_single(scratch, &gf16_affine2x_muladd_x_avx512, dst, src, len, coefficient);
-	_mm256_zeroupper();
-#else
-	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficient);
-#endif
-}
-
-
-#if defined(__GFNI__) && defined(__AVX512BW__) && defined(__AVX512VL__)
-# ifdef PLATFORM_AMD64
-// TODO: may not want 12 regions for non-packed variant
-GF16_MULADD_MULTI_FUNCS(gf16_affine2x, _avx512, gf16_affine2x_muladd_x_avx512, 12, sizeof(__m512i), 0, _mm256_zeroupper())
-# else
-// if only 8 registers available, only allow 2 parallel regions
-GF16_MULADD_MULTI_FUNCS(gf16_affine2x, _avx512, gf16_affine2x_muladd_x_avx512, 2, sizeof(__m512i), 0, _mm256_zeroupper())
-# endif
-#else
-GF16_MULADD_MULTI_FUNCS_STUB(gf16_affine2x, _avx512)
-#endif
diff --git a/gf16/gf16_affine_gfni.c b/gf16/gf16_affine_gfni.c
index e7668cdb..e2539ea3 100644
--- a/gf16/gf16_affine_gfni.c
+++ b/gf16/gf16_affine_gfni.c
@@ -8,6 +8,7 @@
 #define _MM(f) _mm_ ## f
 #define _MMI(f) _mm_ ## f ## _si128
 #define _FNSUFFIX _gfni
+#define _FNPREP(f) f##_gfni
 #define _MM_END
 
 #if defined(__GFNI__) && defined(__SSSE3__)
@@ -25,6 +26,7 @@ int gf16_affine_available_gfni = 0;
 #endif
 #undef _MM_END
 #undef _FNSUFFIX
+#undef _FNPREP
 #undef _MMI
 #undef _MM
 #undef _mword
diff --git a/gf16/gf16_muladd_multi.h b/gf16/gf16_muladd_multi.h
index cb938cb4..f445dbb3 100644
--- a/gf16/gf16_muladd_multi.h
+++ b/gf16/gf16_muladd_multi.h
@@ -24,41 +24,41 @@
 	if(max < 18) UNUSED(_src18)
 
 #define GF16_MULADD_MULTI_FUNCS(fnpre, fnsuf, xfn, procRegions, blocksize, pfFactor, finisher) \
-void fnpre ## _muladd_multi ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \
+void TOKENPASTE3(fnpre, _muladd_multi, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \
 	UNUSED(mutScratch); \
 	gf16_muladd_multi(scratch, &xfn, procRegions, regions, offset, dst, src, len, coefficients); \
 	finisher; \
 } \
-void fnpre ## _muladd_multi_stridepf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \
+void TOKENPASTE3(fnpre, _muladd_multi_stridepf, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \
 	UNUSED(mutScratch); \
 	gf16_muladd_multi_stridepf(scratch, &xfn, procRegions, regions, srcStride, dst, src, len, coefficients, pfFactor, prefetch); \
 	finisher; \
 } \
-void fnpre ## _muladd_multi_packed ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \
+void TOKENPASTE3(fnpre, _muladd_multi_packed, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \
 	UNUSED(mutScratch); \
 	gf16_muladd_multi_packed(scratch, &xfn, procRegions, procRegions, packedRegions, regions, dst, src, len, blocksize, coefficients); \
 	finisher; \
 } \
-void fnpre ## _muladd_multi_packpf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \
+void TOKENPASTE3(fnpre, _muladd_multi_packpf, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \
 	UNUSED(mutScratch); \
 	gf16_muladd_multi_packpf(scratch, &xfn, procRegions, procRegions, packedRegions, regions, dst, src, len, blocksize, coefficients, pfFactor, prefetchIn, prefetchOut); \
 	finisher; \
 }
 
 #define GF16_MULADD_MULTI_FUNCS_STUB(fnpre, fnsuf) \
-void fnpre ## _muladd_multi ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \
+void TOKENPASTE3(fnpre, _muladd_multi, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \
 	UNUSED(mutScratch); \
 	UNUSED(scratch); UNUSED(regions); UNUSED(offset); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); \
 } \
-void fnpre ## _muladd_multi_stridepf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \
+void TOKENPASTE3(fnpre, _muladd_multi_stridepf, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetch) { \
 	UNUSED(mutScratch); \
 	UNUSED(scratch); UNUSED(regions); UNUSED(srcStride); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); UNUSED(prefetch); \
 } \
-void fnpre ## _muladd_multi_packed ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \
+void TOKENPASTE3(fnpre, _muladd_multi_packed, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch) { \
 	UNUSED(mutScratch); \
 	UNUSED(scratch); UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); \
 } \
-void fnpre ## _muladd_multi_packpf ## fnsuf(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \
+void TOKENPASTE3(fnpre, _muladd_multi_packpf, fnsuf)(const void *HEDLEY_RESTRICT scratch, unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \
 	UNUSED(mutScratch); \
 	UNUSED(scratch); UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(coefficients); UNUSED(prefetchIn); UNUSED(prefetchOut); \
 }
diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index ac7765f3..0d48bf73 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -847,7 +847,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 		break;
 		
 		case GF16_AFFINE_AVX512:
-			scratch = gf16_affine_init_avx512(GF16_POLYNOMIAL);
+			scratch = gf16_affine_init_avx2(GF16_POLYNOMIAL);
 			METHOD_REQUIRES(gf16_affine_available_avx512 && gf16_shuffle_available_avx512)
 			_mul = &gf16_affine_mul_avx512;
 			_mul_add = &gf16_affine_muladd_avx512;
@@ -940,7 +940,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 		break;
 		
 		case GF16_AFFINE2X_AVX512:
-			scratch = gf16_affine_init_avx512(GF16_POLYNOMIAL);
+			scratch = gf16_affine_init_avx2(GF16_POLYNOMIAL);
 			METHOD_REQUIRES(gf16_affine_available_avx512 && gf16_shuffle_available_avx512)
 			_mul = &gf16_affine2x_mul_avx512;
 			_mul_add = &gf16_affine2x_muladd_avx512;

From b5858b29c46971f938ba27d86cb35c17046345bb Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 31 Jul 2023 23:22:01 +1000
Subject: [PATCH 45/91] Remove references to deprecated String.prototype.substr
 + update arg_parser.js

---
 bin/parpar.js       | 20 ++++++++++----------
 lib/arg_parser.js   | 42 ++++++++++++++++++++++++++++--------------
 lib/par2gen.js      |  2 +-
 test/par-compare.js |  2 +-
 4 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/bin/parpar.js b/bin/parpar.js
index f03af0b6..b123948a 100755
--- a/bin/parpar.js
+++ b/bin/parpar.js
@@ -393,7 +393,7 @@ if(argv.json)
 		print_json('progress', data);
 	};
 else if(argv.progress == 'stdout' || argv.progress == 'stderr') {
-	var decimalPoint = (1.1).toLocaleString().substr(1, 1);
+	var decimalPoint = (1.1).toLocaleString().substring(1, 2);
 	// TODO: display slices processed, pass# if verbose progress requested
 	writeProgress = function(data) {
 		// add formatting for aesthetics
@@ -457,7 +457,7 @@ var inputFiles = argv._;
 					stdInUsed = true;
 					stream = process.stdin;
 				} else {
-					stream = fs.createReadStream(null, {fd: fl[0].substr(5)|0});
+					stream = fs.createReadStream(null, {fd: fl[0].substring(5)|0});
 				}
 				// read from stream
 				var data = '';
@@ -469,7 +469,7 @@ var inputFiles = argv._;
 				});
 				stream.once('error', cb);
 			} else if(/^proc:\/\//i.test(fl[0])) {
-				require('child_process').exec(fl[0].substr(7), {maxBuffer: 1048576*32, encoding: inlistEnc}, function(err, stdout, stderr) {
+				require('child_process').exec(fl[0].substring(7), {maxBuffer: 1048576*32, encoding: inlistEnc}, function(err, stdout, stderr) {
 					cb(err, [fl[1], stdout]);
 				});
 			} else {
@@ -502,7 +502,7 @@ var inputFiles = argv._;
 		creator: creator
 	};
 	if(argv.out.match(/\.par2$/i))
-		ppo.outputBase = argv.out.substr(0, argv.out.length-5);
+		ppo.outputBase = argv.out.substring(0, argv.out.length-5);
 
 	for(var k in opts) {
 		if(opts[k].map && (k in argv))
@@ -521,7 +521,7 @@ var inputFiles = argv._;
 
 	var parseSizeOrNum = function(arg, input, multiple) {
 		var m;
-		var isRec = (arg.substr(-15) == 'recovery-slices' || arg == 'slices-per-file' || arg == 'slices-first-file' || arg == 'packet-redundancy');
+		var isRec = (arg.slice(-15) == 'recovery-slices' || arg == 'slices-per-file' || arg == 'slices-first-file' || arg == 'packet-redundancy');
 		input = input || argv[arg];
 		if(typeof input == 'number' || /^-?\d+$/.test(input)) {
 			input = input|0;
@@ -540,7 +540,7 @@ var inputFiles = argv._;
 					error('Invalid value specified for `'+arg+'`');
 				var scale = 1;
 				if(m[2].length > 2) {
-					scale = +(m[2].substr(2));
+					scale = +(m[2].substring(2));
 					if(isNaN(scale) || !isFinite(scale)) error('Invalid value specified for `'+arg+'`');
 					if(m[2][1] == '/') {
 						scale = 1/scale;
@@ -588,7 +588,7 @@ var inputFiles = argv._;
 			if(/^slices-/.test(k[0]) && (val[0] == '<' || val[0] == '>')) {
 				// TODO: also do this for packet-redundancy?
 				ppo[k[1]+'Rounding'] = (val[0] == '<' ? 'floor' : 'ceil');
-				val = val.substr(1);
+				val = val.substring(1);
 			}
 			var expr = val.replace(/^[\-+]/, function(x) {
 				if(x == '-') return '0-'; // hack to get initial negative term to work
@@ -626,7 +626,7 @@ var inputFiles = argv._;
 		var ret = {};
 		if(data.process) {
 			ret.ratio = parseFloat(data.process);
-			if(data.process.substr(-1) == '%')
+			if(data.process.slice(-1) == '%')
 				ret.ratio /= 100;
 		}
 		if(data.device) {
@@ -675,8 +675,8 @@ var inputFiles = argv._;
 	};
 	var openclOpts = {};
 	for(var k in argv)
-		if(k.substr(0, 7) == 'opencl-')
-			openclOpts[k.substr(7)] = argv[k];
+		if(k.substring(0, 7) == 'opencl-')
+			openclOpts[k.substring(7)] = argv[k];
 	openclOpts = openclMap(openclOpts);
 	if(argv.opencl) {
 		ppo.openclDevices = argv.opencl.map(function(spec) {
diff --git a/lib/arg_parser.js b/lib/arg_parser.js
index 3502e682..0234802b 100644
--- a/lib/arg_parser.js
+++ b/lib/arg_parser.js
@@ -52,11 +52,12 @@ module.exports = function(argv, opts) {
 			aliasMap[opts[k].alias] = k;
 	}
 	
+	var applyFn = {};
 	var setKey = function(key, val, explicit) {
 		var o = opts[key];
 		if(o === undefined)
 			throw new Error('Unknown option `' + key + '`');
-		var isMultiple = (['list','array','map'].indexOf(o.type) !== -1);
+		var isMultiple = (['list','array','map','map2'].indexOf(o.type) !== -1);
 		if((key in ret) && !isMultiple)
 			throw new Error('Option `' + key + '` specified more than once');
 		
@@ -104,7 +105,7 @@ module.exports = function(argv, opts) {
 			}
 			
 			if(!(key in ret))
-				ret[key] = (o.type == 'map') ? {} : [];
+				ret[key] = (o.type == 'map' || o.type == 'map2') ? {} : [];
 			else if(!ret[key]) { // option set to a special scalar value
 				if(ret[key] === null)
 					throw new Error('No value specified for `' + key + '`');
@@ -122,13 +123,17 @@ module.exports = function(argv, opts) {
 					ret[key].push(val);
 					break;
 				case 'map':
+				case 'map2':
 					var m;
 					if(m = val.match(/^(.+?)[=:](.*)$/))
 						ret[key][m[1].trim()] = m[2].trim();
+					else if(o.type == 'map2')
+						ret[key][val.trim()] = undefined;
 					else
 						throw new Error('Invalid format for `' + key + '`');
 					break;
 			}
+			if(o.fn) applyFn[key] = 1;
 		} else {
 			if(val === undefined || (val === '' && !explicit)) {
 				if(o.ifSetDefault !== undefined)
@@ -169,8 +174,8 @@ module.exports = function(argv, opts) {
 				default: // string
 					ret[key] = val;
 			}
+			if(o.fn) ret[key] = o.fn(ret[key]);
 		}
-		if(o.fn) ret[key] = o.fn(ret[key]);
 	};
 	
 	for(var i=0; i<argv.length; i++) {
@@ -185,17 +190,17 @@ module.exports = function(argv, opts) {
 				}
 
 				var eq = arg.indexOf('=');
-				if(arg.substr(2, 3).toLowerCase() === 'no-') { // TODO: consider allowing options which start with 'no-' ?
+				if(arg.substring(2, 5).toLowerCase() === 'no-') { // TODO: consider allowing options which start with 'no-' ?
 					if(eq !== -1)
 						throw new Error('Unexpected value specified in `' + arg + '`');
-					var k = arg.substr(5).toLowerCase();
+					var k = arg.substring(5).toLowerCase();
 					var opt = opts[k];
-					if(opt && ['list','array','map','bool'].indexOf(opt.type) === -1)
+					if(opt && ['list','array','map','map2','bool'].indexOf(opt.type) === -1)
 						// note that, for multi-value types, --no-opt explicitly sets a blank array/map
 						throw new Error('Cannot specify `' + arg + '`');
 					setKey(k, false, true);
 				} else {
-					var k = arg.substr(2);
+					var k = arg.substring(2);
 					if(eq === -1) {
 						k = k.toLowerCase();
 						var opt = opts[k];
@@ -211,7 +216,7 @@ module.exports = function(argv, opts) {
 							}
 						}
 					} else
-						setKey(k.substr(0, eq-2).toLowerCase(), arg.substr(eq+1), true);
+						setKey(k.substring(0, eq-2).toLowerCase(), arg.substring(eq+1), true);
 				}
 				
 			} else {
@@ -239,8 +244,8 @@ module.exports = function(argv, opts) {
 						} else {
 							var explicit = (arg[j] === '=');
 							if(!explicit && j>2) // have something like `-bkval` where `-b` is a bool and `-k` expects a value, this is vague and may signify user error, so reject this
-								throw new Error('Ambiguous option `' + arg + '` supplied, as `' + arg[j-1] + '` (`' + k + '`) expects a value; consider using `' + arg.substr(0, j-1) + ' -' + arg.substr(j-1) + '` or `' + arg.substr(0, j) + '=' + arg.substr(j) + '`');
-							setKey(k, arg.substr(j + explicit), explicit);
+								throw new Error('Ambiguous option `' + arg + '` supplied, as `' + arg[j] + '` (`' + k + '`) expects a value; please check usage');
+							setKey(k, arg.substring(j + explicit), explicit);
 						}
 						
 						break;
@@ -252,16 +257,20 @@ module.exports = function(argv, opts) {
 		}
 	}
 	
+	// apply functions to multi-valued items
+	for(var k in applyFn)
+		ret[k] = opts[k].fn(ret[k]);
+	
 	// handle defaults + multi-value
 	for(var k in opts) {
 		var o = opts[k];
 		if(o.default !== undefined && !(k in ret))
 			ret[k] = o.default;
-		else if((k in ret) && ['list','array','map'].indexOf(o.type) !== -1 && !ret[k])
+		else if((k in ret) && ['list','array','map','map2'].indexOf(o.type) !== -1 && !ret[k])
 			if(ret[k] === null)
 				ret[k] = o.ifSetDefault;
 			else
-				ret[k] = (o.type == 'map') ? {} : [];
+				ret[k] = (o.type == 'map' || o.type == 'map2') ? {} : [];
 		
 		if(!(k in ret) && o.required)
 			throw new Error('Missing value for `' + k + '`');
@@ -330,7 +339,7 @@ var parseObject = function(config, opts) {
 				
 				case 'array':
 				case 'list': // will be parsed later
-				case 'map': // will be parsed later
+				case 'map': case 'map2': // will be parsed later
 					v = [v];
 					break;
 			}
@@ -371,12 +380,17 @@ var parseObject = function(config, opts) {
 				ret[k] = v;
 				break;
 			case 'map':
+			case 'map2':
 				if(Array.isArray(v)) { // array of strings -> parse to object
 					ret[k] = {};
 					v.forEach(function(s) {
+						if(typeof s !== 'string')
+							throw new Error('Invalid format for `' + k + '`');
 						var m;
-						if(typeof s === 'string' && (m = s.match(/^(.+?)[=:](.*)$/)))
+						if(m = s.match(/^(.+?)[=:](.*)$/))
 							ret[k][m[1].trim()] = m[2].trim();
+						else if(opt.type == 'map2')
+							ret[k][s.trim()] = undefined;
 						else
 							throw new Error('Invalid format for `' + k + '`');
 					});
diff --git a/lib/par2gen.js b/lib/par2gen.js
index 3545550d..b5bcf8df 100644
--- a/lib/par2gen.js
+++ b/lib/par2gen.js
@@ -777,7 +777,7 @@ function PAR2Gen(fileInfo, sliceSize, opts) {
 				var stripLen = common_root.join(path.sep).length + 1;
 				fileInfo.forEach(function(file) {
 					if(!('displayName' in file) && ('name' in file))
-						file.displayName = pathToPar2(file._fullPath.substr(stripLen));
+						file.displayName = pathToPar2(file._fullPath.substring(stripLen));
 					delete file._fullPath;
 				});
 			}
diff --git a/test/par-compare.js b/test/par-compare.js
index 3cd91049..87bf7854 100644
--- a/test/par-compare.js
+++ b/test/par-compare.js
@@ -183,7 +183,7 @@ function compare_files(file1, file2) {
 	for(var k in file1) {
 		// ignore Creator packet + unicode filename
 		// TODO: consider comparing unicode filename packets
-		if(k == 'creator' || k.substr(0, 5) == 'unifn') continue;
+		if(k == 'creator' || k.substring(0, 5) == 'unifn') continue;
 		
 		if(!packet_eq(file1[k], file2[k])) {
 			//console.log('Packet mismatch for ' + k, file1[k], file2[k]);

From 40d25684bdcac663db3000543d1be4050d27be11 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 1 Aug 2023 14:07:56 +1000
Subject: [PATCH 46/91] Address deprecated NodeJS functions

---
 benchmarks/bench.js    |  5 +++--
 bin/parpar.js          |  7 ++++---
 lib/arg_parser.js      |  4 ++--
 lib/filechunkreader.js |  3 ++-
 lib/par2.js            | 21 +++++++++++----------
 lib/par2gen.js         |  9 +++++----
 test/par-compare.js    | 17 +++++++++--------
 7 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/benchmarks/bench.js b/benchmarks/bench.js
index 074c3ceb..23a1b8ba 100644
--- a/benchmarks/bench.js
+++ b/benchmarks/bench.js
@@ -211,6 +211,7 @@ for(var i in benchmarks) {
 		delete benchmarks[i];
 }
 
+var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
 var fsWriteSync = function(fd, data) {
 	fs.writeSync(fd, data, 0, data.length, null);
 };
@@ -259,13 +260,13 @@ async.eachSeries(Object.keys(benchmarks), function getVersion(prog, cb) {
 			if(fs.statSync(tmpDir + name).size == size) return;
 		}
 		var fd = fs.openSync(tmpDir + name, 'w');
-		var rand = require('crypto').createCipher('rc4', 'my_incredibly_strong_password' + name);
+		var rand = require('crypto').createCipheriv('rc4', 'my_incredibly_strong_password' + name, '');
 		rand.setAutoPadding(false);
 		var nullBuf = new Buffer(1024*16);
 		nullBuf.fill(0);
 		var written = 0;
 		while(written < size) {
-			var b = rand.update(nullBuf).slice(0, size-written);
+			var b = bufferSlice.call(rand.update(nullBuf), 0, size-written);
 			fsWriteSync(fd, b);
 			written += b.length;
 		}
diff --git a/bin/parpar.js b/bin/parpar.js
index b123948a..de4a57c0 100755
--- a/bin/parpar.js
+++ b/bin/parpar.js
@@ -309,12 +309,13 @@ var fs = require('fs');
 /*{{!include_in_executable!
 if(!argv['skip-self-check']) {
 	// if this is a compiled EXE, do a self MD5 check to detect corruption
+	var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
 	var executable = fs.readFileSync(process.execPath);
-	var md5loc = executable.slice(-1024, -16).indexOf('\0<!parpar#md5~>=');
+	var md5loc = bufferSlice.call(executable, -1024, -16).indexOf('\0<!parpar#md5~>=');
 	if(md5loc < 0)
 		error('Could not find self-check hash - this executable may be truncated or corrupt. If you are certain this is not a problem, you may use the `--skip-self-check` flag to bypass this check.');
-	var expectedMd5 = executable.slice(-1024 + md5loc + 16, (-1024 + md5loc + 32) || undefined).toString('hex');
-	var actualMd5 = require('crypto').createHash('md5').update(executable.slice(0, -1024 + md5loc)).digest('hex');
+	var expectedMd5 = bufferSlice.call(executable, -1024 + md5loc + 16, (-1024 + md5loc + 32) || undefined).toString('hex');
+	var actualMd5 = require('crypto').createHash('md5').update(bufferSlice.call(executable, 0, -1024 + md5loc)).digest('hex');
 	if(expectedMd5 != actualMd5)
 		error('Self-check failed - this executable may be corrupt. If you are certain this is not a problem, you may use the `--skip-self-check` flag to bypass this check.');
 }
diff --git a/lib/arg_parser.js b/lib/arg_parser.js
index 0234802b..a4c77b71 100644
--- a/lib/arg_parser.js
+++ b/lib/arg_parser.js
@@ -3,7 +3,7 @@
 var RE_DIGITS = /^\d+$/;
 
 var parseSize = function(s) {
-	if(typeof s == 'number' || RE_DIGITS.test(s)) return Math.max(0, Math.floor(s));
+	if(typeof s == 'number' || (''+s).search(RE_DIGITS) >= 0) return Math.max(0, Math.floor(s));
 	var parts = (''+s).toUpperCase().match(/^([0-9.]+)([BKMGTPE])$/);
 	if(parts) {
 		var num = +(parts[1]);
@@ -22,7 +22,7 @@ var parseSize = function(s) {
 	return false;
 };
 var parseTime = function(s) {
-	if(typeof s == 'number' || RE_DIGITS.test(s)) return Math.max(0, Math.floor(s*1000));
+	if(typeof s == 'number' || (''+s).search(RE_DIGITS) >= 0) return Math.max(0, Math.floor(s*1000));
 	var parts = (''+s).toLowerCase().match(/^([0-9.]+)(m?s|[mhdw])$/);
 	if(parts) {
 		var num = +(parts[1]);
diff --git a/lib/filechunkreader.js b/lib/filechunkreader.js
index 77d283d9..788bcff9 100644
--- a/lib/filechunkreader.js
+++ b/lib/filechunkreader.js
@@ -3,6 +3,7 @@
 var fs = require('fs');
 var async = require('async');
 var ProcQueue = require('./procqueue');
+var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
 
 function FileChunkReader(files, sliceSize, chunkSize, chunkOffset, bufPool, concurrency, cbChunk, cb) {
 	var readQ = new ProcQueue(concurrency);
@@ -31,7 +32,7 @@ function FileChunkReader(files, sliceSize, chunkSize, chunkOffset, bufPool, conc
 						if(readErr) return cb(readErr);
 						fs.read(fd, buffer, 0, chunkSize, filePos, function(err, bytesRead) {
 							if(err) readErr = err;
-							else cbChunk(file, buffer.slice(0, bytesRead), sliceNum, bufPool.put.bind(bufPool, buffer));
+							else cbChunk(file, bufferSlice.call(buffer, 0, bytesRead), sliceNum, bufPool.put.bind(bufPool, buffer));
 							
 							if(--chunksLeft == 0) {
 								// all chunks read from this file, so close it
diff --git a/lib/par2.js b/lib/par2.js
index f9eca434..d9e98119 100644
--- a/lib/par2.js
+++ b/lib/par2.js
@@ -6,6 +6,7 @@ var async = require('async');
 
 var allocBuffer = (Buffer.allocUnsafe || Buffer);
 var toBuffer = (Buffer.alloc ? Buffer.from : Buffer);
+var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
 
 
 var SAFE_INT = 0xffffffff; // JS only does 32-bit bit operations
@@ -142,7 +143,7 @@ var GFWrapper = {
 						self.finish(item.data, item.cb);
 						break;
 					}
-					else if(self.gf.add(item.num, item.data.slice(0, item.len), function() {
+					else if(self.gf.add(item.num, bufferSlice.call(item.data, 0, item.len), function() {
 						//this.cb(this.num, this.data);
 						this.cb();
 					}.bind(item))) {
@@ -152,7 +153,7 @@ var GFWrapper = {
 			});
 		}
 		
-		if(this.gf.add(sliceNum, dataSlice.slice(0, len), function() {
+		if(this.gf.add(sliceNum, bufferSlice.call(dataSlice, 0, len), function() {
 			//cb(sliceNum, dataSlice);
 			cb();
 		}))
@@ -310,7 +311,7 @@ var GFWrapper = {
 				for(var i=0; i<numBufs; i++) {
 					if(baseBufIdx + i >= self.recData.length)
 						baseBufIdx -= self.recData.length;
-					bufs[i] = self.recData[baseBufIdx + i].slice(0, self.chunkSize);
+					bufs[i] = bufferSlice.call(self.recData[baseBufIdx + i], 0, self.chunkSize);
 				}
 				
 				self.recDataHashers[hasherIdx].update(bufs, function() {
@@ -344,12 +345,12 @@ var GFWrapper = {
 			setImmediate(cb.bind(
 				null,
 				this.recDataPtr,
-				new PAR2OutputData(this.recDataPtr, this.recData[this.recDataPtr % this.recData.length].slice(0, this.chunkSize), this)
+				new PAR2OutputData(this.recDataPtr, bufferSlice.call(this.recData[this.recDataPtr % this.recData.length], 0, this.chunkSize), this)
 			));
 		} else {
 			var self = this;
 			this.recDataFetchCb[this.recDataPtr] = function(idx, buffer) {
-				cb(idx, new PAR2OutputData(idx, buffer.slice(0, self.chunkSize), self));
+				cb(idx, new PAR2OutputData(idx, bufferSlice.call(buffer, 0, self.chunkSize), self));
 			};
 		}
 		this.recDataPtr++;
@@ -382,7 +383,7 @@ var GFWrapper = {
 			
 			// return requested MD5
 			var offset = 16*(idx % self._gfOpts.hashBatchSize);
-			cb(self.recDataMD5.slice(offset, offset+16));
+			cb(bufferSlice.call(self.recDataMD5, offset, offset+16));
 		});
 	},
 	_isRecoveryProcessed: function() {
@@ -440,7 +441,7 @@ function PAR2(files, sliceSize, opts) {
 		offs += 16;
 	});
 	
-	this.setID = crypto.createHash('md5').update(this.pktMain.slice(64)).digest();
+	this.setID = crypto.createHash('md5').update(bufferSlice.call(this.pktMain, 64)).digest();
 	// lastly, header
 	this._writePktHeader(this.pktMain, 'PAR 2.0\0Main\0\0\0\0');
 	
@@ -488,7 +489,7 @@ PAR2.prototype = {
 		// put in packet hash
 		if(!skipMD5) {
 			crypto.createHash('md5')
-				.update(buf.slice(offset+32, offset+pktLen))
+				.update(bufferSlice.call(buf, offset+32, offset+pktLen))
 				.digest()
 				.copy(buf, offset+16);
 		}
@@ -521,7 +522,7 @@ PAR2.prototype = {
 	makeRecoveryHeader: function(chunks, num) {
 		if(!Array.isArray(chunks)) chunks = [chunks];
 		
-		var md5 = crypto.createHash('md5').update(pkt.slice(32));
+		var md5 = crypto.createHash('md5').update(bufferSlice.call(pkt, 32));
 		var len = this.sliceSize;
 		
 		chunks.forEach(function(chunk) {
@@ -679,7 +680,7 @@ function PAR2File(par2, file) {
 	if(file.size == 0) {
 		this.md5 = toBuffer('d41d8cd98f00b204e9800998ecf8427e', 'hex');
 	} else {
-		this._md5ctx = new binding.HasherInput(par2.sliceSize, this.pktCheck.slice(64 + 16));
+		this._md5ctx = new binding.HasherInput(par2.sliceSize, bufferSlice.call(this.pktCheck, 64 + 16));
 	}
 }
 
diff --git a/lib/par2gen.js b/lib/par2gen.js
index b5bcf8df..38e631c6 100644
--- a/lib/par2gen.js
+++ b/lib/par2gen.js
@@ -9,6 +9,7 @@ var FileSeqReader = require('./fileseqreader');
 var FileChunkReader = require('./filechunkreader');
 var BufferPool = require('./bufferpool');
 var PAR2OutFile = require('./par2outfile');
+var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
 
 var MAX_BUFFER_SIZE = (require('buffer').kMaxLength || (1024*1024*1024-1)) - 1024-68; // the '-1024-68' is padding to deal with alignment issues (XorJit512 can have 1KB block) + 68-byte header
 
@@ -1117,7 +1118,7 @@ PAR2Gen.prototype = {
 			if(this._chunker) {
 				async.parallel([
 					file.processHash.bind(file, buf),
-					this._chunker.processData.bind(this._chunker, file, buf.slice(0, this._chunkSize))
+					this._chunker.processData.bind(this._chunker, file, bufferSlice.call(buf, 0, this._chunkSize))
 				], cb);
 			} else {
 				file.process(buf, cb);
@@ -1288,7 +1289,7 @@ PAR2Gen.prototype = {
 						if(cbProgress) cbProgress('processing_slice', data.file, slicePos);
 						self._chunker.processData(
 							data.file.sliceOffset + slicePos,
-							data.buffer.slice(chunk, Math.min(chunk+chunkSize, data.file.size)),
+							bufferSlice.call(data.buffer, chunk, Math.min(chunk+chunkSize, data.file.size)),
 							cb
 						);
 						slicePos++;
@@ -1306,7 +1307,7 @@ PAR2Gen.prototype = {
 					async.times(numSlices, function(sliceOffNum, cb) {
 						if(cbProgress) cbProgress('processing_slice', data.file, slicePos + sliceOffNum);
 						var bp = sliceOffNum * self.opts.sliceSize;
-						data.file.processData(data.buffer.slice(bp, Math.min(data.buffer.length, bp+self.opts.sliceSize)), cb);
+						data.file.processData(bufferSlice.call(data.buffer, bp, Math.min(data.buffer.length, bp+self.opts.sliceSize)), cb);
 					}, data.release.bind(data));
 				}
 			}, cb);
@@ -1516,7 +1517,7 @@ module.exports = {
 					fs.read(fd, buf, 0, 16384, null, cb);
 				},
 				function(bytesRead, buffer, cb) {
-					info.md5_16k = crypto.createHash('md5').update(buffer.slice(0, bytesRead)).digest();
+					info.md5_16k = crypto.createHash('md5').update(bufferSlice.call(buffer, 0, bytesRead)).digest();
 					if(info.size < 16384) info.md5 = info.md5_16k;
 					fs.close(fd, cb);
 				}
diff --git a/test/par-compare.js b/test/par-compare.js
index 87bf7854..ad0963be 100644
--- a/test/par-compare.js
+++ b/test/par-compare.js
@@ -26,6 +26,7 @@ var fsRead = function(fd, len) {
 	return buf;
 };
 
+var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
 var BufferCompare;
 if(Buffer.compare) BufferCompare = Buffer.compare;
 else BufferCompare = function(a, b) {
@@ -57,29 +58,29 @@ function parse_file(file) {
 	
 	while(pos != stat.size) { // != ensures that size should exactly match expected
 		var header = fsRead(fd, 64);
-		if(header.slice(0, 8).toString() != 'PAR2\0PKT')
+		if(bufferSlice.call(header, 0, 8).toString() != 'PAR2\0PKT')
 			throw new Error('Invalid packet signature @' + pos);
 		
 		var pkt = {
 			len: header.readUInt32LE(8) + header.readUInt32LE(12) * 4294967296,
 			offset: pos,
-			md5: header.slice(16, 32),
-			type: header.slice(48, 64).toString().replace(/\0+$/, '')
+			md5: bufferSlice.call(header, 16, 32),
+			type: bufferSlice.call(header, 48, 64).toString().replace(/\0+$/, '')
 		};
 		try {
 			if(pkt.len % 4 || pkt.len < 64)
 				throw new Error('Invalid packet length specified');
 			
 			if(ret.rsId) {
-				if(BufferCompare(ret.rsId, header.slice(32, 48)))
+				if(BufferCompare(ret.rsId, bufferSlice.call(header, 32, 48)))
 					throw new Error('Mismatching recovery set ID');
 			} else {
 				ret.rsId = new Buffer(16);
-				header.slice(32, 48).copy(ret.rsId);
+				bufferSlice.call(header, 32, 48).copy(ret.rsId);
 			}
 			
 			var md5 = crypto.createHash('md5');
-			md5.update(header.slice(32));
+			md5.update(bufferSlice.call(header, 32));
 			var pktPos = 64;
 			
 			var idLen = 0;
@@ -294,13 +295,13 @@ console.log('Creating random input files...');
 function writeRndFile(name, size) {
 	if(skipFileCreate && fs.existsSync(tmpDir + name)) return;
 	var fd = fs.openSync(tmpDir + name, 'w');
-	var rand = require('crypto').createCipher('rc4', 'my_incredibly_strong_password' + name);
+	var rand = require('crypto').createCipheriv('rc4', 'my_incredibly_strong_password' + name, '');
 	rand.setAutoPadding(false);
 	var nullBuf = new Buffer(1024*16);
 	nullBuf.fill(0);
 	var written = 0;
 	while(written < size) {
-		var b = rand.update(nullBuf).slice(0, Math.min(1024*16, size-written));
+		var b = bufferSlice.call(rand.update(nullBuf), 0, Math.min(1024*16, size-written));
 		fsWriteSync(fd, b);
 		written += b.length;
 	}

From 26f1a914d9e7466030a3272b9a0138045f6f558e Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 1 Aug 2023 16:17:08 +1000
Subject: [PATCH 47/91] Handle Buffer constructor deprecation in extra scripts

---
 benchmarks/bench.js |  5 +++--
 test/par-compare.js | 11 ++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/benchmarks/bench.js b/benchmarks/bench.js
index 23a1b8ba..ba6e7d29 100644
--- a/benchmarks/bench.js
+++ b/benchmarks/bench.js
@@ -223,9 +223,10 @@ var findFile = function(dir, re) {
 	return ret;
 };
 
+var allocBuffer = (Buffer.allocUnsafe || Buffer);
 var async = require('async');
 var fs = require('fs');
-var nullBuf = new Buffer(1024*16);
+var nullBuf = allocBuffer(1024*16);
 nullBuf.fill(0);
 var results = {};
 var testFiles = [];
@@ -262,7 +263,7 @@ async.eachSeries(Object.keys(benchmarks), function getVersion(prog, cb) {
 		var fd = fs.openSync(tmpDir + name, 'w');
 		var rand = require('crypto').createCipheriv('rc4', 'my_incredibly_strong_password' + name, '');
 		rand.setAutoPadding(false);
-		var nullBuf = new Buffer(1024*16);
+		var nullBuf = allocBuffer(1024*16);
 		nullBuf.fill(0);
 		var written = 0;
 		while(written < size) {
diff --git a/test/par-compare.js b/test/par-compare.js
index ad0963be..a8eff33d 100644
--- a/test/par-compare.js
+++ b/test/par-compare.js
@@ -18,15 +18,16 @@ var skipFileCreate = true; // skip creating test files if they already exist (sp
 var fs = require('fs');
 var crypto = require('crypto');
 
+var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
+var allocBuffer = (Buffer.allocUnsafe || Buffer);
 var fsRead = function(fd, len) {
-	var buf = new Buffer(len);
+	var buf = allocBuffer(len);
 	var readLen = fs.readSync(fd, buf, 0, len, null);
 	if(readLen != len)
 		throw new Error("Couldn't read requested data: got " + readLen + " bytes instead of " + len);
 	return buf;
 };
 
-var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
 var BufferCompare;
 if(Buffer.compare) BufferCompare = Buffer.compare;
 else BufferCompare = function(a, b) {
@@ -75,7 +76,7 @@ function parse_file(file) {
 				if(BufferCompare(ret.rsId, bufferSlice.call(header, 32, 48)))
 					throw new Error('Mismatching recovery set ID');
 			} else {
-				ret.rsId = new Buffer(16);
+				ret.rsId = allocBuffer(16);
 				bufferSlice.call(header, 32, 48).copy(ret.rsId);
 			}
 			
@@ -297,7 +298,7 @@ function writeRndFile(name, size) {
 	var fd = fs.openSync(tmpDir + name, 'w');
 	var rand = require('crypto').createCipheriv('rc4', 'my_incredibly_strong_password' + name, '');
 	rand.setAutoPadding(false);
-	var nullBuf = new Buffer(1024*16);
+	var nullBuf = allocBuffer(1024*16);
 	nullBuf.fill(0);
 	var written = 0;
 	while(written < size) {
@@ -588,7 +589,7 @@ async.timesSeries(allTests.length, function(testNum, cb) {
 					for(var k in f) {
 						ret[k] = {
 							type: f[k].type,
-							md5: new Buffer(f[k].md5, 'hex'),
+							md5: (Buffer.alloc ? Buffer.from : Buffer)(f[k].md5, 'hex'),
 							len: f[k].len
 						};
 					}

From bbd2728f863b4b55d780e8f644acb99a6aa23517 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 3 Aug 2023 11:08:02 +1000
Subject: [PATCH 48/91] Buffer.subarray is not identical to Buffer.slice on
 Node 4.x

---
 benchmarks/bench.js    | 2 +-
 bin/parpar.js          | 2 +-
 lib/filechunkreader.js | 2 +-
 lib/par2.js            | 2 +-
 lib/par2gen.js         | 2 +-
 test/par-compare.js    | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/bench.js b/benchmarks/bench.js
index ba6e7d29..759bf57d 100644
--- a/benchmarks/bench.js
+++ b/benchmarks/bench.js
@@ -211,7 +211,7 @@ for(var i in benchmarks) {
 		delete benchmarks[i];
 }
 
-var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
+var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice;
 var fsWriteSync = function(fd, data) {
 	fs.writeSync(fd, data, 0, data.length, null);
 };
diff --git a/bin/parpar.js b/bin/parpar.js
index de4a57c0..a2d3c18f 100755
--- a/bin/parpar.js
+++ b/bin/parpar.js
@@ -309,7 +309,7 @@ var fs = require('fs');
 /*{{!include_in_executable!
 if(!argv['skip-self-check']) {
 	// if this is a compiled EXE, do a self MD5 check to detect corruption
-	var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
+	var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice;
 	var executable = fs.readFileSync(process.execPath);
 	var md5loc = bufferSlice.call(executable, -1024, -16).indexOf('\0<!parpar#md5~>=');
 	if(md5loc < 0)
diff --git a/lib/filechunkreader.js b/lib/filechunkreader.js
index 788bcff9..716ba811 100644
--- a/lib/filechunkreader.js
+++ b/lib/filechunkreader.js
@@ -3,7 +3,7 @@
 var fs = require('fs');
 var async = require('async');
 var ProcQueue = require('./procqueue');
-var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
+var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice;
 
 function FileChunkReader(files, sliceSize, chunkSize, chunkOffset, bufPool, concurrency, cbChunk, cb) {
 	var readQ = new ProcQueue(concurrency);
diff --git a/lib/par2.js b/lib/par2.js
index d9e98119..5fa7203c 100644
--- a/lib/par2.js
+++ b/lib/par2.js
@@ -6,7 +6,7 @@ var async = require('async');
 
 var allocBuffer = (Buffer.allocUnsafe || Buffer);
 var toBuffer = (Buffer.alloc ? Buffer.from : Buffer);
-var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
+var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice;
 
 
 var SAFE_INT = 0xffffffff; // JS only does 32-bit bit operations
diff --git a/lib/par2gen.js b/lib/par2gen.js
index 38e631c6..012cbe6e 100644
--- a/lib/par2gen.js
+++ b/lib/par2gen.js
@@ -9,7 +9,7 @@ var FileSeqReader = require('./fileseqreader');
 var FileChunkReader = require('./filechunkreader');
 var BufferPool = require('./bufferpool');
 var PAR2OutFile = require('./par2outfile');
-var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
+var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice;
 
 var MAX_BUFFER_SIZE = (require('buffer').kMaxLength || (1024*1024*1024-1)) - 1024-68; // the '-1024-68' is padding to deal with alignment issues (XorJit512 can have 1KB block) + 68-byte header
 
diff --git a/test/par-compare.js b/test/par-compare.js
index a8eff33d..60832de8 100644
--- a/test/par-compare.js
+++ b/test/par-compare.js
@@ -18,7 +18,7 @@ var skipFileCreate = true; // skip creating test files if they already exist (sp
 var fs = require('fs');
 var crypto = require('crypto');
 
-var bufferSlice = Buffer.prototype.subarray || Buffer.prototype.slice;
+var bufferSlice = Buffer.prototype.readBigInt64BE ? Buffer.prototype.subarray : Buffer.prototype.slice;
 var allocBuffer = (Buffer.allocUnsafe || Buffer);
 var fsRead = function(fd, len) {
 	var buf = allocBuffer(len);

From 1af436ec6067925971f159d087ce89d5c5cac08b Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sat, 12 Aug 2023 22:11:12 +1000
Subject: [PATCH 49/91] Add RVV implementation of Shuffle128

---
 binding.gyp                |  53 +++++++
 gf16/gf16_checksum_rvv.h   |  71 ++++++++++
 gf16/gf16_cksum.h          |   1 +
 gf16/gf16_cksum_rvv.c      |  19 +++
 gf16/gf16_rvv_common.h     |  37 +++++
 gf16/gf16_shuffle.h        |  46 +++---
 gf16/gf16_shuffle128_rvv.c | 281 +++++++++++++++++++++++++++++++++++++
 gf16/gf16mul.cpp           |  53 ++++++-
 gf16/gf16mul.h             |   2 +
 gf16/gf_add.h              |   3 +
 gf16/gf_add_rvv.c          |  83 +++++++++++
 help.txt                   |   2 +
 lib/par2.js                |   1 +
 src/cpuid.h                |  28 ++++
 14 files changed, 661 insertions(+), 19 deletions(-)
 create mode 100644 gf16/gf16_checksum_rvv.h
 create mode 100644 gf16/gf16_cksum_rvv.c
 create mode 100644 gf16/gf16_rvv_common.h
 create mode 100644 gf16/gf16_shuffle128_rvv.c
 create mode 100644 gf16/gf_add_rvv.c

diff --git a/binding.gyp b/binding.gyp
index 70a36159..42d1e9f1 100644
--- a/binding.gyp
+++ b/binding.gyp
@@ -867,6 +867,59 @@
           ]
         }]
       ]
+    },
+    {
+      "target_name": "gf16_rvv",
+      "type": "static_library",
+      "defines": ["NDEBUG"],
+      "sources": [
+        "gf16/gf16_shuffle128_rvv.c",
+        "gf16/gf_add_rvv.c",
+        "gf16/gf16_cksum_rvv.c"
+      ],
+      "cflags": ["-Wno-unused-function", "-std=c99"],
+      "xcode_settings": {
+        "OTHER_CFLAGS": ["-Wno-unused-function"],
+        "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
+      },
+      "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
+      "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
+      "conditions": [
+        ['target_arch=="riscv64" and OS!="win"', {
+          "variables": {"supports_rvv%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -MM -E gf16/gf16_shuffle128_rvv.c -march=rv64gcv 2>/dev/null || true)"},
+          "conditions": [
+            ['supports_rvv!=""', {
+              "cflags!": ["-march=native"],
+              "cxxflags!": ["-march=native"],
+              "cflags": ["-march=rv64gcv"],
+              "cxxflags": ["-march=rv64gcv"],
+              "xcode_settings": {
+                "OTHER_CFLAGS!": ["-march=native"],
+                "OTHER_CXXFLAGS!": ["-march=native"],
+                "OTHER_CFLAGS": ["-march=rv64gcv"],
+                "OTHER_CXXFLAGS": ["-march=rv64gcv"],
+              }
+            }]
+          ]
+        }],
+        ['target_arch=="riscv32" and OS!="win"', {
+          "variables": {"supports_rvv%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -MM -E gf16/gf16_shuffle128_rvv.c -march=rv32gcv 2>/dev/null || true)"},
+          "conditions": [
+            ['supports_rvv!=""', {
+              "cflags!": ["-march=native"],
+              "cxxflags!": ["-march=native"],
+              "cflags": ["-march=rv32gcv"],
+              "cxxflags": ["-march=rv32gcv"],
+              "xcode_settings": {
+                "OTHER_CFLAGS!": ["-march=native"],
+                "OTHER_CXXFLAGS!": ["-march=native"],
+                "OTHER_CFLAGS": ["-march=rv32gcv"],
+                "OTHER_CXXFLAGS": ["-march=rv32gcv"],
+              }
+            }]
+          ]
+        }]
+      ]
     }
   ]
 }
diff --git a/gf16/gf16_checksum_rvv.h b/gf16/gf16_checksum_rvv.h
new file mode 100644
index 00000000..615109b9
--- /dev/null
+++ b/gf16/gf16_checksum_rvv.h
@@ -0,0 +1,71 @@
+#ifndef __GF16_CHECKSUM_H
+#define __GF16_CHECKSUM_H
+
+#include "gf16_rvv_common.h"
+
+#ifdef __RVV_LE
+static HEDLEY_ALWAYS_INLINE void gf16_checksum_block_rvv(const void *HEDLEY_RESTRICT src, void *HEDLEY_RESTRICT checksum, const size_t blockLen, const int aligned) {
+	size_t vl = RV(vsetvlmax_e8m1)();
+	const unsigned words = blockLen/vl;
+	
+	vint16m1_t v = *(vint16m1_t*)checksum;
+	v = gf16_vec_mul2_rvv(v);
+	if(aligned) {
+		vl = RV(vsetvlmax_e16m1)();
+		int16_t* _src = (int16_t*)src;
+		for(unsigned i=0; i<words; i++)
+			v = RV(vxor_vv_i16m1)(v, RV(vle16_v_i16m1)(_src+i*vl, vl), vl);
+		
+		*(vint16m1_t*)checksum = v;
+	} else {
+		vint8m1_t v8 = RV(vreinterpret_v_i16m1_i8m1)(v);
+		int8_t* _src = (int8_t*)src;
+		for(unsigned i=0; i<words; i++)
+			v8 = RV(vxor_vv_i8m1)(v8, RV(vle8_v_i8m1)(_src+i*vl, vl), vl);
+		
+		*(vint8m1_t*)checksum = v8;
+	}
+}
+
+static HEDLEY_ALWAYS_INLINE void gf16_checksum_blocku_rvv(const void *HEDLEY_RESTRICT src, size_t amount, void *HEDLEY_RESTRICT checksum) {
+	vint16m1_t v = *(vint16m1_t*)checksum;
+	v = gf16_vec_mul2_rvv(v);
+	int8_t* _src = (int8_t*)src;
+	
+	size_t vlmax = RV(vsetvlmax_e8m1)();
+	vint8m1_t v8 = RV(vreinterpret_v_i16m1_i8m1)(v);
+	while(amount) {
+		size_t vl = RV(vsetvl_e8m1)(amount);
+		
+		// intrinsics lack tail-undisturbed, so emulate it
+		vint8m1_t tmp = RV(vmv_v_x_i8m1)(0, vlmax);
+		memcpy(&tmp, _src, vl);
+		v8 = RV(vxor_vv_i8m1)(v8, tmp, vlmax);
+		//v8 = RV(vxor_vv_i8m1)(v8, RV(vle8_v_i8m1)(_src, vl), vl);
+		amount -= vl;
+		_src += vl;
+	}
+	
+	*(vint8m1_t*)checksum = v8;
+}
+
+static HEDLEY_ALWAYS_INLINE void gf16_checksum_exp_rvv(void *HEDLEY_RESTRICT checksum, uint16_t exp) {
+	size_t vl = RV(vsetvlmax_e16m1)();
+	
+	vint16m1_t coeff = RV(vmv_v_x_i16m1)(exp, vl);
+	vint16m1_t _checksum = *(vint16m1_t*)checksum;
+	vint16m1_t res = RV(vand_vv_i16m1)(RV(vsra_vx_i16m1)(coeff, 15, vl), _checksum, vl);
+	for(int i=0; i<15; i++) {
+		res = gf16_vec_mul2_rvv(res);
+		coeff = RV(vadd_vv_i16m1)(coeff, coeff, vl);
+		res = RV(vxor_vv_i16m1_m)(
+			RV(vmslt_vx_i16m1_b16)(coeff, 0, vl),
+			res, res, _checksum,
+			vl
+		);
+	}
+	*(vint16m1_t*)checksum = res;
+}
+#endif
+
+#endif
diff --git a/gf16/gf16_cksum.h b/gf16/gf16_cksum.h
index 688d4267..54199c73 100644
--- a/gf16/gf16_cksum.h
+++ b/gf16/gf16_cksum.h
@@ -11,5 +11,6 @@ FUNCS(avx2);
 FUNCS(avx512);
 FUNCS(neon);
 FUNCS(sve);
+FUNCS(rvv);
 
 #undef FUNCS
diff --git a/gf16/gf16_cksum_rvv.c b/gf16/gf16_cksum_rvv.c
new file mode 100644
index 00000000..4ab75426
--- /dev/null
+++ b/gf16/gf16_cksum_rvv.c
@@ -0,0 +1,19 @@
+#include "../src/hedley.h"
+#include "../src/platform.h"
+
+#include "gf16_checksum_rvv.h"
+
+#ifdef __RVV_LE
+# define _AVAILABLE 1
+#endif
+
+#define cksum_t vint16m1_t
+// force 8-bit load/store to enable unaligned memory access (breaks big endian?)
+#define LOAD_DATA(var, addr) var = RV(vreinterpret_v_i8m1_i16m1)(RV(vle8_v_i8m1)((const int8_t*)(addr), RV(vsetvlmax_e8m1)()))
+#define STORE_DATA(addr, var) RV(vse8_v_i8m1)((int8_t*)(addr), RV(vreinterpret_v_i16m1_i8m1)(var), RV(vsetvlmax_e8m1)())
+#define CKSUM_ZERO RV(vmv_v_x_i16m1)(0, RV(vsetvlmax_e16m1)())
+#define CKSUM_IS_ZERO(c) RV(vfirst_m_b16)(RV(vmsne_vx_i16m1_b16)(c, 0, RV(vsetvlmax_e16m1)()), RV(vsetvlmax_e16m1)()) < 0
+#define CKSUM_SIZE RV(vsetvlmax_e8m1)()
+
+#define _FNSUFFIX _rvv
+#include "gf16_cksum_base.h"
diff --git a/gf16/gf16_rvv_common.h b/gf16/gf16_rvv_common.h
new file mode 100644
index 00000000..7bc43769
--- /dev/null
+++ b/gf16/gf16_rvv_common.h
@@ -0,0 +1,37 @@
+#ifndef __GF16_RVV_COMMON_H
+#define __GF16_RVV_COMMON_H
+
+#include "gf16_global.h"
+#include "../src/platform.h"
+
+
+#if defined(__riscv_vector)
+# include <riscv_vector.h>
+# if defined(__clang__) && __clang_major__ < 16
+#  define RV(f) f
+# else
+#  define RV(f) __riscv_##f
+# endif
+
+
+// TODO: evaluate endian requirements
+# if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
+#  define __RVV_LE
+# endif
+
+static HEDLEY_ALWAYS_INLINE vint16m1_t gf16_vec_mul2_rvv(vint16m1_t v) {
+	size_t vl = RV(vsetvlmax_e16m1)();
+	vbool16_t maskPoly = RV(vmslt_vx_i16m1_b16)(v, 0, vl);
+	v = RV(vadd_vv_i16m1)(v, v, vl);
+	return RV(vxor_vx_i16m1_m)(
+		maskPoly,
+		v, v,
+		GF16_POLYNOMIAL & 0xffff,
+		vl
+	);
+}
+
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/gf16/gf16_shuffle.h b/gf16/gf16_shuffle.h
index 87f3fe05..d4ebbbb1 100644
--- a/gf16/gf16_shuffle.h
+++ b/gf16/gf16_shuffle.h
@@ -36,15 +36,13 @@ FUNCS(neon);
 FUNCS(128_sve);
 FUNCS(128_sve2);
 FUNCS(512_sve2);
+FUNCS(128_rvv);
 
 #undef FUNCS
 
 void gf16_shuffle_mul_vbmi(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_shuffle_muladd_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch);
 void gf16_shuffle_muladd_prefetch_vbmi(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch, const void *HEDLEY_RESTRICT prefetch);
-void gf16_shuffle_prepare_packed_vbmi(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
-void gf16_shuffle_prepare_packed_cksum_vbmi(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
-void gf16_shuffle_prepare_partial_packsum_vbmi(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen);
 extern int gf16_shuffle_available_vbmi;
 
 #define FUNCS(v) \
@@ -55,32 +53,42 @@ FUNCS(neon);
 FUNCS(128_sve);
 FUNCS(128_sve2);
 FUNCS(512_sve2);
+FUNCS(128_rvv);
 
 #undef FUNCS
 
-void gf16_shuffle_prepare_packed_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
-void gf16_shuffle_prepare_packed_cksum_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
-void gf16_shuffle_prepare_partial_packsum_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen);
-void gf16_shuffle_finish_packed_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen);
-int gf16_shuffle_finish_packed_cksum_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen);
-int gf16_shuffle_finish_partial_packsum_neon(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen);
+#define FUNCS(v) \
+	void gf16_shuffle_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
+	void gf16_shuffle_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
+	void gf16_shuffle_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen)
 
-void gf16_shuffle_prepare_packed_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
-void gf16_shuffle_prepare_packed_cksum_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
-void gf16_shuffle_prepare_partial_packsum_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen);
-void gf16_shuffle_finish_packed_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen);
-int gf16_shuffle_finish_packed_cksum_sve(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen);
-int gf16_shuffle_finish_partial_packsum_sve(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen);
+FUNCS(vbmi);
+FUNCS(neon);
+FUNCS(sve);
+FUNCS(512_sve2);
+FUNCS(rvv);
+
+#undef FUNCS
+
+#define FUNCS(v) \
+	void gf16_shuffle_finish_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \
+	int gf16_shuffle_finish_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen); \
+	int gf16_shuffle_finish_partial_packsum_##v(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen)
+
+FUNCS(neon);
+FUNCS(sve);
+FUNCS(rvv);
+
+#undef FUNCS
 
-void gf16_shuffle_prepare_packed_512_sve2(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
-void gf16_shuffle_prepare_packed_cksum_512_sve2(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
-void gf16_shuffle_prepare_partial_packsum_512_sve2(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen);
 
 // also used for clmul, but declared here for convenience
 extern int gf16_available_neon;
 extern int gf16_available_sve;
 extern int gf16_available_sve2;
 
+extern int gf16_available_rvv;
+
 // shuffle2x
 #define FUNCS(v) \
 	void gf16_shuffle2x_prepare_##v(void* dst, const void* src, size_t srcLen); \
@@ -122,8 +130,10 @@ void* gf16_shuffle_init_vbmi(int polynomial);
 void* gf16_shuffle_init_arm(int polynomial);
 void* gf16_shuffle_init_128_sve(int polynomial);
 void* gf16_shuffle_init_512_sve(int polynomial);
+void* gf16_shuffle_init_128_rvv(int polynomial);
 
 int gf16_sve_get_size();
+int gf16_rvv_get_size();
 
 uint16_t gf16_affine2x_replace_word(void* data, size_t index, uint16_t newValue);
 uint16_t gf16_shuffle16_replace_word(void* data, size_t index, uint16_t newValue);
diff --git a/gf16/gf16_shuffle128_rvv.c b/gf16/gf16_shuffle128_rvv.c
new file mode 100644
index 00000000..ba23a1a9
--- /dev/null
+++ b/gf16/gf16_shuffle128_rvv.c
@@ -0,0 +1,281 @@
+#include "gf16_rvv_common.h"
+
+#if defined(__RVV_LE)
+int gf16_available_rvv = 1;
+#else
+int gf16_available_rvv = 0;
+#endif
+
+#include "gf16_muladd_multi.h"
+
+#if defined(__RVV_LE)
+// TODO: detect intrinsics version
+# if 1
+// intrinsics v0.11.x (up to at least GCC 13 / Clang 16)
+#  define _vlseg2e8 RV(vlseg2e8_v_u8m1)
+#  define _vsseg2e8 RV(vsseg2e8_v_u8m1)
+# else
+// intrinsics v0.12.x
+static HEDLEY_ALWAYS_INLINE void _vlseg2e8(vuint8m1_t* v0, vuint8m1_t* v1, const uint8_t* src, size_t vl) {
+	vuint8m1x2_t d = RV(vlseg2e8_v_u8m1x2)(src, vl);
+	*v0 = RV(vget_v_u8m1x2_u8m1)(vd, 0);
+	*v1 = RV(vget_v_u8m1x2_u8m1)(vd, 1);
+}
+static HEDLEY_ALWAYS_INLINE void _vsseg2e8(uint8_t* dst, vuint8m1_t v0, vuint8m1_t v1, size_t vl) {
+	vuint8m1x2_t d;
+	d = RV(vset_v_u8m1_u8m1x2)(d, 0, v0);
+	d = RV(vset_v_u8m1_u8m1x2)(d, 1, v1);
+	RV(vsseg2e8_v_u8m1x2)(dst, d, vl);
+}
+# endif
+
+static HEDLEY_ALWAYS_INLINE void gf16_shuffle_128_rvv_calc_table(vuint8m1_t poly_l, uint16_t val,
+	vuint8m1_t* tbl_l0, vuint8m1_t* tbl_l1, vuint8m1_t* tbl_l2, vuint8m1_t* tbl_l3, 
+	vuint8m1_t* tbl_h0, vuint8m1_t* tbl_h1, vuint8m1_t* tbl_h2, vuint8m1_t* tbl_h3
+) {
+	uint16_t val2 = GF16_MULTBY_TWO(val);
+	uint16_t val4 = GF16_MULTBY_TWO(val2);
+	uint16_t val8 = GF16_MULTBY_TWO(val4);
+	
+	vuint16m1_t tmp0 = RV(vmv_v_x_u16m1)(val ^ val2, 8);
+	tmp0 = RV(vslide1up_vx_u16m1)(tmp0, val2, 8);
+	tmp0 = RV(vslide1up_vx_u16m1)(tmp0, val, 8);
+	tmp0 = RV(vslide1up_vx_u16m1)(tmp0, 0, 8);
+	
+	vuint16m1_t tmp4 = RV(vxor_vv_u16m1)(RV(vmv_v_x_u16m1)(val4, 8), tmp0, 8);
+	tmp0 = RV(vslideup_vx_u16m1)(tmp0, tmp4, 4, 8);
+
+	vuint16m1_t tmp8 = RV(vxor_vv_u16m1)(tmp0, RV(vmv_v_x_u16m1)(val8, 8), 8);
+
+	vuint8mf2_t tmpL0, tmpL1, tmpH0, tmpH1;
+	tmpL0 = RV(vnsrl_wx_u8mf2)(tmp0, 0, 8);
+	tmpL1 = RV(vnsrl_wx_u8mf2)(tmp8, 0, 8);
+	tmpH0 = RV(vnsrl_wx_u8mf2)(tmp0, 8, 8);
+	tmpH1 = RV(vnsrl_wx_u8mf2)(tmp8, 8, 8);
+	
+	*tbl_l0 = RV(vslideup_vx_u8m1)(RV(vlmul_ext_v_u8mf2_u8m1)(tmpL0), RV(vlmul_ext_v_u8mf2_u8m1)(tmpL1), 8, 16);
+	*tbl_h0 = RV(vslideup_vx_u8m1)(RV(vlmul_ext_v_u8mf2_u8m1)(tmpH0), RV(vlmul_ext_v_u8mf2_u8m1)(tmpH1), 8, 16);
+	
+	vuint8m1_t ri, rh, rl;
+	
+	// could replace the sll+or with a macc, but probably not worth it
+	#define MUL16(p, c) \
+		ri = RV(vsrl_vx_u8m1)(*tbl_h##p, 4, 16); \
+		rl = RV(vsll_vx_u8m1)(*tbl_l##p, 4, 16); \
+		rh = RV(vxor_vv_u8m1)(*tbl_h##p, ri, 16); \
+		*tbl_l##c = RV(vxor_vv_u8m1)(rl, RV(vrgather_vv_u8m1)(poly_l, ri, 16), 16); \
+		*tbl_h##c = RV(vor_vv_u8m1)( \
+			RV(vsll_vx_u8m1)(rh, 4, 16), \
+			RV(vsrl_vx_u8m1)(*tbl_l##p, 4, 16), \
+			16 \
+		)
+	
+	MUL16(0, 1);
+	MUL16(1, 2);
+	MUL16(2, 3);
+	#undef MUL16
+}
+
+
+static HEDLEY_ALWAYS_INLINE void gf16_shuffle_128_rvv_round(size_t vl, vuint8m1_t src0, vuint8m1_t src1, vuint8m1_t* rl, vuint8m1_t* rh,
+	vuint8m1_t tbl_l0, vuint8m1_t tbl_l1, vuint8m1_t tbl_l2, vuint8m1_t tbl_l3, 
+	vuint8m1_t tbl_h0, vuint8m1_t tbl_h1, vuint8m1_t tbl_h2, vuint8m1_t tbl_h3
+) {
+	vuint8m1_t tmp = RV(vand_vx_u8m1)(src0, 0xf, vl);
+	*rl = RV(vxor_vv_u8m1)(*rl, RV(vrgather_vv_u8m1)(tbl_l0, tmp, vl), vl);
+	*rh = RV(vxor_vv_u8m1)(*rh, RV(vrgather_vv_u8m1)(tbl_h0, tmp, vl), vl);
+	
+	tmp = RV(vand_vx_u8m1)(src1, 0xf, vl);
+	*rl = RV(vxor_vv_u8m1)(*rl, RV(vrgather_vv_u8m1)(tbl_l2, tmp, vl), vl);
+	*rh = RV(vxor_vv_u8m1)(*rh, RV(vrgather_vv_u8m1)(tbl_h2, tmp, vl), vl);
+	
+	tmp = RV(vsrl_vx_u8m1)(src0, 4, vl);
+	*rl = RV(vxor_vv_u8m1)(*rl, RV(vrgather_vv_u8m1)(tbl_l1, tmp, vl), vl);
+	*rh = RV(vxor_vv_u8m1)(*rh, RV(vrgather_vv_u8m1)(tbl_h1, tmp, vl), vl);
+	
+	tmp = RV(vsrl_vx_u8m1)(src1, 4, vl);
+	*rl = RV(vxor_vv_u8m1)(*rl, RV(vrgather_vv_u8m1)(tbl_l3, tmp, vl), vl);
+	*rh = RV(vxor_vv_u8m1)(*rh, RV(vrgather_vv_u8m1)(tbl_h3, tmp, vl), vl);
+}
+
+
+static HEDLEY_ALWAYS_INLINE void gf16_shuffle_muladd_x_128_rvv(
+	const void *HEDLEY_RESTRICT scratch,
+	uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale, GF16_MULADD_MULTI_SRCLIST, size_t len,
+	const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf
+) {
+	GF16_MULADD_MULTI_SRC_UNUSED(3);
+	
+	vuint8m1_t poly_l = RV(vle8_v_u8m1)((const uint8_t*)scratch, 16);
+	
+	vuint8m1_t tbl_Ah0, tbl_Ah1, tbl_Ah2, tbl_Ah3, tbl_Al0, tbl_Al1, tbl_Al2, tbl_Al3;
+	vuint8m1_t tbl_Bh0, tbl_Bh1, tbl_Bh2, tbl_Bh3, tbl_Bl0, tbl_Bl1, tbl_Bl2, tbl_Bl3;
+	vuint8m1_t tbl_Ch0, tbl_Ch1, tbl_Ch2, tbl_Ch3, tbl_Cl0, tbl_Cl1, tbl_Cl2, tbl_Cl3;
+	// TODO: support calcing multiple tables together
+	#define CALC_TABLE(n, t) \
+		if(srcCount >= n) \
+			gf16_shuffle_128_rvv_calc_table( \
+				poly_l, coefficients[n], \
+				&tbl_##t##l0, &tbl_##t##l1, &tbl_##t##l2, &tbl_##t##l3, &tbl_##t##h0, &tbl_##t##h1, &tbl_##t##h2, &tbl_##t##h3 \
+			)
+	CALC_TABLE(0, A);
+	CALC_TABLE(1, B);
+	CALC_TABLE(2, C);
+	#undef CALC_TABLE
+	
+	size_t vl = RV(vsetvlmax_e8m1)();
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += vl*2) {
+		// TODO: does RISC-V have prefetch instructions?
+		UNUSED(doPrefetch); UNUSED(_pf);
+		
+		vuint8m1_t rl, rh;
+		_vlseg2e8(&rl, &rh, _dst+ptr, vl*2);
+		
+		vuint8m1_t in0, in1;
+		_vlseg2e8(&in0, &in1, _src1+ptr*srcScale, vl*2);
+		
+		gf16_shuffle_128_rvv_round(vl, in0, in1, &rl, &rh, tbl_Al0, tbl_Al1, tbl_Al2, tbl_Al3, tbl_Ah0, tbl_Ah1, tbl_Ah2, tbl_Ah3);
+		if(srcCount > 1) {
+			_vlseg2e8(&in0, &in1, _src2+ptr*srcScale, vl*2);
+			gf16_shuffle_128_rvv_round(vl, in0, in1, &rl, &rh, tbl_Bl0, tbl_Bl1, tbl_Bl2, tbl_Bl3, tbl_Bh0, tbl_Bh1, tbl_Bh2, tbl_Bh3);
+		}
+		if(srcCount > 2) {
+			_vlseg2e8(&in0, &in1, _src3+ptr*srcScale, vl*2);
+			gf16_shuffle_128_rvv_round(vl, in0, in1, &rl, &rh, tbl_Cl0, tbl_Cl1, tbl_Cl2, tbl_Cl3, tbl_Ch0, tbl_Ch1, tbl_Ch2, tbl_Ch3);
+		}
+		
+		_vsseg2e8(_dst+ptr, rl, rh, vl*2);
+	}
+}
+
+#endif /*defined(__RVV_LE)*/
+
+
+
+
+void gf16_shuffle_mul_128_rvv(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch);
+#if defined(__RVV_LE)
+	vuint8m1_t poly_l = RV(vle8_v_u8m1)((const uint8_t*)scratch, 16);
+	vuint8m1_t tbl_h0, tbl_h1, tbl_h2, tbl_h3, tbl_l0, tbl_l1, tbl_l2, tbl_l3;
+	gf16_shuffle_128_rvv_calc_table(poly_l, val, &tbl_l0, &tbl_l1, &tbl_l2, &tbl_l3, &tbl_h0, &tbl_h1, &tbl_h2, &tbl_h3);
+	
+	
+	const uint8_t* _src = (const uint8_t*)src + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	size_t vl = RV(vsetvlmax_e8m1)();
+	
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += vl*2) {
+		vuint8m1_t in0, in1;
+		_vlseg2e8(&in0, &in1, _src+ptr, vl*2);
+		
+		vuint8m1_t tmp = RV(vand_vx_u8m1)(in0, 0xf, vl);
+		vuint8m1_t rl = RV(vrgather_vv_u8m1)(tbl_l0, tmp, vl);
+		vuint8m1_t rh = RV(vrgather_vv_u8m1)(tbl_h0, tmp, vl);
+		
+		tmp = RV(vand_vx_u8m1)(in1, 0xf, vl);
+		rl = RV(vxor_vv_u8m1)(rl, RV(vrgather_vv_u8m1)(tbl_l2, tmp, vl), vl);
+		rh = RV(vxor_vv_u8m1)(rh, RV(vrgather_vv_u8m1)(tbl_h2, tmp, vl), vl);
+		
+		tmp = RV(vsrl_vx_u8m1)(in0, 4, vl);
+		rl = RV(vxor_vv_u8m1)(rl, RV(vrgather_vv_u8m1)(tbl_l1, tmp, vl), vl);
+		rh = RV(vxor_vv_u8m1)(rh, RV(vrgather_vv_u8m1)(tbl_h1, tmp, vl), vl);
+		
+		tmp = RV(vsrl_vx_u8m1)(in1, 4, vl);
+		rl = RV(vxor_vv_u8m1)(rl, RV(vrgather_vv_u8m1)(tbl_l3, tmp, vl), vl);
+		rh = RV(vxor_vv_u8m1)(rh, RV(vrgather_vv_u8m1)(tbl_h3, tmp, vl), vl);
+		
+		_vsseg2e8(_dst+ptr, rl, rh, vl*2);
+	}
+#else
+	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val);
+#endif
+}
+
+void gf16_shuffle_muladd_128_rvv(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch);
+#if defined(__RVV_LE)
+	gf16_muladd_single(scratch, gf16_shuffle_muladd_x_128_rvv, dst, src, len, val);
+#else
+	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val);
+#endif
+}
+
+
+#if defined(__RVV_LE)
+GF16_MULADD_MULTI_FUNCS(gf16_shuffle, _128_rvv, gf16_shuffle_muladd_x_128_rvv, 3, RV(vsetvlmax_e8m1)()*2, 0, (void)0)
+#else
+GF16_MULADD_MULTI_FUNCS_STUB(gf16_shuffle, _128_rvv)
+#endif
+
+
+
+#ifdef __RVV_LE
+static HEDLEY_ALWAYS_INLINE void gf16_prepare_block_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src) {
+	size_t vl = RV(vsetvlmax_e8m2)();
+	RV(vse8_v_u8m2)((uint8_t*)dst, RV(vle8_v_u8m2)((const uint8_t*)src, vl), vl);
+}
+// final block
+static HEDLEY_ALWAYS_INLINE void gf16_prepare_blocku_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) {
+	// current intrinsics don't seem to support tail-undisturbed policy, so zero explicitly for now
+	size_t vl = RV(vsetvlmax_e8m2)();
+	RV(vse8_v_u8m2)((uint8_t*)dst, RV(vmv_v_x_u8m2)(0, vl), vl);
+	vl = RV(vsetvl_e8m2)(remaining);
+	RV(vse8_v_u8m2)((uint8_t*)dst, RV(vle8_v_u8m2)((const uint8_t*)src, vl), vl);
+}
+static HEDLEY_ALWAYS_INLINE void gf16_finish_blocku_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) {
+	size_t vl = RV(vsetvl_e8m2)(remaining);
+	RV(vse8_v_u8m2)((uint8_t*)dst, RV(vle8_v_u8m2)((const uint8_t*)src, vl), vl);
+}
+
+static HEDLEY_ALWAYS_INLINE void gf16_checksum_prepare_rvv(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT checksum, const size_t blockLen, gf16_transform_block_rst prepareBlock) {
+	int16_t tmp[blockLen/2];
+	memset(tmp, 0, blockLen);
+	RV(vse16_v_i16m1)(tmp, *(vint16m1_t*)checksum, RV(vsetvlmax_e16m1)());
+	
+	prepareBlock(dst, tmp);
+}
+
+#include "gf16_checksum_rvv.h"
+
+// TODO: should align be width of the vector, instead of 16?
+GF_PREPARE_PACKED_FUNCS(gf16_shuffle, _rvv, RV(vsetvlmax_e8m1)()*2, gf16_prepare_block_rvv, gf16_prepare_blocku_rvv, 3, (void)0, vuint16m1_t checksum = RV(vmv_v_x_u16m1)(0, RV(vsetvlmax_e16m1)()), gf16_checksum_block_rvv, gf16_checksum_blocku_rvv, gf16_checksum_exp_rvv, gf16_checksum_prepare_rvv, 16)
+GF_FINISH_PACKED_FUNCS(gf16_shuffle, _rvv, RV(vsetvlmax_e8m1)()*2, gf16_prepare_block_rvv, gf16_finish_blocku_rvv, 1, (void)0, gf16_checksum_block_rvv, gf16_checksum_blocku_rvv, gf16_checksum_exp_rvv, NULL, 16)
+#else
+GF_PREPARE_PACKED_FUNCS_STUB(gf16_shuffle, _rvv)
+GF_FINISH_PACKED_FUNCS_STUB(gf16_shuffle, _rvv)
+#endif
+
+
+
+
+
+int gf16_rvv_get_size() {
+#ifdef __RVV_LE
+	return RV(vsetvlmax_e8m1)();
+#else
+	return 0;
+#endif
+}
+
+void* gf16_shuffle_init_128_rvv(int polynomial) {
+#ifdef __RVV_LE
+	uint8_t* ret;
+	if((polynomial | 0x1f) != 0x1101f) return NULL;
+	ALIGN_ALLOC(ret, 16, 16);
+	for(int i=0; i<16; i++) {
+		int p = 0;
+		if(i & 8) p ^= polynomial << 3;
+		if(i & 4) p ^= polynomial << 2;
+		if(i & 2) p ^= polynomial << 1;
+		if(i & 1) p ^= polynomial << 0;
+		
+		ret[i] = p & 0xff;
+	}
+	return ret;
+#else
+	UNUSED(polynomial);
+	return NULL;
+#endif
+}
+
diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index 0d48bf73..1094ced4 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -210,6 +210,17 @@ struct CpuCap {
 	}
 };
 #endif
+#ifdef __riscv
+# include "../src/cpuid.h"
+
+struct CpuCap {
+	bool hasVector;
+	CpuCap(bool detect) : hasVector(true) {
+		if(!detect) return;
+		hasVector = CPU_HAS_VECTOR;
+	}
+};
+#endif
 
 
 Galois16MethodInfo Galois16Mul::info(Galois16Methods _method) {
@@ -309,6 +320,13 @@ Galois16MethodInfo Galois16Mul::info(Galois16Methods _method) {
 			_info.idealInputMultiple = 4;
 		break;
 
+		case GF16_SHUFFLE_128_RVV:
+			_info.alignment = 16; // I guess this is good enough...
+			_info.cksumSize = gf16_rvv_get_size();
+			_info.stride = _info.cksumSize*2;
+			_info.idealInputMultiple = 3;
+		break;
+
 		case GF16_CLMUL_SVE2:
 			_info.alignment = 16;
 			_info.cksumSize = gf16_sve_get_size();
@@ -436,6 +454,7 @@ Galois16MethodInfo Galois16Mul::info(Galois16Methods _method) {
 		case GF16_SHUFFLE_NEON:
 		case GF16_SHUFFLE_128_SVE: // may need smaller chunks for larger vector size
 		case GF16_SHUFFLE_128_SVE2:
+		case GF16_SHUFFLE_128_RVV:
 			_info.idealChunkSize = 16*1024;
 		break;
 		case GF16_SHUFFLE_AVX2:
@@ -846,6 +865,29 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			copy_cksum_check = &gf16_cksum_copy_check_sve;
 		break;
 		
+		case GF16_SHUFFLE_128_RVV:
+			scratch = gf16_shuffle_init_128_rvv(GF16_POLYNOMIAL);
+			METHOD_REQUIRES(gf16_available_rvv)
+			
+			_mul = &gf16_shuffle_mul_128_rvv;
+			_mul_add = &gf16_shuffle_muladd_128_rvv;
+			_mul_add_multi = &gf16_shuffle_muladd_multi_128_rvv;
+			_mul_add_multi_stridepf = &gf16_shuffle_muladd_multi_stridepf_128_rvv;
+			_mul_add_multi_packed = &gf16_shuffle_muladd_multi_packed_128_rvv;
+			//_mul_add_multi_packpf = &gf16_shuffle_muladd_multi_packpf_128_rvv;
+			add_multi = &gf_add_multi_rvv;
+			add_multi_packed = &gf_add_multi_packed_v2i3_rvv;
+			add_multi_packpf = &gf_add_multi_packpf_v2i3_rvv;
+			prepare_packed = &gf16_shuffle_prepare_packed_rvv;
+			prepare_packed_cksum = &gf16_shuffle_prepare_packed_cksum_rvv;
+			prepare_partial_packsum = &gf16_shuffle_prepare_partial_packsum_rvv;
+			finish_packed = &gf16_shuffle_finish_packed_rvv;
+			finish_packed_cksum = &gf16_shuffle_finish_packed_cksum_rvv;
+			finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_rvv;
+			copy_cksum = &gf16_cksum_copy_rvv;
+			copy_cksum_check = &gf16_cksum_copy_check_rvv;
+		break;
+		
 		case GF16_AFFINE_AVX512:
 			scratch = gf16_affine_init_avx2(GF16_POLYNOMIAL);
 			METHOD_REQUIRES(gf16_affine_available_avx512 && gf16_shuffle_available_avx512)
@@ -1332,7 +1374,11 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu
 # endif
 			? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON;
 #endif
-	
+#ifdef __riscv_
+	const CpuCap caps(true);
+	if(caps.hasVector && gf16_available_rvv && gf16_rvv_get_size() >= 16)
+		return GF16_SHUFFLE_128_RVV;
+#endif
 	
 	// lookup vs lookup3: latter seems to be slightly faster than former in most cases (SKX, Silvermont, Zen1, Rpi3 (arm64; arm32 faster muladd, slower mul)), sometimes slightly slower (Haswell, IvB?, Piledriver)
 	// but test w/ multi-region lh-lookup & fat table before preferring it
@@ -1410,6 +1456,11 @@ std::vector<Galois16Methods> Galois16Mul::availableMethods(bool checkCpuid) {
 			ret.push_back(GF16_SHUFFLE_512_SVE2);
 	}
 #endif
+#ifdef __riscv
+	const CpuCap caps(checkCpuid);
+	if(gf16_available_rvv && caps.hasVector && gf16_rvv_get_size() >= 16)
+		ret.push_back(GF16_SHUFFLE_128_RVV);
+#endif
 	
 	return ret;
 }
diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h
index b1e91b7b..8d47a8ff 100644
--- a/gf16/gf16mul.h
+++ b/gf16/gf16mul.h
@@ -45,6 +45,7 @@ enum Galois16Methods {
 	GF16_SHUFFLE_128_SVE2,
 	GF16_SHUFFLE2X_128_SVE2,
 	GF16_SHUFFLE_512_SVE2,
+	GF16_SHUFFLE_128_RVV,
 	GF16_SHUFFLE_SSSE3,
 	GF16_SHUFFLE_AVX,
 	GF16_SHUFFLE_AVX2,
@@ -76,6 +77,7 @@ static const char* Galois16MethodsText[] = {
 	"Shuffle-128 (SVE2)",
 	"Shuffle2x-128 (SVE2)",
 	"Shuffle-512 (SVE2)",
+	"Shuffle-128 (RVV)",
 	"Shuffle (SSSE3)",
 	"Shuffle (AVX)",
 	"Shuffle (AVX2)",
diff --git a/gf16/gf_add.h b/gf16/gf_add.h
index ee964fb8..8a746f8a 100644
--- a/gf16/gf_add.h
+++ b/gf16/gf_add.h
@@ -8,6 +8,7 @@ void gf_add_multi_avx512(unsigned regions, size_t offset, void *HEDLEY_RESTRICT
 void gf_add_multi_neon(unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len);
 void gf_add_multi_sve(unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len);
 void gf_add_multi_sve2(unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len);
+void gf_add_multi_rvv(unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len);
 
 
 void gf_add_multi_packed_v1i2_sse2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len);
@@ -39,6 +40,7 @@ void gf_add_multi_packed_v1i6_sve2(unsigned packRegions, unsigned regions, void
 void gf_add_multi_packed_v2i3_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len);
 void gf_add_multi_packed_v2i4_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len);
 void gf_add_multi_packed_v2i8_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len);
+void gf_add_multi_packed_v2i3_rvv(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len);
 
 void gf_add_multi_packpf_v1i2_sse2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut);
 void gf_add_multi_packpf_v1i6_sse2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut);
@@ -69,6 +71,7 @@ void gf_add_multi_packpf_v1i6_sve2(unsigned packRegions, unsigned regions, void
 void gf_add_multi_packpf_v2i3_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut);
 void gf_add_multi_packpf_v2i4_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut);
 void gf_add_multi_packpf_v2i8_sve2(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut);
+void gf_add_multi_packpf_v2i3_rvv(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut);
 
 
 void gf_add_multi_packed_generic(unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len);
diff --git a/gf16/gf_add_rvv.c b/gf16/gf_add_rvv.c
new file mode 100644
index 00000000..347de417
--- /dev/null
+++ b/gf16/gf_add_rvv.c
@@ -0,0 +1,83 @@
+#include "gf16_rvv_common.h"
+#include "gf16_muladd_multi.h"
+
+#ifdef __riscv_vector
+
+static HEDLEY_ALWAYS_INLINE void gf_add_x_rvv(
+	const void *HEDLEY_RESTRICT scratch, uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale,
+	GF16_MULADD_MULTI_SRCLIST, size_t len,
+	const uint16_t *HEDLEY_RESTRICT coefficients,
+	const int doPrefetch, const char* _pf
+) {
+	ASSUME(len > 0);
+	
+	GF16_MULADD_MULTI_SRC_UNUSED(18);
+	UNUSED(coefficients);
+	
+	unsigned vecStride = (unsigned)((uintptr_t)scratch); // abuse this otherwise unused variable
+	
+	if(vecStride == 2) { // only support a vecStride of 2 for now (may eventually support 1 for CLMul)
+		size_t vl = RV(vsetvlmax_e8m2)();
+		for(intptr_t ptr = -(intptr_t)len; ptr; ptr += vl) {
+			vuint8m2_t data = RV(vle8_v_u8m2)(_dst+ptr, vl);
+			
+			#define XOR_LOAD(n) \
+				if(srcCount >= n) \
+					data = RV(vxor_vv_u8m2)(data, RV(vle8_v_u8m2)(_src##n+ptr*srcScale, vl), vl)
+			XOR_LOAD(1);
+			XOR_LOAD(2);
+			XOR_LOAD(3);
+			XOR_LOAD(4);
+			XOR_LOAD(5);
+			XOR_LOAD(6);
+			XOR_LOAD(7);
+			XOR_LOAD(8);
+			XOR_LOAD(9);
+			XOR_LOAD(10);
+			XOR_LOAD(11);
+			XOR_LOAD(12);
+			XOR_LOAD(13);
+			XOR_LOAD(14);
+			XOR_LOAD(15);
+			XOR_LOAD(16);
+			XOR_LOAD(17);
+			XOR_LOAD(18);
+			#undef XOR_LOAD
+			
+			RV(vse8_v_u8m2)(_dst+ptr, data, vl);
+			
+			UNUSED(doPrefetch); UNUSED(_pf);
+		}
+	}
+}
+#endif
+
+void gf_add_multi_rvv(unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len) {
+#ifdef __riscv_vector
+	gf16_muladd_multi((void*)2, &gf_add_x_rvv, 4, regions, offset, dst, src, len, NULL);
+#else
+	UNUSED(regions); UNUSED(offset); UNUSED(dst); UNUSED(src); UNUSED(len);
+#endif
+}
+
+#ifdef __riscv_vector
+# define PACKED_FUNC(vs, il, it) \
+void gf_add_multi_packed_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len) { \
+	gf16_muladd_multi_packed((void*)vs, &gf_add_x_rvv, il, it, packedRegions, regions, dst, src, len, RV(vsetvlmax_e8m1)()*vs, NULL); \
+} \
+void gf_add_multi_packpf_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \
+	gf16_muladd_multi_packpf((void*)vs, &gf_add_x_rvv, il, it, packedRegions, regions, dst, src, len, RV(vsetvlmax_e8m1)()*vs, NULL, vs>1, prefetchIn, prefetchOut); \
+}
+#else
+# define PACKED_FUNC(vs, il, it) \
+void gf_add_multi_packed_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len) { \
+	UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); \
+}\
+void gf_add_multi_packpf_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \
+	UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(prefetchIn); UNUSED(prefetchOut); \
+}
+#endif
+
+PACKED_FUNC(2, 3, 12)
+
+#undef PACKED_FUNC
diff --git a/help.txt b/help.txt
index 66674bbd..6e0d28b9 100644
--- a/help.txt
+++ b/help.txt
@@ -295,6 +295,8 @@ CPU Tuning Options:
                                  shuffle2x128-sve2: half width variant of shuffle-neon (requires SVE width >= 256 bits)
                                  shuffle512-sve2: SVE2 variant of shuffle-vbmi (requires SVE width >= 512 bits)
                                  clmul-sve2: SVE2 variant of clmul-neon
+                             RISC-V only choices:
+                                 shuffle128-rvv: RISC-V Vector variant of shuffle128-sve2
                              Default is auto-detected.
        --loop-tile-size      Target size used for loop tiling optimisation.
                              Default is 0 (auto-detected)
diff --git a/lib/par2.js b/lib/par2.js
index 5fa7203c..c45f3ecf 100644
--- a/lib/par2.js
+++ b/lib/par2.js
@@ -884,6 +884,7 @@ PAR2Chunked.prototype = {
 var GF_METHODS = [
 	'' /*default*/, 'lookup', 'lookup-sse', '3p_lookup',
 	'shuffle-neon', 'shuffle128-sve', 'shuffle128-sve2', 'shuffle2x128-sve2', 'shuffle512-sve2',
+	'shuffle128-rvv',
 	'shuffle-sse', 'shuffle-avx', 'shuffle-avx2', 'shuffle-avx512', 'shuffle-vbmi',
 	'shuffle2x-avx2', 'shuffle2x-avx512',
 	'xor-sse', 'xorjit-sse', 'xorjit-avx2', 'xorjit-avx512',
diff --git a/src/cpuid.h b/src/cpuid.h
index af6b58df..05201e37 100644
--- a/src/cpuid.h
+++ b/src/cpuid.h
@@ -125,4 +125,32 @@ static unsigned long getauxval(unsigned long cap) {
 
 #endif
 
+#ifdef __riscv
+# if defined(__has_include)
+#  if __has_include(<sys/auxv.h>)
+#   include <sys/auxv.h>
+#   ifdef __FreeBSD__
+static unsigned long getauxval(unsigned long cap) {
+	unsigned long ret;
+	elf_aux_info(cap, &ret, sizeof(ret));
+	return ret;
+}
+#   endif
+#   if __has_include(<asm/hwcap.h>)
+#    include <asm/hwcap.h>
+#   endif
+#  endif
+# endif
+
+# ifndef CPU_HAS_VECTOR
+#  define CPU_HAS_VECTOR false
+
+#  if defined(AT_HWCAP)
+#   undef CPU_HAS_VECTOR
+#   define CPU_HAS_VECTOR (getauxval(AT_HWCAP) & (1 << ('V'-'A')))
+#  endif
+# endif
+
+#endif
+
 #endif /* PP_CPUID_H */

From 07ead19ca15036a6d52f621b07091f9884081fc7 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 13 Aug 2023 20:35:42 +1000
Subject: [PATCH 50/91] Fixes for last commit

---
 binding.gyp       | 2 +-
 gf16/gf_add_rvv.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/binding.gyp b/binding.gyp
index 42d1e9f1..89065506 100644
--- a/binding.gyp
+++ b/binding.gyp
@@ -37,7 +37,7 @@
     {
       "target_name": "parpar_gf",
       "dependencies": [
-        "parpar_gf_c", "gf16", "gf16_generic", "gf16_sse2", "gf16_ssse3", "gf16_avx", "gf16_avx2", "gf16_avx512", "gf16_vbmi", "gf16_gfni", "gf16_gfni_avx2", "gf16_gfni_avx512", "gf16_neon", "gf16_sve", "gf16_sve2",
+        "parpar_gf_c", "gf16", "gf16_generic", "gf16_sse2", "gf16_ssse3", "gf16_avx", "gf16_avx2", "gf16_avx512", "gf16_vbmi", "gf16_gfni", "gf16_gfni_avx2", "gf16_gfni_avx512", "gf16_neon", "gf16_sve", "gf16_sve2", "gf16_rvv",
         "hasher", "hasher_sse2", "hasher_clmul", "hasher_xop", "hasher_bmi1", "hasher_avx2", "hasher_avx512", "hasher_avx512vl", "hasher_armcrc", "hasher_neon", "hasher_neoncrc", "hasher_sve2"
       ],
       "sources": ["src/gf.cc", "gf16/controller.cpp", "gf16/controller_cpu.cpp", "gf16/controller_ocl.cpp", "gf16/controller_ocl_init.cpp"],
diff --git a/gf16/gf_add_rvv.c b/gf16/gf_add_rvv.c
index 347de417..f52a237e 100644
--- a/gf16/gf_add_rvv.c
+++ b/gf16/gf_add_rvv.c
@@ -72,7 +72,7 @@ void gf_add_multi_packpf_v##vs##i##il##_rvv(unsigned packedRegions, unsigned reg
 # define PACKED_FUNC(vs, il, it) \
 void gf_add_multi_packed_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len) { \
 	UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); \
-}\
+} \
 void gf_add_multi_packpf_v##vs##i##il##_rvv(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \
 	UNUSED(packedRegions); UNUSED(regions); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(prefetchIn); UNUSED(prefetchOut); \
 }

From e7d471a3176f2e7a1ac4d58786d853dbc01002ba Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 13 Aug 2023 20:46:41 +1000
Subject: [PATCH 51/91] Display chosen kernel for whenever a kernel is selected

---
 bin/parpar.js        |   8 ++-
 gf16/gf16pmul.cpp    |  22 +++++++
 gf16/gf16pmul.h      |  11 ++++
 gf16/gfmat_inv.cpp   |  12 +++-
 gf16/gfmat_inv.h     |   4 ++
 hasher/hasher.cpp    | 139 +++++++++++++++++++++++++++++++++++--------
 hasher/hasher.h      |   6 ++
 hasher/hasher_impl.h |  11 ++++
 lib/par2.js          |   6 ++
 lib/par2gen.js       |   3 +
 src/gf.cc            |   2 +
 11 files changed, 197 insertions(+), 27 deletions(-)

diff --git a/bin/parpar.js b/bin/parpar.js
index a2d3c18f..f134bf5b 100755
--- a/bin/parpar.js
+++ b/bin/parpar.js
@@ -732,6 +732,7 @@ var inputFiles = argv._;
 			var sizeDisp = function(val) {
 				return cliFormat('1', friendlySize(val));
 			};
+			var hash_methods = g.hash_methods();
 			if(argv.json) {
 				print_json('processing_info', {
 					input_size: g.totalSize,
@@ -753,7 +754,9 @@ var inputFiles = argv._;
 							recovery_offset: rf.recoveryOffset,
 							size: rf.totalSize
 						};
-					})
+					}),
+					hash_input_method: hash_methods[0],
+					hash_recovery_method: hash_methods[1]
 				});
 			} else {
 				if(g.opts.sliceSize > 1024*1048576) {
@@ -771,6 +774,9 @@ var inputFiles = argv._;
 					process.stderr.write('Input pass(es)    : ' + cliFormat('1', g.chunks * g.passes) + ', processing ' + pluralDisp(g.slicesPerPass, '* ' + sizeDisp(g._chunkSize) + ' chunk') + ' per pass\n');
 				}
 				process.stderr.write('Read buffer size  : ' + sizeDisp(g.readSize) + ' * max ' + pluralDisp(g.opts.readBuffers, 'buffer') + '\n');
+				process.stderr.write('Hash method       : ' + cliFormat('1', hash_methods[0]) + ' (input)' + (g.opts.recoverySlices ?
+					', ' + cliFormat('1', hash_methods[1]) + ' (recovery)'
+				: '') + '\n');
 			}
 		}
 		if(argv.progress != 'none') {
diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp
index 5e1f7504..22e31381 100644
--- a/gf16/gf16pmul.cpp
+++ b/gf16/gf16pmul.cpp
@@ -2,11 +2,13 @@
 #include "../src/cpuid.h"
 
 Gf16PMulFunc gf16pmul = nullptr;
+Galois16PointMulMethods gf16pmul_method = GF16PMUL_NONE;
 size_t gf16pmul_alignment = 1;
 size_t gf16pmul_blocklen = 1;
 
 void setup_pmul() {
 	gf16pmul = nullptr;
+	gf16pmul_method = GF16PMUL_NONE;
 	gf16pmul_alignment = 1;
 	gf16pmul_blocklen = 1;
 	
@@ -40,21 +42,25 @@ void setup_pmul() {
 	
 	if(gf16pmul_available_vpclgfni) {
 		gf16pmul = &gf16pmul_vpclgfni;
+		gf16pmul_method = GF16PMUL_VPCLMUL_GFNI;
 		gf16pmul_alignment = 32;
 		gf16pmul_blocklen = 64;
 	}
 	else if(gf16pmul_available_vpclmul) {
 		gf16pmul = &gf16pmul_vpclmul;
+		gf16pmul_method = GF16PMUL_VPCLMUL;
 		gf16pmul_alignment = 32;
 		gf16pmul_blocklen = 32;
 	}
 	else if(gf16pmul_available_avx2) {
 		gf16pmul = &gf16pmul_avx2;
+		gf16pmul_method = GF16PMUL_AVX2;
 		gf16pmul_alignment = 32;
 		gf16pmul_blocklen = 32;
 	}
 	else if(gf16pmul_available_sse) {
 		gf16pmul = &gf16pmul_sse;
+		gf16pmul_method = GF16PMUL_PCLMUL;
 		gf16pmul_alignment = 16;
 		gf16pmul_blocklen = 16;
 	}
@@ -66,13 +72,29 @@ void setup_pmul() {
 	
 	if(gf16pmul_available_sve2) {
 		gf16pmul = &gf16pmul_sve2;
+		gf16pmul_method = GF16PMUL_SVE2;
 		gf16pmul_alignment = gf16pmul_sve2_width();
 		gf16pmul_blocklen = gf16pmul_alignment*2;
 	}
 	else if(gf16pmul_available_neon) {
 		gf16pmul = &gf16pmul_neon;
+		gf16pmul_method = GF16PMUL_NEON;
 		gf16pmul_alignment = 16;
 		gf16pmul_blocklen = 32;
 	}
 #endif
 }
+
+const char* gf16pmul_methodName() {
+	const char* names[] = {
+		"None (exponentiate)",
+		"PCLMUL",
+		"AVX2",
+		"VPCLMUL",
+		"VPCLMUL+GFNI",
+		"NEON",
+		"SVE2"
+	};
+	
+	return names[(int)gf16pmul_method];
+}
diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h
index 7ef94ded..c740bc03 100644
--- a/gf16/gf16pmul.h
+++ b/gf16/gf16pmul.h
@@ -4,11 +4,22 @@
 #include "../src/hedley.h"
 #include <stddef.h>
 
+enum Galois16PointMulMethods {
+	GF16PMUL_NONE,
+	GF16PMUL_PCLMUL,
+	GF16PMUL_AVX2,
+	GF16PMUL_VPCLMUL,
+	GF16PMUL_VPCLMUL_GFNI,
+	GF16PMUL_NEON,
+	GF16PMUL_SVE2
+};
+
 // TODO: consider multi-dest
 typedef void(*Gf16PMulFunc)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len);
 extern Gf16PMulFunc gf16pmul;
 extern size_t gf16pmul_alignment;
 extern size_t gf16pmul_blocklen;
+const char* gf16pmul_methodName();
 
 void setup_pmul();
 
diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index 26b4055b..0697b788 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -567,7 +567,11 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	
 	
 	unsigned matWidth = (unsigned)inputValid.size() * sizeof(uint16_t);
-	Galois16RecMatrixComputeState state(Galois16Mul::default_method(matWidth, (unsigned)inputValid.size(), (unsigned)inputValid.size(), true));
+	if(regionMethod == GF16_AUTO) {
+		regionMethod = Galois16Mul::default_method(matWidth, numRec, numRec, true);
+	}
+	
+	Galois16RecMatrixComputeState state((Galois16Methods)regionMethod);
 	state.validCount = validCount;
 	const auto gfInfo = state.gf.info();
 	state.pfFactor = gfInfo.prefetchDownscale;
@@ -681,11 +685,17 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	return true;
 }
 
+const char* Galois16RecMatrix::getPointMulMethodName() const {
+	return gf16pmul_methodName();
+}
+
 Galois16RecMatrix::Galois16RecMatrix() : mat(nullptr) {
 	numThreads = hardware_concurrency();
 	numRec = 0;
 	numStripes = 0;
 	stripeWidth = 0;
+	
+	regionMethod = (int)GF16_AUTO;
 }
 
 Galois16RecMatrix::~Galois16RecMatrix() {
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index ce53d7fe..0e18a096 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -41,6 +41,10 @@ class Galois16RecMatrix {
 		unsigned stripe = inIdx / sw;
 		return mat[stripe * numRec*sw + recIdx * sw + (inIdx % sw)];
 	}
+	
+	// these should only be queried after Compute has started (i.e. from the progressCb, or after it returns)
+	/*Galois16Methods*/ int regionMethod;
+	const char* getPointMulMethodName() const;
 };
 #endif
 
diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp
index cff2ac4a..85e727f0 100644
--- a/hasher/hasher.cpp
+++ b/hasher/hasher.cpp
@@ -3,8 +3,11 @@
 #include <string.h>
 
 IHasherInput*(*HasherInput_Create)() = NULL;
+HasherInputMethods HasherInput_Method = INHASH_SCALAR;
 uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*) = NULL;
+MD5CRCMethods MD5CRC_Method = MD5CRCMETH_SCALAR;
 uint32_t(*CRC32_Calc)(const void*, size_t) = NULL;
+MD5CRCMethods CRC32_Method = MD5CRCMETH_SCALAR;
 struct _CpuCap {
 #ifdef PLATFORM_X86
 	bool hasSSE2, hasXOP, hasBMI1, hasAVX2, hasAVX512F, hasAVX512VLBW;
@@ -22,8 +25,11 @@ void setup_hasher() {
 	if(HasherInput_Create) return;
 	
 	HasherInput_Create = &HasherInput_Scalar::create;
+	HasherInput_Method = INHASH_SCALAR;
 	MD5CRC_Calc = &MD5CRC_Calc_Scalar;
+	MD5CRC_Method = MD5CRCMETH_SCALAR;
 	CRC32_Calc = &CRC32_Calc_Slice4;
+	CRC32_Method = MD5CRCMETH_SCALAR;
 	
 	struct _CpuCap CpuCap;
 	(void)CpuCap;
@@ -73,45 +79,67 @@ void setup_hasher() {
 	_cpuid(cpuInfo, 0x80000001);
 	CpuCap.hasXOP = hasAVX && (cpuInfo[2] & 0x800);
 	
-	if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && HasherInput_AVX512::isAvailable)
+	if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && HasherInput_AVX512::isAvailable) {
 		HasherInput_Create = &HasherInput_AVX512::create;
+		HasherInput_Method = INHASH_AVX512;
+	}
 	// SSE seems to be faster than scalar on Zen1/2, not Zen3; BMI > SSE on Zen1, unknown on Zen2
 	else if(hasClMul && !isSmallCore && HasherInput_ClMulScalar::isAvailable) {
 		// Gracemont: SSE > scalar, but SSE ~= BMI
-		if(CpuCap.hasBMI1 && HasherInput_BMI1::isAvailable)
+		if(CpuCap.hasBMI1 && HasherInput_BMI1::isAvailable) {
 			HasherInput_Create = &HasherInput_BMI1::create;
-		else
+			HasherInput_Method = INHASH_BMI1;
+		} else {
 			HasherInput_Create = &HasherInput_ClMulScalar::create;
-	} else if(hasClMul && isSmallCore && HasherInput_ClMulSSE::isAvailable)
+			HasherInput_Method = INHASH_CRC;
+		}
+	} else if(hasClMul && isSmallCore && HasherInput_ClMulSSE::isAvailable) {
 		HasherInput_Create = &HasherInput_ClMulSSE::create;
-	else if(CpuCap.hasSSE2 && isSmallCore && HasherInput_SSE::isAvailable) // TODO: CPU w/o ClMul might all be small enough
+		HasherInput_Method = INHASH_SIMD_CRC;
+	}
+	else if(CpuCap.hasSSE2 && isSmallCore && HasherInput_SSE::isAvailable) { // TODO: CPU w/o ClMul might all be small enough
 		HasherInput_Create = &HasherInput_SSE::create;
+		HasherInput_Method = INHASH_SIMD;
+	}
 	
 	if(CpuCap.hasAVX512VLBW && !isVecRotSlow && MD5Single_isAvailable_AVX512) {
 		MD5Single::_update = &MD5Single_update_AVX512;
 		MD5Single::_updateZero = &MD5Single_updateZero_AVX512;
+		MD5Single::method = MD5CRCMETH_AVX512;
 	}
 	else if(isLEASlow && hasClMul && MD5Single_isAvailable_NoLEA) {
 		MD5Single::_update = &MD5Single_update_NoLEA;
 		MD5Single::_updateZero = &MD5Single_updateZero_NoLEA;
+		MD5Single::method = MD5CRCMETH_NOLEA;
 	}
 	// for some reason, single MD5 BMI1 seems to be slower on most cores, except Jaguar... unsure why
 	else if(CpuCap.hasBMI1 && isSmallCore && MD5Single_isAvailable_BMI1) {
 		MD5Single::_update = &MD5Single_update_BMI1;
 		MD5Single::_updateZero = &MD5Single_updateZero_BMI1;
+		MD5Single::method = MD5CRCMETH_BMI1;
 	}
 	
-	if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && MD5CRC_isAvailable_AVX512)
+	if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && MD5CRC_isAvailable_AVX512) {
 		MD5CRC_Calc = &MD5CRC_Calc_AVX512;
-	else if(isLEASlow && hasClMul && MD5CRC_isAvailable_NoLEA)
+		MD5CRC_Method = MD5CRCMETH_AVX512;
+	}
+	else if(isLEASlow && hasClMul && MD5CRC_isAvailable_NoLEA) {
 		MD5CRC_Calc = &MD5CRC_Calc_NoLEA;
-	else if(CpuCap.hasBMI1 && hasClMul && isSmallCore && MD5CRC_isAvailable_BMI1)
+		MD5CRC_Method = MD5CRCMETH_NOLEA;
+	}
+	else if(CpuCap.hasBMI1 && hasClMul && isSmallCore && MD5CRC_isAvailable_BMI1) {
 		MD5CRC_Calc = &MD5CRC_Calc_BMI1;
-	else if(hasClMul && MD5CRC_isAvailable_ClMul)
+		MD5CRC_Method = MD5CRCMETH_BMI1;
+	}
+	else if(hasClMul && MD5CRC_isAvailable_ClMul) {
 		MD5CRC_Calc = &MD5CRC_Calc_ClMul;
+		MD5CRC_Method = MD5CRCMETH_PCLMUL;
+	}
 	
-	if(hasClMul && CRC32_isAvailable_ClMul)
+	if(hasClMul && CRC32_isAvailable_ClMul) {
 		CRC32_Calc = &CRC32_Calc_ClMul;
+		CRC32_Method = MD5CRCMETH_PCLMUL;
+	}
 	
 #endif
 #ifdef PLATFORM_ARM
@@ -120,19 +148,28 @@ void setup_hasher() {
 	CpuCap.hasNEON = CPU_HAS_NEON;
 	CpuCap.hasSVE2 = CPU_HAS_SVE2;
 	
-	if(hasCRC && HasherInput_ARMCRC::isAvailable) // TODO: fast core only
+	if(hasCRC && HasherInput_ARMCRC::isAvailable) { // TODO: fast core only
 		HasherInput_Create = &HasherInput_ARMCRC::create;
+		HasherInput_Method = INHASH_CRC;
+	}
 	else if(CpuCap.hasNEON) { // TODO: slow core only
-		if(hasCRC && HasherInput_NEONCRC::isAvailable)
+		if(hasCRC && HasherInput_NEONCRC::isAvailable) {
 			HasherInput_Create = &HasherInput_NEONCRC::create;
-		else if(HasherInput_NEON::isAvailable)
+			HasherInput_Method = INHASH_SIMD_CRC;
+		} else if(HasherInput_NEON::isAvailable) {
 			HasherInput_Create = &HasherInput_NEON::create;
+			HasherInput_Method = INHASH_SIMD;
+		}
 	}
 	
-	if(hasCRC && MD5CRC_isAvailable_ARMCRC)
+	if(hasCRC && MD5CRC_isAvailable_ARMCRC) {
 		MD5CRC_Calc = &MD5CRC_Calc_ARMCRC;
-	if(hasCRC && CRC32_isAvailable_ARMCRC)
+		MD5CRC_Method = MD5CRCMETH_ARMCRC;
+	}
+	if(hasCRC && CRC32_isAvailable_ARMCRC) {
 		CRC32_Calc = &CRC32_Calc_ARMCRC;
+		CRC32_Method = MD5CRCMETH_ARMCRC;
+	}
 #endif
 	
 	
@@ -155,24 +192,25 @@ void setup_hasher() {
 }
 
 bool set_hasherInput(HasherInputMethods method) {
-#define SET_HASHER(x) { \
+#define SET_HASHER(h, x) if(method == h) { \
 		if(!x::isAvailable) return false; \
 		HasherInput_Create = &x::create; \
+		HasherInput_Method = h; \
 		return true; \
 	}
 	
-	if(method == INHASH_SCALAR) SET_HASHER(HasherInput_Scalar)
+	SET_HASHER(INHASH_SCALAR, HasherInput_Scalar)
 #ifdef PLATFORM_X86
-	if(method == INHASH_SIMD) SET_HASHER(HasherInput_SSE)
-	if(method == INHASH_CRC) SET_HASHER(HasherInput_ClMulScalar)
-	if(method == INHASH_SIMD_CRC) SET_HASHER(HasherInput_ClMulSSE)
-	if(method == INHASH_BMI1) SET_HASHER(HasherInput_BMI1)
-	if(method == INHASH_AVX512) SET_HASHER(HasherInput_AVX512)
+	SET_HASHER(INHASH_SIMD, HasherInput_SSE)
+	SET_HASHER(INHASH_CRC, HasherInput_ClMulScalar)
+	SET_HASHER(INHASH_SIMD_CRC, HasherInput_ClMulSSE)
+	SET_HASHER(INHASH_BMI1, HasherInput_BMI1)
+	SET_HASHER(INHASH_AVX512, HasherInput_AVX512)
 #endif
 #ifdef PLATFORM_ARM
-	if(method == INHASH_SIMD) SET_HASHER(HasherInput_NEON)
-	if(method == INHASH_CRC) SET_HASHER(HasherInput_ARMCRC)
-	if(method == INHASH_SIMD_CRC) SET_HASHER(HasherInput_NEONCRC)
+	SET_HASHER(INHASH_SIMD, HasherInput_NEON)
+	SET_HASHER(INHASH_CRC, HasherInput_ARMCRC)
+	SET_HASHER(INHASH_SIMD_CRC, HasherInput_NEONCRC)
 #endif
 #undef SET_HASHER
 	return false;
@@ -367,6 +405,7 @@ void MD5Multi::get(void* md5s) {
 
 void(*MD5Single::_update)(uint32_t*, const void*, size_t) = &MD5Single_update_Scalar;
 void(*MD5Single::_updateZero)(uint32_t*, size_t) = &MD5Single_updateZero_Scalar;
+MD5CRCMethods MD5Single::method = MD5CRCMETH_SCALAR;
 const size_t MD5_BLOCKSIZE = 64;
 void MD5Single::update(const void* data, size_t len) {
 	uint_fast8_t buffered = dataLen & (MD5_BLOCKSIZE-1);
@@ -415,3 +454,53 @@ void MD5Single::end(void* md5) {
 	md5_final_block(md5State, tmp, dataLen, 0);
 	memcpy(md5, md5State, 16);
 }
+
+
+const char* hasherInput_methodName() {
+	const char* names[] = {
+		"Scalar + Slice4",
+#ifdef PLATFORM_X86
+		"SSE2 + Slice4",
+		"Scalar + PCLMUL",
+		"SSE2 + PCLMUL",
+#elif defined(PLATFORM_ARM)
+		"NEON + Slice4",
+		"Scalar + ARMv8-CRC32",
+		"NEON + ARMv8-CRC32",
+#else
+		"SIMD + Slice4",
+		"Scalar + CRC",
+		"SIMD + CRC",
+#endif
+		"BMI1 + PCLMUL",
+		"AVX512"
+	};
+	
+	return names[(int)HasherInput_Method];
+}
+const char* hasherMD5Multi_methodName() {
+	const char* names[] = {
+		"Scalar",
+		"SSE2",
+		"AVX2",
+		"XOP",
+		"AVX512F",
+		"AVX512VL",
+		"NEON",
+		"SVE2"
+	};
+	
+	return names[(int)HasherMD5Multi_level];
+}
+const char* md5crc_methodName(MD5CRCMethods m) {
+	const char* names[] = {
+		"Scalar",
+		"BMI1",
+		"NoLEA",
+		"AVX512",
+		"ARMv8-CRC32",
+		"PCLMUL"
+	};
+	
+	return names[(int)m];
+}
diff --git a/hasher/hasher.h b/hasher/hasher.h
index 801cee1c..a52c0bc7 100644
--- a/hasher/hasher.h
+++ b/hasher/hasher.h
@@ -30,6 +30,9 @@ bool set_hasherInput(HasherInputMethods method);
 void set_hasherMD5MultiLevel(MD5MultiLevels level);
 extern IHasherInput*(*HasherInput_Create)();
 
+const char* hasherInput_methodName();
+const char* hasherMD5Multi_methodName();
+
 class MD5Multi {
 	std::vector<IMD5Multi*> ctx;
 	std::vector<const void*> lastCtxData;
@@ -58,6 +61,9 @@ class MD5Multi {
 
 // single hash instances
 extern uint32_t(*CRC32_Calc)(const void*, size_t);
+extern MD5CRCMethods CRC32_Method;
 extern uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*);
+extern MD5CRCMethods MD5CRC_Method;
+const char* md5crc_methodName(MD5CRCMethods m);
 
 #endif /* __HASHER_H */
diff --git a/hasher/hasher_impl.h b/hasher/hasher_impl.h
index 75236e22..d908f4fb 100644
--- a/hasher/hasher_impl.h
+++ b/hasher/hasher_impl.h
@@ -5,6 +5,16 @@
 #include "../src/platform.h"
 #include <new>
 
+enum MD5CRCMethods {
+	MD5CRCMETH_SCALAR,
+	// MD5
+	MD5CRCMETH_BMI1,
+	MD5CRCMETH_NOLEA,
+	MD5CRCMETH_AVX512,
+	// CRC32
+	MD5CRCMETH_ARMCRC,
+	MD5CRCMETH_PCLMUL
+};
 
 class MD5Single {
 public:
@@ -16,6 +26,7 @@ class MD5Single {
 	// private, set by setup_hasher
 	static void(*_update)(uint32_t*, const void*, size_t);
 	static void(*_updateZero)(uint32_t*, size_t);
+	static MD5CRCMethods method; // public, read-only
 	
 	// public interface
 	void reset() {
diff --git a/lib/par2.js b/lib/par2.js
index c45f3ecf..5a26b685 100644
--- a/lib/par2.js
+++ b/lib/par2.js
@@ -946,6 +946,12 @@ module.exports = {
 	set_outhash_method: function(method) {
 		return binding.set_HasherOutput(getMethodNum(OUTHASH_METHODS, method));
 	},
+	get_inhash_methodDesc: function() {
+		return binding.hasherInput_method;
+	},
+	get_outhash_methodDesc: function() {
+		return binding.hasherOutput_method;
+	},
 	
 	_extend: Object.assign || function(to) {
 		for(var i=1; i<arguments.length; i++) {
diff --git a/lib/par2gen.js b/lib/par2gen.js
index 012cbe6e..5886aba7 100644
--- a/lib/par2gen.js
+++ b/lib/par2gen.js
@@ -1437,6 +1437,9 @@ PAR2Gen.prototype = {
 	
 	gf_info: function() {
 		return this[this._chunker ? '_chunker' : 'par2'].gf_info();
+	},
+	hash_methods: function() {
+		return [Par2.get_inhash_methodDesc(), Par2.get_outhash_methodDesc()];
 	}
 };
 
diff --git a/src/gf.cc b/src/gf.cc
index 30e32158..f993a634 100644
--- a/src/gf.cc
+++ b/src/gf.cc
@@ -1275,6 +1275,8 @@ void parpar_gf_init(
 	NODE_SET_METHOD(target, "set_HasherOutput", SetHasherOutput);
 	
 	setup_hasher();
+	SET_OBJ(target, "hasherInput_method", NEW_STRING(hasherInput_methodName()));
+	SET_OBJ(target, "hasherOutput_method", NEW_STRING(hasherMD5Multi_methodName()));
 }
 
 NODE_MODULE(parpar_gf, parpar_gf_init);

From fceee313ebac47750cd1bf23fd3c5ad7a550b67f Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 14 Aug 2023 10:32:20 +1000
Subject: [PATCH 52/91] Remove support for non power-of-two SVE widths ARM has
 removed support for such vector sizes

---
 gf16/gf16mul.cpp |  7 +++++++
 gf16/gf16mul.h   | 10 ----------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index 1094ced4..72500339 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -207,6 +207,13 @@ struct CpuCap {
 		hasNEON = CPU_HAS_NEON;
 		hasSVE = CPU_HAS_SVE;
 		hasSVE2 = CPU_HAS_SVE2;
+		if(hasSVE) {
+			size_t sz = gf16_sve_get_size();
+			if(sz & (sz-1)) { // we don't support non-pow2 vector widths
+				hasSVE = false;
+				hasSVE2 = false;
+			}
+		}
 	}
 };
 #endif
diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h
index 8d47a8ff..5a3910b8 100644
--- a/gf16/gf16mul.h
+++ b/gf16/gf16mul.h
@@ -189,20 +189,10 @@ class Galois16Mul {
 	static Galois16MethodInfo info(Galois16Methods _method);
 	
 	inline HEDLEY_CONST bool isMultipleOfStride(size_t len) const {
-#if defined(_M_ARM64) || defined(__aarch64__)
-		// SVE can have non-power-of-2 strides
-		if(HEDLEY_UNLIKELY((_info.stride & (_info.stride-1)) != 0)) // ...but most of the time, expect stride to be a power of 2
-			return (len % _info.stride) == 0;
-#endif
 		return (len & (_info.stride-1)) == 0;
 	}
 	inline HEDLEY_CONST size_t alignToStride(size_t len) const {
 		size_t alignMask = _info.stride-1;
-#if defined(_M_ARM64) || defined(__aarch64__)
-		if(HEDLEY_UNLIKELY((_info.stride & (_info.stride-1)) != 0)) {
-			return ((len + alignMask) / _info.stride) * _info.stride;
-		}
-#endif
 		return (len + alignMask) & ~alignMask;
 	}
 	

From 9c7db32591558f885ec4ed320c2d722ca0b4b62f Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 14 Aug 2023 11:22:01 +1000
Subject: [PATCH 53/91] Detect RVV intrinsic version

---
 gf16/gf16_checksum_rvv.h   | 14 +++++++++++---
 gf16/gf16_rvv_common.h     |  7 ++++++-
 gf16/gf16_shuffle128_rvv.c | 25 +++++++++++++++----------
 3 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/gf16/gf16_checksum_rvv.h b/gf16/gf16_checksum_rvv.h
index 615109b9..7ea94ba2 100644
--- a/gf16/gf16_checksum_rvv.h
+++ b/gf16/gf16_checksum_rvv.h
@@ -37,11 +37,14 @@ static HEDLEY_ALWAYS_INLINE void gf16_checksum_blocku_rvv(const void *HEDLEY_RES
 	while(amount) {
 		size_t vl = RV(vsetvl_e8m1)(amount);
 		
-		// intrinsics lack tail-undisturbed, so emulate it
+#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
+		v8 = RV(vxor_vv_i8m1_tu)(v8, v8, RV(vle8_v_i8m1)(_src, vl), vl);
+#else
+		// emulate tail-undisturbed
 		vint8m1_t tmp = RV(vmv_v_x_i8m1)(0, vlmax);
 		memcpy(&tmp, _src, vl);
 		v8 = RV(vxor_vv_i8m1)(v8, tmp, vlmax);
-		//v8 = RV(vxor_vv_i8m1)(v8, RV(vle8_v_i8m1)(_src, vl), vl);
+#endif
 		amount -= vl;
 		_src += vl;
 	}
@@ -58,7 +61,12 @@ static HEDLEY_ALWAYS_INLINE void gf16_checksum_exp_rvv(void *HEDLEY_RESTRICT che
 	for(int i=0; i<15; i++) {
 		res = gf16_vec_mul2_rvv(res);
 		coeff = RV(vadd_vv_i16m1)(coeff, coeff, vl);
-		res = RV(vxor_vv_i16m1_m)(
+#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
+		res = RV(vxor_vv_i16m1_mu)
+#else
+		res = RV(vxor_vv_i16m1_m)
+#endif
+		(
 			RV(vmslt_vx_i16m1_b16)(coeff, 0, vl),
 			res, res, _checksum,
 			vl
diff --git a/gf16/gf16_rvv_common.h b/gf16/gf16_rvv_common.h
index 7bc43769..da608806 100644
--- a/gf16/gf16_rvv_common.h
+++ b/gf16/gf16_rvv_common.h
@@ -23,7 +23,12 @@ static HEDLEY_ALWAYS_INLINE vint16m1_t gf16_vec_mul2_rvv(vint16m1_t v) {
 	size_t vl = RV(vsetvlmax_e16m1)();
 	vbool16_t maskPoly = RV(vmslt_vx_i16m1_b16)(v, 0, vl);
 	v = RV(vadd_vv_i16m1)(v, v, vl);
-	return RV(vxor_vx_i16m1_m)(
+#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
+	return RV(vxor_vx_i16m1_mu)
+#else
+	return RV(vxor_vx_i16m1_m)
+#endif
+	(
 		maskPoly,
 		v, v,
 		GF16_POLYNOMIAL & 0xffff,
diff --git a/gf16/gf16_shuffle128_rvv.c b/gf16/gf16_shuffle128_rvv.c
index ba23a1a9..494fc4d0 100644
--- a/gf16/gf16_shuffle128_rvv.c
+++ b/gf16/gf16_shuffle128_rvv.c
@@ -9,12 +9,7 @@ int gf16_available_rvv = 0;
 #include "gf16_muladd_multi.h"
 
 #if defined(__RVV_LE)
-// TODO: detect intrinsics version
-# if 1
-// intrinsics v0.11.x (up to at least GCC 13 / Clang 16)
-#  define _vlseg2e8 RV(vlseg2e8_v_u8m1)
-#  define _vsseg2e8 RV(vsseg2e8_v_u8m1)
-# else
+# if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000
 // intrinsics v0.12.x
 static HEDLEY_ALWAYS_INLINE void _vlseg2e8(vuint8m1_t* v0, vuint8m1_t* v1, const uint8_t* src, size_t vl) {
 	vuint8m1x2_t d = RV(vlseg2e8_v_u8m1x2)(src, vl);
@@ -27,6 +22,10 @@ static HEDLEY_ALWAYS_INLINE void _vsseg2e8(uint8_t* dst, vuint8m1_t v0, vuint8m1
 	d = RV(vset_v_u8m1_u8m1x2)(d, 1, v1);
 	RV(vsseg2e8_v_u8m1x2)(dst, d, vl);
 }
+# else
+// intrinsics v0.11.x (up to at least GCC 13 / Clang 16)
+#  define _vlseg2e8 RV(vlseg2e8_v_u8m1)
+#  define _vsseg2e8 RV(vsseg2e8_v_u8m1)
 # endif
 
 static HEDLEY_ALWAYS_INLINE void gf16_shuffle_128_rvv_calc_table(vuint8m1_t poly_l, uint16_t val,
@@ -217,11 +216,17 @@ static HEDLEY_ALWAYS_INLINE void gf16_prepare_block_rvv(void *HEDLEY_RESTRICT ds
 }
 // final block
 static HEDLEY_ALWAYS_INLINE void gf16_prepare_blocku_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) {
-	// current intrinsics don't seem to support tail-undisturbed policy, so zero explicitly for now
-	size_t vl = RV(vsetvlmax_e8m2)();
-	RV(vse8_v_u8m2)((uint8_t*)dst, RV(vmv_v_x_u8m2)(0, vl), vl);
-	vl = RV(vsetvl_e8m2)(remaining);
+	size_t vlmax = RV(vsetvlmax_e8m2)();
+	vuint8m1_t v = RV(vmv_v_x_u8m2)(0, vlmax);
+	size_t vl = RV(vsetvl_e8m2)(remaining);
+#if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
+	v = RV(vle8_v_u8m2_tu)(v, (const uint8_t*)src, vl);
+	RV(vse8_v_u8m2)((uint8_t*)dst, v, vlmax);
+#else
+	// tail-undisturbed not supported, so zero explicitly as a workaround
+	RV(vse8_v_u8m2)((uint8_t*)dst, v, vlmax);
 	RV(vse8_v_u8m2)((uint8_t*)dst, RV(vle8_v_u8m2)((const uint8_t*)src, vl), vl);
+#endif
 }
 static HEDLEY_ALWAYS_INLINE void gf16_finish_blocku_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) {
 	size_t vl = RV(vsetvl_e8m2)(remaining);

From 6d0d71f3af4104b6193da70e176ba2978aa567b5 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 14 Aug 2023 11:50:46 +1000
Subject: [PATCH 54/91] Suppress GCC 12's warnings for some AVX512 intrinsics

---
 hasher/hasher_avx512.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/hasher/hasher_avx512.cpp b/hasher/hasher_avx512.cpp
index 9b66747d..8c6678f1 100644
--- a/hasher/hasher_avx512.cpp
+++ b/hasher/hasher_avx512.cpp
@@ -1,5 +1,14 @@
+// suppress warning spam in GCC 12.0-12.2 (caused by some AVX512 intrinsics)
+#include "../src/hedley.h"
+#if HEDLEY_GCC_VERSION_CHECK(12,0,0) && !HEDLEY_GCC_VERSION_CHECK(12,3,0)
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wuninitialized"
+# pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 #include "../src/platform.h"
 
+
 #define MD5Multi MD5Multi_AVX512
 #define _FNMD5mb(f) f##_avx512
 #define _FNMD5mb2(f) f##_avx512

From 60090b950038769466f521221586ed9b09299253 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 14 Aug 2023 15:12:00 +1000
Subject: [PATCH 55/91] Create NEON-SHA3 variant of CLMul and move M1
 optimisation there Also add ability to bypass getauxval checks, for testing

---
 binding.gyp                 |  36 ++++++-
 gf16/gf16_clmul.h           |  17 +++-
 gf16/gf16_clmul_neon.c      | 193 ++----------------------------------
 gf16/gf16_clmul_neon.h      |   4 +-
 gf16/gf16_clmul_neon_base.h | 172 ++++++++++++++++++++++++++++++++
 gf16/gf16_clmul_sha3.c      |  47 +++++++++
 gf16/gf16mul.cpp            |  40 ++++++++
 gf16/gf16mul.h              |   2 +
 gf16/gf16pmul_neon.c        |   6 ++
 help.txt                    |   1 +
 lib/par2.js                 |   2 +-
 src/cpuid.h                 | 105 ++++++++++++--------
 12 files changed, 388 insertions(+), 237 deletions(-)
 create mode 100644 gf16/gf16_clmul_neon_base.h
 create mode 100644 gf16/gf16_clmul_sha3.c

diff --git a/binding.gyp b/binding.gyp
index 89065506..86757b38 100644
--- a/binding.gyp
+++ b/binding.gyp
@@ -37,7 +37,7 @@
     {
       "target_name": "parpar_gf",
       "dependencies": [
-        "parpar_gf_c", "gf16", "gf16_generic", "gf16_sse2", "gf16_ssse3", "gf16_avx", "gf16_avx2", "gf16_avx512", "gf16_vbmi", "gf16_gfni", "gf16_gfni_avx2", "gf16_gfni_avx512", "gf16_neon", "gf16_sve", "gf16_sve2", "gf16_rvv",
+        "parpar_gf_c", "gf16", "gf16_generic", "gf16_sse2", "gf16_ssse3", "gf16_avx", "gf16_avx2", "gf16_avx512", "gf16_vbmi", "gf16_gfni", "gf16_gfni_avx2", "gf16_gfni_avx512", "gf16_neon", "gf16_sha3", "gf16_sve", "gf16_sve2", "gf16_rvv",
         "hasher", "hasher_sse2", "hasher_clmul", "hasher_xop", "hasher_bmi1", "hasher_avx2", "hasher_avx512", "hasher_avx512vl", "hasher_armcrc", "hasher_neon", "hasher_neoncrc", "hasher_sve2"
       ],
       "sources": ["src/gf.cc", "gf16/controller.cpp", "gf16/controller_cpu.cpp", "gf16/controller_ocl.cpp", "gf16/controller_ocl_init.cpp"],
@@ -794,6 +794,40 @@
         }]
       ]
     },
+    {
+      "target_name": "gf16_sha3",
+      "type": "static_library",
+      "defines": ["NDEBUG"],
+      "sources": [
+        "gf16/gf16_clmul_sha3.c"
+      ],
+      "cflags": ["-Wno-unused-function", "-std=c99"],
+      "xcode_settings": {
+        "OTHER_CFLAGS": ["-Wno-unused-function"],
+        "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
+      },
+      "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
+      "msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
+      "conditions": [
+        ['target_arch=="arm64" and OS!="win"', {
+          "variables": {"supports_sha3%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -MM -E gf16/gf16_clmul_sha3.c -march=armv8.2-a+sha3 2>/dev/null || true)"},
+          "conditions": [
+            ['supports_sha3!=""', {
+              "cflags!": ["-march=native"],
+              "cxxflags!": ["-march=native"],
+              "cflags": ["-march=armv8.2-a+sha3"],
+              "cxxflags": ["-march=armv8.2-a+sha3"],
+              "xcode_settings": {
+                "OTHER_CFLAGS!": ["-march=native"],
+                "OTHER_CXXFLAGS!": ["-march=native"],
+                "OTHER_CFLAGS": ["-march=armv8.2-a+sha3"],
+                "OTHER_CXXFLAGS": ["-march=armv8.2-a+sha3"],
+              }
+            }]
+          ]
+        }]
+      ]
+    },
     {
       "target_name": "gf16_sve",
       "type": "static_library",
diff --git a/gf16/gf16_clmul.h b/gf16/gf16_clmul.h
index d8f189c2..23845a6c 100644
--- a/gf16/gf16_clmul.h
+++ b/gf16/gf16_clmul.h
@@ -7,18 +7,27 @@
 	void gf16_clmul_muladd_multi_packed_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch); \
 	void gf16_clmul_muladd_multi_packpf_##v(const void *HEDLEY_RESTRICT scratch, unsigned packRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, void *HEDLEY_RESTRICT mutScratch, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut); \
 	void gf16_clmul_mul_##v(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
-	void gf16_clmul_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch); \
-	void gf16_clmul_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
-	void gf16_clmul_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
-	void gf16_clmul_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen)
+	void gf16_clmul_muladd_##v(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t coefficient, void *HEDLEY_RESTRICT mutScratch)
 
 // this is the same as the shuffle version, so re-use that
 //int gf16_clmul_finish_packed_cksum_neon(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen);
 //int gf16_clmul_finish_partial_packsum_neon(void *HEDLEY_RESTRICT dst, void *HEDLEY_RESTRICT src, size_t sliceLen, unsigned numOutputs, unsigned outputNum, size_t chunkLen, size_t partOffset, size_t partLen);
 
+FUNCS(neon);
+FUNCS(sha3);
+FUNCS(sve2);
+
+#undef FUNCS
+
+#define FUNCS(v) \
+	void gf16_clmul_prepare_packed_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
+	void gf16_clmul_prepare_packed_cksum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen); \
+	void gf16_clmul_prepare_partial_packsum_##v(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen, size_t partOffset, size_t partLen)
+
 FUNCS(neon);
 FUNCS(sve2);
 
 #undef FUNCS
 
 int gf16_clmul_init_arm(int polynomial);
+extern int gf16_available_neon_sha3;
diff --git a/gf16/gf16_clmul_neon.c b/gf16/gf16_clmul_neon.c
index 95a89cf2..5784a715 100644
--- a/gf16/gf16_clmul_neon.c
+++ b/gf16/gf16_clmul_neon.c
@@ -1,203 +1,24 @@
 
-#include "gf16_clmul_neon.h"
-#include "gf16_muladd_multi.h"
+#include "gf16_neon_common.h"
 
 // TODO: for any multiplicand byte that's 0 (e.g. for coeff < 256), can shortcut a bunch of stuff, but may not be worth the effort
 
 #if defined(__ARM_NEON)
 
-// NOTE: we avoid EOR3 in pmacl* - only chip which supports NEON-SHA3 without SVE2, are the Apple chips and Neoverse V1; the former has PMULL+EOR fusion, which is better than EOR3
-#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) && defined(__APPLE__)
-// Apple M1 supports fusing PMULL+EOR, so ensure these are paired
-static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_low(poly16x8_t sum, poly8x16_t a, poly8x16_t b) {
-	poly16x8_t result;
-	__asm__ ("pmull %0.8h,%1.8b,%2.8b\n"
-	         "eor %0.16b,%0.16b,%3.16b\n"
-		: "=&w"(result)
-		: "w"(a), "w"(b), "w"(sum)
-		: /* No clobbers */);
-	return result;
-}
-static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_high(poly16x8_t sum, poly8x16_t a, poly8x16_t b) {
-	poly16x8_t result;
-	__asm__ ("pmull2 %0.8h,%1.16b,%2.16b\n"
-	         "eor %0.16b,%0.16b,%3.16b\n"
-		: "=&w"(result)
-		: "w"(a), "w"(b), "w"(sum)
-		: /* No clobbers */);
-	return result;
-}
-#else
 static HEDLEY_ALWAYS_INLINE poly16x8_t veorq_p16(poly16x8_t a, poly16x8_t b) {
 	return vreinterpretq_p16_u16(veorq_u16(vreinterpretq_u16_p16(a), vreinterpretq_u16_p16(b)));
 }
-# define pmacl_low(sum, a, b) veorq_p16(sum, pmull_low(a, b))
-# define pmacl_high(sum, a, b) veorq_p16(sum, pmull_high(a, b))
-#endif
-
-static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round1(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) {
-	poly8x16x2_t data = vld2q_p8((const poly8_t*)src);
-	*low1 = pmull_low(data.val[0], coeff[0]);
-	*low2 = pmull_high(data.val[0], coeff[0]);
-	poly8x16_t mid = veorq_p8(data.val[0], data.val[1]);
-	*mid1 = pmull_low(mid, coeff[2]);
-	*mid2 = pmull_high(mid, coeff[2]);
-	*high1 = pmull_low(data.val[1], coeff[1]);
-	*high2 = pmull_high(data.val[1], coeff[1]);
-	
-	// TODO: try idea of forcing an EOR via asm volatile
-	
-/*  Alternative approach for AArch64, which only needs one register per region at the expense of 2 additional instructions; unfortunately compilers won't heed our aim
-	// the `midCoeff` approach can also work with AArch32
-	coeff_t swapCoeff = vextq_p8(coeff[0], coeff[0], 8);
-	coeff_t midCoeff = veorq_p8(coeff[0], swapCoeff);
-	
-	*low1 = pmull_low(data.val[0], coeff[0]);
-	*low2 = pmull_high(data.val[0], swapCoeff);
-	poly8x16_t mid = veorq_p8(data.val[0], data.val[1]);
-	*mid1 = pmull_low(mid, midCoeff);
-	*mid2 = pmull_high(mid, midCoeff);
-	*high1 = pmull_low(data.val[1], swapCoeff);
-	*high2 = pmull_high(data.val[1], coeff[0]);
-*/
-}
-
-static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) {
-	poly8x16x2_t data = vld2q_p8((const poly8_t*)src);
-	*low1 = pmacl_low(*low1, data.val[0], coeff[0]);
-	*low2 = pmacl_high(*low2, data.val[0], coeff[0]);
-	poly8x16_t mid = veorq_p8(data.val[0], data.val[1]);
-	*mid1 = pmacl_low(*mid1, mid, coeff[2]);
-	*mid2 = pmacl_high(*mid2, mid, coeff[2]);
-	*high1 = pmacl_low(*high1, data.val[1], coeff[1]);
-	*high2 = pmacl_high(*high2, data.val[1], coeff[1]);
-}
+#define pmacl_low(sum, a, b) veorq_p16(sum, pmull_low(a, b))
+#define pmacl_high(sum, a, b) veorq_p16(sum, pmull_high(a, b))
 
+#define _AVAILABLE 1
 
-#ifdef __aarch64__
-# define CLMUL_NUM_REGIONS 8
-#else
-# define CLMUL_NUM_REGIONS 3
-#endif
-#define CLMUL_COEFF_PER_REGION 3
-
-static HEDLEY_ALWAYS_INLINE void gf16_clmul_muladd_x_neon(
-	const void *HEDLEY_RESTRICT scratch,
-	uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale, GF16_MULADD_MULTI_SRCLIST, size_t len,
-	const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf
-) {
-	GF16_MULADD_MULTI_SRC_UNUSED(CLMUL_NUM_REGIONS);
-	UNUSED(scratch);
-	
-	coeff_t coeff[CLMUL_COEFF_PER_REGION*CLMUL_NUM_REGIONS];
-	for(int src=0; src<srcCount; src++) {
-		uint8_t lo = coefficients[src] & 0xff;
-		uint8_t hi = coefficients[src] >> 8;
-		coeff[src*CLMUL_COEFF_PER_REGION +0] = coeff_fn(vdup, n_p8)(lo);
-		coeff[src*CLMUL_COEFF_PER_REGION +1] = coeff_fn(vdup, n_p8)(hi);
-		coeff[src*CLMUL_COEFF_PER_REGION +2] = coeff_fn(veor, p8)(coeff[src*CLMUL_COEFF_PER_REGION +0], coeff[src*CLMUL_COEFF_PER_REGION +1]);
-		// if we want to have one register per region (AArch64), at the expense of 2 extra instructions per region
-		//coeff[src] = vcombine_p8(vdup_n_p8(lo), vdup_n_p8(hi));
-	}
-
-	poly16x8_t low1, low2, mid1, mid2, high1, high2;
-	#define DO_PROCESS \
-		gf16_clmul_neon_round1(_src1+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + 0); \
-		if(srcCount > 1) \
-			gf16_clmul_neon_round(_src2+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*1); \
-		if(srcCount > 2) \
-			gf16_clmul_neon_round(_src3+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*2); \
-		if(srcCount > 3) \
-			gf16_clmul_neon_round(_src4+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*3); \
-		if(srcCount > 4) \
-			gf16_clmul_neon_round(_src5+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*4); \
-		if(srcCount > 5) \
-			gf16_clmul_neon_round(_src6+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*5); \
-		if(srcCount > 6) \
-			gf16_clmul_neon_round(_src7+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*6); \
-		if(srcCount > 7) \
-			gf16_clmul_neon_round(_src8+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*7); \
-		 \
-		gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2); \
-		 \
-		uint8x16x2_t vb = vld2q_u8(_dst+ptr); \
-		vb.val[0] = veorq_u8(vreinterpretq_u8_p16(low1), vb.val[0]); \
-		vb.val[1] = veorq_u8(vreinterpretq_u8_p16(high1), vb.val[1]); \
-		vst2q_u8(_dst+ptr, vb)
-	
-	if(doPrefetch) {
-		intptr_t ptr = -(intptr_t)len;
-		if(doPrefetch == 1)
-			PREFETCH_MEM(_pf+ptr, 1);
-		if(doPrefetch == 2)
-			PREFETCH_MEM(_pf+ptr, 0);
-		while(ptr & (CACHELINE_SIZE-1)) {
-			DO_PROCESS;
-			ptr += sizeof(uint8x16_t)*2;
-		}
-		while(ptr) {
-			if(doPrefetch == 1)
-				PREFETCH_MEM(_pf+ptr, 1);
-			if(doPrefetch == 2)
-				PREFETCH_MEM(_pf+ptr, 0);
-			
-			for(size_t iter=0; iter<(CACHELINE_SIZE/(sizeof(uint8x16_t)*2)); iter++) {
-				DO_PROCESS;
-				ptr += sizeof(uint8x16_t)*2;
-			}
-		}
-	} else {
-		for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) {
-			DO_PROCESS;
-		}
-	}
-	#undef DO_PROCESS
-}
 #endif /*defined(__ARM_NEON)*/
 
 
-
-void gf16_clmul_mul_neon(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
-	UNUSED(mutScratch); UNUSED(scratch);
-#if defined(__ARM_NEON)
-	
-	coeff_t coeff[3];
-	coeff[0] = coeff_fn(vdup, n_p8)(val & 0xff);
-	coeff[1] = coeff_fn(vdup, n_p8)(val >> 8);
-	coeff[2] = coeff_fn(veor, p8)(coeff[0], coeff[1]);
-	
-	uint8_t* _src = (uint8_t*)src + len;
-	uint8_t* _dst = (uint8_t*)dst + len;
-	poly16x8_t low1, low2, mid1, mid2, high1, high2;
-	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) {
-		gf16_clmul_neon_round1(_src+ptr, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff);
-		gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2);
-		uint8x16x2_t out;
-		out.val[0] = vreinterpretq_u8_p16(low1);
-		out.val[1] = vreinterpretq_u8_p16(high1);
-		vst2q_u8(_dst+ptr, out);
-	}
-#else
-	UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val);
-#endif
-}
-
-
-void gf16_clmul_muladd_neon(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
-	UNUSED(mutScratch);
-#if defined(__ARM_NEON)
-	gf16_muladd_single(scratch, &gf16_clmul_muladd_x_neon, dst, src, len, val);
-#else
-	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val);
-#endif
-}
-
-
-#if defined(__ARM_NEON)
-GF16_MULADD_MULTI_FUNCS(gf16_clmul, _neon, gf16_clmul_muladd_x_neon, CLMUL_NUM_REGIONS, sizeof(uint8x16_t)*2, 0, (void)0)
-#else
-GF16_MULADD_MULTI_FUNCS_STUB(gf16_clmul, _neon)
-#endif
+#define _FNSUFFIX _neon
+#include "gf16_clmul_neon_base.h"
+#undef _FNSUFFIX
 
 
 #if defined(__ARM_NEON)
diff --git a/gf16/gf16_clmul_neon.h b/gf16/gf16_clmul_neon.h
index a638101f..a0fa0dc2 100644
--- a/gf16/gf16_clmul_neon.h
+++ b/gf16/gf16_clmul_neon.h
@@ -1,6 +1,6 @@
 #include "gf16_neon_common.h"
 
-#if defined(__ARM_NEON)
+#if defined(_AVAILABLE)
 
 // `vaddq_p8` and co seems to be missing from some compilers (like GCC), so define our own variant
 static HEDLEY_ALWAYS_INLINE poly8x16_t veorq_p8(poly8x16_t a, poly8x16_t b) {
@@ -42,9 +42,11 @@ typedef poly8x8_t coeff_t;
 # define coeff_fn(f1, f2) f1##_##f2
 #endif
 
+#ifndef eor3q_u8
 static HEDLEY_ALWAYS_INLINE uint8x16_t eor3q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
 	return veorq_u8(a, veorq_u8(b, c));
 }
+#endif
 
 static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_reduction(poly16x8_t* low1, poly16x8_t low2, poly16x8_t mid1, poly16x8_t mid2, poly16x8_t* high1, poly16x8_t high2) {
 	// put data in proper form
diff --git a/gf16/gf16_clmul_neon_base.h b/gf16/gf16_clmul_neon_base.h
new file mode 100644
index 00000000..59613c71
--- /dev/null
+++ b/gf16/gf16_clmul_neon_base.h
@@ -0,0 +1,172 @@
+
+#include "gf16_clmul_neon.h"
+#include "gf16_muladd_multi.h"
+
+// TODO: for any multiplicand byte that's 0 (e.g. for coeff < 256), can shortcut a bunch of stuff, but may not be worth the effort
+
+#if defined(_AVAILABLE)
+
+
+static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round1(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) {
+	poly8x16x2_t data = vld2q_p8((const poly8_t*)src);
+	*low1 = pmull_low(data.val[0], coeff[0]);
+	*low2 = pmull_high(data.val[0], coeff[0]);
+	poly8x16_t mid = veorq_p8(data.val[0], data.val[1]);
+	*mid1 = pmull_low(mid, coeff[2]);
+	*mid2 = pmull_high(mid, coeff[2]);
+	*high1 = pmull_low(data.val[1], coeff[1]);
+	*high2 = pmull_high(data.val[1], coeff[1]);
+	
+	// TODO: try idea of forcing an EOR via asm volatile
+	
+/*  Alternative approach for AArch64, which only needs one register per region at the expense of 2 additional instructions; unfortunately compilers won't heed our aim
+	// the `midCoeff` approach can also work with AArch32
+	coeff_t swapCoeff = vextq_p8(coeff[0], coeff[0], 8);
+	coeff_t midCoeff = veorq_p8(coeff[0], swapCoeff);
+	
+	*low1 = pmull_low(data.val[0], coeff[0]);
+	*low2 = pmull_high(data.val[0], swapCoeff);
+	poly8x16_t mid = veorq_p8(data.val[0], data.val[1]);
+	*mid1 = pmull_low(mid, midCoeff);
+	*mid2 = pmull_high(mid, midCoeff);
+	*high1 = pmull_low(data.val[1], swapCoeff);
+	*high2 = pmull_high(data.val[1], coeff[0]);
+*/
+}
+
+static HEDLEY_ALWAYS_INLINE void gf16_clmul_neon_round(const void* src, poly16x8_t* low1, poly16x8_t* low2, poly16x8_t* mid1, poly16x8_t* mid2, poly16x8_t* high1, poly16x8_t* high2, const coeff_t* coeff) {
+	poly8x16x2_t data = vld2q_p8((const poly8_t*)src);
+	*low1 = pmacl_low(*low1, data.val[0], coeff[0]);
+	*low2 = pmacl_high(*low2, data.val[0], coeff[0]);
+	poly8x16_t mid = veorq_p8(data.val[0], data.val[1]);
+	*mid1 = pmacl_low(*mid1, mid, coeff[2]);
+	*mid2 = pmacl_high(*mid2, mid, coeff[2]);
+	*high1 = pmacl_low(*high1, data.val[1], coeff[1]);
+	*high2 = pmacl_high(*high2, data.val[1], coeff[1]);
+}
+
+
+#ifdef __aarch64__
+# define CLMUL_NUM_REGIONS 8
+#else
+# define CLMUL_NUM_REGIONS 3
+#endif
+#define CLMUL_COEFF_PER_REGION 3
+
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_clmul_muladd_x)(
+	const void *HEDLEY_RESTRICT scratch,
+	uint8_t *HEDLEY_RESTRICT _dst, const unsigned srcScale, GF16_MULADD_MULTI_SRCLIST, size_t len,
+	const uint16_t *HEDLEY_RESTRICT coefficients, const int doPrefetch, const char* _pf
+) {
+	GF16_MULADD_MULTI_SRC_UNUSED(CLMUL_NUM_REGIONS);
+	UNUSED(scratch);
+	
+	coeff_t coeff[CLMUL_COEFF_PER_REGION*CLMUL_NUM_REGIONS];
+	for(int src=0; src<srcCount; src++) {
+		uint8_t lo = coefficients[src] & 0xff;
+		uint8_t hi = coefficients[src] >> 8;
+		coeff[src*CLMUL_COEFF_PER_REGION +0] = coeff_fn(vdup, n_p8)(lo);
+		coeff[src*CLMUL_COEFF_PER_REGION +1] = coeff_fn(vdup, n_p8)(hi);
+		coeff[src*CLMUL_COEFF_PER_REGION +2] = coeff_fn(veor, p8)(coeff[src*CLMUL_COEFF_PER_REGION +0], coeff[src*CLMUL_COEFF_PER_REGION +1]);
+		// if we want to have one register per region (AArch64), at the expense of 2 extra instructions per region
+		//coeff[src] = vcombine_p8(vdup_n_p8(lo), vdup_n_p8(hi));
+	}
+
+	poly16x8_t low1, low2, mid1, mid2, high1, high2;
+	#define DO_PROCESS \
+		gf16_clmul_neon_round1(_src1+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + 0); \
+		if(srcCount > 1) \
+			gf16_clmul_neon_round(_src2+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*1); \
+		if(srcCount > 2) \
+			gf16_clmul_neon_round(_src3+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*2); \
+		if(srcCount > 3) \
+			gf16_clmul_neon_round(_src4+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*3); \
+		if(srcCount > 4) \
+			gf16_clmul_neon_round(_src5+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*4); \
+		if(srcCount > 5) \
+			gf16_clmul_neon_round(_src6+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*5); \
+		if(srcCount > 6) \
+			gf16_clmul_neon_round(_src7+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*6); \
+		if(srcCount > 7) \
+			gf16_clmul_neon_round(_src8+ptr*srcScale, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff + CLMUL_COEFF_PER_REGION*7); \
+		 \
+		gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2); \
+		 \
+		uint8x16x2_t vb = vld2q_u8(_dst+ptr); \
+		vb.val[0] = veorq_u8(vreinterpretq_u8_p16(low1), vb.val[0]); \
+		vb.val[1] = veorq_u8(vreinterpretq_u8_p16(high1), vb.val[1]); \
+		vst2q_u8(_dst+ptr, vb)
+	
+	if(doPrefetch) {
+		intptr_t ptr = -(intptr_t)len;
+		if(doPrefetch == 1)
+			PREFETCH_MEM(_pf+ptr, 1);
+		if(doPrefetch == 2)
+			PREFETCH_MEM(_pf+ptr, 0);
+		while(ptr & (CACHELINE_SIZE-1)) {
+			DO_PROCESS;
+			ptr += sizeof(uint8x16_t)*2;
+		}
+		while(ptr) {
+			if(doPrefetch == 1)
+				PREFETCH_MEM(_pf+ptr, 1);
+			if(doPrefetch == 2)
+				PREFETCH_MEM(_pf+ptr, 0);
+			
+			for(size_t iter=0; iter<(CACHELINE_SIZE/(sizeof(uint8x16_t)*2)); iter++) {
+				DO_PROCESS;
+				ptr += sizeof(uint8x16_t)*2;
+			}
+		}
+	} else {
+		for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) {
+			DO_PROCESS;
+		}
+	}
+	#undef DO_PROCESS
+}
+#endif /*defined(_AVAILABLE)*/
+
+
+
+void _FN(gf16_clmul_mul)(const void *HEDLEY_RESTRICT scratch, void* dst, const void* src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch); UNUSED(scratch);
+#if defined(_AVAILABLE)
+	
+	coeff_t coeff[3];
+	coeff[0] = coeff_fn(vdup, n_p8)(val & 0xff);
+	coeff[1] = coeff_fn(vdup, n_p8)(val >> 8);
+	coeff[2] = coeff_fn(veor, p8)(coeff[0], coeff[1]);
+	
+	uint8_t* _src = (uint8_t*)src + len;
+	uint8_t* _dst = (uint8_t*)dst + len;
+	poly16x8_t low1, low2, mid1, mid2, high1, high2;
+	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) {
+		gf16_clmul_neon_round1(_src+ptr, &low1, &low2, &mid1, &mid2, &high1, &high2, coeff);
+		gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2);
+		uint8x16x2_t out;
+		out.val[0] = vreinterpretq_u8_p16(low1);
+		out.val[1] = vreinterpretq_u8_p16(high1);
+		vst2q_u8(_dst+ptr, out);
+	}
+#else
+	UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val);
+#endif
+}
+
+
+void _FN(gf16_clmul_muladd)(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, uint16_t val, void *HEDLEY_RESTRICT mutScratch) {
+	UNUSED(mutScratch);
+#if defined(_AVAILABLE)
+	gf16_muladd_single(scratch, &_FN(gf16_clmul_muladd_x), dst, src, len, val);
+#else
+	UNUSED(scratch); UNUSED(dst); UNUSED(src); UNUSED(len); UNUSED(val);
+#endif
+}
+
+
+#if defined(_AVAILABLE)
+GF16_MULADD_MULTI_FUNCS(gf16_clmul, _FNSUFFIX, _FN(gf16_clmul_muladd_x), CLMUL_NUM_REGIONS, sizeof(uint8x16_t)*2, 0, (void)0)
+#else
+GF16_MULADD_MULTI_FUNCS_STUB(gf16_clmul, _FNSUFFIX)
+#endif
diff --git a/gf16/gf16_clmul_sha3.c b/gf16/gf16_clmul_sha3.c
new file mode 100644
index 00000000..d9424a3f
--- /dev/null
+++ b/gf16/gf16_clmul_sha3.c
@@ -0,0 +1,47 @@
+
+// this CLMul variant is optimised for Apple M1
+
+#include "gf16_neon_common.h"
+
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA3)
+int gf16_available_neon_sha3 = 1;
+
+// NOTE: we avoid EOR3 in pmacl* - only chip which supports NEON-SHA3 without SVE2, are the Apple chips and Neoverse V1; the former has PMULL+EOR fusion, which is better than EOR3
+#if defined(__GNUC__) || defined(__clang__)
+// Apple M1 supports fusing PMULL+EOR, so ensure these are paired
+static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_low(poly16x8_t sum, poly8x16_t a, poly8x16_t b) {
+	poly16x8_t result;
+	__asm__ ("pmull %0.8h,%1.8b,%2.8b\n"
+	         "eor %0.16b,%0.16b,%3.16b\n"
+		: "=&w"(result)
+		: "w"(a), "w"(b), "w"(sum)
+		: /* No clobbers */);
+	return result;
+}
+static HEDLEY_ALWAYS_INLINE poly16x8_t pmacl_high(poly16x8_t sum, poly8x16_t a, poly8x16_t b) {
+	poly16x8_t result;
+	__asm__ ("pmull2 %0.8h,%1.16b,%2.16b\n"
+	         "eor %0.16b,%0.16b,%3.16b\n"
+		: "=&w"(result)
+		: "w"(a), "w"(b), "w"(sum)
+		: /* No clobbers */);
+	return result;
+}
+#else
+static HEDLEY_ALWAYS_INLINE poly16x8_t veorq_p16(poly16x8_t a, poly16x8_t b) {
+	return vreinterpretq_p16_u16(veorq_u16(vreinterpretq_u16_p16(a), vreinterpretq_u16_p16(b)));
+}
+# define pmacl_low(sum, a, b) veorq_p16(sum, pmull_low(a, b))
+# define pmacl_high(sum, a, b) veorq_p16(sum, pmull_high(a, b))
+#endif
+
+#define _AVAILABLE 1
+#define eor3q_u8 veor3q_u8
+
+#else
+int gf16_available_neon_sha3 = 0;
+#endif /*defined(__ARM_NEON)*/
+
+#define _FNSUFFIX _sha3
+#include "gf16_clmul_neon_base.h"
+#undef _FNSUFFIX
diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index 72500339..1aa04723 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -200,11 +200,13 @@ struct CpuCap {
 
 struct CpuCap {
 	bool hasNEON;
+	bool hasSHA3;
 	bool hasSVE;
 	bool hasSVE2;
 	CpuCap(bool detect) : hasNEON(true), hasSVE(true), hasSVE2(true) {
 		if(!detect) return;
 		hasNEON = CPU_HAS_NEON;
+		hasSHA3 = CPU_HAS_NEON_SHA3;
 		hasSVE = CPU_HAS_SVE;
 		hasSVE2 = CPU_HAS_SVE2;
 		if(hasSVE) {
@@ -295,6 +297,7 @@ Galois16MethodInfo Galois16Mul::info(Galois16Methods _method) {
 		break;
 		
 		case GF16_CLMUL_NEON:
+		case GF16_CLMUL_SHA3:
 			_info.alignment = 32; // presumably double-loads work best when aligned to 32 instead of 16?
 			_info.stride = 32;
 			_info.cksumSize = 16;
@@ -483,6 +486,7 @@ Galois16MethodInfo Galois16Mul::info(Galois16Methods _method) {
 			_info.idealChunkSize = 4*1024;
 		break;
 		case GF16_CLMUL_NEON: // faster init than Shuffle, and usually faster
+		case GF16_CLMUL_SHA3:
 		case GF16_CLMUL_SVE2: // may want smaller chunk size for wider vectors
 		case GF16_AFFINE_GFNI:
 		case GF16_AFFINE2X_GFNI:
@@ -756,6 +760,35 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 			copy_cksum_check = &gf16_cksum_copy_check_neon;
 		} break;
 		
+		case GF16_CLMUL_SHA3: {
+			int available = gf16_clmul_init_arm(GF16_POLYNOMIAL);
+			METHOD_REQUIRES(gf16_available_neon_sha3 && available)
+			
+			scratch = gf16_shuffle_init_arm(GF16_POLYNOMIAL);
+			if(scratch) {
+				_mul = &gf16_shuffle_mul_neon;
+				_mul_add = &gf16_shuffle_muladd_neon;
+			} else {
+				_mul = &gf16_clmul_mul_sha3;
+				_mul_add = &gf16_clmul_muladd_sha3;
+			}
+			_mul_add_multi = &gf16_clmul_muladd_multi_sha3;
+			_mul_add_multi_stridepf = &gf16_clmul_muladd_multi_stridepf_sha3;
+			_mul_add_multi_packed = &gf16_clmul_muladd_multi_packed_sha3;
+			add_multi = &gf_add_multi_neon;
+			add_multi_packed = &gf_add_multi_packed_clmul_neon;
+			add_multi_packpf = &gf_add_multi_packpf_clmul_neon;
+			_mul_add_multi_packpf = &gf16_clmul_muladd_multi_packpf_sha3;
+			prepare_packed = &gf16_clmul_prepare_packed_neon;
+			prepare_packed_cksum = &gf16_clmul_prepare_packed_cksum_neon;
+			prepare_partial_packsum = &gf16_clmul_prepare_partial_packsum_neon;
+			finish_packed = &gf16_shuffle_finish_packed_neon;
+			finish_packed_cksum = &gf16_shuffle_finish_packed_cksum_neon; // re-use shuffle routine
+			finish_partial_packsum = &gf16_shuffle_finish_partial_packsum_neon;
+			copy_cksum = &gf16_cksum_copy_neon;
+			copy_cksum_check = &gf16_cksum_copy_check_neon;
+		} break;
+		
 		case GF16_SHUFFLE_128_SVE:
 			METHOD_REQUIRES(gf16_available_sve)
 			
@@ -1372,6 +1405,10 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu
 	}
 	if(caps.hasSVE && gf16_sve_get_size() > 16)
 		return GF16_SHUFFLE_128_SVE;
+# ifdef __aarch64__
+	if(gf16_available_neon_sha3 && caps.hasSHA3)
+		return inputs > 3 ? GF16_CLMUL_SHA3 : GF16_SHUFFLE_NEON;
+# endif
 	if(gf16_available_neon && caps.hasNEON)
 		return
 # ifdef __aarch64__
@@ -1452,6 +1489,9 @@ std::vector<Galois16Methods> Galois16Mul::availableMethods(bool checkCpuid) {
 		ret.push_back(GF16_SHUFFLE_NEON);
 		ret.push_back(GF16_CLMUL_NEON);
 	}
+	if(gf16_available_neon_sha3 && caps.hasSHA3) {
+		ret.push_back(GF16_CLMUL_SHA3);
+	}
 	if(gf16_available_sve && caps.hasSVE)
 		ret.push_back(GF16_SHUFFLE_128_SVE);
 	if(gf16_available_sve2 && caps.hasSVE2) {
diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h
index 5a3910b8..5979d93a 100644
--- a/gf16/gf16mul.h
+++ b/gf16/gf16mul.h
@@ -64,6 +64,7 @@ enum Galois16Methods {
 	GF16_AFFINE2X_AVX2,
 	GF16_AFFINE2X_AVX512,
 	GF16_CLMUL_NEON,
+	GF16_CLMUL_SHA3,
 	GF16_CLMUL_SVE2
 	// TODO: consider non-transforming shuffle/affine
 };
@@ -96,6 +97,7 @@ static const char* Galois16MethodsText[] = {
 	"Affine2x (GFNI+AVX2)",
 	"Affine2x (GFNI+AVX512)",
 	"CLMul (NEON)",
+	"CLMul (SHA3)",
 	"CLMul (SVE2)"
 };
 
diff --git a/gf16/gf16pmul_neon.c b/gf16/gf16pmul_neon.c
index c23cc3c4..52c68655 100644
--- a/gf16/gf16pmul_neon.c
+++ b/gf16/gf16pmul_neon.c
@@ -1,7 +1,13 @@
 #include "gf16_global.h"
+
+#ifdef __ARM_NEON
+# define _AVAILABLE
+#endif
 #include "gf16_clmul_neon.h"
 
 #ifdef __ARM_NEON
+# undef _AVAILABLE
+
 int gf16pmul_available_neon = 1;
 
 void gf16pmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len) {
diff --git a/help.txt b/help.txt
index 6e0d28b9..82ddb81e 100644
--- a/help.txt
+++ b/help.txt
@@ -294,6 +294,7 @@ CPU Tuning Options:
                                  shuffle128-sve2: SVE2 variant of shuffle-neon
                                  shuffle2x128-sve2: half width variant of shuffle-neon (requires SVE width >= 256 bits)
                                  shuffle512-sve2: SVE2 variant of shuffle-vbmi (requires SVE width >= 512 bits)
+                                 clmul-sha3: NEON-SHA3 variant of clmul-neon
                                  clmul-sve2: SVE2 variant of clmul-neon
                              RISC-V only choices:
                                  shuffle128-rvv: RISC-V Vector variant of shuffle128-sve2
diff --git a/lib/par2.js b/lib/par2.js
index 5a26b685..20ef302f 100644
--- a/lib/par2.js
+++ b/lib/par2.js
@@ -890,7 +890,7 @@ var GF_METHODS = [
 	'xor-sse', 'xorjit-sse', 'xorjit-avx2', 'xorjit-avx512',
 	'affine-sse', 'affine-avx2', 'affine-avx512',
 	'affine2x-sse', 'affine2x-avx2', 'affine2x-avx512',
-	'clmul-neon', 'clmul-sve2'
+	'clmul-neon', 'clmul-sha3', 'clmul-sve2'
 ];
 var GFOCL_METHODS = [
 	'' /*default*/, 'lookup', 'lookup_half', 'lookup_nc', 'lookup_half_nc',
diff --git a/src/cpuid.h b/src/cpuid.h
index 05201e37..56c95761 100644
--- a/src/cpuid.h
+++ b/src/cpuid.h
@@ -67,53 +67,67 @@ static unsigned long getauxval(unsigned long cap) {
 #  endif
 # endif
 
-# define CPU_HAS_NEON false
-# define CPU_HAS_ARMCRC false
-# define CPU_HAS_SVE false
-# define CPU_HAS_SVE2 false
-
-# if defined(AT_HWCAP)
-#  undef CPU_HAS_NEON
-#  ifdef __aarch64__
-#   define CPU_HAS_NEON (getauxval(AT_HWCAP) & HWCAP_ASIMD)
-#   if defined(HWCAP_SVE)
-#    undef CPU_HAS_SVE
-#    define CPU_HAS_SVE (getauxval(AT_HWCAP) & HWCAP_SVE)
+
+# ifdef PARPAR_SKIP_AUX_CHECK
+#  define CPU_HAS_NEON true
+#  define CPU_HAS_ARMCRC true
+#  define CPU_HAS_NEON_SHA3 true
+#  define CPU_HAS_SVE true
+#  define CPU_HAS_SVE2 true
+# else
+#  define CPU_HAS_NEON false
+#  define CPU_HAS_ARMCRC false
+#  define CPU_HAS_NEON_SHA3 false
+#  define CPU_HAS_SVE false
+#  define CPU_HAS_SVE2 false
+
+#  if defined(AT_HWCAP)
+#   undef CPU_HAS_NEON
+#   ifdef __aarch64__
+#    define CPU_HAS_NEON (getauxval(AT_HWCAP) & HWCAP_ASIMD)
+#    if defined(HWCAP_SHA3)
+#     undef CPU_HAS_NEON_SHA3
+#     define CPU_HAS_NEON_SHA3 (getauxval(AT_HWCAP) & HWCAP_SHA3)
+#    endif
+#    if defined(HWCAP_SVE)
+#     undef CPU_HAS_SVE
+#     define CPU_HAS_SVE (getauxval(AT_HWCAP) & HWCAP_SVE)
+#    endif
+#    if defined(AT_HWCAP2) && defined(HWCAP2_SVE2)
+#     undef CPU_HAS_SVE2
+#     define CPU_HAS_SVE2 (getauxval(AT_HWCAP2) & HWCAP2_SVE2)
+#    endif
+#   else
+#    define CPU_HAS_NEON (getauxval(AT_HWCAP) & HWCAP_NEON)
 #   endif
-#   if defined(AT_HWCAP2) && defined(HWCAP2_SVE2)
-#    undef CPU_HAS_SVE2
-#    define CPU_HAS_SVE2 (getauxval(AT_HWCAP2) & HWCAP2_SVE2)
+#   if defined(AT_HWCAP2) && defined(HWCAP2_CRC32)
+#    undef CPU_HAS_ARMCRC
+#    define CPU_HAS_ARMCRC (getauxval(AT_HWCAP2) & HWCAP2_CRC32)
+#   elif defined(HWCAP_CRC32)
+#    undef CPU_HAS_ARMCRC
+#    define CPU_HAS_ARMCRC (getauxval(AT_HWCAP) & HWCAP_CRC32)
 #   endif
-#  else
-#   define CPU_HAS_NEON (getauxval(AT_HWCAP) & HWCAP_NEON)
-#  endif
-#  if defined(AT_HWCAP2) && defined(HWCAP2_CRC32)
+#  elif defined(ANDROID_CPU_FAMILY_ARM)
+#   undef CPU_HAS_NEON
 #   undef CPU_HAS_ARMCRC
-#   define CPU_HAS_ARMCRC (getauxval(AT_HWCAP2) & HWCAP2_CRC32)
-#  elif defined(HWCAP_CRC32)
+#   ifdef __aarch64__
+#    define CPU_HAS_NEON (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
+#    define CPU_HAS_ARMCRC (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32)
+#   else
+#    define CPU_HAS_NEON (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
+#    define CPU_HAS_ARMCRC (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32)
+#   endif
+#  elif defined(_WIN32)
+#   undef CPU_HAS_NEON
 #   undef CPU_HAS_ARMCRC
-#   define CPU_HAS_ARMCRC (getauxval(AT_HWCAP) & HWCAP_CRC32)
-#  endif
-# elif defined(ANDROID_CPU_FAMILY_ARM)
-#  undef CPU_HAS_NEON
-#  undef CPU_HAS_ARMCRC
-#  ifdef __aarch64__
-#   define CPU_HAS_NEON (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD)
-#   define CPU_HAS_ARMCRC (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32)
-#  else
-#   define CPU_HAS_NEON (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
-#   define CPU_HAS_ARMCRC (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32)
-#  endif
-# elif defined(_WIN32)
-#  undef CPU_HAS_NEON
-#  undef CPU_HAS_ARMCRC
-#  define CPU_HAS_NEON (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
-#  define CPU_HAS_ARMCRC (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE))
-# elif defined(__APPLE__)
-#  undef CPU_HAS_NEON
-#  undef CPU_HAS_ARMCRC
-#  define CPU_HAS_NEON (cpuHasFeature("hw.optional.neon"))
-#  define CPU_HAS_ARMCRC (cpuHasFeature("hw.optional.armv8_crc32"))
+#   define CPU_HAS_NEON (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+#   define CPU_HAS_ARMCRC (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE))
+#  elif defined(__APPLE__)
+#   undef CPU_HAS_NEON
+#   undef CPU_HAS_ARMCRC
+#   define CPU_HAS_NEON (cpuHasFeature("hw.optional.neon"))
+#   define CPU_HAS_ARMCRC (cpuHasFeature("hw.optional.armv8_crc32"))
+#   define CPU_HAS_NEON_SHA3 (cpuHasFeature("hw.optional.armv8_2_sha3"))
 	static inline bool cpuHasFeature(const char* feature) {
 		int supported = 0;
 		size_t len = sizeof(supported);
@@ -121,6 +135,7 @@ static unsigned long getauxval(unsigned long cap) {
 			return (bool)supported;
 		return false;
 	}
+#  endif
 # endif
 
 #endif
@@ -142,7 +157,9 @@ static unsigned long getauxval(unsigned long cap) {
 #  endif
 # endif
 
-# ifndef CPU_HAS_VECTOR
+# ifdef PARPAR_SKIP_AUX_CHECK
+#  define CPU_HAS_VECTOR true
+# else
 #  define CPU_HAS_VECTOR false
 
 #  if defined(AT_HWCAP)

From fb6863c8dff5974b6fc43126c677d826417e481e Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 14 Aug 2023 15:25:32 +1000
Subject: [PATCH 56/91] RVV bugfix

---
 gf16/gf16_shuffle128_rvv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gf16/gf16_shuffle128_rvv.c b/gf16/gf16_shuffle128_rvv.c
index 494fc4d0..433b2faa 100644
--- a/gf16/gf16_shuffle128_rvv.c
+++ b/gf16/gf16_shuffle128_rvv.c
@@ -217,7 +217,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_prepare_block_rvv(void *HEDLEY_RESTRICT ds
 // final block
 static HEDLEY_ALWAYS_INLINE void gf16_prepare_blocku_rvv(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t remaining) {
 	size_t vlmax = RV(vsetvlmax_e8m2)();
-	vuint8m1_t v = RV(vmv_v_x_u8m2)(0, vlmax);
+	vuint8m2_t v = RV(vmv_v_x_u8m2)(0, vlmax);
 	size_t vl = RV(vsetvl_e8m2)(remaining);
 #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 13000
 	v = RV(vle8_v_u8m2_tu)(v, (const uint8_t*)src, vl);

From 867e173ef7d9e31c4c336d91e1632935bb15cd97 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 16 Aug 2023 10:32:50 +1000
Subject: [PATCH 57/91] Add ability to list available hasher methods + ability
 to override single MD5/CRC kernel

---
 gf16/gf16pmul.cpp |   6 +-
 gf16/gf16pmul.h   |   6 +-
 hasher/hasher.cpp | 426 ++++++++++++++++++++++++++++------------------
 hasher/hasher.h   |  35 +++-
 4 files changed, 296 insertions(+), 177 deletions(-)

diff --git a/gf16/gf16pmul.cpp b/gf16/gf16pmul.cpp
index 22e31381..b5962993 100644
--- a/gf16/gf16pmul.cpp
+++ b/gf16/gf16pmul.cpp
@@ -85,9 +85,9 @@ void setup_pmul() {
 #endif
 }
 
-const char* gf16pmul_methodName() {
+const char* gf16pmul_methodName(Galois16PointMulMethods method) {
 	const char* names[] = {
-		"None (exponentiate)",
+		"None",
 		"PCLMUL",
 		"AVX2",
 		"VPCLMUL",
@@ -96,5 +96,5 @@ const char* gf16pmul_methodName() {
 		"SVE2"
 	};
 	
-	return names[(int)gf16pmul_method];
+	return names[(int)method];
 }
diff --git a/gf16/gf16pmul.h b/gf16/gf16pmul.h
index c740bc03..d88460fb 100644
--- a/gf16/gf16pmul.h
+++ b/gf16/gf16pmul.h
@@ -17,9 +17,13 @@ enum Galois16PointMulMethods {
 // TODO: consider multi-dest
 typedef void(*Gf16PMulFunc)(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2, size_t len);
 extern Gf16PMulFunc gf16pmul;
+extern Galois16PointMulMethods gf16pmul_method;
 extern size_t gf16pmul_alignment;
 extern size_t gf16pmul_blocklen;
-const char* gf16pmul_methodName();
+const char* gf16pmul_methodName(Galois16PointMulMethods method);
+inline const char* gf16pmul_methodName() {
+	return gf16pmul_methodName(gf16pmul_method);
+}
 
 void setup_pmul();
 
diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp
index 85e727f0..ced8b76e 100644
--- a/hasher/hasher.cpp
+++ b/hasher/hasher.cpp
@@ -8,14 +8,67 @@ uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*) = NULL;
 MD5CRCMethods MD5CRC_Method = MD5CRCMETH_SCALAR;
 uint32_t(*CRC32_Calc)(const void*, size_t) = NULL;
 MD5CRCMethods CRC32_Method = MD5CRCMETH_SCALAR;
-struct _CpuCap {
+struct CpuCap {
 #ifdef PLATFORM_X86
-	bool hasSSE2, hasXOP, hasBMI1, hasAVX2, hasAVX512F, hasAVX512VLBW;
-	_CpuCap() : hasSSE2(false), hasXOP(false), hasBMI1(false), hasAVX2(false), hasAVX512F(false), hasAVX512VLBW(false) {}
+	bool hasSSE2, hasClMul, hasXOP, hasBMI1, hasAVX2, hasAVX512F, hasAVX512VLBW;
+	bool isSmallCore, isLEASlow, isVecRotSlow;
+	CpuCap(bool detect) :
+		hasSSE2(true), hasClMul(true), hasXOP(true), hasBMI1(true), hasAVX2(true), hasAVX512F(true), hasAVX512VLBW(true),
+		isSmallCore(false), isLEASlow(false), isVecRotSlow(false)
+	{
+		if(!detect) return;
+			
+		bool hasAVX = false;
+		
+		int cpuInfo[4];
+		int cpuInfoX[4];
+		int family, model;
+		_cpuid(cpuInfo, 1);
+		hasSSE2 = (cpuInfo[3] & 0x4000000);
+		hasClMul = ((cpuInfo[2] & 0x80202) == 0x80202); // SSE4.1 + SSSE3 + CLMUL
+		
+		family = ((cpuInfo[0]>>8) & 0xf) + ((cpuInfo[0]>>16) & 0xff0);
+		model = ((cpuInfo[0]>>4) & 0xf) + ((cpuInfo[0]>>12) & 0xf0);
+		
+		// TODO: check perf on small cores
+		if(family == 6) {
+			isSmallCore = CPU_MODEL_IS_BNL_SLM(model);
+			// Intel Sandy Bridge to Skylake has slow 3-component LEA
+			isLEASlow = (model == 0x2A || model == 0x2D || model == 0x3A || model == 0x3C || model == 0x3D || model == 0x3E || model == 0x3F || model == 0x45 || model == 0x46 || model == 0x47 || model == 0x4E || model == 0x4F || model == 0x55 || model == 0x56 || model == 0x5E || model == 0x66 || model == 0x67 || model == 0x8E || model == 0x9E || model == 0xA5 || model == 0xA6);
+		} else {
+			isSmallCore = CPU_FAMMDL_IS_AMDCAT(family, model);
+		}
+		
+		isVecRotSlow = (family == 0xaf); // vector rotate has 2 cycle latency on Zen4
+		
+#if !defined(_MSC_VER) || _MSC_VER >= 1600
+		_cpuidX(cpuInfoX, 7, 0);
+		if((cpuInfo[2] & 0x1C000000) == 0x1C000000) { // has AVX + OSXSAVE + XSAVE
+			int xcr = _GET_XCR() & 0xff;
+			if((xcr & 6) == 6) { // AVX enabled
+				hasAVX = true;
+				hasBMI1 = hasAVX && (cpuInfoX[1] & 0x08);
+				hasAVX2 = cpuInfoX[1] & 0x20;
+				if((xcr & 0xE0) == 0xE0) {
+					hasAVX512F = ((cpuInfoX[1] & 0x10000) == 0x10000);
+					hasAVX512VLBW = ((cpuInfoX[1] & 0xC0010100) == 0xC0010100); // AVX512VL + AVX512BW + AVX512F + BMI2
+				}
+			}
+		}
+#endif
+
+		_cpuid(cpuInfo, 0x80000001);
+		hasXOP = hasAVX && (cpuInfo[2] & 0x800);
+	}
 #endif
 #ifdef PLATFORM_ARM
-	bool hasNEON, hasSVE2;
-	_CpuCap() : hasNEON(false), hasSVE2(false) {}
+	bool hasCRC, hasNEON, hasSVE2;
+	CpuCap(bool detect) : hasCRC(true), hasNEON(true), hasSVE2(true) {
+		if(!detect) return;
+		hasCRC = CPU_HAS_ARMCRC;
+		hasNEON = CPU_HAS_NEON;
+		hasSVE2 = CPU_HAS_SVE2;
+	}
 #endif
 };
 
@@ -24,168 +77,67 @@ MD5MultiLevels HasherMD5Multi_level;
 void setup_hasher() {
 	if(HasherInput_Create) return;
 	
-	HasherInput_Create = &HasherInput_Scalar::create;
-	HasherInput_Method = INHASH_SCALAR;
-	MD5CRC_Calc = &MD5CRC_Calc_Scalar;
-	MD5CRC_Method = MD5CRCMETH_SCALAR;
-	CRC32_Calc = &CRC32_Calc_Slice4;
-	CRC32_Method = MD5CRCMETH_SCALAR;
-	
-	struct _CpuCap CpuCap;
-	(void)CpuCap;
+	set_hasherInput(INHASH_SCALAR);
+	set_hasherMD5CRC(MD5CRCMETH_SCALAR);
 	
-	// CPU detection
 #ifdef PLATFORM_X86
-	bool hasClMul = false, hasAVX = false;
-	bool isSmallCore = false, isLEASlow = false, isVecRotSlow = false;
-	
-	int cpuInfo[4];
-	int cpuInfoX[4];
-	int family, model;
-	_cpuid(cpuInfo, 1);
-	CpuCap.hasSSE2 = (cpuInfo[3] & 0x4000000);
-	hasClMul = ((cpuInfo[2] & 0x80202) == 0x80202); // SSE4.1 + SSSE3 + CLMUL
-	
-	family = ((cpuInfo[0]>>8) & 0xf) + ((cpuInfo[0]>>16) & 0xff0);
-	model = ((cpuInfo[0]>>4) & 0xf) + ((cpuInfo[0]>>12) & 0xf0);
-	
-	// TODO: check perf on small cores
-	if(family == 6) {
-		isSmallCore = CPU_MODEL_IS_BNL_SLM(model);
-		// Intel Sandy Bridge to Skylake has slow 3-component LEA
-		isLEASlow = (model == 0x2A || model == 0x2D || model == 0x3A || model == 0x3C || model == 0x3D || model == 0x3E || model == 0x3F || model == 0x45 || model == 0x46 || model == 0x47 || model == 0x4E || model == 0x4F || model == 0x55 || model == 0x56 || model == 0x5E || model == 0x66 || model == 0x67 || model == 0x8E || model == 0x9E || model == 0xA5 || model == 0xA6);
-	} else {
-		isSmallCore = CPU_FAMMDL_IS_AMDCAT(family, model);
-	}
-	
-	isVecRotSlow = (family == 0xaf); // vector rotate has 2 cycle latency on Zen4
+	struct CpuCap caps(true);
 	
-#if !defined(_MSC_VER) || _MSC_VER >= 1600
-	_cpuidX(cpuInfoX, 7, 0);
-	if((cpuInfo[2] & 0x1C000000) == 0x1C000000) { // has AVX + OSXSAVE + XSAVE
-		int xcr = _GET_XCR() & 0xff;
-		if((xcr & 6) == 6) { // AVX enabled
-			hasAVX = true;
-			CpuCap.hasBMI1 = hasAVX && (cpuInfoX[1] & 0x08);
-			CpuCap.hasAVX2 = cpuInfoX[1] & 0x20;
-			if((xcr & 0xE0) == 0xE0) {
-				CpuCap.hasAVX512F = ((cpuInfoX[1] & 0x10000) == 0x10000);
-				CpuCap.hasAVX512VLBW = ((cpuInfoX[1] & 0xC0010100) == 0xC0010100); // AVX512VL + AVX512BW + AVX512F + BMI2
-			}
-		}
-	}
-#endif
-
-	_cpuid(cpuInfo, 0x80000001);
-	CpuCap.hasXOP = hasAVX && (cpuInfo[2] & 0x800);
-	
-	if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && HasherInput_AVX512::isAvailable) {
-		HasherInput_Create = &HasherInput_AVX512::create;
-		HasherInput_Method = INHASH_AVX512;
-	}
+	if(caps.hasAVX512VLBW && caps.hasClMul && !caps.isVecRotSlow && HasherInput_AVX512::isAvailable)
+		set_hasherInput(INHASH_AVX512);
 	// SSE seems to be faster than scalar on Zen1/2, not Zen3; BMI > SSE on Zen1, unknown on Zen2
-	else if(hasClMul && !isSmallCore && HasherInput_ClMulScalar::isAvailable) {
+	else if(caps.hasClMul && !caps.isSmallCore && HasherInput_ClMulScalar::isAvailable) {
 		// Gracemont: SSE > scalar, but SSE ~= BMI
-		if(CpuCap.hasBMI1 && HasherInput_BMI1::isAvailable) {
-			HasherInput_Create = &HasherInput_BMI1::create;
-			HasherInput_Method = INHASH_BMI1;
-		} else {
-			HasherInput_Create = &HasherInput_ClMulScalar::create;
-			HasherInput_Method = INHASH_CRC;
-		}
-	} else if(hasClMul && isSmallCore && HasherInput_ClMulSSE::isAvailable) {
-		HasherInput_Create = &HasherInput_ClMulSSE::create;
-		HasherInput_Method = INHASH_SIMD_CRC;
-	}
-	else if(CpuCap.hasSSE2 && isSmallCore && HasherInput_SSE::isAvailable) { // TODO: CPU w/o ClMul might all be small enough
-		HasherInput_Create = &HasherInput_SSE::create;
-		HasherInput_Method = INHASH_SIMD;
-	}
+		if(caps.hasBMI1 && HasherInput_BMI1::isAvailable)
+			set_hasherInput(INHASH_BMI1);
+		else
+			set_hasherInput(INHASH_CRC);
+	} else if(caps.hasClMul && caps.isSmallCore && HasherInput_ClMulSSE::isAvailable)
+		set_hasherInput(INHASH_SIMD_CRC);
+	else if(caps.hasSSE2 && caps.isSmallCore && HasherInput_SSE::isAvailable) // TODO: CPU w/o ClMul might all be small enough
+		set_hasherInput(INHASH_SIMD);
 	
-	if(CpuCap.hasAVX512VLBW && !isVecRotSlow && MD5Single_isAvailable_AVX512) {
-		MD5Single::_update = &MD5Single_update_AVX512;
-		MD5Single::_updateZero = &MD5Single_updateZero_AVX512;
-		MD5Single::method = MD5CRCMETH_AVX512;
-	}
-	else if(isLEASlow && hasClMul && MD5Single_isAvailable_NoLEA) {
-		MD5Single::_update = &MD5Single_update_NoLEA;
-		MD5Single::_updateZero = &MD5Single_updateZero_NoLEA;
-		MD5Single::method = MD5CRCMETH_NOLEA;
-	}
+	if(caps.hasAVX512VLBW && caps.hasClMul && !caps.isVecRotSlow && MD5CRC_isAvailable_AVX512)
+		set_hasherMD5CRC(MD5CRCMETH_AVX512);
+	else if(caps.isLEASlow && caps.hasClMul && MD5CRC_isAvailable_NoLEA)
+		set_hasherMD5CRC(MD5CRCMETH_NOLEA);
 	// for some reason, single MD5 BMI1 seems to be slower on most cores, except Jaguar... unsure why
-	else if(CpuCap.hasBMI1 && isSmallCore && MD5Single_isAvailable_BMI1) {
-		MD5Single::_update = &MD5Single_update_BMI1;
-		MD5Single::_updateZero = &MD5Single_updateZero_BMI1;
-		MD5Single::method = MD5CRCMETH_BMI1;
-	}
-	
-	if(CpuCap.hasAVX512VLBW && hasClMul && !isVecRotSlow && MD5CRC_isAvailable_AVX512) {
-		MD5CRC_Calc = &MD5CRC_Calc_AVX512;
-		MD5CRC_Method = MD5CRCMETH_AVX512;
-	}
-	else if(isLEASlow && hasClMul && MD5CRC_isAvailable_NoLEA) {
-		MD5CRC_Calc = &MD5CRC_Calc_NoLEA;
-		MD5CRC_Method = MD5CRCMETH_NOLEA;
-	}
-	else if(CpuCap.hasBMI1 && hasClMul && isSmallCore && MD5CRC_isAvailable_BMI1) {
-		MD5CRC_Calc = &MD5CRC_Calc_BMI1;
-		MD5CRC_Method = MD5CRCMETH_BMI1;
-	}
-	else if(hasClMul && MD5CRC_isAvailable_ClMul) {
-		MD5CRC_Calc = &MD5CRC_Calc_ClMul;
-		MD5CRC_Method = MD5CRCMETH_PCLMUL;
-	}
-	
-	if(hasClMul && CRC32_isAvailable_ClMul) {
-		CRC32_Calc = &CRC32_Calc_ClMul;
-		CRC32_Method = MD5CRCMETH_PCLMUL;
-	}
+	else if(caps.hasBMI1 && caps.hasClMul && caps.isSmallCore && MD5CRC_isAvailable_BMI1)
+		set_hasherMD5CRC(MD5CRCMETH_BMI1);
+	else if(caps.hasClMul && MD5CRC_isAvailable_ClMul)
+		set_hasherMD5CRC(MD5CRCMETH_PCLMUL);
 	
 #endif
 #ifdef PLATFORM_ARM
-	bool hasCRC = CPU_HAS_ARMCRC;
+	struct CpuCap caps(true);
 	
-	CpuCap.hasNEON = CPU_HAS_NEON;
-	CpuCap.hasSVE2 = CPU_HAS_SVE2;
-	
-	if(hasCRC && HasherInput_ARMCRC::isAvailable) { // TODO: fast core only
-		HasherInput_Create = &HasherInput_ARMCRC::create;
-		HasherInput_Method = INHASH_CRC;
-	}
-	else if(CpuCap.hasNEON) { // TODO: slow core only
-		if(hasCRC && HasherInput_NEONCRC::isAvailable) {
-			HasherInput_Create = &HasherInput_NEONCRC::create;
-			HasherInput_Method = INHASH_SIMD_CRC;
-		} else if(HasherInput_NEON::isAvailable) {
-			HasherInput_Create = &HasherInput_NEON::create;
-			HasherInput_Method = INHASH_SIMD;
-		}
+	if(caps.hasCRC && HasherInput_ARMCRC::isAvailable) // TODO: fast core only
+		set_hasherInput(INHASH_CRC);
+	else if(caps.hasNEON) { // TODO: slow core only
+		if(caps.hasCRC && HasherInput_NEONCRC::isAvailable)
+			set_hasherInput(INHASH_SIMD_CRC);
+		else if(HasherInput_NEON::isAvailable)
+			set_hasherInput(INHASH_SIMD);
 	}
 	
-	if(hasCRC && MD5CRC_isAvailable_ARMCRC) {
-		MD5CRC_Calc = &MD5CRC_Calc_ARMCRC;
-		MD5CRC_Method = MD5CRCMETH_ARMCRC;
-	}
-	if(hasCRC && CRC32_isAvailable_ARMCRC) {
-		CRC32_Calc = &CRC32_Calc_ARMCRC;
-		CRC32_Method = MD5CRCMETH_ARMCRC;
-	}
+	if(caps.hasCRC && MD5CRC_isAvailable_ARMCRC)
+		set_hasherMD5CRC(MD5CRCMETH_ARMCRC);
 #endif
 	
 	
 	// note that this logic assumes that if a compiler can compile for more advanced ISAs, it supports simpler ones as well
 #ifdef PLATFORM_X86
-	if(CpuCap.hasAVX512VLBW && MD5Multi_AVX512_128::isAvailable) HasherMD5Multi_level = MD5MULT_AVX512VL;
-	else if(CpuCap.hasAVX512F && MD5Multi_AVX512::isAvailable) HasherMD5Multi_level = MD5MULT_AVX512F;
-	else if(CpuCap.hasXOP && MD5Multi_XOP::isAvailable) HasherMD5Multi_level = MD5MULT_XOP;  // for the only CPU with AVX2 + XOP (Excavator) I imagine XOP works better than AVX2, due to half rate AVX
-	else if(CpuCap.hasAVX2 && MD5Multi_AVX2::isAvailable) HasherMD5Multi_level = MD5MULT_AVX2;
-	else if(CpuCap.hasSSE2 && MD5Multi_SSE::isAvailable) HasherMD5Multi_level = MD5MULT_SSE;
+	if(caps.hasAVX512VLBW && MD5Multi_AVX512_256::isAvailable) HasherMD5Multi_level = MD5MULT_AVX512VL;
+	else if(caps.hasAVX512F && MD5Multi_AVX512::isAvailable) HasherMD5Multi_level = MD5MULT_AVX512F;
+	else if(caps.hasXOP && MD5Multi_XOP::isAvailable) HasherMD5Multi_level = MD5MULT_XOP;  // for the only CPU with AVX2 + XOP (Excavator) I imagine XOP works better than AVX2, due to half rate AVX
+	else if(caps.hasAVX2 && MD5Multi_AVX2::isAvailable) HasherMD5Multi_level = MD5MULT_AVX2;
+	else if(caps.hasSSE2 && MD5Multi_SSE::isAvailable) HasherMD5Multi_level = MD5MULT_SSE;
 	else
 #endif
 #ifdef PLATFORM_ARM
 	// TODO: if SVE2 width = 128b, prefer NEON?
-	if(CpuCap.hasSVE2 && MD5Multi_SVE2::isAvailable) HasherMD5Multi_level = MD5MULT_SVE2;
-	else if(CpuCap.hasNEON && MD5Multi_NEON::isAvailable) HasherMD5Multi_level = MD5MULT_NEON;
+	if(caps.hasSVE2 && MD5Multi_SVE2::isAvailable) HasherMD5Multi_level = MD5MULT_SVE2;
+	else if(caps.hasNEON && MD5Multi_NEON::isAvailable) HasherMD5Multi_level = MD5MULT_NEON;
 	else
 #endif
 	HasherMD5Multi_level = MD5MULT_SCALAR;
@@ -216,16 +168,76 @@ bool set_hasherInput(HasherInputMethods method) {
 	return false;
 }
 
+bool set_hasherMD5CRC(MD5CRCMethods method) {
+#define SET_HASHER(h, x, hMd5, hCrc) case h: { \
+		if(!MD5CRC_isAvailable_##x) return false; \
+		MD5CRC_Calc = &MD5CRC_Calc_##x; \
+		MD5CRC_Method = h; \
+		MD5Single::method = hMd5; \
+		CRC32_Method = hCrc; \
+		break; \
+	}
+	
+	switch(method) {
+		SET_HASHER(MD5CRCMETH_SCALAR, Scalar, MD5CRCMETH_SCALAR, MD5CRCMETH_SCALAR)
+#ifdef PLATFORM_X86
+		SET_HASHER(MD5CRCMETH_BMI1, BMI1, MD5CRCMETH_BMI1, MD5CRCMETH_PCLMUL)
+		SET_HASHER(MD5CRCMETH_NOLEA, NoLEA, MD5CRCMETH_NOLEA, MD5CRCMETH_PCLMUL)
+		SET_HASHER(MD5CRCMETH_AVX512, AVX512, MD5CRCMETH_AVX512, MD5CRCMETH_PCLMUL)
+		SET_HASHER(MD5CRCMETH_PCLMUL, ClMul, MD5CRCMETH_SCALAR, MD5CRCMETH_PCLMUL)
+#endif
+#ifdef PLATFORM_ARM
+		SET_HASHER(MD5CRCMETH_ARMCRC, ARMCRC, MD5CRCMETH_SCALAR, MD5CRCMETH_ARMCRC)
+#endif
+		default: return false;
+	}
+#undef SET_HASHER
+	
+	switch(MD5Single::method) {
+		case MD5CRCMETH_AVX512:
+			MD5Single::_update = &MD5Single_update_AVX512;
+			MD5Single::_updateZero = &MD5Single_updateZero_AVX512;
+			break;
+		case MD5CRCMETH_NOLEA:
+			MD5Single::_update = &MD5Single_update_NoLEA;
+			MD5Single::_updateZero = &MD5Single_updateZero_NoLEA;
+			break;
+		case MD5CRCMETH_BMI1:
+			MD5Single::_update = &MD5Single_update_BMI1;
+			MD5Single::_updateZero = &MD5Single_updateZero_BMI1;
+			break;
+		case MD5CRCMETH_SCALAR:
+			MD5Single::_update = &MD5Single_update_Scalar;
+			MD5Single::_updateZero = &MD5Single_updateZero_Scalar;
+			break;
+		default: return false; // shouldn't happen
+	}
+	switch(CRC32_Method) {
+		case MD5CRCMETH_PCLMUL:
+			CRC32_Calc = &CRC32_Calc_ClMul;
+			break;
+		case MD5CRCMETH_ARMCRC:
+			CRC32_Calc = &CRC32_Calc_ARMCRC;
+			break;
+		case MD5CRCMETH_SCALAR:
+			CRC32_Calc = &CRC32_Calc_Slice4;
+			break;
+		default: return false; // shouldn't happen
+	}
+	
+	return true;
+}
+
 void set_hasherMD5MultiLevel(MD5MultiLevels level) {
 #define SET_LEVEL(h, l) \
 		if(h::isAvailable) { \
 			HasherMD5Multi_level = l; \
-			break; \
+			return; \
 		}
 	switch(level) {
 #ifdef PLATFORM_X86
 		case MD5MULT_AVX512VL:
-			SET_LEVEL(MD5Multi_AVX512_128, MD5MULT_AVX512VL)
+			SET_LEVEL(MD5Multi_AVX512_256, MD5MULT_AVX512VL)
 			// fallthrough
 		case MD5MULT_AVX512F:
 			SET_LEVEL(MD5Multi_AVX512, MD5MULT_AVX512F)
@@ -456,29 +468,29 @@ void MD5Single::end(void* md5) {
 }
 
 
-const char* hasherInput_methodName() {
+const char* hasherInput_methodName(HasherInputMethods m) {
 	const char* names[] = {
-		"Scalar + Slice4",
+		"Scalar+Generic",
 #ifdef PLATFORM_X86
-		"SSE2 + Slice4",
-		"Scalar + PCLMUL",
-		"SSE2 + PCLMUL",
+		"SSE2+Generic",
+		"Scalar+PCLMUL",
+		"SSE2+PCLMUL",
 #elif defined(PLATFORM_ARM)
-		"NEON + Slice4",
-		"Scalar + ARMv8-CRC32",
-		"NEON + ARMv8-CRC32",
+		"NEON+Generic",
+		"Scalar+ARMCRC",
+		"NEON+ARMCRC",
 #else
-		"SIMD + Slice4",
-		"Scalar + CRC",
-		"SIMD + CRC",
+		"SIMD+Generic",
+		"Scalar+CRC",
+		"SIMD+CRC",
 #endif
-		"BMI1 + PCLMUL",
+		"BMI1+PCLMUL",
 		"AVX512"
 	};
 	
-	return names[(int)HasherInput_Method];
+	return names[(int)m];
 }
-const char* hasherMD5Multi_methodName() {
+const char* hasherMD5Multi_methodName(MD5MultiLevels l) {
 	const char* names[] = {
 		"Scalar",
 		"SSE2",
@@ -490,17 +502,103 @@ const char* hasherMD5Multi_methodName() {
 		"SVE2"
 	};
 	
-	return names[(int)HasherMD5Multi_level];
+	return names[(int)l];
 }
 const char* md5crc_methodName(MD5CRCMethods m) {
 	const char* names[] = {
-		"Scalar",
+		"Generic", // or Slice4 for CRC
 		"BMI1",
 		"NoLEA",
 		"AVX512",
-		"ARMv8-CRC32",
+		"ARMCRC",
 		"PCLMUL"
 	};
 	
 	return names[(int)m];
 }
+
+
+
+std::vector<HasherInputMethods> hasherInput_availableMethods(bool checkCpuid) {
+	std::vector<HasherInputMethods> ret;
+	ret.push_back(INHASH_SCALAR);
+	
+#ifdef PLATFORM_X86
+	const CpuCap caps(checkCpuid);
+	if(caps.hasClMul) {
+		if(caps.hasAVX512VLBW && HasherInput_AVX512::isAvailable)
+			ret.push_back(INHASH_AVX512);
+		if(caps.hasBMI1 && HasherInput_BMI1::isAvailable)
+			ret.push_back(INHASH_BMI1);
+		if(HasherInput_ClMulSSE::isAvailable)
+			ret.push_back(INHASH_SIMD_CRC);
+		if(HasherInput_ClMulScalar::isAvailable)
+			ret.push_back(INHASH_CRC);
+	}
+	if(caps.hasSSE2 && HasherInput_SSE::isAvailable)
+		ret.push_back(INHASH_SIMD);
+#endif
+#ifdef PLATFORM_ARM
+	const CpuCap caps(checkCpuid);
+	if(caps.hasCRC && HasherInput_ARMCRC::isAvailable)
+		ret.push_back(INHASH_CRC);
+	if(caps.hasNEON && HasherInput_NEON::isAvailable)
+		ret.push_back(INHASH_SIMD);
+	if(caps.hasCRC && caps.hasNEON && HasherInput_NEONCRC::isAvailable)
+		ret.push_back(INHASH_SIMD_CRC);
+#endif
+	
+	return ret;
+}
+std::vector<MD5CRCMethods> hasherMD5CRC_availableMethods(bool checkCpuid) {
+	std::vector<MD5CRCMethods> ret;
+	ret.push_back(MD5CRCMETH_SCALAR);
+	
+#ifdef PLATFORM_X86
+	const CpuCap caps(checkCpuid);
+	if(caps.hasClMul) {
+		if(caps.hasAVX512VLBW && MD5CRC_isAvailable_AVX512)
+			ret.push_back(MD5CRCMETH_AVX512);
+		if(MD5CRC_isAvailable_NoLEA)
+			ret.push_back(MD5CRCMETH_NOLEA);
+		if(caps.hasBMI1 && MD5CRC_isAvailable_BMI1)
+			ret.push_back(MD5CRCMETH_BMI1);
+		if(MD5CRC_isAvailable_ClMul)
+			ret.push_back(MD5CRCMETH_PCLMUL);
+	}
+#endif
+#ifdef PLATFORM_ARM
+	const CpuCap caps(checkCpuid);
+	if(caps.hasCRC && MD5CRC_isAvailable_ARMCRC)
+		ret.push_back(MD5CRCMETH_ARMCRC);
+#endif
+	
+	return ret;
+}
+std::vector<MD5MultiLevels> hasherMD5Multi_availableMethods(bool checkCpuid) {
+	std::vector<MD5MultiLevels> ret;
+	ret.push_back(MD5MULT_SCALAR);
+	
+#ifdef PLATFORM_X86
+	const CpuCap caps(checkCpuid);
+	if(caps.hasAVX512VLBW && MD5Multi_AVX512_256::isAvailable)
+		ret.push_back(MD5MULT_AVX512VL);
+	if(caps.hasAVX512F && MD5Multi_AVX512::isAvailable)
+		ret.push_back(MD5MULT_AVX512F);
+	if(caps.hasXOP && MD5Multi_XOP::isAvailable)
+		ret.push_back(MD5MULT_XOP);
+	if(caps.hasAVX2 && MD5Multi_AVX2::isAvailable)
+		ret.push_back(MD5MULT_AVX2);
+	if(caps.hasSSE2 && MD5Multi_SSE::isAvailable)
+		ret.push_back(MD5MULT_SSE);
+#endif
+#ifdef PLATFORM_ARM
+	const CpuCap caps(checkCpuid);
+	if(caps.hasSVE2 && MD5Multi_SVE2::isAvailable)
+		ret.push_back(MD5MULT_SVE2);
+	if(caps.hasNEON && MD5Multi_NEON::isAvailable)
+		ret.push_back(MD5MULT_NEON);
+#endif
+	
+	return ret;
+}
diff --git a/hasher/hasher.h b/hasher/hasher.h
index a52c0bc7..3eea2564 100644
--- a/hasher/hasher.h
+++ b/hasher/hasher.h
@@ -25,13 +25,37 @@ enum MD5MultiLevels {
 	MD5MULT_SVE2
 };
 
+// single hash instances
+extern uint32_t(*CRC32_Calc)(const void*, size_t);
+extern MD5CRCMethods CRC32_Method;
+extern uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*);
+extern MD5CRCMethods MD5CRC_Method;
+
+
 void setup_hasher();
 bool set_hasherInput(HasherInputMethods method);
+bool set_hasherMD5CRC(MD5CRCMethods method);
 void set_hasherMD5MultiLevel(MD5MultiLevels level);
 extern IHasherInput*(*HasherInput_Create)();
+extern HasherInputMethods HasherInput_Method;
+extern MD5MultiLevels HasherMD5Multi_level;
+
+const char* hasherInput_methodName(HasherInputMethods m);
+const char* md5crc_methodName(MD5CRCMethods m);
+const char* hasherMD5Multi_methodName(MD5MultiLevels l);
+inline const char* hasherInput_methodName() {
+	return hasherInput_methodName(HasherInput_Method);
+}
+inline const char* md5crc_methodName() {
+	return md5crc_methodName(MD5CRC_Method);
+}
+inline const char* hasherMD5Multi_methodName() {
+	return hasherMD5Multi_methodName(HasherMD5Multi_level);
+}
 
-const char* hasherInput_methodName();
-const char* hasherMD5Multi_methodName();
+std::vector<HasherInputMethods> hasherInput_availableMethods(bool checkCpuid);
+std::vector<MD5CRCMethods> hasherMD5CRC_availableMethods(bool checkCpuid);
+std::vector<MD5MultiLevels> hasherMD5Multi_availableMethods(bool checkCpuid);
 
 class MD5Multi {
 	std::vector<IMD5Multi*> ctx;
@@ -59,11 +83,4 @@ class MD5Multi {
 };
 
 
-// single hash instances
-extern uint32_t(*CRC32_Calc)(const void*, size_t);
-extern MD5CRCMethods CRC32_Method;
-extern uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*);
-extern MD5CRCMethods MD5CRC_Method;
-const char* md5crc_methodName(MD5CRCMethods m);
-
 #endif /* __HASHER_H */

From 583afb5bc26f3edae01eb0c7d730bf239190266e Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 17 Aug 2023 14:26:57 +1000
Subject: [PATCH 58/91] Hasher fixes

---
 hasher/hasher.cpp    | 2 ++
 hasher/hasher_base.h | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp
index ced8b76e..121d303f 100644
--- a/hasher/hasher.cpp
+++ b/hasher/hasher.cpp
@@ -31,6 +31,7 @@ struct CpuCap {
 		model = ((cpuInfo[0]>>4) & 0xf) + ((cpuInfo[0]>>12) & 0xf0);
 		
 		// TODO: check perf on small cores
+		isLEASlow = false;
 		if(family == 6) {
 			isSmallCore = CPU_MODEL_IS_BNL_SLM(model);
 			// Intel Sandy Bridge to Skylake has slow 3-component LEA
@@ -41,6 +42,7 @@ struct CpuCap {
 		
 		isVecRotSlow = (family == 0xaf); // vector rotate has 2 cycle latency on Zen4
 		
+		hasAVX = false; hasBMI1 = false; hasAVX2 = false; hasAVX512F = false; hasAVX512VLBW = false;
 #if !defined(_MSC_VER) || _MSC_VER >= 1600
 		_cpuidX(cpuInfoX, 7, 0);
 		if((cpuInfo[2] & 0x1C000000) == 0x1C000000) { // has AVX + OSXSAVE + XSAVE
diff --git a/hasher/hasher_base.h b/hasher/hasher_base.h
index 56eb7f1c..9464f463 100644
--- a/hasher/hasher_base.h
+++ b/hasher/hasher_base.h
@@ -56,7 +56,7 @@ const bool MD5CRC(isAvailable) = true;
 uint32_t MD5CRC(Calc)(const void* data, size_t length, size_t zeroPad, void* md5) {
 	uint32_t md5State[4] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476};
 #ifdef PLATFORM_X86
-	char crcState[64]; // ClMul uses 4x16B state, others use 4B
+	ALIGN_TO(16, char crcState[64]); // ClMul uses 4x16B state, others use 4B
 #else
 	char crcState[4];
 #endif
@@ -293,7 +293,7 @@ void MD5Multi::reset() {
 const bool CRC32Impl(CRC32_isAvailable) = true;
 uint32_t CRC32Impl(CRC32_Calc)(const void* data, size_t len) {
 #ifdef PLATFORM_X86
-	char crcState[64]; // ClMul uses 4x16B state, others use 4B
+	ALIGN_TO(16, char crcState[64]); // ClMul uses 4x16B state, others use 4B
 #else
 	char crcState[4];
 #endif

From 88b3f2086ba66704abe703883dd24e09f672e00a Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 20 Aug 2023 10:22:22 +1000
Subject: [PATCH 59/91] Fix compile on AArch32 + EOL fix

---
 gf16/gf16pmul_neon.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/gf16/gf16pmul_neon.c b/gf16/gf16pmul_neon.c
index 52c68655..d2dc6398 100644
--- a/gf16/gf16pmul_neon.c
+++ b/gf16/gf16pmul_neon.c
@@ -20,14 +20,20 @@ void gf16pmul_neon(void *HEDLEY_RESTRICT dst, const void* src1, const void* src2
 	for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(uint8x16_t)*2) {
 		poly8x16x2_t data1 = vld2q_p8(_src1+ptr);
 		poly8x16x2_t data2 = vld2q_p8(_src2+ptr);
-		poly16x8_t low1 = pmull_low(data1.val[0], data2.val[0]);
-		poly16x8_t low2 = pmull_high(data1.val[0], data2.val[0]);
+		poly16x8_t low1 = vmull_p8(vget_low_p8(data1.val[0]), vget_low_p8(data2.val[0]));
 		poly8x16_t dataMid1 = veorq_p8(data1.val[0], data1.val[1]);
 		poly8x16_t dataMid2 = veorq_p8(data2.val[0], data2.val[1]);
-		poly16x8_t mid1 = pmull_low(dataMid1, dataMid2);
+		poly16x8_t mid1 = vmull_p8(vget_low_p8(dataMid1), vget_low_p8(dataMid2));
+		poly16x8_t high1 = vmull_p8(vget_low_p8(data1.val[1]), vget_low_p8(data2.val[1]));
+#ifdef __aarch64__
+		poly16x8_t low2 = pmull_high(data1.val[0], data2.val[0]);
 		poly16x8_t mid2 = pmull_high(dataMid1, dataMid2);
-		poly16x8_t high1 = pmull_low(data1.val[1], data2.val[1]);
 		poly16x8_t high2 = pmull_high(data1.val[1], data2.val[1]);
+#else
+		poly16x8_t low2 = vmull_p8(vget_high_p8(data1.val[0]), vget_high_p8(data2.val[0]));
+		poly16x8_t mid2 = vmull_p8(vget_high_p8(dataMid1), vget_high_p8(dataMid2));
+		poly16x8_t high2 = vmull_p8(vget_high_p8(data1.val[1]), vget_high_p8(data2.val[1]));
+#endif
 		
 		gf16_clmul_neon_reduction(&low1, low2, mid1, mid2, &high1, high2);
 		uint8x16x2_t out;

From 626482eda5233ea557c71ff383dc3824eddcc79b Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 20 Aug 2023 17:25:11 +1000
Subject: [PATCH 60/91] Check for presence of GC in RVV (since it's compiled
 that way)

---
 gf16/gf16mul.cpp | 2 +-
 src/cpuid.h      | 4 ++++
 src/platform.h   | 5 +++++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index 1aa04723..136e6829 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -226,7 +226,7 @@ struct CpuCap {
 	bool hasVector;
 	CpuCap(bool detect) : hasVector(true) {
 		if(!detect) return;
-		hasVector = CPU_HAS_VECTOR;
+		hasVector = CPU_HAS_VECTOR && CPU_HAS_GC;
 	}
 };
 #endif
diff --git a/src/cpuid.h b/src/cpuid.h
index 56c95761..6d7c1c11 100644
--- a/src/cpuid.h
+++ b/src/cpuid.h
@@ -158,11 +158,15 @@ static unsigned long getauxval(unsigned long cap) {
 # endif
 
 # ifdef PARPAR_SKIP_AUX_CHECK
+#  define CPU_HAS_GC true
 #  define CPU_HAS_VECTOR true
 # else
+#  define CPU_HAS_GC false
 #  define CPU_HAS_VECTOR false
 
 #  if defined(AT_HWCAP)
+#   undef CPU_HAS_GC
+#   define CPU_HAS_GC ((getauxval(AT_HWCAP) & 4397) == 4397) // 4397 = IMAFDC; TODO: how to detect Z* features of 'G'?
 #   undef CPU_HAS_VECTOR
 #   define CPU_HAS_VECTOR (getauxval(AT_HWCAP) & (1 << ('V'-'A')))
 #  endif
diff --git a/src/platform.h b/src/platform.h
index 11d7f6d6..3ee0d37d 100644
--- a/src/platform.h
+++ b/src/platform.h
@@ -203,6 +203,11 @@ HEDLEY_WARNING("GFNI disabled on GCC < 10 due to incorrect GF2P8AFFINEQB operand
 # endif
 #endif
 
+#if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
+// GCC added RVV intrinsics in GCC13
+# undef __riscv_vector
+#endif
+
 // Some environments lack ARM headers, so try to check for these
 #ifdef __has_include
 # if defined(__ARM_FEATURE_SVE) && !__has_include(<arm_sve.h>)

From c9f864af95df817d0e1f04f296b92775b622018d Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 20 Aug 2023 18:48:34 +1000
Subject: [PATCH 61/91] Suppress build/UBSan warnings + upgrade Hedley

---
 gf16/gf16_xor_avx512.c |    4 +-
 gf16/gf16_xor_common.h |    2 +-
 gf16/gf16mul.cpp       |    1 +
 hasher/hasher.cpp      |    3 +
 hasher/md5x2-x86-asm.h |    4 +-
 src/hedley.h           | 1001 ++++++++++++++++++++++++++++++----------
 6 files changed, 778 insertions(+), 237 deletions(-)

diff --git a/gf16/gf16_xor_avx512.c b/gf16/gf16_xor_avx512.c
index 694f5ef5..d87d79e1 100644
--- a/gf16/gf16_xor_avx512.c
+++ b/gf16/gf16_xor_avx512.c
@@ -851,7 +851,7 @@ void gf16_xor_jit_muladd_multi_avx512(const void *HEDLEY_RESTRICT scratch, unsig
 		/* cmp/jcc */
 		write64(jitptr, 0x800FC03948 | (AX <<16) | (CX <<19) | ((uint64_t)JL <<32));
 		if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT || info->jitOptStrat == GF16_XOR_JIT_STRAT_COPY) {
-			write32(jitptr +5, (int32_t)((jitTemp - (jitdst - (uint8_t*)jit->w)) - jitptr -9));
+			write32(jitptr +5, (int32_t)(((intptr_t)jitTemp - (jitdst - (uint8_t*)jit->w)) - (intptr_t)jitptr -9));
 			jitptr[9] = 0xC3; /* ret */
 			/* memcpy to destination */
 			if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT) {
@@ -957,7 +957,7 @@ void gf16_xor_jit_muladd_multi_packed_avx512(const void *HEDLEY_RESTRICT scratch
 		/* cmp/jcc */
 		write64(jitptr, 0x800FC03948 | (AX <<16) | (CX <<19) | ((uint64_t)JL <<32));
 		if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT || info->jitOptStrat == GF16_XOR_JIT_STRAT_COPY) {
-			write32(jitptr +5, (int32_t)((jitTemp - (jitdst - (uint8_t*)jit->w)) - jitptr -9));
+			write32(jitptr +5, (int32_t)(((intptr_t)jitTemp - (jitdst - (uint8_t*)jit->w)) - (intptr_t)jitptr -9));
 			jitptr[9] = 0xC3; /* ret */
 			/* memcpy to destination */
 			if(info->jitOptStrat == GF16_XOR_JIT_STRAT_COPYNT) {
diff --git a/gf16/gf16_xor_common.h b/gf16/gf16_xor_common.h
index 21b7ea80..7260172d 100644
--- a/gf16/gf16_xor_common.h
+++ b/gf16/gf16_xor_common.h
@@ -150,7 +150,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_xorjit_write_jit(const void *HEDLEY_RESTRI
 			jitptr = jitTemp;
 		
 		jitptr = writeFunc(info, jitptr, coefficient, mode, prefetch);
-		write32(jitptr, (int32_t)(jitTemp - copyOffset - jitptr -4));
+		write32(jitptr, (int32_t)((intptr_t)jitTemp - copyOffset - (intptr_t)jitptr -4));
 		jitptr[4] = 0xC3; /* ret */
 		jitptr += 5;
 		
diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index 136e6829..142aba3d 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -1430,6 +1430,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu
 }
 
 std::vector<Galois16Methods> Galois16Mul::availableMethods(bool checkCpuid) {
+	UNUSED(checkCpuid);
 	std::vector<Galois16Methods> ret;
 	ret.push_back(GF16_LOOKUP);
 	if(gf16_lookup3_stride())
diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp
index 121d303f..ce6e3f40 100644
--- a/hasher/hasher.cpp
+++ b/hasher/hasher.cpp
@@ -522,6 +522,7 @@ const char* md5crc_methodName(MD5CRCMethods m) {
 
 
 std::vector<HasherInputMethods> hasherInput_availableMethods(bool checkCpuid) {
+	(void)checkCpuid;
 	std::vector<HasherInputMethods> ret;
 	ret.push_back(INHASH_SCALAR);
 	
@@ -553,6 +554,7 @@ std::vector<HasherInputMethods> hasherInput_availableMethods(bool checkCpuid) {
 	return ret;
 }
 std::vector<MD5CRCMethods> hasherMD5CRC_availableMethods(bool checkCpuid) {
+	(void)checkCpuid;
 	std::vector<MD5CRCMethods> ret;
 	ret.push_back(MD5CRCMETH_SCALAR);
 	
@@ -578,6 +580,7 @@ std::vector<MD5CRCMethods> hasherMD5CRC_availableMethods(bool checkCpuid) {
 	return ret;
 }
 std::vector<MD5MultiLevels> hasherMD5Multi_availableMethods(bool checkCpuid) {
+	(void)checkCpuid;
 	std::vector<MD5MultiLevels> ret;
 	ret.push_back(MD5MULT_SCALAR);
 	
diff --git a/hasher/md5x2-x86-asm.h b/hasher/md5x2-x86-asm.h
index 64322d2f..325938a7 100644
--- a/hasher/md5x2-x86-asm.h
+++ b/hasher/md5x2-x86-asm.h
@@ -195,8 +195,8 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co
 	ROUND_I(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 21) \
 : ASM_PARAMS(i2, i3));
 	
-	A1 += _data[0][0];
-	A2 += _data[1][0];
+	A1 += read32(_data[0]);
+	A2 += read32(_data[1]);
 	
 	RF4( 1,  2,  3,  4,  -0x28955b88, -0x173848aa, 0x242070db, -0x3e423112)
 	RF4( 5,  6,  7,  8,  -0x0a83f051, 0x4787c62a, -0x57cfb9ed, -0x02b96aff)
diff --git a/src/hedley.h b/src/hedley.h
index d20c2297..8a713e67 100644
--- a/src/hedley.h
+++ b/src/hedley.h
@@ -10,11 +10,11 @@
  * SPDX-License-Identifier: CC0-1.0
  */
 
-#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 9)
+#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 15)
 #if defined(HEDLEY_VERSION)
 #  undef HEDLEY_VERSION
 #endif
-#define HEDLEY_VERSION 9
+#define HEDLEY_VERSION 15
 
 #if defined(HEDLEY_STRINGIFY_EX)
 #  undef HEDLEY_STRINGIFY_EX
@@ -36,6 +36,16 @@
 #endif
 #define HEDLEY_CONCAT(a,b) HEDLEY_CONCAT_EX(a,b)
 
+#if defined(HEDLEY_CONCAT3_EX)
+#  undef HEDLEY_CONCAT3_EX
+#endif
+#define HEDLEY_CONCAT3_EX(a,b,c) a##b##c
+
+#if defined(HEDLEY_CONCAT3)
+#  undef HEDLEY_CONCAT3
+#endif
+#define HEDLEY_CONCAT3(a,b,c) HEDLEY_CONCAT3_EX(a,b,c)
+
 #if defined(HEDLEY_VERSION_ENCODE)
 #  undef HEDLEY_VERSION_ENCODE
 #endif
@@ -77,18 +87,18 @@
 #if defined(HEDLEY_MSVC_VERSION)
 #  undef HEDLEY_MSVC_VERSION
 #endif
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000)
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL)
 #  define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100)
-#elif defined(_MSC_FULL_VER)
+#elif defined(_MSC_FULL_VER) && !defined(__ICL)
 #  define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10)
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && !defined(__ICL)
 #  define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
 #endif
 
 #if defined(HEDLEY_MSVC_VERSION_CHECK)
 #  undef HEDLEY_MSVC_VERSION_CHECK
 #endif
-#if !defined(_MSC_VER)
+#if !defined(HEDLEY_MSVC_VERSION)
 #  define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0)
 #elif defined(_MSC_VER) && (_MSC_VER >= 1400)
 #  define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
@@ -101,9 +111,9 @@
 #if defined(HEDLEY_INTEL_VERSION)
 #  undef HEDLEY_INTEL_VERSION
 #endif
-#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE)
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL)
 #  define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE)
-#elif defined(__INTEL_COMPILER)
+#elif defined(__INTEL_COMPILER) && !defined(__ICL)
 #  define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
 #endif
 
@@ -116,6 +126,22 @@
 #  define HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0)
 #endif
 
+#if defined(HEDLEY_INTEL_CL_VERSION)
+#  undef HEDLEY_INTEL_CL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL)
+#  define HEDLEY_INTEL_CL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0)
+#endif
+
+#if defined(HEDLEY_INTEL_CL_VERSION_CHECK)
+#  undef HEDLEY_INTEL_CL_VERSION_CHECK
+#endif
+#if defined(HEDLEY_INTEL_CL_VERSION)
+#  define HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (HEDLEY_INTEL_CL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#  define HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
 #if defined(HEDLEY_PGI_VERSION)
 #  undef HEDLEY_PGI_VERSION
 #endif
@@ -211,8 +237,16 @@
 #if defined(HEDLEY_TI_VERSION)
 #  undef HEDLEY_TI_VERSION
 #endif
-#if defined(__TI_COMPILER_VERSION__)
-#  define HEDLEY_TI_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#if \
+    defined(__TI_COMPILER_VERSION__) && \
+    ( \
+      defined(__TMS470__) || defined(__TI_ARM__) || \
+      defined(__MSP430__) || \
+      defined(__TMS320C2000__) \
+    )
+#  if (__TI_COMPILER_VERSION__ >= 16000000)
+#    define HEDLEY_TI_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#  endif
 #endif
 
 #if defined(HEDLEY_TI_VERSION_CHECK)
@@ -224,6 +258,102 @@
 #  define HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0)
 #endif
 
+#if defined(HEDLEY_TI_CL2000_VERSION)
+#  undef HEDLEY_TI_CL2000_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__)
+#  define HEDLEY_TI_CL2000_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(HEDLEY_TI_CL2000_VERSION_CHECK)
+#  undef HEDLEY_TI_CL2000_VERSION_CHECK
+#endif
+#if defined(HEDLEY_TI_CL2000_VERSION)
+#  define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL2000_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#  define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(HEDLEY_TI_CL430_VERSION)
+#  undef HEDLEY_TI_CL430_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__)
+#  define HEDLEY_TI_CL430_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(HEDLEY_TI_CL430_VERSION_CHECK)
+#  undef HEDLEY_TI_CL430_VERSION_CHECK
+#endif
+#if defined(HEDLEY_TI_CL430_VERSION)
+#  define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL430_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#  define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(HEDLEY_TI_ARMCL_VERSION)
+#  undef HEDLEY_TI_ARMCL_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__))
+#  define HEDLEY_TI_ARMCL_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(HEDLEY_TI_ARMCL_VERSION_CHECK)
+#  undef HEDLEY_TI_ARMCL_VERSION_CHECK
+#endif
+#if defined(HEDLEY_TI_ARMCL_VERSION)
+#  define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_ARMCL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#  define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(HEDLEY_TI_CL6X_VERSION)
+#  undef HEDLEY_TI_CL6X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__)
+#  define HEDLEY_TI_CL6X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(HEDLEY_TI_CL6X_VERSION_CHECK)
+#  undef HEDLEY_TI_CL6X_VERSION_CHECK
+#endif
+#if defined(HEDLEY_TI_CL6X_VERSION)
+#  define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL6X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#  define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(HEDLEY_TI_CL7X_VERSION)
+#  undef HEDLEY_TI_CL7X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__)
+#  define HEDLEY_TI_CL7X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(HEDLEY_TI_CL7X_VERSION_CHECK)
+#  undef HEDLEY_TI_CL7X_VERSION_CHECK
+#endif
+#if defined(HEDLEY_TI_CL7X_VERSION)
+#  define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL7X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#  define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(HEDLEY_TI_CLPRU_VERSION)
+#  undef HEDLEY_TI_CLPRU_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__)
+#  define HEDLEY_TI_CLPRU_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(HEDLEY_TI_CLPRU_VERSION_CHECK)
+#  undef HEDLEY_TI_CLPRU_VERSION_CHECK
+#endif
+#if defined(HEDLEY_TI_CLPRU_VERSION)
+#  define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CLPRU_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#  define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
 #if defined(HEDLEY_CRAY_VERSION)
 #  undef HEDLEY_CRAY_VERSION
 #endif
@@ -251,7 +381,7 @@
 #  if __VER__ > 1000
 #    define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000))
 #  else
-#    define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE(VER / 100, __VER__ % 100, 0)
+#    define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0)
 #  endif
 #endif
 
@@ -328,6 +458,22 @@
 #  define HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0)
 #endif
 
+#if defined(HEDLEY_MCST_LCC_VERSION)
+#  undef HEDLEY_MCST_LCC_VERSION
+#endif
+#if defined(__LCC__) && defined(__LCC_MINOR__)
+#  define HEDLEY_MCST_LCC_VERSION HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__)
+#endif
+
+#if defined(HEDLEY_MCST_LCC_VERSION_CHECK)
+#  undef HEDLEY_MCST_LCC_VERSION_CHECK
+#endif
+#if defined(HEDLEY_MCST_LCC_VERSION)
+#  define HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (HEDLEY_MCST_LCC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#  define HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
 #if defined(HEDLEY_GCC_VERSION)
 #  undef HEDLEY_GCC_VERSION
 #endif
@@ -337,8 +483,16 @@
   !defined(HEDLEY_INTEL_VERSION) && \
   !defined(HEDLEY_PGI_VERSION) && \
   !defined(HEDLEY_ARM_VERSION) && \
+  !defined(HEDLEY_CRAY_VERSION) && \
   !defined(HEDLEY_TI_VERSION) && \
-  !defined(__COMPCERT__)
+  !defined(HEDLEY_TI_ARMCL_VERSION) && \
+  !defined(HEDLEY_TI_CL430_VERSION) && \
+  !defined(HEDLEY_TI_CL2000_VERSION) && \
+  !defined(HEDLEY_TI_CL6X_VERSION) && \
+  !defined(HEDLEY_TI_CL7X_VERSION) && \
+  !defined(HEDLEY_TI_CLPRU_VERSION) && \
+  !defined(__COMPCERT__) && \
+  !defined(HEDLEY_MCST_LCC_VERSION)
 #  define HEDLEY_GCC_VERSION HEDLEY_GNUC_VERSION
 #endif
 
@@ -354,7 +508,11 @@
 #if defined(HEDLEY_HAS_ATTRIBUTE)
 #  undef HEDLEY_HAS_ATTRIBUTE
 #endif
-#if defined(__has_attribute)
+#if \
+  defined(__has_attribute) && \
+  ( \
+    (!defined(HEDLEY_IAR_VERSION) || HEDLEY_IAR_VERSION_CHECK(8,5,9)) \
+  )
 #  define HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
 #else
 #  define HEDLEY_HAS_ATTRIBUTE(attribute) (0)
@@ -364,7 +522,7 @@
 #  undef HEDLEY_GNUC_HAS_ATTRIBUTE
 #endif
 #if defined(__has_attribute)
-#  define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute)
+#  define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_HAS_ATTRIBUTE(attribute)
 #else
 #  define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
 #endif
@@ -373,7 +531,7 @@
 #  undef HEDLEY_GCC_HAS_ATTRIBUTE
 #endif
 #if defined(__has_attribute)
-#  define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) __has_attribute(attribute)
+#  define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_HAS_ATTRIBUTE(attribute)
 #else
 #  define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
 #endif
@@ -381,12 +539,30 @@
 #if defined(HEDLEY_HAS_CPP_ATTRIBUTE)
 #  undef HEDLEY_HAS_CPP_ATTRIBUTE
 #endif
-#if defined(__has_cpp_attribute) && defined(__cplusplus)
+#if \
+  defined(__has_cpp_attribute) && \
+  defined(__cplusplus) && \
+  (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0))
 #  define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
 #else
 #  define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
 #endif
 
+#if defined(HEDLEY_HAS_CPP_ATTRIBUTE_NS)
+#  undef HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#endif
+#if !defined(__cplusplus) || !defined(__has_cpp_attribute)
+#  define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#elif \
+  !defined(HEDLEY_PGI_VERSION) && \
+  !defined(HEDLEY_IAR_VERSION) && \
+  (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \
+  (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0))
+#  define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute)
+#else
+#  define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#endif
+
 #if defined(HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
 #  undef HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
 #endif
@@ -548,7 +724,13 @@
   HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
   HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(6,0,0) || \
+  HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
   HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \
   HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \
   HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \
@@ -575,13 +757,21 @@
 #elif HEDLEY_GCC_VERSION_CHECK(4,6,0)
 #  define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
 #  define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
-#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+#elif \
+  HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
 #  define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
 #  define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
 #elif HEDLEY_ARM_VERSION_CHECK(5,6,0)
 #  define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
 #  define HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
-#elif HEDLEY_TI_VERSION_CHECK(8,1,0)
+#elif \
+    HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \
+    HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
+    HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
 #  define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
 #  define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
 #elif HEDLEY_PELLES_VERSION_CHECK(2,90,0)
@@ -592,6 +782,102 @@
 #  define HEDLEY_DIAGNOSTIC_POP
 #endif
 
+/* HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+#  undef HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#endif
+#if defined(__cplusplus)
+#  if HEDLEY_HAS_WARNING("-Wc++98-compat")
+#    if HEDLEY_HAS_WARNING("-Wc++17-extensions")
+#      if HEDLEY_HAS_WARNING("-Wc++1z-extensions")
+#        define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+           HEDLEY_DIAGNOSTIC_PUSH \
+           _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+           _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
+           _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \
+           xpr \
+           HEDLEY_DIAGNOSTIC_POP
+#      else
+#        define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+           HEDLEY_DIAGNOSTIC_PUSH \
+           _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+           _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
+           xpr \
+           HEDLEY_DIAGNOSTIC_POP
+#      endif
+#    else
+#      define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+         HEDLEY_DIAGNOSTIC_PUSH \
+         _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+         xpr \
+         HEDLEY_DIAGNOSTIC_POP
+#    endif
+#  endif
+#endif
+#if !defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x
+#endif
+
+#if defined(HEDLEY_CONST_CAST)
+#  undef HEDLEY_CONST_CAST
+#endif
+#if defined(__cplusplus)
+#  define HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
+#elif \
+  HEDLEY_HAS_WARNING("-Wcast-qual") || \
+  HEDLEY_GCC_VERSION_CHECK(4,6,0) || \
+  HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \
+      HEDLEY_DIAGNOSTIC_PUSH \
+      HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
+      ((T) (expr)); \
+      HEDLEY_DIAGNOSTIC_POP \
+    }))
+#else
+#  define HEDLEY_CONST_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(HEDLEY_REINTERPRET_CAST)
+#  undef HEDLEY_REINTERPRET_CAST
+#endif
+#if defined(__cplusplus)
+#  define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
+#else
+#  define HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(HEDLEY_STATIC_CAST)
+#  undef HEDLEY_STATIC_CAST
+#endif
+#if defined(__cplusplus)
+#  define HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
+#else
+#  define HEDLEY_STATIC_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(HEDLEY_CPP_CAST)
+#  undef HEDLEY_CPP_CAST
+#endif
+#if defined(__cplusplus)
+#  if HEDLEY_HAS_WARNING("-Wold-style-cast")
+#    define HEDLEY_CPP_CAST(T, expr) \
+       HEDLEY_DIAGNOSTIC_PUSH \
+       _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \
+       ((T) (expr)) \
+       HEDLEY_DIAGNOSTIC_POP
+#  elif HEDLEY_IAR_VERSION_CHECK(8,3,0)
+#    define HEDLEY_CPP_CAST(T, expr) \
+       HEDLEY_DIAGNOSTIC_PUSH \
+       _Pragma("diag_suppress=Pe137") \
+       HEDLEY_DIAGNOSTIC_POP
+#  else
+#    define HEDLEY_CPP_CAST(T, expr) ((T) (expr))
+#  endif
+#else
+#  define HEDLEY_CPP_CAST(T, expr) (expr)
+#endif
+
 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
 #  undef HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
 #endif
@@ -599,13 +885,30 @@
 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
 #elif HEDLEY_INTEL_VERSION_CHECK(13,0,0)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)")
+#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786))
+#elif HEDLEY_PGI_VERSION_CHECK(20,7,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445")
 #elif HEDLEY_PGI_VERSION_CHECK(17,10,0)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
 #elif HEDLEY_GCC_VERSION_CHECK(4,3,0)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
 #elif HEDLEY_MSVC_VERSION_CHECK(15,0,0)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996))
-#elif HEDLEY_TI_VERSION_CHECK(8,0,0)
+#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
+#elif \
+    HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
@@ -626,20 +929,62 @@
 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
 #elif HEDLEY_INTEL_VERSION_CHECK(13,0,0)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)")
+#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161))
 #elif HEDLEY_PGI_VERSION_CHECK(17,10,0)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
 #elif HEDLEY_GCC_VERSION_CHECK(4,3,0)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
 #elif HEDLEY_MSVC_VERSION_CHECK(15,0,0)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068))
-#elif HEDLEY_TI_VERSION_CHECK(8,0,0)
+#elif \
+    HEDLEY_TI_VERSION_CHECK(16,9,0) || \
+    HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
+    HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
 #elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
+#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161")
 #else
 #  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
 #endif
 
+#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES)
+#  undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+#if HEDLEY_HAS_WARNING("-Wunknown-attributes")
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"")
+#elif HEDLEY_GCC_VERSION_CHECK(4,6,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif HEDLEY_INTEL_VERSION_CHECK(17,0,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)")
+#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292))
+#elif HEDLEY_MSVC_VERSION_CHECK(19,0,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030))
+#elif HEDLEY_PGI_VERSION_CHECK(20,7,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098")
+#elif HEDLEY_PGI_VERSION_CHECK(17,10,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)")
+#elif \
+    HEDLEY_TI_VERSION_CHECK(18,1,0) || \
+    HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
+    HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173")
+#elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097")
+#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
+#else
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+
 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
 #  undef HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
 #endif
@@ -653,40 +998,74 @@
 #  define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
 #endif
 
+#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION)
+#  undef HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
+#endif
+#if HEDLEY_HAS_WARNING("-Wunused-function")
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"")
+#elif HEDLEY_GCC_VERSION_CHECK(3,4,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"")
+#elif HEDLEY_MSVC_VERSION_CHECK(1,0,0)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505))
+#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142")
+#else
+#  define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
+#endif
+
 #if defined(HEDLEY_DEPRECATED)
 #  undef HEDLEY_DEPRECATED
 #endif
 #if defined(HEDLEY_DEPRECATED_FOR)
 #  undef HEDLEY_DEPRECATED_FOR
 #endif
-#if defined(__cplusplus) && (__cplusplus >= 201402L)
-#  define HEDLEY_DEPRECATED(since) [[deprecated("Since " #since)]]
-#  define HEDLEY_DEPRECATED_FOR(since, replacement) [[deprecated("Since " #since "; use " #replacement)]]
+#if \
+  HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since))
+#  define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
 #elif \
-  HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \
+  (HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(HEDLEY_IAR_VERSION)) || \
   HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
   HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
   HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
   HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
-  HEDLEY_TI_VERSION_CHECK(8,3,0)
+  HEDLEY_TI_VERSION_CHECK(18,1,0) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since)))
 #  define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement)))
+#elif defined(__cplusplus) && (__cplusplus >= 201402L)
+#  define HEDLEY_DEPRECATED(since) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]])
+#  define HEDLEY_DEPRECATED_FOR(since, replacement) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]])
 #elif \
   HEDLEY_HAS_ATTRIBUTE(deprecated) || \
   HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(8,0,0) || \
-  (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+  HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
+  HEDLEY_IAR_VERSION_CHECK(8,10,0)
 #  define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
 #  define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__))
-#elif HEDLEY_MSVC_VERSION_CHECK(14,0,0)
-#  define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since))
-#  define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
 #elif \
   HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
-  HEDLEY_PELLES_VERSION_CHECK(6,50,0)
-#  define HEDLEY_DEPRECATED(since) _declspec(deprecated)
+  HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define HEDLEY_DEPRECATED(since) __declspec(deprecated)
 #  define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
 #elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
 #  define HEDLEY_DEPRECATED(since) _Pragma("deprecated")
@@ -702,7 +1081,8 @@
 #if \
   HEDLEY_HAS_ATTRIBUTE(warning) || \
   HEDLEY_GCC_VERSION_CHECK(4,3,0) || \
-  HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+  HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since)))
 #else
 #  define HEDLEY_UNAVAILABLE(available_since)
@@ -711,21 +1091,41 @@
 #if defined(HEDLEY_WARN_UNUSED_RESULT)
 #  undef HEDLEY_WARN_UNUSED_RESULT
 #endif
-#if defined(__cplusplus) && (__cplusplus >= 201703L)
-#  define HEDLEY_WARN_UNUSED_RESULT [[nodiscard]]
-#elif \
+#if defined(HEDLEY_WARN_UNUSED_RESULT_MSG)
+#  undef HEDLEY_WARN_UNUSED_RESULT_MSG
+#endif
+#if \
   HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \
   HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-  HEDLEY_TI_VERSION_CHECK(8,0,0) || \
-  (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
   (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
-  HEDLEY_PGI_VERSION_CHECK(17,10,0)
+  HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#  define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__))
+#elif (HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
+#  define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#  define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
+#elif HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
+#  define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#  define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
 #elif defined(_Check_return_) /* SAL */
 #  define HEDLEY_WARN_UNUSED_RESULT _Check_return_
+#  define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
 #else
 #  define HEDLEY_WARN_UNUSED_RESULT
+#  define HEDLEY_WARN_UNUSED_RESULT_MSG(msg)
 #endif
 
 #if defined(HEDLEY_SENTINEL)
@@ -735,7 +1135,8 @@
   HEDLEY_HAS_ATTRIBUTE(sentinel) || \
   HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-  HEDLEY_ARM_VERSION_CHECK(5,4,0)
+  HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
 #else
 #  define HEDLEY_SENTINEL(position)
@@ -746,24 +1147,40 @@
 #endif
 #if HEDLEY_IAR_VERSION_CHECK(8,0,0)
 #  define HEDLEY_NO_RETURN __noreturn
-#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#elif \
+  HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_NO_RETURN __attribute__((__noreturn__))
 #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
 #  define HEDLEY_NO_RETURN _Noreturn
 #elif defined(__cplusplus) && (__cplusplus >= 201103L)
-#  define HEDLEY_NO_RETURN [[noreturn]]
+#  define HEDLEY_NO_RETURN HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]])
 #elif \
   HEDLEY_HAS_ATTRIBUTE(noreturn) || \
   HEDLEY_GCC_VERSION_CHECK(3,2,0) || \
   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(18,0,0) || \
-  (HEDLEY_TI_VERSION_CHECK(17,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+  HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  HEDLEY_IAR_VERSION_CHECK(8,10,0)
 #  define HEDLEY_NO_RETURN __attribute__((__noreturn__))
-#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0)
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+#  define HEDLEY_NO_RETURN _Pragma("does_not_return")
+#elif \
+  HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
 #  define HEDLEY_NO_RETURN __declspec(noreturn)
-#elif HEDLEY_TI_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+#elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
 #  define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
 #elif HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
 #  define HEDLEY_NO_RETURN __attribute((noreturn))
@@ -773,67 +1190,82 @@
 #  define HEDLEY_NO_RETURN
 #endif
 
+#if defined(HEDLEY_NO_ESCAPE)
+#  undef HEDLEY_NO_ESCAPE
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(noescape)
+#  define HEDLEY_NO_ESCAPE __attribute__((__noescape__))
+#else
+#  define HEDLEY_NO_ESCAPE
+#endif
+
 #if defined(HEDLEY_UNREACHABLE)
 #  undef HEDLEY_UNREACHABLE
 #endif
 #if defined(HEDLEY_UNREACHABLE_RETURN)
 #  undef HEDLEY_UNREACHABLE_RETURN
 #endif
-#if \
-  (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(HEDLEY_ARM_VERSION))) || \
-  HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
-  HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-  HEDLEY_IBM_VERSION_CHECK(13,1,5)
-#  define HEDLEY_UNREACHABLE() __builtin_unreachable()
-#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0)
-#  define HEDLEY_UNREACHABLE() __assume(0)
-#elif HEDLEY_TI_VERSION_CHECK(6,0,0)
-#  if defined(__cplusplus)
-#    define HEDLEY_UNREACHABLE() std::_nassert(0)
-#  else
-#    define HEDLEY_UNREACHABLE() _nassert(0)
-#  endif
-#  define HEDLEY_UNREACHABLE_RETURN(value) return value
-#elif defined(EXIT_FAILURE)
-#  define HEDLEY_UNREACHABLE() abort()
-#else
-#  define HEDLEY_UNREACHABLE()
-#  define HEDLEY_UNREACHABLE_RETURN(value) return value
-#endif
-#if !defined(HEDLEY_UNREACHABLE_RETURN)
-#  define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE()
-#endif
-
 #if defined(HEDLEY_ASSUME)
 #  undef HEDLEY_ASSUME
 #endif
 #if \
   HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
-  HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+  HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
 #  define HEDLEY_ASSUME(expr) __assume(expr)
 #elif HEDLEY_HAS_BUILTIN(__builtin_assume)
 #  define HEDLEY_ASSUME(expr) __builtin_assume(expr)
-#elif HEDLEY_TI_VERSION_CHECK(6,0,0)
+#elif \
+    HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+    HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
 #  if defined(__cplusplus)
 #    define HEDLEY_ASSUME(expr) std::_nassert(expr)
 #  else
 #    define HEDLEY_ASSUME(expr) _nassert(expr)
 #  endif
-#elif \
-  (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && !defined(HEDLEY_ARM_VERSION)) || \
+#endif
+#if \
+  (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(HEDLEY_ARM_VERSION))) || \
   HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+  HEDLEY_PGI_VERSION_CHECK(18,10,0) || \
   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
-  HEDLEY_IBM_VERSION_CHECK(13,1,5)
-#  define HEDLEY_ASSUME(expr) ((void) ((expr) ? 1 : (__builtin_unreachable(), 1)))
+  HEDLEY_IBM_VERSION_CHECK(13,1,5) || \
+  HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define HEDLEY_UNREACHABLE() __builtin_unreachable()
+#elif defined(HEDLEY_ASSUME)
+#  define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0)
+#endif
+#if !defined(HEDLEY_ASSUME)
+#  if defined(HEDLEY_UNREACHABLE)
+#    define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (HEDLEY_UNREACHABLE(), 1)))
+#  else
+#    define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, expr)
+#  endif
+#endif
+#if defined(HEDLEY_UNREACHABLE)
+#  if  \
+      HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+      HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
+#    define HEDLEY_UNREACHABLE_RETURN(value) return (HEDLEY_STATIC_CAST(void, HEDLEY_ASSUME(0)), (value))
+#  else
+#    define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE()
+#  endif
 #else
-#  define HEDLEY_ASSUME(expr) ((void) (expr))
+#  define HEDLEY_UNREACHABLE_RETURN(value) return (value)
+#endif
+#if !defined(HEDLEY_UNREACHABLE)
+#  define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0)
 #endif
-
 
 HEDLEY_DIAGNOSTIC_PUSH
-#if \
-  HEDLEY_HAS_WARNING("-Wvariadic-macros") || \
-  HEDLEY_GCC_VERSION_CHECK(4,0,0)
+#if HEDLEY_HAS_WARNING("-Wpedantic")
+#  pragma clang diagnostic ignored "-Wpedantic"
+#endif
+#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus)
+#  pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#endif
+#if HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0)
 #  if defined(__clang__)
 #    pragma clang diagnostic ignored "-Wvariadic-macros"
 #  elif defined(HEDLEY_GCC_VERSION)
@@ -867,8 +1299,18 @@ HEDLEY_DIAGNOSTIC_POP
   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
   HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(8,0,0) || \
-  (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+  HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check)))
 #elif HEDLEY_PELLES_VERSION_CHECK(6,0,0)
 #  define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check))
@@ -881,7 +1323,7 @@ HEDLEY_DIAGNOSTIC_POP
 #endif
 #if defined(__cplusplus)
 #  if __cplusplus >= 201103L
-#    define HEDLEY_CONSTEXPR constexpr
+#    define HEDLEY_CONSTEXPR HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr)
 #  endif
 #endif
 #if !defined(HEDLEY_CONSTEXPR)
@@ -901,44 +1343,50 @@ HEDLEY_DIAGNOSTIC_POP
 #  undef HEDLEY_UNPREDICTABLE
 #endif
 #if HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
-#  define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable(!!(expr))
+#  define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
 #endif
 #if \
-  HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) || \
-  HEDLEY_GCC_VERSION_CHECK(9,0,0)
-#  define HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(expr, value, probability)
-#  define HEDLEY_PREDICT_TRUE(expr, probability) __builtin_expect_with_probability(!!(expr), 1, probability)
-#  define HEDLEY_PREDICT_FALSE(expr, probability) __builtin_expect_with_probability(!!(expr), 0, probability)
-#  define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1)
-#  define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
-#  if !defined(HEDLEY_BUILTIN_UNPREDICTABLE)
-#    define HEDLEY_BUILTIN_UNPREDICTABLE(expr) __builtin_expect_with_probability(!!(expr), 1, 0.5)
-#  endif
+  (HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(HEDLEY_PGI_VERSION)) || \
+  HEDLEY_GCC_VERSION_CHECK(9,0,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(  (expr), (value), (probability))
+#  define HEDLEY_PREDICT_TRUE(expr, probability)   __builtin_expect_with_probability(!!(expr),    1   , (probability))
+#  define HEDLEY_PREDICT_FALSE(expr, probability)  __builtin_expect_with_probability(!!(expr),    0   , (probability))
+#  define HEDLEY_LIKELY(expr)                      __builtin_expect                 (!!(expr),    1                  )
+#  define HEDLEY_UNLIKELY(expr)                    __builtin_expect                 (!!(expr),    0                  )
 #elif \
-  HEDLEY_HAS_BUILTIN(__builtin_expect) || \
+  (HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(HEDLEY_INTEL_CL_VERSION)) || \
   HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
   (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(6,1,0) || \
-  HEDLEY_TINYC_VERSION_CHECK(0,9,27)
+  HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \
+  HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_PREDICT(expr, expected, probability) \
-  (((probability) >= 0.9) ? __builtin_expect(!!(expr), (expected)) : (((void) (expected)), !!(expr)))
+     (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (HEDLEY_STATIC_CAST(void, expected), (expr)))
 #  define HEDLEY_PREDICT_TRUE(expr, probability) \
      (__extension__ ({ \
-       HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \
+       double hedley_probability_ = (probability); \
        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \
      }))
 #  define HEDLEY_PREDICT_FALSE(expr, probability) \
      (__extension__ ({ \
-       HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \
+       double hedley_probability_ = (probability); \
        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \
      }))
 #  define HEDLEY_LIKELY(expr)   __builtin_expect(!!(expr), 1)
 #  define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
 #else
-#  define HEDLEY_PREDICT(expr, expected, probability) (((void) (expected)), !!(expr))
+#  define HEDLEY_PREDICT(expr, expected, probability) (HEDLEY_STATIC_CAST(void, expected), (expr))
 #  define HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
 #  define HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
 #  define HEDLEY_LIKELY(expr) (!!(expr))
@@ -958,10 +1406,24 @@ HEDLEY_DIAGNOSTIC_POP
   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
   HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(8,0,0) || \
-  (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+  HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_MALLOC __attribute__((__malloc__))
-#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0)
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+#  define HEDLEY_MALLOC _Pragma("returns_new_memory")
+#elif \
+  HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
 #  define HEDLEY_MALLOC __declspec(restrict)
 #else
 #  define HEDLEY_MALLOC
@@ -977,11 +1439,28 @@ HEDLEY_DIAGNOSTIC_POP
   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(8,0,0) || \
-  (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-  HEDLEY_PGI_VERSION_CHECK(17,10,0)
+  HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_PURE __attribute__((__pure__))
-#elif HEDLEY_TI_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+#  define HEDLEY_PURE _Pragma("does_not_write_global_data")
+#elif defined(__cplusplus) && \
+    ( \
+      HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
+      HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \
+      HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \
+    )
 #  define HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
 #else
 #  define HEDLEY_PURE
@@ -997,10 +1476,23 @@ HEDLEY_DIAGNOSTIC_POP
   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(8,0,0) || \
-  (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
-  HEDLEY_PGI_VERSION_CHECK(17,10,0)
+  HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_CONST __attribute__((__const__))
+#elif \
+  HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+#  define HEDLEY_CONST _Pragma("no_side_effect")
 #else
 #  define HEDLEY_CONST HEDLEY_PURE
 #endif
@@ -1014,13 +1506,18 @@ HEDLEY_DIAGNOSTIC_POP
   HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
   HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
   HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
   HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
-  HEDLEY_TI_VERSION_CHECK(8,0,0) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
   (HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \
   HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
-  defined(__clang__)
+  defined(__clang__) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_RESTRICT __restrict
 #elif HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus)
 #  define HEDLEY_RESTRICT _Restrict
@@ -1041,8 +1538,15 @@ HEDLEY_DIAGNOSTIC_POP
 #  define HEDLEY_INLINE __inline__
 #elif \
   HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(8,0,0)
+  HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_INLINE __inline
 #else
 #  define HEDLEY_INLINE
@@ -1058,12 +1562,33 @@ HEDLEY_DIAGNOSTIC_POP
   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(8,0,0) || \
-  (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+  HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
+  HEDLEY_IAR_VERSION_CHECK(8,10,0)
 #  define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE
-#elif HEDLEY_MSVC_VERSION_CHECK(12,0,0)
+#elif \
+  HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
 #  define HEDLEY_ALWAYS_INLINE __forceinline
-#elif HEDLEY_TI_VERSION_CHECK(7,0,0) && defined(__cplusplus)
+#elif defined(__cplusplus) && \
+    ( \
+      HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+      HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+      HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+      HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+      HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+      HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \
+    )
 #  define HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
 #elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
 #  define HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
@@ -1081,14 +1606,27 @@ HEDLEY_DIAGNOSTIC_POP
   HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
   HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(8,0,0) || \
-  (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+  HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
+  HEDLEY_IAR_VERSION_CHECK(8,10,0)
 #  define HEDLEY_NEVER_INLINE __attribute__((__noinline__))
-#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0)
+#elif \
+  HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
 #  define HEDLEY_NEVER_INLINE __declspec(noinline)
 #elif HEDLEY_PGI_VERSION_CHECK(10,2,0)
 #  define HEDLEY_NEVER_INLINE _Pragma("noinline")
-#elif HEDLEY_TI_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+#elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
 #  define HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
 #elif HEDLEY_IAR_VERSION_CHECK(8,0,0)
 #  define HEDLEY_NEVER_INLINE _Pragma("inline=never")
@@ -1121,8 +1659,14 @@ HEDLEY_DIAGNOSTIC_POP
     HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
     HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
     HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
-    HEDLEY_TI_VERSION_CHECK(8,0,0) || \
-    (HEDLEY_TI_VERSION_CHECK(7,3,0) && defined(__TI_EABI__) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+    ( \
+      defined(__TI_EABI__) && \
+      ( \
+        (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+        HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \
+      ) \
+    ) || \
+    HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #    define HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
 #    define HEDLEY_PUBLIC  __attribute__((__visibility__("default")))
 #  else
@@ -1138,10 +1682,12 @@ HEDLEY_DIAGNOSTIC_POP
 #if \
   HEDLEY_HAS_ATTRIBUTE(nothrow) || \
   HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
-  HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+  HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_NO_THROW __attribute__((__nothrow__))
 #elif \
   HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0)
 #  define HEDLEY_NO_THROW __declspec(nothrow)
 #else
@@ -1149,30 +1695,21 @@ HEDLEY_DIAGNOSTIC_POP
 #endif
 
 #if defined(HEDLEY_FALL_THROUGH)
-#  undef HEDLEY_FALL_THROUGH
+# undef HEDLEY_FALL_THROUGH
 #endif
 #if \
-     defined(__cplusplus) && \
-     (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \
-     !defined(HEDLEY_PGI_VERSION)
-#  if \
-     (__cplusplus >= 201703L) || \
-     ((__cplusplus >= 201103L) && HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough))
-#    define HEDLEY_FALL_THROUGH [[fallthrough]]
-#  elif (__cplusplus >= 201103L) && HEDLEY_HAS_CPP_ATTRIBUTE(clang::fallthrough)
-#    define HEDLEY_FALL_THROUGH [[clang::fallthrough]]
-#  elif (__cplusplus >= 201103L) && HEDLEY_GCC_VERSION_CHECK(7,0,0)
-#    define HEDLEY_FALL_THROUGH [[gnu::fallthrough]]
-#  endif
-#endif
-#if !defined(HEDLEY_FALL_THROUGH)
-#  if HEDLEY_GNUC_HAS_ATTRIBUTE(fallthrough,7,0,0) && !defined(HEDLEY_PGI_VERSION)
-#    define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
-#  elif defined(__fallthrough) /* SAL */
-#    define HEDLEY_FALL_THROUGH __fallthrough
-#  else
-#    define HEDLEY_FALL_THROUGH
-#  endif
+  HEDLEY_HAS_ATTRIBUTE(fallthrough) || \
+  HEDLEY_GCC_VERSION_CHECK(7,0,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
+#elif HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough)
+#  define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]])
+#elif HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)
+#  define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]])
+#elif defined(__fallthrough) /* SAL */
+#  define HEDLEY_FALL_THROUGH __fallthrough
+#else
+#  define HEDLEY_FALL_THROUGH
 #endif
 
 #if defined(HEDLEY_RETURNS_NON_NULL)
@@ -1180,7 +1717,8 @@ HEDLEY_DIAGNOSTIC_POP
 #endif
 #if \
   HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \
-  HEDLEY_GCC_VERSION_CHECK(4,9,0)
+  HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
 #elif defined(_Ret_notnull_) /* SAL */
 #  define HEDLEY_RETURNS_NON_NULL _Ret_notnull_
@@ -1208,12 +1746,11 @@ HEDLEY_DIAGNOSTIC_POP
 #if defined(HEDLEY_REQUIRE_CONSTEXPR)
 #  undef HEDLEY_REQUIRE_CONSTEXPR
 #endif
-/* Note the double-underscore. For internal use only; no API
- * guarantees! */
-#if defined(HEDLEY__IS_CONSTEXPR)
-#  undef HEDLEY__IS_CONSTEXPR
+/* HEDLEY_IS_CONSTEXPR_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(HEDLEY_IS_CONSTEXPR_)
+#  undef HEDLEY_IS_CONSTEXPR_
 #endif
-
 #if \
   HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
   HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
@@ -1221,9 +1758,10 @@ HEDLEY_DIAGNOSTIC_POP
   HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \
   HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
   HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
-  HEDLEY_TI_VERSION_CHECK(6,1,0) || \
-  HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) || \
-  HEDLEY_CRAY_VERSION_CHECK(8,1,0)
+  HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+  (HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \
+  HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
+  HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
 #  define HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
 #endif
 #if !defined(__cplusplus)
@@ -1236,31 +1774,40 @@ HEDLEY_DIAGNOSTIC_POP
        HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
        HEDLEY_TINYC_VERSION_CHECK(0,9,24)
 #    if defined(__INTPTR_TYPE__)
-#      define HEDLEY__IS_CONSTEXPR(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*)
+#      define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*)
 #    else
 #      include <stdint.h>
-#      define HEDLEY__IS_CONSTEXPR(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*)
+#      define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*)
 #    endif
 #  elif \
-       (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(HEDLEY_SUNPRO_VERSION) && !defined(HEDLEY_PGI_VERSION)) || \
-       HEDLEY_HAS_EXTENSION(c_generic_selections) || \
+       ( \
+          defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+          !defined(HEDLEY_SUNPRO_VERSION) && \
+          !defined(HEDLEY_PGI_VERSION) && \
+          !defined(HEDLEY_IAR_VERSION)) || \
+       (HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(HEDLEY_IAR_VERSION)) || \
        HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
        HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \
        HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
        HEDLEY_ARM_VERSION_CHECK(5,3,0)
 #    if defined(__INTPTR_TYPE__)
-#      define HEDLEY__IS_CONSTEXPR(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0)
+#      define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0)
 #    else
 #      include <stdint.h>
-#      define HEDLEY__IS_CONSTEXPR(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0)
+#      define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0)
 #    endif
 #  elif \
        defined(HEDLEY_GCC_VERSION) || \
        defined(HEDLEY_INTEL_VERSION) || \
        defined(HEDLEY_TINYC_VERSION) || \
-       defined(HEDLEY_TI_VERSION) || \
+       defined(HEDLEY_TI_ARMCL_VERSION) || \
+       HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \
+       defined(HEDLEY_TI_CL2000_VERSION) || \
+       defined(HEDLEY_TI_CL6X_VERSION) || \
+       defined(HEDLEY_TI_CL7X_VERSION) || \
+       defined(HEDLEY_TI_CLPRU_VERSION) || \
        defined(__clang__)
-#    define HEDLEY__IS_CONSTEXPR(expr) ( \
+#    define HEDLEY_IS_CONSTEXPR_(expr) ( \
          sizeof(void) != \
          sizeof(*( \
            1 ? \
@@ -1271,11 +1818,11 @@ HEDLEY_DIAGNOSTIC_POP
        )
 #  endif
 #endif
-#if defined(HEDLEY__IS_CONSTEXPR)
+#if defined(HEDLEY_IS_CONSTEXPR_)
 #  if !defined(HEDLEY_IS_CONSTANT)
-#    define HEDLEY_IS_CONSTANT(expr) HEDLEY__IS_CONSTEXPR(expr)
+#    define HEDLEY_IS_CONSTANT(expr) HEDLEY_IS_CONSTEXPR_(expr)
 #  endif
-#  define HEDLEY_REQUIRE_CONSTEXPR(expr) (HEDLEY__IS_CONSTEXPR(expr) ? (expr) : (-1))
+#  define HEDLEY_REQUIRE_CONSTEXPR(expr) (HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1))
 #else
 #  if !defined(HEDLEY_IS_CONSTANT)
 #    define HEDLEY_IS_CONSTANT(expr) (0)
@@ -1308,67 +1855,36 @@ HEDLEY_DIAGNOSTIC_POP
 #if \
   !defined(__cplusplus) && ( \
       (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
-      HEDLEY_HAS_FEATURE(c_static_assert) || \
+      (HEDLEY_HAS_FEATURE(c_static_assert) && !defined(HEDLEY_INTEL_CL_VERSION)) || \
       HEDLEY_GCC_VERSION_CHECK(6,0,0) || \
       HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
       defined(_Static_assert) \
     )
 #  define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
 #elif \
-  (defined(__cplusplus) && (__cplusplus >= 201703L)) || \
+  (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
   HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \
-  (defined(__cplusplus) && HEDLEY_TI_VERSION_CHECK(8,3,0))
-#  define HEDLEY_STATIC_ASSERT(expr, message) static_assert(expr, message)
-#elif defined(__cplusplus) && (__cplusplus >= 201103L)
-#  define HEDLEY_STATIC_ASSERT(expr, message) static_assert(expr)
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define HEDLEY_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
 #else
 #  define HEDLEY_STATIC_ASSERT(expr, message)
 #endif
 
-#if defined(HEDLEY_CONST_CAST)
-#  undef HEDLEY_CONST_CAST
-#endif
-#if defined(__cplusplus)
-#  define HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
-#elif \
-  HEDLEY_HAS_WARNING("-Wcast-qual") || \
-  HEDLEY_GCC_VERSION_CHECK(4,6,0) || \
-  HEDLEY_INTEL_VERSION_CHECK(13,0,0)
-#  define HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \
-      HEDLEY_DIAGNOSTIC_PUSH \
-      HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
-      ((T) (expr)); \
-      HEDLEY_DIAGNOSTIC_POP \
-    }))
-#else
-#  define HEDLEY_CONST_CAST(T, expr) ((T) (expr))
-#endif
-
-#if defined(HEDLEY_REINTERPRET_CAST)
-#  undef HEDLEY_REINTERPRET_CAST
+#if defined(HEDLEY_NULL)
+#  undef HEDLEY_NULL
 #endif
 #if defined(__cplusplus)
-#  define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
-#else
-#  define HEDLEY_REINTERPRET_CAST(T, expr) (*((T*) &(expr)))
-#endif
-
-#if defined(HEDLEY_STATIC_CAST)
-#  undef HEDLEY_STATIC_CAST
-#endif
-#if defined(__cplusplus)
-#  define HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
-#else
-#  define HEDLEY_STATIC_CAST(T, expr) ((T) (expr))
-#endif
-
-#if defined(HEDLEY_CPP_CAST)
-#  undef HEDLEY_CPP_CAST
-#endif
-#if defined(__cplusplus)
-#  define HEDLEY_CPP_CAST(T, expr) static_cast<T>(expr)
+#  if __cplusplus >= 201103L
+#    define HEDLEY_NULL HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr)
+#  elif defined(NULL)
+#    define HEDLEY_NULL NULL
+#  else
+#    define HEDLEY_NULL HEDLEY_STATIC_CAST(void*, 0)
+#  endif
+#elif defined(NULL)
+#  define HEDLEY_NULL NULL
 #else
-#  define HEDLEY_CPP_CAST(T, expr) (expr)
+#  define HEDLEY_NULL ((void*) 0)
 #endif
 
 #if defined(HEDLEY_MESSAGE)
@@ -1405,41 +1921,51 @@ HEDLEY_DIAGNOSTIC_POP
   HEDLEY_DIAGNOSTIC_POP
 #elif \
   HEDLEY_GCC_VERSION_CHECK(4,8,0) || \
-  HEDLEY_PGI_VERSION_CHECK(18,4,0)
+  HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
+  HEDLEY_INTEL_VERSION_CHECK(13,0,0)
 #  define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg)
-#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+#elif \
+  HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
 #  define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg))
 #else
 #  define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg)
 #endif
 
+#if defined(HEDLEY_REQUIRE)
+#  undef HEDLEY_REQUIRE
+#endif
 #if defined(HEDLEY_REQUIRE_MSG)
 #  undef HEDLEY_REQUIRE_MSG
 #endif
 #if HEDLEY_HAS_ATTRIBUTE(diagnose_if)
 #  if HEDLEY_HAS_WARNING("-Wgcc-compat")
-#    define HEDLEY_REQUIRE_MSG(expr, msg) \
-  HEDLEY_DIAGNOSTIC_PUSH \
-  _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
-  __attribute__((__diagnose_if__(!(expr), msg, "error"))) \
-  HEDLEY_DIAGNOSTIC_POP
+#    define HEDLEY_REQUIRE(expr) \
+       HEDLEY_DIAGNOSTIC_PUSH \
+       _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+       __attribute__((diagnose_if(!(expr), #expr, "error"))) \
+       HEDLEY_DIAGNOSTIC_POP
+#    define HEDLEY_REQUIRE_MSG(expr,msg) \
+       HEDLEY_DIAGNOSTIC_PUSH \
+       _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+       __attribute__((diagnose_if(!(expr), msg, "error"))) \
+       HEDLEY_DIAGNOSTIC_POP
 #  else
-#    define HEDLEY_REQUIRE_MSG(expr, msg) __attribute__((__diagnose_if__(!(expr), msg, "error")))
+#    define HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error")))
+#    define HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error")))
 #  endif
 #else
-#  define HEDLEY_REQUIRE_MSG(expr, msg)
+#  define HEDLEY_REQUIRE(expr)
+#  define HEDLEY_REQUIRE_MSG(expr,msg)
 #endif
 
-#if defined(HEDLEY_REQUIRE)
-#  undef HEDLEY_REQUIRE
-#endif
-#define HEDLEY_REQUIRE(expr) HEDLEY_REQUIRE_MSG(expr, #expr)
-
 #if defined(HEDLEY_FLAGS)
 #  undef HEDLEY_FLAGS
 #endif
-#if HEDLEY_HAS_ATTRIBUTE(flag_enum)
+#if HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion"))
 #  define HEDLEY_FLAGS __attribute__((__flag_enum__))
+#else
+#  define HEDLEY_FLAGS
 #endif
 
 #if defined(HEDLEY_FLAGS_CAST)
@@ -1456,6 +1982,17 @@ HEDLEY_DIAGNOSTIC_POP
 #  define HEDLEY_FLAGS_CAST(T, expr) HEDLEY_STATIC_CAST(T, expr)
 #endif
 
+#if defined(HEDLEY_EMPTY_BASES)
+#  undef HEDLEY_EMPTY_BASES
+#endif
+#if \
+  (HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \
+  HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define HEDLEY_EMPTY_BASES __declspec(empty_bases)
+#else
+#  define HEDLEY_EMPTY_BASES
+#endif
+
 /* Remaining macros are deprecated. */
 
 #if defined(HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)

From ca8bd0f1c91c110d7dd68b68a43a7480a58710ee Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 22 Aug 2023 15:25:27 +1000
Subject: [PATCH 62/91] Tweak C flags

---
 binding.gyp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/binding.gyp b/binding.gyp
index 86757b38..cfdf1265 100644
--- a/binding.gyp
+++ b/binding.gyp
@@ -21,7 +21,7 @@
         ]
       }]
     ],
-    "cflags": ["-std=c99", "-D_POSIX_C_SOURCE=200112L", "-D_DARWIN_C_SOURCE", "-D_GNU_SOURCE"],
+    "cflags_c": ["-std=c99", "-D_POSIX_C_SOURCE=200112L", "-D_DARWIN_C_SOURCE", "-D_GNU_SOURCE"],
     "cxxflags": ["-std=c++11"],
     "msvs_settings": {"VCCLCompilerTool": {"Optimization": "MaxSpeed"}},
     "configurations": {"Release": {
@@ -46,8 +46,8 @@
       "cxxflags!": ["-fno-exceptions"],
       "cflags_cc!": ["-fno-exceptions"],
       "defines": ["USE_LIBUV"],
-      "cflags": ["-fexceptions", "-std=c++11"],
-      "cxxflags": ["-fexceptions"],
+      "cflags": ["-fexceptions"],
+      "cxxflags": ["-fexceptions", "-std=c++11"],
       "cflags_cc": ["-fexceptions"],
       "xcode_settings": {
         "OTHER_CFLAGS!": ["-fno-exceptions"],
@@ -64,7 +64,7 @@
       "defines": ["NDEBUG", "PARPAR_LIBDL_SUPPORT"],
       "sources": ["gf16/opencl-include/cl.c", "gf16/gfmat_coeff.c"],
       "include_dirs": ["gf16/opencl-include"],
-      "cflags": ["-Wno-unused-function", "-std=gnu99"],
+      "cflags": ["-Wno-unused-function", "-std=c99"],
       "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
       "xcode_settings": {
         "OTHER_CFLAGS": ["-Wno-unused-function"],
@@ -77,6 +77,7 @@
       "defines": ["NDEBUG"],
       "sources": ["hasher/hasher.cpp", "hasher/hasher_scalar.cpp"],
       "dependencies": ["hasher_c"],
+      "cxxflags": ["-std=c++11"],
       "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
       "cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
       "xcode_settings": {
@@ -90,7 +91,7 @@
       "type": "static_library",
       "defines": ["NDEBUG"],
       "sources": ["hasher/crc_zeropad.c", "hasher/md5-final.c"],
-      "cflags": ["-Wno-unused-function", "-std=gnu99"],
+      "cflags": ["-Wno-unused-function", "-std=c99"],
       "cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
       "xcode_settings": {
         "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
@@ -444,7 +445,7 @@
         "gf16/gf_add_generic.c",
         "gf16/gf16_cksum_generic.c"
       ],
-      "cflags": ["-Wno-unused-function", "-std=gnu99"],
+      "cflags": ["-Wno-unused-function", "-std=c99"],
       "xcode_settings": {
         "OTHER_CFLAGS": ["-Wno-unused-function"],
         "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
@@ -772,7 +773,7 @@
         "gf16/gf_add_neon.c",
         "gf16/gf16_cksum_neon.c"
       ],
-      "cflags": ["-Wno-unused-function", "-std=gnu99"],
+      "cflags": ["-Wno-unused-function", "-std=c99"],
       "xcode_settings": {
         "OTHER_CFLAGS": ["-Wno-unused-function"],
         "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
@@ -837,7 +838,7 @@
         "gf16/gf_add_sve.c",
         "gf16/gf16_cksum_sve.c"
       ],
-      "cflags": ["-Wno-unused-function", "-std=gnu99"],
+      "cflags": ["-Wno-unused-function", "-std=c99"],
       "xcode_settings": {
         "OTHER_CFLAGS": ["-Wno-unused-function"],
         "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
@@ -875,7 +876,7 @@
         "gf16/gf16_clmul_sve2.c",
         "gf16/gf_add_sve2.c"
       ],
-      "cflags": ["-Wno-unused-function", "-std=gnu99"],
+      "cflags": ["-Wno-unused-function", "-std=c99"],
       "xcode_settings": {
         "OTHER_CFLAGS": ["-Wno-unused-function"],
         "OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]

From 3888616089db3d86692bba837087f69694d1566e Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 22 Aug 2023 16:31:13 +1000
Subject: [PATCH 63/91] Add gf16/hasher test suite

---
 .github/workflows/test.yml        | 243 ++++++++
 test/gf16/CMakeLists.txt          | 314 ++++++++++
 test/gf16/p2c-inv/galois.cpp      |  28 +
 test/gf16/p2c-inv/galois.h        | 317 ++++++++++
 test/gf16/p2c-inv/reedsolomon.cpp | 253 ++++++++
 test/gf16/p2c-inv/reedsolomon.h   |  45 ++
 test/gf16/test-ctrl.cpp           | 378 ++++++++++++
 test/gf16/test-inv.cpp            | 188 ++++++
 test/gf16/test-pmul.cpp           | 106 ++++
 test/gf16/test.cpp                | 956 ++++++++++++++++++++++++++++++
 test/gf16/test.h                  | 115 ++++
 test/hasher/CMakeLists.txt        | 154 +++++
 test/hasher/test.cpp              | 312 ++++++++++
 13 files changed, 3409 insertions(+)
 create mode 100644 .github/workflows/test.yml
 create mode 100644 test/gf16/CMakeLists.txt
 create mode 100644 test/gf16/p2c-inv/galois.cpp
 create mode 100644 test/gf16/p2c-inv/galois.h
 create mode 100644 test/gf16/p2c-inv/reedsolomon.cpp
 create mode 100644 test/gf16/p2c-inv/reedsolomon.h
 create mode 100644 test/gf16/test-ctrl.cpp
 create mode 100644 test/gf16/test-inv.cpp
 create mode 100644 test/gf16/test-pmul.cpp
 create mode 100644 test/gf16/test.cpp
 create mode 100644 test/gf16/test.h
 create mode 100644 test/hasher/CMakeLists.txt
 create mode 100644 test/hasher/test.cpp

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 00000000..3bb6d5d1
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,243 @@
+name: Test GF16/Hasher
+on:
+  workflow_dispatch:
+
+jobs:
+  test-win-x86:
+    strategy:
+      fail-fast: false
+      matrix:
+        config: [Debug, Release]
+        compiler: ['v141', 'v142', 'v143', 'ClangCL']
+        arch: ['Win32', 'x64']
+    name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }} (${{ matrix.config }})
+    runs-on: windows-latest
+    steps:
+      - uses: ilammy/setup-nasm@v1
+      - uses: petarpetrovt/setup-sde@v2.1
+      - uses: actions/checkout@v3
+      - run: |
+          mkdir test\gf16\build
+          cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
+          cmake --build test\gf16\build --config ${{ matrix.config }}
+          
+          mkdir test\hasher\build
+          cmake -B test\hasher\build -S test\hasher -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
+          cmake --build test\hasher\build --config ${{ matrix.config }}
+      - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe
+      - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
+      - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f
+        if: ${{ matrix.config == 'Release' && (matric.compiler == 'ClangCL' || matric.compiler == 'v143') }}
+      - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f
+        if: ${{ matrix.config == 'Release' && (matric.compiler == 'ClangCL' || matric.compiler == 'v143') }}
+      - run: sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe
+      
+      # test SSE2-only to see if CPUID checking works
+      - run: |
+          sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe
+          sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
+          sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe
+        if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }}
+      
+  
+  # test building only
+  test-win-arm:
+    strategy:
+      fail-fast: false
+      matrix:
+        compiler: ['v142', 'v143', 'ClangCL']
+        arch: ['ARM', 'ARM64']
+    name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }}
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v3
+      - run: |
+          mkdir test\gf16\build
+          cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
+          cmake --build test\gf16\build --config Debug
+          
+          mkdir test\hasher\build
+          cmake -B test\hasher\build -S test\hasher -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
+          cmake --build test\hasher\build --config Debug
+    
+  # TODO: test mingw
+  #  https://github.com/msys2/setup-msys2
+  #  https://www.msys2.org/docs/ci/
+  # TODO: test libuv, OpenCL
+  
+  test-linux-gcc:
+    strategy:
+      fail-fast: false
+      matrix:
+        config: [Debug, Release]
+        # GCC 8 available in 20.04
+        cc_ver: ['9','12']
+        t:
+          # qemu x86 doesn't support AVX, so we use Intel SDE instead
+          - {arch: 'i386',     target: 'i686-linux-gnu',       libc: 'i386',    emu: '$SDE_PATH/sde -icl --'}
+          - {arch: 'amd64',    target: 'x86-64-linux-gnu',     libc: 'amd64',   emu: '$SDE_PATH/sde64 -icl --'}
+          #- {arch: 'amd64',    target: 'x86-64-linux-gnux32',  libc: 'x32',     emu: 'qemu-x86_64-static -cpu max'}
+          # TODO: how to test x32?
+          - {arch: 'aarch64',  target: 'aarch64-linux-gnu',    libc: 'arm64',   emu: 'qemu-aarch64-static -L /usr/aarch64-linux-gnu -cpu max,sve-max-vq=4'}
+          - {arch: 'arm',      target: 'arm-linux-gnueabihf',  libc: 'armhf',   emu: 'qemu-arm-static -L /usr/arm-linux-gnueabihf -cpu max'}
+          # RVV unavailable in Ubuntu 22.04's qemu
+          # TODO: consider using newer qemu
+          #- {arch: 'riscv64',  target: 'riscv64-linux-gnu',    libc: 'riscv64', emu: 'qemu-riscv64-static -L /usr/riscv64-linux-gnu -cpu rv64,v=true,vlen=512,elen=64,vext_spec=v1.0,zba=true,zbb=true,zbc=true'}
+          - {arch: 'ppc64',  target: 'powerpc64-linux-gnu',    libc: 'ppc64', emu: 'qemu-ppc64-static -L /usr/powerpc64-linux-gnu'}
+    name: Test Ubuntu GCC ${{ matrix.cc_ver }} ${{ matrix.t.arch }} (${{ matrix.config }})
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - run: apt update
+      - uses: petarpetrovt/setup-sde@v2.1
+        if: ${{ matrix.t.arch == 'amd64' || matrix.t.arch == 'i386' }}
+      - run: apt install -y qemu-user-static
+        if: ${{ matrix.t.arch != 'amd64' && matrix.t.arch != 'i386' }}
+      - run: |
+          apt install -y g++-${{ matrix.cc_ver }}-${{ matrix.t.target }}
+          echo "CC=${{ matrix.t.target }}-gcc-${{ matrix.cc_ver }}" >> $GITHUB_ENV
+          echo "CXX=${{ matrix.t.target }}-g++-${{ matrix.cc_ver }}" >> $GITHUB_ENV
+        if: ${{ matrix.t.arch != 'amd64' }}
+      - run: |
+          apt install -y g++-${{ matrix.cc_ver }}
+          echo "CC=gcc-${{ matrix.cc_ver }}" >> $GITHUB_ENV
+          echo "CXX=g++-${{ matrix.cc_ver }}" >> $GITHUB_ENV
+        if: ${{ matrix.t.arch == 'amd64' }}
+      - run: |
+          mkdir test/gf16/build
+          cmake -Btest/gf16/build -Stest/gf16 -DSKIP_AUX=1 -DCMAKE_BUILD_TYPE=${{ matrix.config }} \
+            -DCMAKE_SYSTEM_PROCESSOR=${{ matrix.t.arch }} \
+            -DCMAKE_SYSTEM_NAME=Linux \
+            -DCMAKE_FIND_ROOT_PATH=/usr/${{ matrix.t.target }} \
+            -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+            -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY
+          cmake --build test/gf16/build
+          
+          mkdir test/hasher/build
+          cmake -Btest/hasher/build -Stest/hasher -DSKIP_AUX=1 -DCMAKE_BUILD_TYPE=${{ matrix.config }} \
+            -DCMAKE_SYSTEM_PROCESSOR=${{ matrix.t.arch }} \
+            -DCMAKE_SYSTEM_NAME=Linux \
+            -DCMAKE_FIND_ROOT_PATH=/usr/${{ matrix.t.target }} \
+            -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+            -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY
+          cmake --build test/hasher/build
+      - run: ${{ matrix.t.emu }} test/gf16/build/test
+      - run: ${{ matrix.t.emu }} test/gf16/build/test-pmul
+      - run: ${{ matrix.t.emu }} test/gf16/build/test-ctrl -f
+        if: ${{ matrix.config == 'Release' && matric.cc_ver == '12' }}
+      - run: ${{ matrix.t.emu }} test/gf16/build/test-inv -f
+        if: ${{ matrix.config == 'Release' && matric.cc_ver == '12' }}
+      - run: ${{ matrix.t.emu }} test/hasher/build/test
+  
+  test-linux-clang:
+    strategy:
+      fail-fast: false
+      matrix:
+        config: ['Debug', 'Release']
+        # Clang 6 available in 20.04
+        cc_ver: ['11','15']
+        t:
+          - {arch: 'i386', target: 'i686-linux-gnu', cl_target: 'x86-linux-gnu', libc: 'i386', emu: '$SDE_PATH/sde -icl --'}
+          - {arch: 'amd64', target: 'x86-64-linux-gnu', cl_target: 'x86_64-linux-gnu', libc: 'amd64', emu: '$SDE_PATH/sde64 -icl --'}
+          #- {arch: 'amd64', target: 'x86-64-linux-gnux32', cl_target: 'x86-64-linux-gnux32', libc: 'x32', emu: 'qemu- -cpu max'}
+          # TODO: how to test x32?
+          - {arch: 'aarch64', target: 'aarch64-linux-gnu', cl_target: 'aarch64-linux-gnu', libc: 'arm64', emu: 'qemu-aarch64-static -L /usr/aarch64-linux-gnu -cpu max,sve-max-vq=4'}
+          - {arch: 'arm', target: 'arm-linux-gnueabihf', cl_target: 'armv7a-linux-gnueabihf', libc: 'armhf', emu: 'qemu-arm-static -L /usr/arm-linux-gnueabihf -cpu max'}
+          # TODO: can't test ARM BE/RISCV32 without available libc
+          #- {arch: 'aarch64be', target: 'aarch64_be-linux-gnu', cl_target: 'aarch64_be-linux-gnu', libc: 'arm64be', emu: 'qemu-aarch64_be-static -L /usr/aarch64_be-linux-gnu -cpu max,sve-max-vq=4'}
+          #- {arch: 'arm_be', target: 'armeb-linux-gnu', cl_target: 'armebv7a-linux-gnu', libc: 'armeb', emu: 'qemu-armeb-static -L /usr/armeb-linux-gnu -cpu max'}
+          #- {arch: 'riscv32', target: 'riscv32-linux-gnu', cl_target: 'riscv32-linux-gnu', libc: 'riscv32', emu: 'qemu-riscv32-static -L /usr/riscv32-linux-gnu -cpu rv32,v=true,vlen=512,elen=64,vext_spec=v1.0,zba=true,zbb=true,zbc=true'}
+          # RVV unavailable in Ubuntu 22.04's qemu
+          #- {arch: 'riscv64', target: 'riscv64-linux-gnu', cl_target: 'riscv64-linux-gnu', libc: 'riscv64', emu: 'qemu-riscv64-static -L /usr/riscv64-linux-gnu -cpu rv64,v=true,vlen=512,elen=64,vext_spec=v1.0,zba=true,zbb=true,zbc=true'}
+          - {arch: 'ppc64', target: 'powerpc64-linux-gnu', cl_target: 'ppc64-linux-gnu', libc: 'ppc64', emu: 'qemu-ppc64-static -L /usr/powerpc64-linux-gnu'}
+    name: Test Ubuntu Clang ${{ matrix.cc_ver }} ${{ matrix.t.arch }} (${{ matrix.config }})
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - run: apt update && apt install -y clang-${{ matrix.cc_ver }}
+      - uses: petarpetrovt/setup-sde@v2.1
+        if: ${{ matrix.t.arch == 'amd64' || matrix.t.arch == 'i386' }}
+      - run: apt install -y qemu-user-static
+        if: ${{ matrix.t.arch != 'amd64' && matrix.t.arch != 'i386' }}
+      - run: apt install -y binutils-${{ matrix.t.target }} libgcc-12-dev-${{ matrix.t.libc }}-cross libstdc++-12-dev-${{ matrix.t.libc }}-cross
+        if: ${{ matrix.t.arch != 'amd64' }}
+      - run: echo "SANITIZE=-DENABLE_SANITIZE=1" >> $GITHUB_ENV
+        if: ${{ matrix.config == 'Release' && matrix.t.arch == 'amd64' }}
+      - run: |
+          mkdir test/gf16/build
+          cmake -Btest/gf16/build -Stest/gf16 -DSKIP_AUX=1 -DCMAKE_BUILD_TYPE=${{ matrix.config }} $SANITIZE \
+            -DCMAKE_C_COMPILER=clang-${{ matrix.cc_ver }} \
+            -DCMAKE_CXX_COMPILER=clang++-${{ matrix.cc_ver }} \
+            -DCMAKE_C_COMPILER_TARGET=${{ matrix.t.cl_target }} \
+            -DCMAKE_CXX_COMPILER_TARGET=${{ matrix.t.cl_target }} \
+            -DCMAKE_SYSTEM_PROCESSOR=${{ matrix.t.arch }} \
+            -DCMAKE_SYSTEM_NAME=Linux \
+            -DCMAKE_FIND_ROOT_PATH="/usr/${{ matrix.t.target }};/usr/lib/llvm-${{ matrix.cc_ver }}/lib/clang/" \
+            -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+            -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \
+            -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \
+            -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \
+            -DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld
+          cmake --build test/gf16/build
+          
+          mkdir test/hasher/build
+          cmake -Btest/hasher/build -Stest/hasher -DSKIP_AUX=1 -DCMAKE_BUILD_TYPE=${{ matrix.config }} $SANITIZE \
+            -DCMAKE_C_COMPILER=clang-${{ matrix.cc_ver }} \
+            -DCMAKE_CXX_COMPILER=clang++-${{ matrix.cc_ver }} \
+            -DCMAKE_C_COMPILER_TARGET=${{ matrix.t.cl_target }} \
+            -DCMAKE_CXX_COMPILER_TARGET=${{ matrix.t.cl_target }} \
+            -DCMAKE_SYSTEM_PROCESSOR=${{ matrix.t.arch }} \
+            -DCMAKE_SYSTEM_NAME=Linux \
+            -DCMAKE_FIND_ROOT_PATH="/usr/${{ matrix.t.target }};/usr/lib/llvm-${{ matrix.cc_ver }}/lib/clang/" \
+            -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+            -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \
+            -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \
+            -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \
+            -DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld
+          cmake --build test/hasher/build
+      - run: ${{ matrix.t.emu }} test/gf16/build/test
+      - run: ${{ matrix.t.emu }} test/gf16/build/test-pmul
+      - run: ${{ matrix.t.emu }} test/gf16/build/test-ctrl -f
+        if: ${{ matrix.config == 'Release' && matric.cc_ver == '15' }}
+      - run: ${{ matrix.t.emu }} test/gf16/build/test-inv -f
+        if: ${{ matrix.config == 'Release' && matric.cc_ver == '15' }}
+      - run: ${{ matrix.t.emu }} test/hasher/build/test
+  
+  
+  test-mac-x86:
+    strategy:
+      fail-fast: false
+      matrix:
+        config: ['Debug', 'Release']
+        compiler:
+          - {cc: 'gcc-12', cxx: 'g++-12'}
+          - {cc: 'clang', cxx: 'clang++'}
+    name: Test MacOS ${{ matrix.compiler.cc }} (${{ matrix.config }})
+    runs-on: macos-latest
+    steps:
+      - uses: actions/checkout@v3
+      - run: |
+          mkdir test/gf16/build
+          cmake -Btest/gf16/build -Stest/gf16 -DCMAKE_BUILD_TYPE=${{ matrix.config }} -DCMAKE_C_COMPILER=${{ matrix.compiler.cc }} -DCMAKE_CXX_COMPILER=${{ matrix.compiler.cxx }}
+          cmake --build test/gf16/build
+          
+          mkdir test/hasher/build
+          cmake -Btest/hasher/build -Stest/hasher -DCMAKE_BUILD_TYPE=${{ matrix.config }} -DCMAKE_C_COMPILER=${{ matrix.compiler.cc }} -DCMAKE_CXX_COMPILER=${{ matrix.compiler.cxx }}
+          cmake --build test/hasher/build
+      - run: test/gf16/build/test
+      - run: test/gf16/build/test-pmul
+      - run: test/gf16/build/test-ctrl -f
+        if: ${{ matrix.config == 'Release' && matrix.compiler.cc == 'clang' }}
+      - run: test/gf16/build/test-inv -f
+        if: ${{ matrix.config == 'Release' && matrix.compiler.cc == 'clang' }}
+      - run: test/hasher/build/test
+  
+  # TODO: test building on Mac ARM64? might not be necessary, given we build it in par2cmdline-turbo
+  
+  # TODO: BSD?
+  #  https://github.com/marketplace/actions/freebsd-vm
+  #  https://github.com/vmactions
diff --git a/test/gf16/CMakeLists.txt b/test/gf16/CMakeLists.txt
new file mode 100644
index 00000000..37f7b578
--- /dev/null
+++ b/test/gf16/CMakeLists.txt
@@ -0,0 +1,314 @@
+cmake_minimum_required(VERSION 2.8.9...3.22)
+project(gf16_test)
+
+option(USE_LIBUV "Use libuv interface with callbacks, instead of C++11 threading + futures" OFF)
+option(ENABLE_OCL "Enable OpenCL" OFF)
+option(SKIP_AUX "Bypass getauxval checks (for testing purposes)" OFF)
+option(ENABLE_SANITIZE "Enable sanitizers" OFF)
+
+include(CheckCXXCompilerFlag)
+include(CheckIncludeFileCXX)
+include(CheckCXXSymbolExists)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_C_STANDARD 99)
+
+if(NOT CMAKE_BUILD_TYPE)
+	set(CMAKE_BUILD_TYPE Debug)
+endif()
+if(NOT TARGET_ARCH)
+	if(CMAKE_GENERATOR_PLATFORM)
+		set(TARGET_ARCH ${CMAKE_GENERATOR_PLATFORM})
+	else()
+		set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+	endif()
+endif()
+
+message("Building for ${TARGET_ARCH}")
+if (${TARGET_ARCH} MATCHES "i386|i686|x86|x86_64|x64|amd64|AMD64|win32|Win32")
+	set(IS_X86 TRUE)
+	if(${TARGET_ARCH} MATCHES "x86_64|x64|amd64|AMD64")
+		set(IS_X64 TRUE)
+	endif()
+endif()
+if (${TARGET_ARCH} MATCHES "arm|ARM|aarch64|arm64|ARM64")
+	set(IS_ARM TRUE)
+endif()
+if (${TARGET_ARCH} MATCHES "riscv64|rv64")
+	set(IS_RISCV64 TRUE)
+endif()
+if (${TARGET_ARCH} MATCHES "riscv32|rv32")
+	set(IS_RISCV32 TRUE)
+endif()
+
+if(ENABLE_OCL)
+	add_compile_definitions(ENABLE_OCL=1)
+	add_compile_definitions(PARPAR_LIBDL_SUPPORT=1)
+endif()
+if(USE_LIBUV)
+	add_compile_definitions(USE_LIBUV=1)
+endif()
+if(SKIP_AUX)
+	add_compile_definitions(PARPAR_SKIP_AUX_CHECK=1)
+endif()
+
+set(GF16_DIR ../../gf16)
+set(SRC_DIR ../../src)
+set(GF16_C_SOURCES
+	${GF16_DIR}/gf_add_avx2.c
+	${GF16_DIR}/gf_add_avx512.c
+	${GF16_DIR}/gf_add_generic.c
+	${GF16_DIR}/gf_add_neon.c
+	${GF16_DIR}/gf_add_rvv.c
+	${GF16_DIR}/gf_add_sse2.c
+	${GF16_DIR}/gf_add_sve.c
+	${GF16_DIR}/gf_add_sve2.c
+	${GF16_DIR}/gf16_affine_avx2.c
+	${GF16_DIR}/gf16_affine_avx512.c
+	${GF16_DIR}/gf16_affine_gfni.c
+	${GF16_DIR}/gf16_cksum_avx2.c
+	${GF16_DIR}/gf16_cksum_avx512.c
+	${GF16_DIR}/gf16_cksum_generic.c
+	${GF16_DIR}/gf16_cksum_neon.c
+	${GF16_DIR}/gf16_cksum_rvv.c
+	${GF16_DIR}/gf16_cksum_sse2.c
+	${GF16_DIR}/gf16_cksum_sve.c
+	${GF16_DIR}/gf16_clmul_neon.c
+	${GF16_DIR}/gf16_clmul_sha3.c
+	${GF16_DIR}/gf16_clmul_sve2.c
+	${GF16_DIR}/gf16_lookup.c
+	${GF16_DIR}/gf16_lookup_sse2.c
+	${GF16_DIR}/gf16_shuffle_avx.c
+	${GF16_DIR}/gf16_shuffle_avx2.c
+	${GF16_DIR}/gf16_shuffle_avx512.c
+	${GF16_DIR}/gf16_shuffle_neon.c
+	${GF16_DIR}/gf16_shuffle_ssse3.c
+	${GF16_DIR}/gf16_shuffle_vbmi.c
+	${GF16_DIR}/gf16_shuffle2x128_sve2.c
+	${GF16_DIR}/gf16_shuffle128_rvv.c
+	${GF16_DIR}/gf16_shuffle128_sve.c
+	${GF16_DIR}/gf16_shuffle128_sve2.c
+	${GF16_DIR}/gf16_shuffle512_sve2.c
+	${GF16_DIR}/gf16_xor_avx2.c
+	${GF16_DIR}/gf16_xor_avx512.c
+	${GF16_DIR}/gf16_xor_sse2.c
+	${GF16_DIR}/gfmat_coeff.c
+	
+	${GF16_DIR}/opencl-include/cl.c
+	${SRC_DIR}/platform_warnings.c
+	
+	
+	${GF16_DIR}/gf16pmul_avx2.c
+	${GF16_DIR}/gf16pmul_neon.c
+	${GF16_DIR}/gf16pmul_sse.c
+	${GF16_DIR}/gf16pmul_sve2.c
+	${GF16_DIR}/gf16pmul_vpclgfni.c
+	${GF16_DIR}/gf16pmul_vpclmul.c
+)
+
+if(MSVC AND IS_X64)
+	ENABLE_LANGUAGE(ASM_MASM)
+	set(GF16_C_SOURCES ${GF16_C_SOURCES} ${GF16_DIR}/xor_jit_stub_masm64.asm)
+endif()
+
+set(GF16_CPP_SOURCES
+	${GF16_DIR}/controller.cpp
+	${GF16_DIR}/controller_cpu.cpp
+	${GF16_DIR}/controller_ocl.cpp
+	${GF16_DIR}/controller_ocl_init.cpp
+	${GF16_DIR}/gf16mul.cpp
+	
+	
+	${GF16_DIR}/gf16pmul.cpp
+	${GF16_DIR}/gfmat_inv.cpp
+)
+
+include_directories(${GF16_DIR}/opencl-include ${GF16_DIR})
+
+if(MSVC)
+	set(RELEASE_COMPILE_FLAGS /GS- /Gy /sdl- /Oy /Oi)
+	set(RELEASE_LINK_FLAGS /OPT:REF /OPT:ICF)
+	add_compile_options(/W2 "$<$<NOT:$<CONFIG:Debug>>:${RELEASE_COMPILE_FLAGS}>")
+	add_link_options("$<$<NOT:$<CONFIG:Debug>>:${RELEASE_LINK_FLAGS}>")
+else()
+	# TODO: consider -Werror
+	add_compile_options(-Wall -Wextra -Wno-unused-function)
+	if(${CMAKE_BUILD_TYPE} MATCHES "Debug")
+		add_compile_options(-ggdb)
+	else()
+		if(NOT ENABLE_SANITIZE)
+			add_compile_options(-fomit-frame-pointer)
+		endif()
+	endif()
+	
+	if(ENABLE_SANITIZE)
+		set(SANITIZE_OPTS -fsanitize=address -fsanitize=bool,builtin,bounds,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,object-size,return,returns-nonnull-attribute,shift,signed-integer-overflow,unreachable,vla-bound)
+		# -fsanitize=pointer-overflow causes compilation of shuffle_avx512 to freeze on clang10
+		# -fsanitize=memory requires instrumented libraries, so not useful
+		add_compile_options(-fno-omit-frame-pointer ${SANITIZE_OPTS})
+		add_link_options(${SANITIZE_OPTS})
+	endif()
+	
+	#if(ENABLE_OCL)
+	#	add_compile_options(-fexceptions)
+	#else()
+	#	add_compile_options(-fno-exceptions)
+	#endif()
+endif()
+
+add_compile_definitions(PARPAR_INVERT_SUPPORT=1)
+add_library(gf16_c STATIC ${GF16_C_SOURCES})
+add_library(gf16_ctl STATIC ${GF16_CPP_SOURCES})
+target_link_libraries(gf16_ctl gf16_c)
+
+if(NOT MSVC)
+	if(NOT ENABLE_SANITIZE)
+		target_compile_options(gf16_ctl PRIVATE -fno-rtti)
+	endif()
+	target_compile_definitions(gf16_c PRIVATE _POSIX_C_SOURCE=200112L)
+	target_compile_definitions(gf16_c PRIVATE _DARWIN_C_SOURCE=)
+	target_compile_definitions(gf16_c PRIVATE _GNU_SOURCE=)
+	
+	if(ENABLE_SANITIZE)
+		# not supported on all platforms?
+		#target_compile_options(gf16_ctl PRIVATE -fsanitize=thread)
+	endif()
+endif()
+
+if(MSVC)
+	if(IS_X86)
+		set_source_files_properties(${GF16_DIR}/gf_add_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2)
+		set_source_files_properties(${GF16_DIR}/gf_add_avx512.c PROPERTIES COMPILE_OPTIONS /arch:AVX512)
+		set_source_files_properties(${GF16_DIR}/gf16_affine_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2)
+		set_source_files_properties(${GF16_DIR}/gf16_affine_avx512.c PROPERTIES COMPILE_OPTIONS /arch:AVX512)
+		set_source_files_properties(${GF16_DIR}/gf16_cksum_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2)
+		set_source_files_properties(${GF16_DIR}/gf16_cksum_avx512.c PROPERTIES COMPILE_OPTIONS /arch:AVX512)
+		set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx.c PROPERTIES COMPILE_OPTIONS /arch:AVX)
+		set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2)
+		set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx512.c PROPERTIES COMPILE_OPTIONS /arch:AVX512)
+		set_source_files_properties(${GF16_DIR}/gf16_shuffle_vbmi.c PROPERTIES COMPILE_OPTIONS /arch:AVX512)
+		set_source_files_properties(${GF16_DIR}/gf16_xor_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2)
+		set_source_files_properties(${GF16_DIR}/gf16_xor_avx512.c PROPERTIES COMPILE_OPTIONS /arch:AVX512)
+		set_source_files_properties(${GF16_DIR}/gf16pmul_avx2.c PROPERTIES COMPILE_OPTIONS /arch:AVX2)
+		set_source_files_properties(${GF16_DIR}/gf16pmul_vpclgfni.c PROPERTIES COMPILE_OPTIONS /arch:AVX2)
+		set_source_files_properties(${GF16_DIR}/gf16pmul_vpclmul.c PROPERTIES COMPILE_OPTIONS /arch:AVX2)
+	endif()
+endif()
+if(NOT MSVC OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+	if(IS_X86)
+		set_source_files_properties(${GF16_DIR}/gf_add_avx2.c PROPERTIES COMPILE_OPTIONS -mavx2)
+		set_source_files_properties(${GF16_DIR}/gf_add_avx512.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw")
+		set_source_files_properties(${GF16_DIR}/gf_add_sse2.c PROPERTIES COMPILE_OPTIONS -msse2)
+		set_source_files_properties(${GF16_DIR}/gf16_cksum_avx2.c PROPERTIES COMPILE_OPTIONS -mavx2)
+		set_source_files_properties(${GF16_DIR}/gf16_cksum_avx512.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw")
+		set_source_files_properties(${GF16_DIR}/gf16_cksum_sse2.c PROPERTIES COMPILE_OPTIONS -msse2)
+		set_source_files_properties(${GF16_DIR}/gf16_lookup_sse2.c PROPERTIES COMPILE_OPTIONS -msse2)
+		set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx.c PROPERTIES COMPILE_OPTIONS -mavx)
+		set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx2.c PROPERTIES COMPILE_OPTIONS -mavx2)
+		set_source_files_properties(${GF16_DIR}/gf16_shuffle_avx512.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw")
+		set_source_files_properties(${GF16_DIR}/gf16_shuffle_ssse3.c PROPERTIES COMPILE_OPTIONS -mssse3)
+		set_source_files_properties(${GF16_DIR}/gf16_xor_avx2.c PROPERTIES COMPILE_OPTIONS -mavx2)
+		set_source_files_properties(${GF16_DIR}/gf16_xor_avx512.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw")
+		set_source_files_properties(${GF16_DIR}/gf16_xor_sse2.c PROPERTIES COMPILE_OPTIONS -msse2)
+		set_source_files_properties(${GF16_DIR}/gf16pmul_avx2.c PROPERTIES COMPILE_OPTIONS "-mavx2;-mpclmul")
+		set_source_files_properties(${GF16_DIR}/gf16pmul_sse.c PROPERTIES COMPILE_OPTIONS "-msse4.1;-mpclmul")
+		
+		CHECK_CXX_COMPILER_FLAG("-mavx512vl -mavx512bw -mavx512vbmi" COMPILER_SUPPORTS_VBMI)
+		if(COMPILER_SUPPORTS_VBMI)
+			set_source_files_properties(${GF16_DIR}/gf16_shuffle_vbmi.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw;-mavx512vbmi")
+		endif()
+		CHECK_CXX_COMPILER_FLAG("-mgfni" COMPILER_SUPPORTS_GFNI)
+		if(COMPILER_SUPPORTS_GFNI)
+			set_source_files_properties(${GF16_DIR}/gf16_affine_avx2.c PROPERTIES COMPILE_OPTIONS "-mavx2;-mgfni")
+			set_source_files_properties(${GF16_DIR}/gf16_affine_avx512.c PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw;-mgfni")
+			set_source_files_properties(${GF16_DIR}/gf16_affine_gfni.c PROPERTIES COMPILE_OPTIONS "-mssse3;-mgfni")
+			
+			set_source_files_properties(${SRC_DIR}/platform_warnings.c.c PROPERTIES COMPILE_OPTIONS "-mavx2;-mgfni")
+		endif()
+		
+		CHECK_CXX_COMPILER_FLAG("-mvpclmulqdq" COMPILER_SUPPORTS_VPCLMULQDQ)
+		if(COMPILER_SUPPORTS_VPCLMULQDQ)
+			set_source_files_properties(${GF16_DIR}/gf16pmul_vpclmul.c PROPERTIES COMPILE_OPTIONS "-mavx2;-mvpclmulqdq")
+		endif()
+		if(COMPILER_SUPPORTS_VPCLMULQDQ AND COMPILER_SUPPORTS_GFNI)
+			set_source_files_properties(${GF16_DIR}/gf16pmul_vpclgfni.c PROPERTIES COMPILE_OPTIONS "-mavx2;-mvpclmulqdq;-mgfni")
+		endif()
+	endif()
+	
+	if(IS_ARM AND NOT APPLE) # M1 Macs don't seem to need these ARM options
+		CHECK_CXX_COMPILER_FLAG("-mfpu=neon -march=armv7-a" COMPILER_SUPPORTS_ARM32_NEON)
+		if(COMPILER_SUPPORTS_ARM32_NEON)
+			set_source_files_properties(${GF16_DIR}/gf_add_neon.c PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a")
+			set_source_files_properties(${GF16_DIR}/gf16_cksum_neon.c PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a")
+			set_source_files_properties(${GF16_DIR}/gf16_clmul_neon.c PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a")
+			set_source_files_properties(${GF16_DIR}/gf16_shuffle_neon.c PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a")
+			set_source_files_properties(${GF16_DIR}/gf16pmul_neon.c PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a")
+		endif()
+		CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+sha3" COMPILER_SUPPORTS_SHA3)
+		if(COMPILER_SUPPORTS_SHA3)
+			set_source_files_properties(${GF16_DIR}/gf16_clmul_sha3.c PROPERTIES COMPILE_OPTIONS -march=armv8.2-a+sha3)
+		endif()
+		
+		CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" COMPILER_SUPPORTS_SVE)
+		if(COMPILER_SUPPORTS_SVE)
+			set_source_files_properties(${GF16_DIR}/gf_add_sve.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve)
+			set_source_files_properties(${GF16_DIR}/gf16_cksum_sve.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve)
+			set_source_files_properties(${GF16_DIR}/gf16_shuffle128_sve.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve)
+		endif()
+		
+		CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve2" COMPILER_SUPPORTS_SVE2)
+		if(COMPILER_SUPPORTS_SVE2)
+			set_source_files_properties(${GF16_DIR}/gf_add_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2)
+			set_source_files_properties(${GF16_DIR}/gf16_clmul_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2)
+			set_source_files_properties(${GF16_DIR}/gf16_shuffle2x128_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2)
+			set_source_files_properties(${GF16_DIR}/gf16_shuffle128_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2)
+			set_source_files_properties(${GF16_DIR}/gf16_shuffle512_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2)
+			set_source_files_properties(${GF16_DIR}/gf16pmul_sve2.c PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2)
+		endif()
+	endif()
+	
+	if(IS_RISCV64)
+		CHECK_CXX_COMPILER_FLAG("-march=rv64gcv" COMPILER_SUPPORTS_RVV)
+		if(COMPILER_SUPPORTS_RVV)
+			set_source_files_properties(${GF16_DIR}/gf_add_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv64gcv)
+			set_source_files_properties(${GF16_DIR}/gf16_cksum_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv64gcv)
+			set_source_files_properties(${GF16_DIR}/gf16_shuffle128_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv64gcv)
+		endif()
+	endif()
+	if(IS_RISCV32)
+		CHECK_CXX_COMPILER_FLAG("-march=rv32gcv" COMPILER_SUPPORTS_RVV)
+		if(COMPILER_SUPPORTS_RVV)
+			set_source_files_properties(${GF16_DIR}/gf_add_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv32gcv)
+			set_source_files_properties(${GF16_DIR}/gf16_cksum_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv32gcv)
+			set_source_files_properties(${GF16_DIR}/gf16_shuffle128_rvv.c PROPERTIES COMPILE_OPTIONS -march=rv32gcv)
+		endif()
+	endif()
+endif()
+
+
+
+
+# binaries
+set(TEST_DIR .)
+add_executable(test ${TEST_DIR}/test.cpp)
+target_link_libraries(test gf16_ctl)
+add_executable(test-ctrl ${TEST_DIR}/test-ctrl.cpp)
+target_link_libraries(test-ctrl gf16_ctl)
+add_executable(test-inv ${TEST_DIR}/test-inv.cpp ${TEST_DIR}/p2c-inv/reedsolomon.cpp)
+target_link_libraries(test-inv gf16_ctl)
+add_executable(test-pmul ${TEST_DIR}/test-pmul.cpp)
+target_link_libraries(test-pmul gf16_ctl)
+
+if(NOT MSVC)
+	target_link_libraries(test-ctrl -pthread)
+	target_link_libraries(test-inv -pthread)
+	
+	if(ENABLE_OCL)
+		target_link_libraries(test-ctrl dl)
+	endif()
+endif()
+
+if(USE_LIBUV)
+	target_link_libraries(test-ctrl uv)
+	target_link_libraries(test-inv uv)
+endif()
diff --git a/test/gf16/p2c-inv/galois.cpp b/test/gf16/p2c-inv/galois.cpp
new file mode 100644
index 00000000..a1a39030
--- /dev/null
+++ b/test/gf16/p2c-inv/galois.cpp
@@ -0,0 +1,28 @@
+//  This file is part of par2cmdline (a PAR 2.0 compatible file verification and
+//  repair tool). See http://parchive.sourceforge.net for details of PAR 2.0.
+//
+//  Copyright (c) 2003 Peter Brian Clements
+//
+//  par2cmdline is free software; you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation; either version 2 of the License, or
+//  (at your option) any later version.
+//
+//  par2cmdline is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+#include "libpar2internal.h"
+
+#ifdef _MSC_VER
+#ifdef _DEBUG
+#undef THIS_FILE
+static char THIS_FILE[]=__FILE__;
+#define new DEBUG_NEW
+#endif
+#endif
diff --git a/test/gf16/p2c-inv/galois.h b/test/gf16/p2c-inv/galois.h
new file mode 100644
index 00000000..7671c98b
--- /dev/null
+++ b/test/gf16/p2c-inv/galois.h
@@ -0,0 +1,317 @@
+//  This file is part of par2cmdline (a PAR 2.0 compatible file verification and
+//  repair tool). See http://parchive.sourceforge.net for details of PAR 2.0.
+//
+//  Copyright (c) 2003 Peter Brian Clements
+//
+//  par2cmdline is free software; you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation; either version 2 of the License, or
+//  (at your option) any later version.
+//
+//  par2cmdline is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+#ifndef __GALOIS_H__
+#define __GALOIS_H__
+
+#include <cassert>
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype> class GaloisTable;
+template <const unsigned int bits, const unsigned int generator, typename valuetype> class Galois;
+
+template <class g> class GaloisLongMultiplyTable;
+
+// This source file defines the Galois object for carrying out
+// arithmetic in GF(2^16) using the generator 0x1100B.
+
+// Also defined are the GaloisTable object (which contains log and
+// anti log tables for use in multiplication and division), and
+// the GaloisLongMultiplyTable object (which contains tables for
+// carrying out multiplation of 16-bit galois numbers 8 bits at a time).
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+class GaloisTable
+{
+public:
+  typedef valuetype ValueType;
+
+  GaloisTable(void);
+
+  enum
+  {
+    Bits = bits,
+    Count = 1<<Bits,
+    Limit = Count-1,
+    Generator = generator,
+  };
+
+  ValueType log[Count];
+  ValueType antilog[Count];
+};
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+class Galois
+{
+public:
+  typedef valuetype ValueType;
+
+  // Basic constructors
+  Galois(void) {};
+  Galois(ValueType v);
+
+  // Copy and assignment
+  Galois(const Galois &right) {value = right.value;}
+  Galois& operator = (const Galois &right) { value = right.value; return *this;}
+
+  // Addition
+  Galois operator + (const Galois &right) const { return (value ^ right.value); }
+  Galois& operator += (const Galois &right) { value ^= right.value; return *this;}
+
+  // Subtraction
+  Galois operator - (const Galois &right) const { return (value ^ right.value); }
+  Galois& operator -= (const Galois &right) { value ^= right.value; return *this;}
+
+  // Multiplication
+  Galois operator * (const Galois &right) const;
+  Galois& operator *= (const Galois &right);
+
+  // Division
+  Galois operator / (const Galois &right) const;
+  Galois& operator /= (const Galois &right);
+
+  // Power
+  Galois pow(unsigned int right) const;
+  Galois operator ^ (unsigned int right) const;
+  Galois& operator ^= (unsigned int right);
+
+  // Cast to value and value access
+  operator ValueType(void) const {return value;}
+  ValueType Value(void) const {return value;}
+
+  // Direct log and antilog
+  ValueType Log(void) const;
+  ValueType ALog(void) const;
+
+  enum
+  {
+    Bits  = GaloisTable<bits,generator,valuetype>::Bits,
+    Count = GaloisTable<bits,generator,valuetype>::Count,
+    Limit = GaloisTable<bits,generator,valuetype>::Limit,
+  };
+
+protected:
+  ValueType value;
+
+  static GaloisTable<bits,generator,valuetype> table;
+};
+
+#ifdef LONGMULTIPLY
+template <class g>
+class GaloisLongMultiplyTable
+{
+public:
+  GaloisLongMultiplyTable(void);
+
+  typedef g G;
+
+  enum
+  {
+    Bytes = ((G::Bits + 7) >> 3),
+    Count = ((Bytes * (Bytes+1)) / 2),
+  };
+
+  G tables[Count * 256 * 256];
+};
+#endif
+
+// Construct the log and antilog tables from the generator
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+inline GaloisTable<bits,generator,valuetype>::GaloisTable(void)
+{
+  u32 b = 1;
+
+  for (u32 l=0; l<Limit; l++)
+  {
+    log[b]     = (ValueType)l;
+    antilog[l] = (ValueType)b;
+
+    b <<= 1;
+    if (b & Count) b ^= Generator;
+  }
+
+  log[0] = (ValueType)Limit;
+  antilog[Limit] = 0;
+}
+
+
+// The one and only galois log/antilog table object
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+GaloisTable<bits,generator,valuetype> Galois<bits,generator,valuetype>::table;
+
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+inline Galois<bits,generator,valuetype>::Galois(typename Galois<bits,generator,valuetype>::ValueType v)
+{
+  value = v;
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+inline Galois<bits,generator,valuetype> Galois<bits,generator,valuetype>::operator * (const Galois<bits,generator,valuetype> &right) const
+{
+  if (value == 0 || right.value == 0) return 0;
+  unsigned int sum = table.log[value] + table.log[right.value];
+  if (sum >= Limit)
+  {
+    return table.antilog[sum-Limit];
+  }
+  else
+  {
+    return table.antilog[sum];
+  }
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+inline Galois<bits,generator,valuetype>& Galois<bits,generator,valuetype>::operator *= (const Galois<bits,generator,valuetype> &right)
+{
+  if (value == 0 || right.value == 0)
+  {
+    value = 0;
+  }
+  else
+  {
+    unsigned int sum = table.log[value] + table.log[right.value];
+    if (sum >= Limit)
+    {
+      value = table.antilog[sum-Limit];
+    }
+    else
+    {
+      value = table.antilog[sum];
+    }
+  }
+
+  return *this;
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+inline Galois<bits,generator,valuetype> Galois<bits,generator,valuetype>::operator / (const Galois<bits,generator,valuetype> &right) const
+{
+  if (value == 0) return 0;
+
+  assert(right.value != 0);
+  if (right.value == 0) {return 0;} // Division by 0!
+
+  int sum = table.log[value] - table.log[right.value];
+  if (sum < 0)
+  {
+    return table.antilog[sum+Limit];
+  }
+  else
+  {
+    return table.antilog[sum];
+  }
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+inline Galois<bits,generator,valuetype>& Galois<bits,generator,valuetype>::operator /= (const Galois<bits,generator,valuetype> &right)
+{
+  if (value == 0) return *this;
+
+  assert(right.value != 0);
+  if (right.value == 0) {return *this;} // Division by 0!
+
+  int sum = table.log[value] - table.log[right.value];
+  if (sum < 0)
+  {
+    value = table.antilog[sum+Limit];
+  }
+  else
+  {
+    value = table.antilog[sum];
+  }
+
+  return *this;
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+inline Galois<bits,generator,valuetype> Galois<bits,generator,valuetype>::pow(unsigned int right) const
+{
+  if (right == 0) return 1;
+  if (value == 0) return 0;
+
+  unsigned int sum = table.log[value] * right;
+
+  sum = (sum >> Bits) + (sum & Limit);
+  if (sum >= Limit)
+  {
+    return table.antilog[sum-Limit];
+  }
+  else
+  {
+    return table.antilog[sum];
+  }
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+inline Galois<bits,generator,valuetype> Galois<bits,generator,valuetype>::operator ^ (unsigned int right) const
+{
+  if (right == 0) return 1;
+  if (value == 0) return 0;
+
+  unsigned int sum = table.log[value] * right;
+
+  sum = (sum >> Bits) + (sum & Limit);
+  if (sum >= Limit)
+  {
+    return table.antilog[sum-Limit];
+  }
+  else
+  {
+    return table.antilog[sum];
+  }
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+inline Galois<bits,generator,valuetype>& Galois<bits,generator,valuetype>::operator ^= (unsigned int right)
+{
+  if (right == 0) {value = 1; return *this;}
+  if (value == 0) return *this;
+
+  unsigned int sum = table.log[value] * right;
+
+  sum = (sum >> Bits) + (sum & Limit);
+  if (sum >= Limit)
+  {
+    value = table.antilog[sum-Limit];
+  }
+  else
+  {
+    value = table.antilog[sum];
+  }
+
+  return *this;
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+inline valuetype Galois<bits,generator,valuetype>::Log(void) const
+{
+  return table.log[value];
+}
+
+template <const unsigned int bits, const unsigned int generator, typename valuetype>
+inline valuetype Galois<bits,generator,valuetype>::ALog(void) const
+{
+  return table.antilog[value];
+}
+
+typedef Galois<16,0x1100B,u16> Galois16;
+
+#endif // __GALOIS_H__
diff --git a/test/gf16/p2c-inv/reedsolomon.cpp b/test/gf16/p2c-inv/reedsolomon.cpp
new file mode 100644
index 00000000..17e41dec
--- /dev/null
+++ b/test/gf16/p2c-inv/reedsolomon.cpp
@@ -0,0 +1,253 @@
+#include "reedsolomon.h"
+#include <cstring>
+using namespace std;
+
+
+static u32 gcd(u32 a, u32 b)
+{
+  if (a && b)
+  {
+    while (a && b)
+    {
+      if (a>b)
+      {
+        a = a%b;
+      }
+      else
+      {
+        b = b%a;
+      }
+    }
+
+    return a+b;
+  }
+  else
+  {
+    return 0;
+  }
+}
+
+
+inline bool ReedSolomon_GaussElim(unsigned int rows, unsigned int leftcols, Galois16 *leftmatrix, Galois16 *rightmatrix, unsigned int datamissing)
+{
+  // Because the matrices being operated on are Vandermonde matrices
+  // they are guaranteed not to be singular.
+
+  // Additionally, because Galois arithmetic is being used, all calculations
+  // involve exact values with no loss of precision. It is therefore
+  // not necessary to carry out any row or column swapping.
+
+  // Solve one row at a time
+
+  // For each row in the matrix
+  for (unsigned int row=0; row<datamissing; row++)
+  {
+    // NB Row and column swapping to find a non zero pivot value or to find the largest value
+    // is not necessary due to the nature of the arithmetic and construction of the RS matrix.
+
+    // Get the pivot value.
+    Galois16 pivotvalue = rightmatrix[row * rows + row];
+    if (pivotvalue == 0)
+    {
+      return false;
+    }
+
+    // If the pivot value is not 1, then the whole row has to be scaled
+    if (pivotvalue != 1)
+    {
+      for (unsigned int col=0; col<leftcols; col++)
+      {
+        if (leftmatrix[row * leftcols + col] != 0)
+        {
+          leftmatrix[row * leftcols + col] /= pivotvalue;
+        }
+      }
+      rightmatrix[row * rows + row] = 1;
+      for (unsigned int col=row+1; col<rows; col++)
+      {
+        if (rightmatrix[row * rows + col] != 0)
+        {
+          rightmatrix[row * rows + col] /= pivotvalue;
+        }
+      }
+    }
+
+    // For every other row in the matrix
+    for (unsigned int row2=0; row2<rows; row2++)
+    {
+      // Define MPDL to skip reporting and speed things up
+
+      if (row != row2)
+      {
+        // Get the scaling factor for this row.
+        Galois16 scalevalue = rightmatrix[row2 * rows + row];
+
+        if (scalevalue == 1)
+        {
+          // If the scaling factor happens to be 1, just subtract rows
+          for (unsigned int col=0; col<leftcols; col++)
+          {
+            if (leftmatrix[row * leftcols + col] != 0)
+            {
+              leftmatrix[row2 * leftcols + col] -= leftmatrix[row * leftcols + col];
+            }
+          }
+
+          for (unsigned int col=row; col<rows; col++)
+          {
+            if (rightmatrix[row * rows + col] != 0)
+            {
+              rightmatrix[row2 * rows + col] -= rightmatrix[row * rows + col];
+            }
+          }
+        }
+        else if (scalevalue != 0)
+        {
+          // If the scaling factor is not 0, then compute accordingly.
+          for (unsigned int col=0; col<leftcols; col++)
+          {
+            if (leftmatrix[row * leftcols + col] != 0)
+            {
+              leftmatrix[row2 * leftcols + col] -= leftmatrix[row * leftcols + col] * scalevalue;
+            }
+          }
+
+          for (unsigned int col=row; col<rows; col++)
+          {
+            if (rightmatrix[row * rows + col] != 0)
+            {
+              rightmatrix[row2 * rows + col] -= rightmatrix[row * rows + col] * scalevalue;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+
+
+// Construct the Vandermonde matrix and solve it if necessary
+bool ReedSolomon_Compute(const vector<bool> &present, vector<RSOutputRow> outputrows, Galois16*& leftmatrix)
+{
+  // SetInput
+  u32 inputcount = (u32)present.size();
+
+  u32* datapresentindex = new u32[inputcount];
+  u32* datamissingindex = new u32[inputcount];
+  Galois16::ValueType* database    = new Galois16::ValueType[inputcount];
+  u32 datapresent = 0, datamissing = 0;
+
+  unsigned int logbase = 0;
+
+  for (unsigned int index=0; index<inputcount; index++)
+  {
+    // Record the index of the file in the datapresentindex array
+    // or the datamissingindex array
+    if (present[index])
+    {
+      datapresentindex[datapresent++] = index;
+    }
+    else
+    {
+      datamissingindex[datamissing++] = index;
+    }
+
+    // Determine the next useable base value.
+    // Its log must must be relatively prime to 65535
+    while (gcd(Galois16::Limit, logbase) != 1)
+    {
+      logbase++;
+    }
+    if (logbase >= Galois16::Limit)
+    {
+      return false;
+    }
+    Galois16::ValueType base = Galois16(logbase++).ALog();
+
+    database[index] = base;
+  }
+  
+  
+  
+  
+  // Compute
+  u32 outcount = datamissing;
+  u32 incount = datapresent + datamissing;
+
+  if (datamissing > outputrows.size()) return false;
+  if (outcount == 0)
+  {
+    return false;
+  }
+
+  // Allocate the left hand matrix
+
+  leftmatrix = new Galois16[outcount * incount];
+  for (unsigned int index=0; index < outcount * incount; index++)
+    leftmatrix[index] = 0;
+
+  // Allocate the right hand matrix only if we are recovering
+
+  Galois16 *rightmatrix = 0;
+  if (datamissing > 0)
+  {
+    rightmatrix = new Galois16[outcount * outcount];
+    for (unsigned int index=0; index < outcount * outcount; index++)
+      rightmatrix[index] = 0;
+  }
+
+  // Fill in the two matrices:
+
+  vector<RSOutputRow>::const_iterator outputrow = outputrows.begin();
+
+  // One row for each present recovery block that will be used for a missing data block
+  for (unsigned int row=0; row<datamissing; row++)
+  {
+    // Get the exponent of the next present recovery block
+    while (!outputrow->present)
+    {
+      outputrow++;
+    }
+    u16 exponent = outputrow->exponent;
+
+    // One column for each present data block
+    for (unsigned int col=0; col<datapresent; col++)
+    {
+      leftmatrix[row * incount + col] = Galois16(database[datapresentindex[col]]).pow(exponent);
+    }
+    // One column for each each present recovery block that will be used for a missing data block
+    for (unsigned int col=0; col<datamissing; col++)
+    {
+      leftmatrix[row * incount + col + datapresent] = (row == col) ? 1 : 0;
+    }
+
+    if (datamissing > 0)
+    {
+      // One column for each missing data block
+      for (unsigned int col=0; col<datamissing; col++)
+      {
+        rightmatrix[row * outcount + col] = Galois16(database[datamissingindex[col]]).pow(exponent);
+      }
+    }
+
+    outputrow++;
+  }
+
+  // Solve the matrices only if recovering data
+  if (datamissing > 0)
+  {
+    // Perform Gaussian Elimination and then delete the right matrix (which
+    // will no longer be required).
+    bool success = ReedSolomon_GaussElim(outcount, incount, leftmatrix, rightmatrix, datamissing);
+    delete [] rightmatrix;
+    return success;
+  }
+
+  return true;
+}
+
+// Use Gaussian Elimination to solve the matrices
+
diff --git a/test/gf16/p2c-inv/reedsolomon.h b/test/gf16/p2c-inv/reedsolomon.h
new file mode 100644
index 00000000..ba4aaab7
--- /dev/null
+++ b/test/gf16/p2c-inv/reedsolomon.h
@@ -0,0 +1,45 @@
+//  This file is part of par2cmdline (a PAR 2.0 compatible file verification and
+//  repair tool). See http://parchive.sourceforge.net for details of PAR 2.0.
+//
+//  Copyright (c) 2003 Peter Brian Clements
+//  Copyright (c) 2019 Michael D. Nahas
+//
+//  par2cmdline is free software; you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation; either version 2 of the License, or
+//  (at your option) any later version.
+//
+//  par2cmdline is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with this program; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+#ifndef __REEDSOLOMON_H__
+#define __REEDSOLOMON_H__
+
+#include <vector>
+#include <stdint.h>
+typedef uint16_t u16;
+typedef uint32_t u32;
+
+#include "galois.h"
+
+class RSOutputRow
+{
+public:
+  RSOutputRow(void) {};
+  RSOutputRow(bool _present, u16 _exponent) : present(_present), exponent(_exponent) {}
+
+public:
+  bool present;
+  u16 exponent;
+};
+
+
+bool ReedSolomon_Compute(const std::vector<bool> &present, std::vector<RSOutputRow> outputrows, Galois16*& leftmatrix);
+
+#endif // __REEDSOLOMON_H__
diff --git a/test/gf16/test-ctrl.cpp b/test/gf16/test-ctrl.cpp
new file mode 100644
index 00000000..cd9d9193
--- /dev/null
+++ b/test/gf16/test-ctrl.cpp
@@ -0,0 +1,378 @@
+#if defined(_MSC_VER) && !defined(NDEBUG)
+#define _CRTDBG_MAP_ALLOC
+#include <stdlib.h>
+#include <crtdbg.h>
+#endif
+
+#define NOMINMAX
+
+#include "controller.h"
+#include "controller_cpu.h"
+#include "controller_ocl.h"
+#include "gfmat_coeff.h"
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include "test.h"
+
+const int MAX_TEST_REGIONS = 20;
+const int MAX_TEST_OUTPUTS = 20;
+const int REGION_SIZE = 20000;
+
+
+
+// globals
+uint16_t* src[MAX_TEST_REGIONS];
+uint16_t* dst[MAX_TEST_OUTPUTS];
+uint16_t* ref[MAX_TEST_OUTPUTS];
+uint16_t inputIndicies[MAX_TEST_REGIONS];
+uint16_t outputIndicies[MAX_TEST_OUTPUTS*2];
+#ifdef USE_LIBUV
+uv_loop_t *loop;
+#endif
+
+
+struct testProps {
+	size_t sliceSize, lastSliceSize;
+	unsigned numInputs, numOutputs;
+	Galois16Methods cpuMethod;
+	int cpuThreads;
+	Galois16OCLMethods oclMethod;
+	bool useCpu, useOcl;
+	
+	void print(const char* label) const {
+		std::cout << label << "(" << numInputs << "x" << numOutputs << ", sliceSize " << sliceSize << ", lastSliceSize " << lastSliceSize;
+		if(useCpu && !useOcl)
+			std::cout << ", method " << PAR2ProcCPU::info(cpuMethod).name << ", threads " << cpuThreads;
+		if(!useCpu && useOcl)
+			std::cout << ", method " << PAR2ProcOCL::methodToText(oclMethod);
+		std::cout << ")";
+	}
+};
+
+static void run_test(struct testProps test IF_LIBUV(, std::function<void()> cb)) {
+	auto* par2 = new PAR2Proc();
+	PAR2ProcCPU* par2cpu = nullptr;
+	PAR2ProcOCL* par2ocl = nullptr;
+	
+	if(test.useCpu && test.useOcl && test.sliceSize < 3)
+		test.useOcl = false;  // not enable space to split
+	if(test.useCpu) par2cpu = new PAR2ProcCPU(IF_LIBUV(loop));
+	if(test.useOcl) par2ocl = new PAR2ProcOCL(IF_LIBUV(loop));
+	// note the above needs to be allocated before this lambda, so that it captures the allocated values as opposed to nullptr
+	
+	auto endCb = [=]() {
+		std::shared_ptr<unsigned> doneCount(new unsigned(0));
+		for(unsigned outputNum=0; outputNum<test.numOutputs; outputNum++) {
+			void* buffer = dst[outputNum];
+			auto outputCb = [=](bool cksumSuccess) {
+				if(memcmp(buffer, ref[outputNum], test.sliceSize)) {
+					test.print("MatMul ");
+					std::cout << ", output " << outputNum << " failure" << std::endl;
+					unsigned loc = display_mem_diff(ref[outputNum], (const uint16_t*)buffer, test.sliceSize/2);
+					
+					std::cout << std::endl;
+					std::cout << "Input Idx:" << std::endl;
+					print_mem_region(inputIndicies, 0, test.numInputs);
+					for(unsigned region=0; region<test.numInputs; region++) {
+						size_t regionSize = region == test.numInputs-1 ? test.lastSliceSize : test.sliceSize;
+						if(regionSize & 1) {
+							// odd num of bytes - zero last byte
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+							src[region][regionSize/2] >>= 8;
+#else
+							src[region][regionSize/2] &= 0xff;
+#endif
+							regionSize++;
+						}
+						regionSize /= 2;
+						int printFrom = loc;
+						if(loc > regionSize) printFrom = 0;
+						size_t printTo = std::min((int)regionSize, printFrom+32);
+						std::cout << "Input " << region << ":" << std::endl;
+						print_mem_region(src[region], printFrom, printTo);
+						
+						
+						uint16_t coeff = gfmat_coeff(inputIndicies[region], outputIndicies[outputNum]);
+						std::cout << "Input " << region << " (*" << coeff << "):" << std::endl;
+						// since we're exiting, just edit in-place
+						for(unsigned iidx=printFrom; iidx<printTo; iidx++) {
+							src[region][iidx] = gf16_mul_le(src[region][iidx], coeff);
+						}
+						print_mem_region(src[region], printFrom, printTo);
+					}
+					
+					exit(1);
+				}
+				if(!cksumSuccess) {
+					test.print("MatMul ");
+					std::cout << ", output " << outputNum << " checksum verification failed" << std::endl;
+					exit(1);
+				}
+				
+				if(++(*doneCount) == test.numOutputs) {
+					//delete par2; // for some reason, this can cause MSVC to free captured params, so defer deletion
+					auto deinitCb = [=]() {
+						delete par2;
+						// TODO: closing off async_t for unused asyncs causes libuv to go crazy?
+						delete par2cpu;
+						delete par2ocl;
+						IF_LIBUV(cb());
+					};
+#ifdef USE_LIBUV
+					par2->deinit(deinitCb);
+#else
+					par2->deinit();
+					deinitCb();
+#endif
+				}
+			};
+#ifdef USE_LIBUV
+			par2->getOutput(outputNum, buffer, outputCb);
+#else
+			outputCb(par2->getOutput(outputNum, buffer).get());
+#endif
+		}
+	};
+	
+	std::shared_ptr<unsigned> input(new unsigned(0));
+	auto addInputCb = [=](unsigned) {
+		if(*input >= test.numInputs) return;
+		// TODO: make last chunk smaller
+		while(1) {
+			IF_NOT_LIBUV(par2->waitForAdd());
+			auto added = par2->addInput(src[*input], *input == test.numInputs-1 ? test.lastSliceSize : test.sliceSize, inputIndicies[*input], false IF_LIBUV(, nullptr));
+#ifdef USE_LIBUV
+			if(!added) break;
+#else
+			(void)added;
+#endif
+			if(++(*input) == test.numInputs) {
+#ifdef USE_LIBUV
+				par2->endInput(endCb);
+#else
+				par2->endInput().get();
+				endCb();
+#endif
+				break;
+			}
+		}
+	};
+	
+	std::vector<struct PAR2ProcBackendAlloc> par2backends;
+	if(test.useCpu && test.useOcl) {
+		// split between the two evenly
+		// TODO: test different splits
+		size_t half = test.sliceSize >> 1;
+		half += half&1;
+		par2backends.push_back({par2ocl, 0, half});
+		par2backends.push_back({par2cpu, half, test.sliceSize-half});
+	} else if(test.useCpu) {
+		par2backends.push_back({par2cpu, 0, test.sliceSize});
+	} else {
+		par2backends.push_back({par2ocl, 0, test.sliceSize});
+	}
+	
+	par2->init(test.sliceSize, par2backends IF_LIBUV(, addInputCb));
+	if(par2cpu) par2cpu->init(test.cpuMethod);
+	if(test.cpuThreads) par2cpu->setNumThreads(test.cpuThreads);
+	if(par2ocl) par2ocl->init(test.oclMethod);
+	if(!par2->setRecoverySlices(test.numOutputs, outputIndicies)) {
+		std::cout << "Init failed" << std::endl;
+		exit(1);
+	}
+	
+	// generate reference
+	for(unsigned output=0; output<test.numOutputs; output++) {
+		memset(ref[output], 0, REGION_SIZE);
+		for(unsigned region=0; region<test.numInputs; region++) {
+			size_t regionSize = region == test.numInputs-1 ? test.lastSliceSize : test.sliceSize;
+			uint16_t coeff = gfmat_coeff(inputIndicies[region], outputIndicies[output]);
+			for(size_t i=0; i<regionSize/sizeof(uint16_t); i++)
+				ref[output][i] ^= gf16_mul_le(src[region][i], coeff);
+			if(regionSize & 1) {
+				size_t idx = regionSize/sizeof(uint16_t);
+				uint16_t lastword = src[region][idx];
+				memset(((uint8_t*)&lastword) + 1, 0, 1);
+				ref[output][idx] ^= gf16_mul_le(lastword, coeff);
+			}
+		}
+	}
+	
+	addInputCb(0);
+	
+	// TODO: test re-using PAR2 for multiple passes
+}
+
+static void show_help() {
+	std::cout << "test-ctrl [-v] [-f] [-p[c][g]]" << std::endl;
+	exit(0);
+}
+
+int real_main(int argc, char** argv) {
+	bool verbose = false;
+	bool useCpu = true, useOcl = false;
+	bool skipMethods = false; // faster test: only test default method
+	
+	for(int i=1; i<argc; i++) {
+		if(argv[i][0] != '-') show_help();
+		switch(argv[i][1]) {
+			case 'v':
+				verbose = true;
+			break;
+			case 'f':
+				skipMethods = true;
+			break;
+			case 'p':
+				useCpu = false;
+				useOcl = false;
+				if(argv[i][2] == 'c') {
+					useCpu = true;
+					if(argv[i][3] == 'g') useOcl = true;
+				} else if(argv[i][2] == 'g') {
+					useOcl = true;
+					if(argv[i][3] == 'c') useCpu = true;
+				} else show_help();
+			break;
+			default: show_help();
+		}
+	}
+	
+	
+	const std::vector<size_t> outputSizeTests{1, 15, 16}; // must be less than MAX_TEST_OUTPUTS
+	gf16_generate_log_tables();
+	gfmat_init();
+	
+	if(useOcl) {
+		if(PAR2ProcOCL::load_runtime()) {
+			std::cerr << "OpenCL load failed" << std::endl;
+			return 1;
+		}
+	}
+	
+	// generate source regions
+	srand(0x01020304);
+	for(unsigned i=0; i<MAX_TEST_REGIONS; i++) {
+		src[i] = (uint16_t*)malloc(REGION_SIZE);
+		for(unsigned j=0; j<REGION_SIZE/sizeof(uint16_t); j++)
+			src[i][j] = rand() & 0xffff;
+	}
+	for(unsigned i=0; i<MAX_TEST_OUTPUTS; i++) {
+		dst[i] = (uint16_t*)malloc(REGION_SIZE);
+		ref[i] = (uint16_t*)malloc(REGION_SIZE);
+	}
+	
+	for(auto& idx : inputIndicies)
+		idx = rand() & 0x7fff;
+	for(auto& idx : outputIndicies)
+		idx = rand() % 0xffff;
+	outputIndicies[0] = 0; // to test multi-add functionality
+	
+#ifdef USE_LIBUV
+	loop = new uv_loop_t;
+	uv_loop_init(loop);
+#endif
+	
+	// create queue of tests
+	std::queue<struct testProps> tests;
+	const std::vector<size_t> sliceSizes{2, REGION_SIZE-2, REGION_SIZE};
+	for(size_t sliceSize : sliceSizes) {
+		
+		std::vector<size_t> lastSliceSizes{1, 2};
+		if(sliceSize > 2) {
+			lastSliceSizes.push_back(sliceSize-1);
+			lastSliceSizes.push_back(sliceSize);
+		}
+		for(const auto& lastSliceSize : lastSliceSizes) {
+			if(lastSliceSize < 1) continue;
+			
+			for(unsigned numOutputs : outputSizeTests) {
+				const std::vector<unsigned> inputSizes{1, 15, 16}; // must be less than MAX_TEST_REGIONS
+				for(const auto& numRegions : inputSizes) {
+					if(numRegions == 1 && lastSliceSize != sliceSize) continue; // pointless test
+					if(lastSliceSize != sliceSize && lastSliceSize != 1 && (numRegions > 15 || numOutputs > 2))
+						continue; // don't bother testing every lastSliceSize against all input/output region combinations (only test partial and full)
+					
+					
+					if(useCpu && useOcl) {
+						tests.push({
+							sliceSize, lastSliceSize, numRegions, numOutputs, GF16_AUTO, 0, GF16OCL_AUTO, useCpu, useOcl
+						});
+					} else if(useCpu) {
+						const std::vector<Galois16Methods> methods = skipMethods ? std::vector<Galois16Methods>{GF16_AUTO} : PAR2ProcCPU::availableMethods();
+						const std::vector<int> threadTests{1, 2, 23};
+						for(auto threads : threadTests) {
+							for(const auto& method : methods) {
+								tests.push({
+									sliceSize, lastSliceSize, numRegions, numOutputs, method, threads, GF16OCL_AUTO, useCpu, useOcl
+								});
+							}
+						}
+					} else {
+						const std::vector<Galois16OCLMethods> methods = skipMethods ? std::vector<Galois16OCLMethods>{GF16OCL_AUTO} : PAR2ProcOCL::availableMethods();
+						for(const auto& method : methods) {
+							tests.push({
+								sliceSize, lastSliceSize, numRegions, numOutputs, GF16_AUTO, 0, method, useCpu, useOcl
+							});
+						}
+					}
+					
+				}
+			}
+		}
+	}
+	
+	std::function<bool()> testRunner;
+	testRunner = [=, &tests, &testRunner]() -> bool {
+		if(tests.empty()) return false;
+#ifndef USE_LIBUV
+		(void)testRunner;
+#endif
+		
+		auto test = tests.front();
+		tests.pop();
+		if(verbose) {
+			test.print("Test ");
+			std::cout << std::endl;
+		}
+		run_test(test IF_LIBUV(, testRunner));
+		return true;
+	};
+	
+#ifdef USE_LIBUV
+	testRunner();
+	uv_run(loop, UV_RUN_DEFAULT);
+	uv_loop_close(loop);
+	delete loop;
+#else
+	while(testRunner());
+#endif
+	
+	
+	for(int i=0; i<MAX_TEST_REGIONS; i++) {
+		free(src[i]);
+	}
+	for(int i=0; i<MAX_TEST_OUTPUTS; i++) {
+		free(dst[i]);
+		free(ref[i]);
+	}
+	gfmat_free();
+	
+	if(useOcl) {
+		PAR2ProcOCL::unload_runtime();
+	}
+	
+	std::cout << "All tests passed" << std::endl;
+	return 0;
+}
+
+int main(int argc, char** argv) {
+	int ret = real_main(argc, argv);
+#if defined(_MSC_VER) && !defined(NDEBUG)
+	_CrtDumpMemoryLeaks();
+#endif
+	return ret;
+}
diff --git a/test/gf16/test-inv.cpp b/test/gf16/test-inv.cpp
new file mode 100644
index 00000000..a42c2c27
--- /dev/null
+++ b/test/gf16/test-inv.cpp
@@ -0,0 +1,188 @@
+#include "gfmat_inv.h"
+#include "gfmat_coeff.h"
+#include "gf16mul.h"
+#include "p2c-inv/reedsolomon.h"
+#include <algorithm>
+#include <iostream>
+#include <random>
+
+static bool p2c_invert(std::vector<bool> inputValid, std::vector<uint16_t> recovery, Galois16*& leftmatrix) {
+	// get reference from par2cmdline
+	std::vector<RSOutputRow> outputrows;
+	for(uint16_t r : recovery)
+		outputrows.push_back(RSOutputRow(true, r));
+	return ReedSolomon_Compute(inputValid, outputrows, leftmatrix);
+}
+
+static void compare_invert(const Galois16RecMatrix& mat, Galois16* leftmatrix, std::vector<bool> inputValid, std::vector<uint16_t> recovery) {
+	unsigned validCount = std::count(inputValid.begin(), inputValid.end(), true);
+	unsigned invalidCount = inputValid.size()-validCount;
+	
+	if(recovery.size() != invalidCount) abort();
+	
+	// compare
+	for(unsigned outRow = 0; outRow < invalidCount; outRow++)
+		for(unsigned inCol = 0; inCol < inputValid.size(); inCol++) {
+			if(leftmatrix[outRow * inputValid.size() + inCol] != mat.GetFactor(inCol, outRow))
+				abort();
+		}
+}
+
+static void do_test(std::vector<bool> inputValid, std::vector<uint16_t> recovery, Galois16Methods method) {
+	std::sort(recovery.begin(), recovery.end());
+	
+	// get reference from par2cmdline
+	Galois16* leftmatrix = nullptr;
+	bool canInvert = p2c_invert(inputValid, recovery, leftmatrix);
+	
+	// do inversion
+	unsigned validCount = std::count(inputValid.begin(), inputValid.end(), true);
+	Galois16RecMatrix mat;
+	mat.regionMethod = (int)method;
+	if(mat.Compute(inputValid, validCount, recovery) != canInvert) abort();
+	if(canInvert) {
+		compare_invert(mat, leftmatrix, inputValid, recovery);
+	}
+	if(leftmatrix)
+		delete[] leftmatrix;
+}
+
+static void show_help() {
+	std::cout << "test-inv [-v] [-f]" << std::endl;
+	exit(0);
+}
+
+int main(int argc, char** argv) {
+	bool verbose = false;
+	bool fast = false; // faster test: only test default method + fewer iterations
+	
+	for(int i=1; i<argc; i++) {
+		if(argv[i][0] != '-') show_help();
+		switch(argv[i][1]) {
+			case 'v':
+				verbose = true;
+			break;
+			case 'f':
+				fast = true;
+			break;
+			default: show_help();
+		}
+	}
+	
+	gfmat_init();
+	
+	const std::vector<Galois16Methods> methods = fast ? std::vector<Galois16Methods>{GF16_AUTO} : Galois16Mul::availableMethods(true);
+	
+	for(auto method : methods) {
+		// one block only
+		do_test(std::vector<bool>{false}, std::vector<uint16_t>{0}, method);
+		do_test(std::vector<bool>{false}, std::vector<uint16_t>{1}, method);
+		do_test(std::vector<bool>{false}, std::vector<uint16_t>{65534}, method);
+		// first block is bad
+		do_test(std::vector<bool>{false, true}, std::vector<uint16_t>{0}, method);
+		// 3/4 bad blocks, just enough recovery
+		do_test(std::vector<bool>{false, false, true, false}, std::vector<uint16_t>{0,1,2}, method);
+		// all bad blocks, insufficient recovery
+		do_test(std::vector<bool>{false, false, false, false}, std::vector<uint16_t>{0,1,5}, method);
+		// all bad blocks, sufficient recovery
+		do_test(std::vector<bool>{false, false, false, false}, std::vector<uint16_t>{1,5,8,100}, method);
+		// PAR2 flaw (can't invert matrix) [https://sourceforge.net/p/parchive/mailman/parchive-devel/thread/202374635.20040218104317%40pbclements.co.uk/]
+		std::vector<bool> flawedInput(6555, true);
+		flawedInput[0] = false;
+		flawedInput[6554] = false;
+		do_test(flawedInput, std::vector<uint16_t>{0,5}, method);
+		// invertible
+		do_test(flawedInput, std::vector<uint16_t>{0,6}, method);
+		
+		// PAR2 flaw, but invertible by discarding a bad recovery
+		{
+			Galois16RecMatrix mat;
+			std::vector<uint16_t> recovery{0,5,6};
+			mat.regionMethod = (int)method;
+			
+			unsigned validCount = std::count(flawedInput.begin(), flawedInput.end(), true);
+			if(!mat.Compute(flawedInput, validCount, recovery)) abort();
+			if(recovery.size() != 2) abort();
+			if(!((recovery.at(0) == 0 || recovery.at(0) == 5) && recovery.at(1) == 6)) abort();
+			
+			Galois16* leftmatrix = nullptr;
+			bool canInvert = p2c_invert(flawedInput, recovery, leftmatrix);
+			if(!canInvert) abort();
+			
+			compare_invert(mat, leftmatrix, flawedInput, recovery);
+			delete[] leftmatrix;
+		}
+		
+		// a few more tests to check multi-region multiplies work
+		do_test(std::vector<bool>{false, false, false, false, false}, std::vector<uint16_t>{0,3,5,17,65534}, method);
+		do_test(std::vector<bool>{false, false, false, false, false, false}, std::vector<uint16_t>{0,1,2,3,32768,65534}, method);
+		do_test(std::vector<bool>{false, false, false, false, false, false, false}, std::vector<uint16_t>{0,1,2,3,4,5,6}, method);
+		do_test(std::vector<bool>{true, false, false, false, false, false, false, false, false}, std::vector<uint16_t>{0,1,2,3,5,6,7,8}, method);
+	}
+	
+	
+	
+	std::cout << "Random tests..." << std::endl;
+	
+	std::mt19937 rnd;
+	rnd.seed(0x01020304);
+	std::vector<uint16_t> recIdx(65535);
+	for(int i=0; i<65535; i++) recIdx[i] = i;
+	
+	const std::vector<uint16_t> inputSizeTests{2, 100, 1234, 32768};
+	for(uint16_t iSize : inputSizeTests) {
+		std::vector<bool> inputValid(iSize);
+		std::vector<float> validProb{0.1f, 0.5f, 0.9f};
+		if(iSize == 32768) {
+			validProb.clear();
+			validProb.push_back(0.01f); // otherwise would be too slow
+		}
+		
+		for(int round=0; round<(iSize>100?(fast?1:2):10); round++) {
+			for(float pValid : validProb) {
+				uint16_t invalidCount = 0;
+				// generate distribution
+				for(int v=0; v<iSize; v++) {
+					float p = (double)rnd() / (double)rnd.max();
+					inputValid[v] = p > pValid;
+					invalidCount += inputValid[v] ? 0 : 1;
+				}
+				if(invalidCount < 1) continue;
+				
+				
+				// num outputs = num failures
+				std::shuffle(recIdx.begin(), recIdx.end(), rnd);
+				std::vector<uint16_t> recovery(recIdx.begin(), recIdx.begin() + invalidCount);
+				std::sort(recovery.begin(), recovery.end());
+				
+				// get reference from par2cmdline
+				Galois16* leftmatrix = nullptr;
+				bool canInvert = p2c_invert(inputValid, recovery, leftmatrix);
+				
+				for(auto method : methods) {
+					if(verbose) std::cout << "  " << iSize << "x" << invalidCount << " [" << (pValid*100) << "% validity] (" << Galois16Mul::methodToText(method) << ")" << std::endl;
+					
+					
+					recovery = std::vector<uint16_t>(recIdx.begin(), recIdx.begin() + invalidCount);
+					std::sort(recovery.begin(), recovery.end());
+					
+					// do inversion
+					Galois16RecMatrix mat;
+					mat.regionMethod = (int)method;
+					if(mat.Compute(inputValid, iSize-invalidCount, recovery) != canInvert) abort();
+					if(canInvert) {
+						compare_invert(mat, leftmatrix, inputValid, recovery);
+					}
+				}
+				
+				if(leftmatrix)
+					delete[] leftmatrix;
+			}
+		}
+	}
+	
+	gfmat_free();
+	std::cout << "Tests passed" << std::endl;
+	
+	return 0;
+}
\ No newline at end of file
diff --git a/test/gf16/test-pmul.cpp b/test/gf16/test-pmul.cpp
new file mode 100644
index 00000000..159ebbfa
--- /dev/null
+++ b/test/gf16/test-pmul.cpp
@@ -0,0 +1,106 @@
+
+#include "gf16pmul.h"
+#include "test.h"
+
+const int MAX_TEST_REGIONS = 20;
+// earlier GCC doesn't like `const int` used for alignment statements, so use a define instead
+#define REGION_ALIGNMENT 4096
+const int REGION_SIZE = MAX_TEST_REGIONS * 1024; // largest stride = 1024 bytes from Xor512
+
+struct TestFunc {
+	Galois16PointMulMethods id;
+	Gf16PMulFunc fn;
+	unsigned blocklen;
+};
+static void show_help() {
+	std::cout << "test-pmul [-v]" << std::endl;
+	exit(0);
+}
+
+int main(int argc, char** argv) {
+	bool verbose = false;
+	int seeds[] = {0x01020304, 0x50607080 };
+	
+	for(int i=1; i<argc; i++) {
+		if(argv[i][0] != '-') show_help();
+		switch(argv[i][1]) {
+			case 'v':
+				verbose = true;
+			break;
+			default: show_help();
+		}
+	}
+	(void)verbose;
+	
+	// allocate src/tmp regions
+	uint16_t* src1, *src2, *dst, *ref;
+	ALIGN_ALLOC(src1, REGION_SIZE, REGION_ALIGNMENT);
+	ALIGN_ALLOC(src2, REGION_SIZE, REGION_ALIGNMENT);
+	ALIGN_ALLOC(dst, REGION_SIZE, REGION_ALIGNMENT);
+	ALIGN_ALLOC(ref, REGION_SIZE, REGION_ALIGNMENT);
+	if(!src1 || !src2 || !dst || !ref) {
+		std::cout << "Failed to allocate memory" << std::endl;
+		return 2;
+	}
+	
+	gf16_generate_log_tables();
+	setup_pmul();
+	
+	std::vector<struct TestFunc> funcs;
+	if(gf16pmul_available_sse)
+		funcs.push_back({
+			GF16PMUL_PCLMUL, &gf16pmul_sse, 16
+		});
+	if(gf16pmul_available_avx2)
+		funcs.push_back({
+			GF16PMUL_AVX2, &gf16pmul_avx2, 32
+		});
+	if(gf16pmul_available_vpclmul)
+		funcs.push_back({
+			GF16PMUL_VPCLMUL, &gf16pmul_vpclmul, 32
+		});
+	if(gf16pmul_available_vpclgfni)
+		funcs.push_back({
+			GF16PMUL_VPCLMUL_GFNI, &gf16pmul_vpclgfni, 64
+		});
+	if(gf16pmul_available_neon)
+		funcs.push_back({
+			GF16PMUL_NEON, &gf16pmul_neon, 32
+		});
+	if(gf16pmul_available_sve2)
+		funcs.push_back({
+			GF16PMUL_SVE2, &gf16pmul_sve2, gf16pmul_sve2_width()*2
+		});
+	
+	for(int seed : seeds) {
+		// generate source regions + ref
+		srand(seed);
+		for(size_t i=0; i<REGION_SIZE/sizeof(uint16_t); i++) {
+			src1[i] = rand() & 0xffff;
+			src2[i] = rand() & 0xffff;
+			ref[i] = gf16_mul_le(src1[i], src2[i]);
+		}
+		
+		for(const auto& fn : funcs) {
+			auto name = gf16pmul_methodName(fn.id);
+			/*if(verbose)*/ std::cout << "  " << name << std::endl;
+			memset(dst, 0, REGION_SIZE);
+			size_t regionSize = REGION_SIZE;
+			regionSize -= regionSize % fn.blocklen;
+			fn.fn(dst, src1, src2, regionSize);
+			if(memcmp(dst, ref, regionSize)) {
+				std::cout << "PointMul failure: " << name << std::endl;
+				display_mem_diff(ref, dst, regionSize/2);
+				return 1;
+			}
+		}
+	}
+	
+	
+	ALIGN_FREE(src1);
+	ALIGN_FREE(src2);
+	ALIGN_FREE(dst);
+	ALIGN_FREE(ref);
+	std::cout << "All tests passed" << std::endl;
+	return 0;
+}
diff --git a/test/gf16/test.cpp b/test/gf16/test.cpp
new file mode 100644
index 00000000..e90067ac
--- /dev/null
+++ b/test/gf16/test.cpp
@@ -0,0 +1,956 @@
+
+#include "gf16mul.h"
+#include "test.h"
+
+
+const int MAX_TEST_REGIONS = 20;
+const int MAX_TEST_OUTPUTS = 17;
+// earlier GCC doesn't like `const int` used for alignment statements, so use a define instead
+#define REGION_ALIGNMENT 4096
+const int REGION_SIZE = MAX_TEST_REGIONS * 1024; // largest stride = 1024 bytes from Xor512
+const int MAX_PACK_REGIONS = 8; // should be <= MAX_TEST_REGIONS (because we re-use the memory allocated for it)
+const int MAX_MISALIGN = 3; // maximum misaligned bytes to test finish_packed
+
+static void show_help() {
+	std::cout << "test [-c] [-p] [-m] [-a] [-w] [-v]" << std::endl;
+	exit(0);
+}
+
+int main(int argc, char** argv) {
+	bool verbose = false;
+	const bool fastMul = true;
+	int seeds[] = {0x01020304 /*, 0x50607080*/ }; // 1 round seems to be enough for testing purposes
+	const std::vector<Galois16Methods> methods = Galois16Mul::availableMethods(true);
+	std::vector<Galois16Mul> gf;
+	std::vector<void*> gfScratch;
+	for(auto method : methods) {
+		gf.emplace_back(method);
+	}
+	gfScratch.reserve(methods.size());
+	for(const auto& g : gf) {
+		gfScratch.push_back(g.mutScratch_alloc());
+	}
+	
+	bool testAllFuncs = true;
+	bool testCksum = false, testPrep = false, testMul = false, testAdd = false, testPow = false, testWord = false;
+	for(int i=1; i<argc; i++) {
+		if(argv[i][0] != '-') show_help();
+		switch(argv[i][1]) {
+			case 'c':
+				testAllFuncs = false;
+				testCksum = true;
+			break;
+			case 'p':
+				testAllFuncs = false;
+				testPrep = true;
+			break;
+			case 'm':
+				testAllFuncs = false;
+				testMul = true;
+			break;
+			case 'a':
+				testAllFuncs = false;
+				testAdd = true;
+			break;
+			case 'w':
+				testAllFuncs = false;
+				testPow = true;
+			break;
+			case 'o':
+				testAllFuncs = false;
+				testWord = true;
+			break;
+			case 'v':
+				verbose = true;
+			break;
+			default: show_help();
+		}
+	}
+	if(testAllFuncs) {
+		testCksum = true;
+		testPrep = true;
+		testMul = true;
+		testAdd = true;
+		testPow = true;
+		testWord = true;
+	}
+	
+	const std::vector<size_t> outputSizeTests{1, 2, 15, 16, 17}; // must be less than MAX_TEST_OUTPUTS
+	
+	// allocate src/tmp regions
+	uint16_t* src;
+	uint16_t* tmp, * tmp2;
+	ALIGN_ALLOC(src, REGION_SIZE*MAX_TEST_REGIONS, REGION_ALIGNMENT);
+	ALIGN_ALLOC(tmp, REGION_SIZE*MAX_TEST_REGIONS, REGION_ALIGNMENT);
+	ALIGN_ALLOC(tmp2, REGION_SIZE*MAX_TEST_REGIONS, REGION_ALIGNMENT);
+	uint16_t* dst;
+	uint16_t* ref;
+	const unsigned allocOutputs = MAX_TEST_OUTPUTS > MAX_PACK_REGIONS ? MAX_TEST_OUTPUTS : MAX_PACK_REGIONS;
+	ALIGN_ALLOC(dst, (REGION_SIZE+MAX_MISALIGN*2)*allocOutputs, REGION_ALIGNMENT);
+	ALIGN_ALLOC(ref, REGION_SIZE*allocOutputs, REGION_ALIGNMENT);
+	if(!src || !tmp || !dst || !ref) {
+		std::cout << "Failed to allocate memory" << std::endl;
+		return 2;
+	}
+	
+	uint16_t* srcM[MAX_TEST_REGIONS];
+	uint16_t* tmpM[MAX_TEST_REGIONS];
+	for(size_t i=0; i<MAX_TEST_REGIONS; i++) {
+		srcM[i] = src + i*REGION_SIZE/sizeof(uint16_t);
+		tmpM[i] = tmp + i*REGION_SIZE/sizeof(uint16_t);
+	}
+	uint16_t* dstM[allocOutputs];
+	uint16_t* refM[allocOutputs];
+	for(size_t i=0; i<allocOutputs; i++) {
+		dstM[i] = dst + i*REGION_SIZE/sizeof(uint16_t);
+		refM[i] = ref + i*REGION_SIZE/sizeof(uint16_t);
+	}
+	
+	// I won't bother testing alignment/stride - assume it's always correct
+	ALIGN_TO(REGION_ALIGNMENT, uint16_t src2[REGION_SIZE/sizeof(uint16_t)]);
+	char* zeroes[REGION_SIZE] = {0};
+	
+	gf16_generate_log_tables();
+	gfmat_init();
+	
+	if(!verbose) {
+		std::cout << "Kernels to test: ";
+		for(unsigned gi = 0; gi < gf.size(); gi++) {
+			if(gi) std::cout << ", ";
+			std::cout << gf[gi].info().name;
+		}
+		std::cout << std::endl;
+	}
+	
+	for(int seed : seeds) {
+		// generate source regions
+		srand(seed);
+		for(size_t i=0; i<MAX_TEST_REGIONS*REGION_SIZE/sizeof(uint16_t); i++)
+			src[i] = rand() & 0xffff;
+		for(auto& word : src2)
+			word = rand() & 0xffff;
+		
+		// test cksum
+		if(testCksum) {
+			std::cout << "Testing copy cksum..." << std::endl;
+			for(unsigned gi = 0; gi < gf.size(); gi++) {
+				const auto& g = gf[gi];
+				const std::vector<size_t> regionSizes{g.info().stride, g.info().stride-1, REGION_SIZE, REGION_SIZE-1, REGION_SIZE+1};
+				for(unsigned regionSize : regionSizes) {
+					if(verbose) std::cout << "  " << g.info().name << ": regionSize=" << regionSize << std::endl;
+					memset(tmp, seed&0xff, REGION_SIZE*2);
+					memset(dst, seed&0xff, REGION_SIZE*2);
+					g.copy_cksum(tmp, src, regionSize, regionSize);
+					unsigned totalSize = regionSize + g.info().cksumSize;
+					if(memcmp(dst, (char*)tmp+totalSize, REGION_SIZE*2 - totalSize)) {
+						std::cout << "Cksum copy checksum wrote too much data: " << g.info().name << " (regionSize=" << regionSize << ")" << std::endl;
+						return 1;
+					}
+					if(!g.copy_cksum_check(dst, tmp, regionSize)) {
+						std::cout << "Cksum copy checksum failure: " << g.info().name << " (regionSize=" << regionSize << ")" << std::endl;
+						std::cout << "Checksum:" << std::endl;
+						print_mem_region((uint16_t*)((uintptr_t)tmp + regionSize), 0, g.info().cksumSize/2);
+						if(regionSize <= g.info().stride*2) {
+							std::cout << "Data:" << std::endl;
+							print_mem_region(src, 0, (regionSize+1)/2);
+						}
+						return 1;
+					}
+					if(memcmp(dst, src, regionSize)) {
+						std::cout << "Cksum copy data failure: " << g.info().name << " (regionSize=" << regionSize << ")" << std::endl;
+						display_mem_diff(src, dst, regionSize/2);
+						return 1;
+					}
+					// check that it detects failure
+					tmp[0] ^= 0x1111;
+					if(g.copy_cksum_check(dst, tmp, regionSize)) {
+						std::cout << "Cksum copy failed to detect checksum error: " << g.info().name << " (regionSize=" << regionSize << ")" << std::endl;
+						std::cout << "Checksum:" << std::endl;
+						print_mem_region((uint16_t*)((uintptr_t)tmp + regionSize), 0, g.info().cksumSize/2);
+						return 1;
+					}
+					
+					
+					// test with add
+					const std::vector<size_t> lastRegionSizes{1, 2, REGION_SIZE/2-1, REGION_SIZE/2, REGION_SIZE/2+1, regionSize-1, regionSize};
+					for(auto lastRegionSize : lastRegionSizes) {
+						if(lastRegionSize > regionSize) continue;
+						g.copy_cksum(tmp2, srcM[0], regionSize, regionSize);
+						g.copy_cksum(tmp, srcM[1], lastRegionSize, regionSize);
+						unsigned addSize = regionSize + g.info().stride;
+						while(addSize % g.info().stride)
+							addSize++;
+						g.mul_add(tmp2, tmp, addSize, 1, gfScratch[gi]);
+						if(!g.copy_cksum_check(dst, tmp2, regionSize)) {
+							std::cout << "Cksum copy checksum (with add) failure: " << g.info().name << " (regionSize=" << regionSize << ", lastRegionSize=" << lastRegionSize << ")" << std::endl;
+							return 1;
+						}
+						// the zeroed section of the second region should be the same
+						if(memcmp((char*)dst + lastRegionSize, (char*)src + lastRegionSize, regionSize - lastRegionSize)) {
+							std::cout << "Cksum copy data (with add) failure: " << g.info().name << " (regionSize=" << regionSize << ", lastRegionSize=" << lastRegionSize << ")" << std::endl;
+							display_mem_diff(src + lastRegionSize/2, dst + lastRegionSize/2, (regionSize-lastRegionSize+1)/2);
+							return 1;
+						}
+					}
+				}
+			}
+		}
+		
+		// test prepare/finish
+		if(testPrep) {
+			std::cout << "Testing prepare/finish..." << std::endl;
+			for(const auto& g : gf) {
+				if(!g.needPrepare()) continue;
+				//const unsigned regionSize = rounddown_to(REGION_SIZE, g.info().stride);
+				const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride;
+				if(verbose) std::cout << "  " << g.info().name << std::endl;
+				memset(dst, seed&0xff, REGION_SIZE); // scramble, to ensure we're actually doing something
+				g.prepare(dst, src, regionSize);
+				g.finish(dst, regionSize);
+				if(memcmp(dst, src, regionSize)) {
+					std::cout << "Prepare/finish failure: " << g.info().name << std::endl;
+					display_mem_diff(src, dst, regionSize/2);
+					return 1;
+				}
+				// test prepare not aligned to stride
+				for(int offset = -(int)g.info().stride+1; offset < 0; offset++) {
+					memset(dst, seed&0xff, REGION_SIZE); // fill with non-zero to test zero-fill
+					g.prepare(dst, src, regionSize + offset);
+					g.finish(dst, regionSize);
+					if(memcmp(dst, src, regionSize + offset)) {
+						std::cout << "Prepare/finish misaligned (" << offset << ") failure: " << g.info().name << std::endl;
+						display_mem_diff(src, dst, regionSize/2);
+						return 1;
+					}
+					if(memcmp((uint8_t*)dst + regionSize + offset, zeroes, -offset)) {
+						std::cout << "Prepare/finish misaligned zero-fill (" << offset << ") failure: " << g.info().name << std::endl;
+						print_mem_region(dst, (regionSize-g.info().stride)>>1, regionSize>>1);
+						return 1;
+					}
+				}
+				// test in-situ prepare
+				memcpy(dst, src, regionSize);
+				g.prepare(dst, dst, regionSize);
+				g.finish(dst, regionSize);
+				if(memcmp(dst, src, regionSize)) {
+					std::cout << "Prepare/finish in-situ failure: " << g.info().name << std::endl;
+					display_mem_diff(src, dst, regionSize/2);
+					return 1;
+				}
+			}
+			
+			// test prepare packed + accumulate
+			std::cout << "Testing prepare packed..." << std::endl;
+			for(unsigned gi = 0; gi < gf.size(); gi++) {
+				const auto& g = gf[gi];
+				
+				const unsigned stride = g.info().stride;
+				//const unsigned regionSize = rounddown_to(REGION_SIZE, stride);
+				const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride;
+				const std::vector<size_t> srcLenOffsets{0, 1, 2, 3, stride, stride+1, regionSize/2, regionSize/2+1, regionSize/2+stride, regionSize-stride, regionSize-1};
+				for(const auto& srcLenOffset : srcLenOffsets) {
+					size_t srcLen = regionSize - srcLenOffset;
+					for(const auto& srcLenLastOffset : srcLenOffsets) {
+						size_t srcLenLast = regionSize - srcLenLastOffset;
+						if(srcLenLast > srcLen) continue;
+						
+						const std::vector<intptr_t> chunkLenOffsets{-(int)stride, 0, (int)stride, (int)stride*2, (int)rounddown_to(regionSize/2, (int)stride), (int)rounddown_to(regionSize/2, (int)stride)+(int)stride, (int)roundup_to(regionSize/3, (int)stride), (int)(regionSize-stride)};
+						for(const auto& chunkLenOffset : chunkLenOffsets) {
+							size_t chunkLen = regionSize - chunkLenOffset;
+							for(unsigned inputPackSize = 1; inputPackSize <= MAX_PACK_REGIONS; inputPackSize++) {
+								if(inputPackSize == 1 && srcLenLast != srcLen) continue; // pointless test
+								
+								if(verbose) std::cout << "  " << g.info().name << ": srcLen=" << srcLen << ", srcLenLast=" << srcLenLast << ", chunkLen=" << chunkLen << ", inputPackSize=" << inputPackSize << std::endl;
+								
+								// generate reference
+								memset(ref, 0, REGION_SIZE);
+								for(unsigned inputNum = 0; inputNum < inputPackSize; inputNum++) {
+									size_t len = (inputNum == inputPackSize-1) ? srcLenLast : srcLen;
+									for(size_t i=0; i<len; i++) {
+										((uint8_t*)ref)[i] ^= ((uint8_t*)(srcM[inputNum]))[i];
+									}
+								}
+								
+								if(chunkLenOffset >= 0) {
+									memset(tmp, seed&0xff, REGION_SIZE*MAX_PACK_REGIONS); // scramble, to ensure we're actually doing something
+									memset(dst, 0, REGION_SIZE);
+									
+									// pack input
+									for(unsigned inputNum = 0; inputNum < inputPackSize; inputNum++) {
+										size_t len = (inputNum == inputPackSize-1) ? srcLenLast : srcLen;
+										g.prepare_packed(tmp, srcM[inputNum], len, regionSize, inputPackSize, inputNum, chunkLen);
+									}
+									// compute output
+									for(size_t sliceOffset=0; sliceOffset < regionSize; sliceOffset += chunkLen) {
+										size_t len = chunkLen;
+										if(regionSize - sliceOffset < len)
+											len = roundup_to(regionSize - sliceOffset, stride);
+										g.add_multi_packed(inputPackSize, inputPackSize, (uint8_t*)dst + sliceOffset, (uint8_t*)tmp + sliceOffset*inputPackSize, len);
+									}
+									g.finish(dst, regionSize);
+									
+									// test result
+									if(memcmp(dst, ref, regionSize)) {
+										std::cout << "Prepare packed failure: " << g.info().name << ": srcLen=" << srcLen << ", srcLenLast=" << srcLenLast << ", chunkLen=" << chunkLen << ", inputPackSize=" << inputPackSize << std::endl;
+										display_mem_diff(ref, dst, regionSize/2);
+										return 1;
+									}
+								}
+								
+								
+								// test again using checksumming variant
+								const size_t regionSizeWithCksum = regionSize+stride;
+								memset(tmp, seed&0xff, regionSizeWithCksum*MAX_PACK_REGIONS);
+								memset(dst, (seed>>8)&0xff, REGION_SIZE);
+								
+								for(unsigned inputNum = 0; inputNum < inputPackSize; inputNum++) {
+									size_t len = (inputNum == inputPackSize-1) ? srcLenLast : srcLen;
+									g.prepare_packed_cksum(tmp, srcM[inputNum], len, regionSize, inputPackSize, inputNum, chunkLen);
+								}
+								// check that the partial prepare matches against full prepare
+								const std::vector<int> lastPartLens{0, (int)stride, (int)stride*2, -(int)stride};
+								for(const int lastPartLen : lastPartLens) if(srcLenLast >= (unsigned)abs(lastPartLen)) {
+									memset(tmp2, seed&0xff, regionSizeWithCksum*MAX_PACK_REGIONS);
+									for(unsigned inputNum = 0; inputNum < inputPackSize; inputNum++) {
+										size_t len = (inputNum == inputPackSize-1) ? srcLenLast : srcLen;
+										size_t first, last;
+										if(lastPartLen < 0) {
+											first = -lastPartLen;
+										} else {
+											first = len-lastPartLen;
+											if(first % stride && lastPartLen) // align to stride if this is the first part
+												first += stride - (first % stride);
+										}
+										if(first > len) first = len;
+										last = len-first;
+										g.prepare_partial_packsum(tmp2, srcM[inputNum], len, regionSize, inputPackSize, inputNum, chunkLen, 0, first);
+										if(last)
+											g.prepare_partial_packsum(tmp2, (char*)(srcM[inputNum]) + first, len, regionSize, inputPackSize, inputNum, chunkLen, len-last, last);
+									}
+									if(memcmp(tmp2, tmp, regionSizeWithCksum*MAX_PACK_REGIONS)) {
+										std::cout << "Prepare packed-cksum differs from partial version: " << g.info().name << ": srcLen=" << srcLen << ", srcLenLast=" << srcLenLast << ", chunkLen=" << chunkLen << ", inputPackSize=" << inputPackSize << ", lastPartLen=" << lastPartLen << std::endl;
+										display_mem_diff(tmp, tmp2, (regionSizeWithCksum*MAX_PACK_REGIONS)/2);
+										return 1;
+									}
+								}
+								memset(tmp2, 0, regionSizeWithCksum);
+								
+								for(size_t sliceOffset=0; sliceOffset < regionSizeWithCksum; sliceOffset += chunkLen) {
+									size_t len = chunkLen;
+									if(regionSizeWithCksum - sliceOffset < len)
+										len = roundup_to(regionSizeWithCksum - sliceOffset, stride);
+									g.add_multi_packed(inputPackSize, inputPackSize, (uint8_t*)tmp2 + sliceOffset, (uint8_t*)tmp + sliceOffset*inputPackSize, len);
+								}
+								int checksumResult = g.finish_packed_cksum(dst, tmp2, regionSize, 1, 0, regionSizeWithCksum);
+								if(memcmp(dst, ref, regionSize)) {
+									std::cout << "Prepare packed-cksum failure: " << g.info().name << ": srcLen=" << srcLen << ", srcLenLast=" << srcLenLast << ", chunkLen=" << chunkLen << ", inputPackSize=" << inputPackSize << std::endl;
+									display_mem_diff(ref, dst, regionSize/2);
+									return 1;
+								}
+								if(!checksumResult) {
+									std::cout << "Prepare/finish packed checksum failure: " << g.info().name << ": srcLen=" << srcLen << ", srcLenLast=" << srcLenLast << ", chunkLen=" << chunkLen << ", inputPackSize=" << inputPackSize << std::endl;
+									return 1;
+								}
+							}
+						}
+					}
+				}
+			}
+			
+			std::cout << "Testing finish packed..." << std::endl;
+			{
+				uint16_t coeffs[MAX_PACK_REGIONS]; // used for finish-cksum
+				for(auto& coeff : coeffs)
+					coeff = rand() & 0xffff;
+				
+				for(unsigned gi = 0; gi < gf.size(); gi++) {
+					const auto& g = gf[gi];
+					
+					const unsigned stride = g.info().stride;
+					//const unsigned alignedRegionSize = rounddown_to(REGION_SIZE, stride);
+					const unsigned alignedRegionSize = MAX_TEST_REGIONS * g.info().stride;
+					
+					const std::vector<size_t> srcLenOffsets{0, 2, stride-2};
+					for(const auto& srcLenOffset : srcLenOffsets) {
+						size_t srcLen = alignedRegionSize - srcLenOffset;
+						
+						const std::vector<intptr_t> chunkLenOffsets{-(int)stride, 0, (int)stride, (int)stride*2, (int)rounddown_to(alignedRegionSize/2, (int)stride), (int)rounddown_to(alignedRegionSize/2, (int)stride)+(int)stride, (int)roundup_to(alignedRegionSize/3, (int)stride), (int)(alignedRegionSize-stride)};
+						for(const auto& chunkLenOffset : chunkLenOffsets) {
+							size_t chunkLen = alignedRegionSize - chunkLenOffset;
+							for(unsigned numOutputs = 1; numOutputs <= MAX_PACK_REGIONS; numOutputs++) {
+								if(verbose) std::cout << "  " << g.info().name << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << std::endl;
+								
+								if(chunkLenOffset >= 0) {
+									memset(dst, seed&0xff, REGION_SIZE*MAX_PACK_REGIONS); // scramble, to ensure we're actually doing something
+									
+									// pack input
+									// TODO: if there's output interleaving, this won't work :(
+									for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) {
+										unsigned chunk = 0;
+										for(size_t pos = 0; pos < srcLen; pos += chunkLen) {
+											size_t len = srcLen - pos;
+											if(len > chunkLen) len = chunkLen;
+											g.prepare(tmp + (chunk*numOutputs*chunkLen + outputNum*roundup_to(len, stride))/2, srcM[outputNum] + pos/2, len);
+											++chunk;
+										}
+									}
+									/*
+									for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) {
+										g.prepare_packed(tmp, srcM[outputNum], srcLen, alignedRegionSize, numOutputs, outputNum, chunkLen);
+									}
+									// TODO: need to fix the below
+									for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) {
+										g.mul_add_multi_packed(numOutputs, numOutputs, tmp2, tmp, chunkLen, <0s>, gfScratch[gi]);
+									}
+									*/
+									// unpack output
+									for(unsigned misalign = 0; misalign < MAX_MISALIGN; misalign++) {
+										for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) {
+											// because dstM is region aligned and aliased, we need to hack around the fact that misalignment overflows the regions
+											uint8_t* outputDst = (uint8_t*)dstM[outputNum] + misalign + misalign * outputNum*2;
+											uint16_t* odPre = (uint16_t*)(outputDst - misalign);
+											uint16_t* odPost = (uint16_t*)(outputDst + srcLen);
+											memcpy(odPre, guard_magic, misalign);
+											memcpy(odPost, guard_magic, misalign);
+											g.finish_packed(outputDst, tmp, srcLen, numOutputs, outputNum, chunkLen);
+											
+											// test result
+											if(memcmp(outputDst, srcM[outputNum], srcLen)) {
+												std::cout << "Packed finish failure: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << std::endl;
+												display_mem_diff(srcM[outputNum], (uint16_t*)outputDst, (alignedRegionSize*numOutputs)/2);
+												return 1;
+											}
+											if(memcmp(odPre, guard_magic, misalign)) {
+												std::cout << "Packed finish pre-guard bytes corrupted: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", misalign=" << misalign << std::endl;
+												print_mem_region(odPre, 0, (misalign+1)/2);
+												return 1;
+											}
+											if(memcmp(odPost, guard_magic, misalign)) {
+												std::cout << "Packed finish post-guard bytes corrupted: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", misalign=" << misalign << std::endl;
+												print_mem_region(odPost, 0, (misalign+1)/2);
+												return 1;
+											}
+										}
+									}
+								}
+								
+								// test finish with checksum
+								const size_t regionSizeWithCksum = alignedRegionSize+stride;
+								memset(tmp, seed&0xff, regionSizeWithCksum*numOutputs);
+								memset(dst, seed&0xff, REGION_SIZE*numOutputs);
+								
+								g.prepare_packed_cksum(tmp2, src, srcLen, alignedRegionSize, 1, 0, chunkLen);
+								for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) {
+									for(size_t sliceOffset=0; sliceOffset < regionSizeWithCksum; sliceOffset += chunkLen) {
+										size_t len = chunkLen;
+										if(regionSizeWithCksum - sliceOffset < len)
+											len = roundup_to(regionSizeWithCksum - sliceOffset, stride);
+										//g.mul((uint8_t*)tmp + outputNum*len + sliceOffset*numOutputs, (uint8_t*)tmp2 + sliceOffset, len, coeffs[outputNum], gfScratch[gi]);
+										
+										uint8_t* tmpPtr = (uint8_t*)tmp + outputNum*len + sliceOffset*numOutputs;
+										memset(tmpPtr, 0, len);
+										g.mul_add_multi_packed(1, 1, tmpPtr, (uint8_t*)tmp2 + sliceOffset, len, coeffs + outputNum, gfScratch[gi]);
+									}
+								}
+								for(unsigned misalign = 0; misalign < MAX_MISALIGN; misalign++) {
+									for(unsigned outputNum = 0; outputNum < numOutputs; outputNum++) {
+										uint8_t* outputDst = (uint8_t*)dstM[outputNum] + misalign + misalign * outputNum*2;
+										uint16_t* odPre = (uint16_t*)(outputDst - misalign);
+										uint16_t* odPost = (uint16_t*)(outputDst + srcLen);
+										memcpy(odPre, guard_magic, misalign);
+										memcpy(odPost, guard_magic, misalign);
+										
+										// compute reference
+										for(size_t i=0; i<srcLen/sizeof(uint16_t); i++)
+											ref[i] = gf16_mul_le(src[i], coeffs[outputNum]);
+										
+										std::vector<size_t> firstLens{srcLen, 0, stride, stride*2};
+										if(srcLen % stride) {
+											size_t srcLenAligned = srcLen - (srcLen % stride);
+											firstLens.push_back(srcLenAligned);
+											firstLens.push_back(srcLenAligned - stride);
+										} else
+											firstLens.push_back(srcLen - stride);
+										for(size_t firstLen : firstLens) {
+											int checksumResult;
+											if(firstLen == srcLen)
+												checksumResult = g.finish_packed_cksum(outputDst, tmp, srcLen, numOutputs, outputNum, chunkLen);
+											else {
+												memcpy(tmp2, tmp, regionSizeWithCksum*numOutputs);
+												if(firstLen)
+													g.finish_partial_packsum(outputDst, tmp2, srcLen, numOutputs, outputNum, chunkLen, 0, firstLen);
+												checksumResult = g.finish_partial_packsum(outputDst+firstLen, tmp2, srcLen, numOutputs, outputNum, chunkLen, firstLen, srcLen-firstLen);
+											}
+											if(memcmp(outputDst, ref, srcLen)) {
+												std::cout << "Packed finish-cksum failure: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", firstLen=" << firstLen << std::endl;
+												display_mem_diff(ref, (uint16_t*)outputDst, srcLen/2);
+												return 1;
+											}
+											if(memcmp(odPre, guard_magic, misalign)) {
+												std::cout << "Packed finish pre-guard bytes corrupted: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", misalign=" << misalign << ", firstLen=" << firstLen << std::endl;
+												print_mem_region(odPre, 0, (misalign+1)/2);
+												return 1;
+											}
+											if(memcmp(odPost, guard_magic, misalign)) {
+												std::cout << "Packed finish post-guard bytes corrupted: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", misalign=" << misalign << ", firstLen=" << firstLen << std::endl;
+												print_mem_region(odPost, 0, (misalign+1)/2);
+												return 1;
+											}
+											if(!checksumResult) {
+												std::cout << "Prepare/finish packed checksum failure: " << g.info().name << ", output " << outputNum << ": srcLen=" << srcLen << ", chunkLen=" << chunkLen << ", numOutputs=" << numOutputs << ", misalign=" << misalign << ", firstLen=" << firstLen << std::endl;
+												return 1;
+											}
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+		
+		// test mul/mul_add
+		if(testMul) {
+			std::cout << "Testing mul/muladd..." << std::endl;
+			for(int test=0; test<(fastMul ? 256 : 65536); test++) {
+				int coeff = test;
+				if(fastMul && test > 1)
+					coeff = rand() & 0xffff;
+				// compute mul reference
+				for(size_t i=0; i<REGION_SIZE/sizeof(uint16_t); i++)
+					ref[i] = gf16_mul_le(src[i], coeff);
+				
+				// test mul
+				for(unsigned gi = 0; gi < gf.size(); gi++) {
+					const auto& g = gf[gi];
+					//const unsigned regionSize = rounddown_to(REGION_SIZE, g.info().stride);
+					const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride;
+					if(verbose) std::cout << "  mul " << coeff << ": " << g.info().name << std::endl;
+					if(g.needPrepare()) {
+						g.prepare(tmp, src, regionSize);
+						g.mul(dst, tmp, regionSize, coeff, gfScratch[gi]);
+						g.finish(dst, regionSize);
+					} else {
+						memset(dst, seed&0xff, REGION_SIZE); // scramble, to ensure we're actually doing something
+						g.mul(dst, src, regionSize, coeff, gfScratch[gi]);
+					}
+					if(memcmp(dst, ref, regionSize)) {
+						std::cout << "Mul (" << coeff << ") failure: " << g.info().name << std::endl;
+						
+						int from = display_mem_diff(ref, dst, regionSize/2);
+						int to = (std::min)(from+16, (int)regionSize/2);
+						std::cout << "\nSrc:\n";
+						print_mem_region(src, from, to);
+						return 1;
+					}
+					
+					// test in-situ mul
+					if(g.needPrepare()) {
+						g.prepare(dst, src, regionSize);
+						g.mul(dst, dst, regionSize, coeff, gfScratch[gi]);
+						g.finish(dst, regionSize);
+					} else {
+						memcpy(dst, src, regionSize);
+						g.mul(dst, dst, regionSize, coeff, gfScratch[gi]);
+					}
+					if(memcmp(dst, ref, regionSize)) {
+						std::cout << "Mul in-situ (" << coeff << ") failure: " << g.info().name << std::endl;
+						display_mem_diff(ref, dst, regionSize/2);
+						return 1;
+					}
+				}
+				
+				// compute mul_add reference
+				for(size_t i=0; i<REGION_SIZE/sizeof(uint16_t); i++)
+					ref[i] = src2[i] ^ gf16_mul_le(src[i], coeff);
+				
+				// test mul_add
+				for(unsigned gi = 0; gi < gf.size(); gi++) {
+					const auto& g = gf[gi];
+					//const unsigned regionSize = rounddown_to(REGION_SIZE, g.info().stride);
+					const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride;
+					if(verbose) std::cout << "  muladd " << coeff << ": " << g.info().name << std::endl;
+					g.prepare(dst, src2, regionSize);
+					g.prepare(tmp2, src2, regionSize);
+					if(g.needPrepare()) {
+						g.prepare(tmp, src, regionSize);
+						g.mul_add(dst, tmp, regionSize, coeff, gfScratch[gi]);
+						g.mul_add_pf(tmp2, tmp, regionSize, coeff, gfScratch[gi], dst);
+					} else {
+						g.mul_add(dst, src, regionSize, coeff, gfScratch[gi]);
+						g.mul_add_pf(tmp2, src, regionSize, coeff, gfScratch[gi], dst);
+					}
+					g.finish(dst, regionSize);
+					g.finish(tmp2, regionSize);
+					if(memcmp(dst, ref, regionSize)) {
+						std::cout << "Mul_add (" << coeff << ") failure: " << g.info().name << std::endl;
+						int from = display_mem_diff(ref, dst, regionSize/2);
+						int to = (std::min)(from+16, (int)regionSize/2);
+						std::cout << "\nTarget:\n";
+						print_mem_region(src2, from, to);
+						std::cout << "Src:\n";
+						print_mem_region(src, from, to);
+						return 1;
+					}
+					if(memcmp(tmp2, ref, regionSize)) {
+						std::cout << "Mul_add_pf (" << coeff << ") failure: " << g.info().name << std::endl;
+						display_mem_diff(ref, tmp2, regionSize/2);
+						return 1;
+					}
+				}
+			}
+			
+			
+			// test multi-mul_add
+			std::cout << "Testing multi muladd..." << std::endl;
+			for(unsigned maxRegions=1; maxRegions<MAX_TEST_REGIONS; maxRegions++) {
+				uint16_t coeffs[MAX_TEST_REGIONS];
+				for(int test=0; test<2; test++) {
+					// generate random coeffs
+					if(test == 0) {
+						for(unsigned region=0; region<maxRegions; region++)
+							coeffs[region] = rand() & 0xffff;
+					}
+					// check that special values work
+					else if(test == 1) {
+						coeffs[0] = 0;
+						coeffs[1] = 1;
+					}
+					
+					// generate reference
+					memcpy(ref, src2, REGION_SIZE);
+					for(unsigned region=0; region<maxRegions; region++)
+						for(size_t i=0; i<REGION_SIZE/sizeof(uint16_t); i++)
+							ref[i] ^= gf16_mul_le(srcM[region][i], coeffs[region]);
+					
+					// we'll assume offset functionality works
+					for(unsigned gi = 0; gi < gf.size(); gi++) {
+						const auto& g = gf[gi];
+						//const unsigned regionSize = rounddown_to(REGION_SIZE, g.info().stride);
+						const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride;
+						if(verbose) std::cout << "  " << g.info().name << std::endl;
+						
+						// regular muladd_multi
+						if(g.hasMultiMulAdd()) {
+							g.prepare(dst, src2, regionSize);
+							g.prepare(dstM[1], src2, regionSize);
+							if(g.needPrepare()) {
+								for(unsigned region=0; region<maxRegions; region++)
+									g.prepare(tmpM[region], srcM[region], regionSize);
+								g.mul_add_multi(maxRegions, 0, dst, (const void**)tmpM, regionSize, coeffs, gfScratch[gi]);
+								g.mul_add_multi_stridepf(maxRegions, REGION_SIZE, dstM[1], tmp, regionSize, coeffs, gfScratch[gi], tmp);
+							} else {
+								g.mul_add_multi(maxRegions, 0, dst, (const void**)srcM, regionSize, coeffs, gfScratch[gi]);
+								g.mul_add_multi_stridepf(maxRegions, REGION_SIZE, dstM[1], src, regionSize, coeffs, gfScratch[gi], src);
+							}
+							g.finish(dst, regionSize);
+							g.finish(dstM[1], regionSize);
+							if(memcmp(dst, ref, regionSize)) {
+								std::cout << "Mul_add_multi (" << maxRegions << ") failure: " << g.info().name << std::endl;
+								display_mem_diff(ref, dst, regionSize/2);
+								return 1;
+							}
+							if(memcmp(dstM[1], ref, regionSize)) {
+								std::cout << "Mul_add_multi_stridepf (" << maxRegions << ") failure: " << g.info().name << std::endl;
+								display_mem_diff(ref, dstM[1], regionSize/2);
+								return 1;
+							}
+						}
+						
+						for(unsigned blankRegions=0; blankRegions<3; blankRegions++) { // test packing with regions that are never written
+							if(blankRegions + maxRegions >= MAX_TEST_REGIONS) break;
+							
+							// packed muladd_multi
+							g.prepare(dst, src2, regionSize);
+							for(unsigned region = 0; region < maxRegions; region++)
+								g.prepare_packed(tmp, srcM[region], regionSize, regionSize, maxRegions+blankRegions, region, regionSize);
+							g.mul_add_multi_packed(maxRegions+blankRegions, maxRegions, dst, tmp, regionSize, coeffs, gfScratch[gi]);
+							g.finish(dst, regionSize);
+							if(memcmp(dst, ref, regionSize)) {
+								std::cout << "Mul_add_multi_packed (" << maxRegions << "+" << blankRegions << ") failure: " << g.info().name << std::endl;
+								display_mem_diff(ref, dst, regionSize/2);
+								return 1;
+							}
+							
+							// packed muladd_multi with prefetch
+							// can't really test prefetch functionality, so just test it like above
+							g.prepare(dst, src2, regionSize);
+							for(unsigned region = 0; region < maxRegions; region++)
+								g.prepare_packed(tmp, srcM[region], regionSize, regionSize, maxRegions+blankRegions, region, regionSize);
+							g.mul_add_multi_packpf(maxRegions+blankRegions, maxRegions, dst, tmp, regionSize, coeffs, gfScratch[gi], tmp, tmp2 /*prefetch - any memory will do*/);
+							g.finish(dst, regionSize);
+							if(memcmp(dst, ref, regionSize)) {
+								std::cout << "Mul_add_multi_packpf (" << maxRegions << "+" << blankRegions << ") failure: " << g.info().name << std::endl;
+								display_mem_diff(ref, dst, regionSize/2);
+								return 1;
+							}
+						}
+					}
+				}
+			}
+		}
+		
+		
+		// test multi_add
+		if(testAdd) {
+			std::cout << "Testing multi add..." << std::endl;
+			for(unsigned maxRegions=1; maxRegions<MAX_TEST_REGIONS; maxRegions++) {
+				// generate reference
+				memcpy(ref, src2, REGION_SIZE);
+				for(unsigned region=0; region<maxRegions; region++)
+					for(size_t i=0; i<REGION_SIZE/sizeof(uint16_t); i++)
+						ref[i] ^= srcM[region][i];
+				
+				// we'll assume offset functionality works
+				for(unsigned gi = 0; gi < gf.size(); gi++) {
+					const auto& g = gf[gi];
+					//const unsigned regionSize = rounddown_to(REGION_SIZE, g.info().stride);
+					const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride;
+					if(verbose) std::cout << "  " << g.info().name << std::endl;
+					
+					// regular add_multi
+					memcpy(dst, src2, regionSize);
+					g.add_multi(maxRegions, 0, dst, (const void**)srcM, regionSize);
+					if(memcmp(dst, ref, regionSize)) {
+						std::cout << "Add_multi (" << maxRegions << ") failure: " << g.info().name << std::endl;
+						int from = display_mem_diff(ref, dst, regionSize/2);
+						int to = (std::min)(from+16, (int)regionSize/2);
+						std::cout << "\nTarget:\n";
+						print_mem_region(src2, from, to);
+						for(unsigned rgn = 0; rgn < maxRegions; rgn++) {
+							std::cout << "Src" << rgn << "\n";
+							print_mem_region(srcM[rgn], from, to);
+						}
+						return 1;
+					}
+					
+					// packed add_multi
+					for(unsigned blankRegions=0; blankRegions<3; blankRegions++) {
+						if(blankRegions + maxRegions >= MAX_TEST_REGIONS) break;
+						
+						g.prepare(dst, src2, regionSize);
+						for(unsigned region = 0; region < maxRegions; region++)
+							g.prepare_packed(tmp, srcM[region], regionSize, regionSize, maxRegions+blankRegions, region, regionSize);
+						g.add_multi_packed(maxRegions+blankRegions, maxRegions, dst, tmp, regionSize);
+						g.finish(dst, regionSize);
+						if(memcmp(dst, ref, regionSize)) {
+							std::cout << "Add_multi_packed (" << maxRegions << "+" << blankRegions << ") failure: " << g.info().name << std::endl;
+							display_mem_diff(ref, dst, regionSize/2);
+							return 1;
+						}
+						
+						// packed add_multi with prefetch
+						// can't really test prefetch functionality, so just test it like above
+						g.prepare(dst, src2, regionSize);
+						for(unsigned region = 0; region < maxRegions; region++)
+							g.prepare_packed(tmp, srcM[region], regionSize, regionSize, maxRegions+blankRegions, region, regionSize);
+						g.add_multi_packpf(maxRegions+blankRegions, maxRegions, dst, tmp, regionSize, tmp, tmp2 /*prefetch - any memory will do*/);
+						g.finish(dst, regionSize);
+						if(memcmp(dst, ref, regionSize)) {
+							std::cout << "Add_multi_packpf (" << maxRegions << "+" << blankRegions << ") failure: " << g.info().name << std::endl;
+							display_mem_diff(ref, dst, regionSize/2);
+							return 1;
+						}
+					}
+				}
+			}
+		}
+		
+		
+		if(testPow) {
+			std::cout << "Testing pow..." << std::endl;
+			for(int outputs : outputSizeTests) {
+				for(int test=0; test<(fastMul ? 256 : 65536); test++) {
+					int coeff = test;
+					if(fastMul && test > 1)
+						coeff = rand() & 0xffff;
+					
+					// compute pow reference
+					for(int output=0, curCoeff=coeff; output < outputs; output++, curCoeff = gf16_mul(curCoeff, coeff)) {
+						for(size_t i=0; i<REGION_SIZE/sizeof(uint16_t); i++) {
+							refM[output][i] = gf16_mul_le(src[i], curCoeff);
+						}
+					}
+					
+					for(unsigned gi = 0; gi < gf.size(); gi++) {
+						const auto& g = gf[gi];
+						if(!g.hasPowAdd()) continue;
+						//const unsigned regionSize = rounddown_to(REGION_SIZE, g.info().stride);
+						const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride;
+						if(verbose) std::cout << "  " << g.info().name << std::endl;
+						if(g.needPrepare()) {
+							g.prepare(tmp, src, regionSize);
+							g.pow(outputs, 0, (void**)dstM, tmp, regionSize, coeff, gfScratch[gi]);
+						} else {
+							g.pow(outputs, 0, (void**)dstM, src, regionSize, coeff, gfScratch[gi]);
+						}
+						for(int output=0; output < outputs; output++) {
+							g.finish(dstM[output], regionSize);
+							if(memcmp(dstM[output], refM[output], regionSize)) {
+								std::cout << "Pow (" << outputs << ") by " << coeff << ", output " << output << " failure: " << g.info().name << std::endl;
+								display_mem_diff(refM[output], dstM[output], regionSize/2);
+								return 1;
+							}
+						}
+					}
+				}
+			}
+			
+			std::cout << "Testing powadd..." << std::endl;
+			for(int outputs : outputSizeTests) {
+				for(int test=0; test<(fastMul ? 256 : 65536); test++) {
+					int coeff = test;
+					if(fastMul && test > 1)
+						coeff = rand() & 0xffff;
+					
+					// compute pow reference
+					for(int output=0, curCoeff=coeff; output < outputs; output++, curCoeff = gf16_mul(curCoeff, coeff)) {
+						for(size_t i=0; i<REGION_SIZE/sizeof(uint16_t); i++) {
+							refM[output][i] = src2[i] ^ gf16_mul_le(src[i], curCoeff);
+						}
+					}
+					
+					for(unsigned gi = 0; gi < gf.size(); gi++) {
+						const auto& g = gf[gi];
+						if(!g.hasPowAdd()) continue;
+						//const unsigned regionSize = rounddown_to(REGION_SIZE, g.info().stride);
+						const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride;
+						if(verbose) std::cout << "  " << g.info().name << std::endl;
+						for(int output=0; output < outputs; output++)
+							g.prepare(dstM[output], src2, regionSize);
+						if(g.needPrepare()) {
+							g.prepare(tmp, src, regionSize);
+							g.pow_add(outputs, 0, (void**)dstM, tmp, regionSize, coeff, gfScratch[gi]);
+						} else {
+							g.pow_add(outputs, 0, (void**)dstM, src, regionSize, coeff, gfScratch[gi]);
+						}
+						for(int output=0; output < outputs; output++) {
+							g.finish(dstM[output], regionSize);
+							if(memcmp(dstM[output], refM[output], regionSize)) {
+								std::cout << "Pow_add (" << outputs << ") by " << coeff << ", output " << output << " failure: " << g.info().name << std::endl;
+								display_mem_diff(refM[output], dstM[output], regionSize/2);
+								return 1;
+							}
+						}
+					}
+				}
+			}
+			
+			/*
+			std::cout << "Testing powadd_multi..." << std::endl;
+			for(unsigned maxRegions=1; maxRegions<MAX_TEST_REGIONS; maxRegions++) {
+				uint16_t coeffs[MAX_TEST_REGIONS];
+				uint16_t bases[MAX_TEST_REGIONS];
+				for(int outputs : outputSizeTests) {
+					for(int test=0; test<4; test++) {
+						// generate random coeffs
+						if(test&1) {
+							for(unsigned region=0; region<maxRegions; region++)
+								coeffs[region] = rand() & 0xffff;
+						}
+						// check that special values work
+						else {
+							coeffs[0] = 0;
+							coeffs[1] = 1;
+						}
+						// generate random base
+						if(test&2) {
+							for(unsigned region=0; region<maxRegions; region++)
+								bases[region] = rand() & 0xffff;
+						}
+						// use same base as coeff
+						else
+							memcpy(bases, coeffs, sizeof(coeffs));
+						
+						// generate reference
+						uint16_t curCoeffs[MAX_TEST_REGIONS];
+						memcpy(curCoeffs, coeffs, sizeof(coeffs));
+						for(int output=0; output < outputs; output++) {
+							memcpy(refM[output], src2, REGION_SIZE);
+							for(unsigned region=0; region<maxRegions; region++) {
+								for(size_t i=0; i<REGION_SIZE/sizeof(uint16_t); i++)
+									refM[output][i] ^= gf16_mul_le(srcM[region][i], curCoeffs[region]);
+								curCoeffs[region] = gf16_mul(curCoeffs[region], bases[region]);
+							}
+						}
+						
+						// we'll assume offset functionality works; TODO: perhaps test it anyway
+						for(unsigned gi = 0; gi < gf.size(); gi++) {
+							const auto& g = gf[gi];
+							//if(!g.hasMultiMulAdd()) continue;
+							//const unsigned regionSize = rounddown_to(REGION_SIZE, g.info().stride);
+							const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride;
+							if(verbose) std::cout << "  " << g.info().name << std::endl;
+							for(int output=0; output < outputs; output++)
+								g.prepare(dstM[output], src2, regionSize);
+							memcpy(curCoeffs, coeffs, sizeof(coeffs)); // because the coeffs is mutable
+							if(g.needPrepare()) {
+								for(unsigned region=0; region<maxRegions; region++)
+									g.prepare(tmpM[region], srcM[region], regionSize);
+								g.pow_add_multi(maxRegions, outputs, 0, (void**)dstM, (const void**)tmpM, regionSize, curCoeffs, bases, gfScratch[gi]);
+							} else {
+								g.pow_add_multi(maxRegions, outputs, 0, (void**)dstM, (const void**)srcM, regionSize, curCoeffs, bases, gfScratch[gi]);
+							}
+							for(int output=0; output < outputs; output++) {
+								g.finish(dstM[output], regionSize);
+								if(memcmp(dstM[output], refM[output], regionSize)) {
+									std::cout << "Pow_add_multi (" << maxRegions << "x" << outputs << ", test " << test << "), output " << output << " failure: " << g.info().name << std::endl;
+									display_mem_diff(refM[output], dstM[output], regionSize/2);
+									return 1;
+								}
+							}
+						}
+					}
+				}
+			}
+			*/
+		}
+		
+		if(testWord) {
+			std::cout << "Testing replace_word..." << std::endl;
+			for(int test=0; test<4; test++) {
+				for(unsigned gi = 0; gi < gf.size(); gi++) {
+					const auto& g = gf[gi];
+					const unsigned regionSize = MAX_TEST_REGIONS * g.info().stride;
+					if(verbose) std::cout << "  replace_word: " << g.info().name << std::endl;
+					if(g.needPrepare())
+						g.prepare(dst, src, regionSize);
+					else
+						memcpy(dst, src, regionSize);
+					
+					for(unsigned i=0; i<regionSize/sizeof(uint16_t); i++) {
+						tmp[i] = g.replace_word(dst, i, src2[i]);
+					}
+					if(g.needPrepare())
+						g.finish(dst, regionSize);
+					
+					if(memcmp(tmp, src, regionSize)) {
+						std::cout << "Replace_word read failure: " << g.info().name << std::endl;
+						display_mem_diff(src, tmp, regionSize/2);
+						return 1;
+					}
+					if(memcmp(dst, src2, regionSize)) {
+						std::cout << "Replace_word write failure: " << g.info().name << std::endl;
+						display_mem_diff(src2, dst, regionSize/2);
+						return 1;
+					}
+				}
+			}
+		}
+	}
+	
+	for(unsigned gi = 0; gi < gf.size(); gi++) {
+		if(gfScratch[gi])
+			gf[gi].mutScratch_free(gfScratch[gi]);
+	}
+	
+	
+	ALIGN_FREE(src);
+	ALIGN_FREE(tmp);
+	ALIGN_FREE(tmp2);
+	ALIGN_FREE(dst);
+	ALIGN_FREE(ref);
+	std::cout << "All tests passed" << std::endl;
+	return 0;
+}
diff --git a/test/gf16/test.h b/test/gf16/test.h
new file mode 100644
index 00000000..4c57eac3
--- /dev/null
+++ b/test/gf16/test.h
@@ -0,0 +1,115 @@
+#include "gfmat_coeff.h"
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <vector>
+#include <cstring>
+
+#ifdef _MSC_VER
+# define ALIGN_TO(a, v) __declspec(align(a)) v
+#else
+# define ALIGN_TO(a, v) v __attribute__((aligned(a)))
+#endif
+
+#include <stdlib.h>
+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
+	// MSVC doesn't support C11 aligned_alloc: https://stackoverflow.com/a/62963007
+	#define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = _aligned_malloc((len), align)
+	#define ALIGN_FREE _aligned_free
+#elif defined(_ISOC11_SOURCE)
+	// C11 method
+	// len needs to be a multiple of alignment, although it sometimes works if it isn't...
+	#define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
+	#define ALIGN_FREE free
+#elif defined(__cplusplus) && __cplusplus >= 201700
+	// C++17 method
+	#include <cstdlib>
+	#define ALIGN_ALLOC(buf, len, align) *(void**)&(buf) = std::aligned_alloc(align, ((len) + (align)-1) & ~((align)-1))
+	#define ALIGN_FREE free
+#else
+	#define ALIGN_ALLOC(buf, len, align) if(posix_memalign((void**)&(buf), align, (len))) (buf) = NULL
+	#define ALIGN_FREE free
+#endif
+
+#ifdef _MSC_VER
+# ifndef __BYTE_ORDER__
+#  define __BYTE_ORDER__ 1234
+# endif
+# ifndef __ORDER_BIG_ENDIAN__
+#  define __ORDER_BIG_ENDIAN__ 4321
+# endif
+#endif
+
+
+const uint8_t guard_magic[] = { 0xdb, 0xef, 0x55, 0xf4 };
+
+static inline size_t roundup_to(size_t n, size_t rounding) {
+	return ((n + rounding-1) / rounding) * rounding;
+}
+static inline size_t rounddown_to(size_t n, size_t rounding) {
+	return (n / rounding) * rounding;
+}
+
+static uint16_t gf16_log[65536];
+static uint16_t gf16_antilog[65536];
+static void gf16_generate_log_tables(int polynomial = 0x1100b) {
+	int n = 1;
+	memset(gf16_log, 0, sizeof(gf16_log));
+	for(int i=0; i<65535; i++) {
+		gf16_log[n] = i;
+		gf16_antilog[i] = n;
+		n <<= 1;
+		if(n > 0xffff) n ^= polynomial;
+	}
+	gf16_antilog[65535] = gf16_antilog[0];
+}
+static inline uint16_t gf16_mul(uint16_t a, uint16_t b) {
+	if(a == 0 || b == 0) return 0;
+	int log_prod = (int)gf16_log[a] + (int)gf16_log[b];
+	return gf16_antilog[(log_prod >> 16) + (log_prod & 0xffff)];
+}
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+static inline uint16_t gf16_mul_le(uint16_t src, uint16_t coeff) {
+	uint16_t r = gf16_mul((src>>8) | ((src&0xff)<<8), coeff);
+	return (r >> 8) | ((r & 0xff) << 8);
+}
+#else
+# define gf16_mul_le gf16_mul
+#endif
+
+static int find_mem_diff(const uint16_t* a, const uint16_t* b, int n) {
+	for(int i=0; i<n; i++) {
+		if(a[i] != b[i]) {
+			return i;
+		}
+	}
+	return -1;
+}
+static void print_mem_region(const uint16_t* mem, int from, int to) {
+	for(int i=from; i<to; i+=8) {
+		printf(" %04X | ", i*2);
+		for(int j=0; j< (i+8<to ? 8 : to-i); j++) {
+			printf("%04X ", mem[i+j]);
+		}
+		printf("\n");
+	}
+}
+static int display_mem_diff(const uint16_t* a, const uint16_t* b, int n) {
+	// find first diff
+	int idx = find_mem_diff(a, b, n);
+	if(idx < 0) return -1;
+	
+	// display region
+	int from = (idx & 0xfffffff8) - 8;
+	int to = from + 24;
+	if(from < 0) from = 0;
+	if(to > n) to = n;
+	
+	printf("Expected:\n");
+	print_mem_region(a, from, to);
+	printf("Actual:\n");
+	print_mem_region(b, from, to);
+	return from;
+}
diff --git a/test/hasher/CMakeLists.txt b/test/hasher/CMakeLists.txt
new file mode 100644
index 00000000..507dd536
--- /dev/null
+++ b/test/hasher/CMakeLists.txt
@@ -0,0 +1,154 @@
+cmake_minimum_required(VERSION 2.8.9...3.22)
+project(hasher_test)
+
+option(SKIP_AUX "Bypass getauxval checks (for testing purposes)" OFF)
+
+include(CheckCXXCompilerFlag)
+include(CheckIncludeFileCXX)
+include(CheckCXXSymbolExists)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_C_STANDARD 99)
+
+if(NOT CMAKE_BUILD_TYPE)
+	set(CMAKE_BUILD_TYPE Debug)
+endif()
+if(NOT TARGET_ARCH)
+	if(CMAKE_GENERATOR_PLATFORM)
+		set(TARGET_ARCH ${CMAKE_GENERATOR_PLATFORM})
+	else()
+		set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+	endif()
+endif()
+
+message("Building for ${TARGET_ARCH}")
+if (${TARGET_ARCH} MATCHES "i386|i686|x86|x86_64|x64|amd64|AMD64|win32|Win32")
+	set(IS_X86 TRUE)
+	if(${TARGET_ARCH} MATCHES "x86_64|x64|amd64|AMD64")
+		set(IS_X64 TRUE)
+	endif()
+endif()
+if (${TARGET_ARCH} MATCHES "arm|ARM|aarch64|arm64|ARM64")
+	set(IS_ARM TRUE)
+endif()
+if (${TARGET_ARCH} MATCHES "riscv64|rv64")
+	set(IS_RISCV64 TRUE)
+endif()
+if (${TARGET_ARCH} MATCHES "riscv32|rv32")
+	set(IS_RISCV32 TRUE)
+endif()
+
+if(SKIP_AUX)
+	add_compile_definitions(PARPAR_SKIP_AUX_CHECK=1)
+endif()
+
+set(HASHER_DIR ../../hasher)
+set(SRC_DIR ../../src)
+set(HASHER_C_SOURCES
+	${HASHER_DIR}/crc_zeropad.c
+	${HASHER_DIR}/md5-final.c
+)
+
+set(HASHER_CPP_SOURCES
+	${HASHER_DIR}/hasher.cpp
+	${HASHER_DIR}/hasher_armcrc.cpp
+	${HASHER_DIR}/hasher_avx2.cpp
+	${HASHER_DIR}/hasher_avx512.cpp
+	${HASHER_DIR}/hasher_avx512vl.cpp
+	${HASHER_DIR}/hasher_bmi1.cpp
+	${HASHER_DIR}/hasher_clmul.cpp
+	${HASHER_DIR}/hasher_neon.cpp
+	${HASHER_DIR}/hasher_neoncrc.cpp
+	${HASHER_DIR}/hasher_scalar.cpp
+	${HASHER_DIR}/hasher_sse.cpp
+	${HASHER_DIR}/hasher_sve2.cpp
+	${HASHER_DIR}/hasher_xop.cpp
+)
+
+include_directories(${HASHER_DIR})
+
+if(MSVC)
+	set(RELEASE_COMPILE_FLAGS /GS- /Gy /sdl- /Oy /Oi)
+	set(RELEASE_LINK_FLAGS /OPT:REF /OPT:ICF)
+	add_compile_options(/W2 "$<$<NOT:$<CONFIG:Debug>>:${RELEASE_COMPILE_FLAGS}>")
+	add_link_options("$<$<NOT:$<CONFIG:Debug>>:${RELEASE_LINK_FLAGS}>")
+else()
+	add_compile_options(-Wall -Wextra -Wno-unused-function)
+	if(${CMAKE_BUILD_TYPE} MATCHES "Debug")
+		add_compile_options(-ggdb)
+	else()
+		if(NOT ENABLE_SANITIZE)
+			add_compile_options(-fomit-frame-pointer)
+		endif()
+	endif()
+	
+	if(ENABLE_SANITIZE)
+		set(SANITIZE_OPTS -fsanitize=address -fsanitize=undefined)
+		add_compile_options(-fno-omit-frame-pointer ${SANITIZE_OPTS})
+		add_link_options(${SANITIZE_OPTS})
+	endif()
+endif()
+
+add_compile_definitions(PARPAR_INVERT_SUPPORT=1)
+add_library(hasher_c STATIC ${HASHER_C_SOURCES})
+add_library(hasher STATIC ${HASHER_CPP_SOURCES})
+target_link_libraries(hasher hasher_c)
+
+if(NOT MSVC)
+	if(ENABLE_SANITIZE)
+		target_compile_options(hasher PRIVATE -fno-exceptions)
+	else()
+		target_compile_options(hasher PRIVATE -fno-rtti -fno-exceptions)
+	endif()
+	target_compile_definitions(hasher_c PRIVATE _POSIX_C_SOURCE=200112L)
+	target_compile_definitions(hasher_c PRIVATE _DARWIN_C_SOURCE=)
+	target_compile_definitions(hasher_c PRIVATE _GNU_SOURCE=)
+endif()
+
+if(MSVC)
+	if(IS_X86)
+		set_source_files_properties(${HASHER_DIR}/hasher_avx2.cpp PROPERTIES COMPILE_OPTIONS /arch:AVX2)
+		set_source_files_properties(${HASHER_DIR}/hasher_avx512.cpp PROPERTIES COMPILE_OPTIONS /arch:AVX512)
+		set_source_files_properties(${HASHER_DIR}/hasher_avx512vl.cpp PROPERTIES COMPILE_OPTIONS /arch:AVX512)
+		set_source_files_properties(${HASHER_DIR}/hasher_bmi1.cpp PROPERTIES COMPILE_OPTIONS /arch:AVX)
+		set_source_files_properties(${HASHER_DIR}/hasher_xop.cpp PROPERTIES COMPILE_OPTIONS /arch:AVX)
+	endif()
+endif()
+if(NOT MSVC OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+	if(IS_X86)
+		set_source_files_properties(${HASHER_DIR}/hasher_avx2.cpp PROPERTIES COMPILE_OPTIONS -mavx2)
+		set_source_files_properties(${HASHER_DIR}/hasher_avx512.cpp PROPERTIES COMPILE_OPTIONS "-mavx512f")
+		set_source_files_properties(${HASHER_DIR}/hasher_avx512vl.cpp PROPERTIES COMPILE_OPTIONS "-mavx512vl;-mavx512bw;-mbmi2;-mpclmul")
+		set_source_files_properties(${HASHER_DIR}/hasher_bmi1.cpp PROPERTIES COMPILE_OPTIONS "-mpclmul;-mavx;-mbmi")
+		set_source_files_properties(${HASHER_DIR}/hasher_clmul.cpp PROPERTIES COMPILE_OPTIONS "-mpclmul;-msse4.1")
+		set_source_files_properties(${HASHER_DIR}/hasher_sse.cpp PROPERTIES COMPILE_OPTIONS -msse2)
+		set_source_files_properties(${HASHER_DIR}/hasher_xop.cpp PROPERTIES COMPILE_OPTIONS "-mxop;-mavx")
+	endif()
+	
+	if(IS_ARM AND NOT APPLE) # M1 Macs don't seem to need these ARM options
+		CHECK_CXX_COMPILER_FLAG("-mfpu=neon -march=armv7-a" COMPILER_SUPPORTS_ARM32_NEON)
+		if(COMPILER_SUPPORTS_ARM32_NEON)
+			set_source_files_properties(${HASHER_DIR}/hasher_neon.cpp PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv7-a")
+			set_source_files_properties(${HASHER_DIR}/hasher_neoncrc.cpp PROPERTIES COMPILE_OPTIONS "-mfpu=neon;-march=armv8-a+crc")
+			set_source_files_properties(${HASHER_DIR}/hasher_armcrc.cpp PROPERTIES COMPILE_OPTIONS "-mfpu=fp-armv8;-march=armv8-a+crc")
+		else()
+			CHECK_CXX_COMPILER_FLAG("-march=armv8-a+crc" COMPILER_SUPPORTS_ARM_CRC)
+			if(COMPILER_SUPPORTS_ARM_CRC)
+				set_source_files_properties(${HASHER_DIR}/hasher_neoncrc.cpp PROPERTIES COMPILE_OPTIONS -march=armv8-a+crc)
+				set_source_files_properties(${HASHER_DIR}/hasher_armcrc.cpp PROPERTIES COMPILE_OPTIONS -march=armv8-a+crc)
+			endif()
+		endif()
+		CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve2" COMPILER_SUPPORTS_SVE2)
+		if(COMPILER_SUPPORTS_SVE2)
+			set_source_files_properties(${HASHER_DIR}/hasher_sve2.cpp PROPERTIES COMPILE_OPTIONS -march=armv8-a+sve2)
+		endif()
+	endif()
+endif()
+
+
+
+
+# binaries
+set(TEST_DIR .)
+add_executable(test ${TEST_DIR}/test.cpp)
+target_link_libraries(test hasher)
diff --git a/test/hasher/test.cpp b/test/hasher/test.cpp
new file mode 100644
index 00000000..57203fd6
--- /dev/null
+++ b/test/hasher/test.cpp
@@ -0,0 +1,312 @@
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <vector>
+#include <tuple>
+#include <memory>
+#include "hasher.h"
+
+typedef char md5hash[16]; // add null byte for convenience
+
+typedef void(*MD5SingleUpdate_t)(uint32_t*, const void*, size_t);
+typedef uint32_t(*CRC32_Calc_t)(const void*, size_t);
+typedef uint32_t(*MD5CRC_Calc_t)(const void*, size_t, size_t, void*);
+
+uint32_t readUint32LE(uint8_t* p) {
+	return (*p) | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
+}
+void writeUint32LE(uint8_t* p, uint32_t v) {
+	p[0] = v & 0xff;
+	p[1] = (v >> 8) & 0xff;
+	p[2] = (v >> 16) & 0xff;
+	p[3] = (v >> 24) & 0xff;
+}
+
+// TODO: test MD5Single::updateZero
+
+bool do_tests(IHasherInput* hasher, MD5SingleUpdate_t md5sgl, MD5CRC_Calc_t md5crcImpl, CRC32_Calc_t crc32impl) {
+	md5hash md5;
+	uint8_t md5crc[20];
+	MD5Single md5hasher, md5extract;
+	if(md5sgl) md5hasher._update = md5sgl;
+	#define MD5_ACTION(act) if(hasher) hasher->act; if(md5sgl) md5hasher.act
+	#define DO_MD5CRC(data, zp) \
+		if(md5crcImpl) { \
+			uint32_t c = md5crcImpl(data, sizeof(data)-1, zp, md5crc); \
+			writeUint32LE(md5crc+16, c); \
+		}
+	#define ADD_DATA(data, zpMd5Crc) \
+		MD5_ACTION(update(data, sizeof(data)-1)); \
+		DO_MD5CRC(data, zpMd5Crc)
+	#define CHECK_BLOCK(zp, xMd5, xCrc, t) \
+		if(md5crcImpl) { \
+			if(memcmp(md5crc, xMd5, 16)) { printf("md5crc-md5 (" t "): "); return true; } \
+			if(readUint32LE(md5crc+16) != xCrc) { printf("md5crc-crc (" t ") [%x <> %x]: ", readUint32LE(md5crc+16), xCrc); return true; } \
+		} \
+		if(hasher) { \
+			hasher->getBlock(md5crc, zp); \
+			if(memcmp(md5crc, xMd5, 16)) { printf("getBlock-md5 (" t "): "); return true; } \
+			if(readUint32LE(md5crc+16) != xCrc) { printf("getBlock-crc (" t ") [%x <> %x]: ", readUint32LE(md5crc+16), xCrc); return true; } \
+		}
+	#define CHECK_END(xMd5, t) \
+		if(hasher) { \
+			hasher->extractFileMD5(md5extract); \
+			md5extract.end(md5); \
+			if(memcmp(md5, xMd5, 16)) { printf("input-extract (" t "): "); return true; } \
+			hasher->end(md5); \
+			if(memcmp(md5, xMd5, 16)) { printf("input-end (" t "): "); return true; } \
+		} \
+		if(md5sgl) { \
+			md5hasher.end(md5); \
+			if(memcmp(md5, xMd5, 16)) { printf("single (" t "): "); return true; } \
+		}
+	#define CHECK_CRC(data, xCrc, t) \
+		if(crc32impl) { \
+			if(crc32impl(data, sizeof(data)-1) != xCrc) { printf("crc (" t "): "); return true; } \
+		}
+	// test blank
+	DO_MD5CRC("", 0)
+	CHECK_BLOCK(0, "\xd4\x1d\x8c\xd9\x8f\0\xb2\x04\xe9\x80\x09\x98\xec\xf8\x42\x7e", 0, "blank")
+	CHECK_END("\xd4\x1d\x8c\xd9\x8f\0\xb2\x04\xe9\x80\x09\x98\xec\xf8\x42\x7e", "blank")
+	CHECK_CRC("", 0, "blank")
+	
+	// zero padding tests
+	MD5_ACTION(reset());
+	DO_MD5CRC("", 1)
+	CHECK_BLOCK(1, "\x93\xb8\x85\xad\xfe\x0d\xa0\x89\xcd\xf6\x34\x90\x4f\xd5\x9f\x71", 0xd202ef8d, "blank + 1 zero")
+	DO_MD5CRC("", 4)
+	CHECK_BLOCK(4, "\xf1\xd3\xff\x84\x43\x29\x77\x32\x86\x2d\xf2\x1d\xc4\xe5\x72\x62", 0x2144df1c, "blank + 4 zeroes")
+	DO_MD5CRC("", 55)
+	CHECK_BLOCK(55, "\xc9\xea\x33\x14\xb9\x1c\x9f\xd4\xe3\x8f\x94\x32\x06\x4f\xd1\xf2", 0x113bc241, "blank + 55 zeroes")
+	DO_MD5CRC("", 56)
+	CHECK_BLOCK(56, "\xe3\xc4\xdd\x21\xa9\x17\x1f\xd3\x9d\x20\x8e\xfa\x09\xbf\x78\x83", 0xd3c8a549, "blank + 56 zeroes")
+	DO_MD5CRC("", 57)
+	CHECK_BLOCK(57, "\xab\x9d\x8e\xf2\xff\xa9\x14\x5d\x6c\x32\x5c\xef\xa4\x1d\x5d\x4e", 0xddd1de1c, "blank + 57 zeroes")
+	DO_MD5CRC("", 63)
+	CHECK_BLOCK(63, "\x65\xce\xcf\xb9\x80\xd7\x2f\xde\x57\xd1\x75\xd6\xec\x1c\x3f\x64", 0xe8aadae4, "blank + 63 zeroes")
+	DO_MD5CRC("", 64)
+	CHECK_BLOCK(64, "\x3b\x5d\x3c\x7d\x20\x7e\x37\xdc\xee\xed\xd3\x01\xe3\x5e\x2e\x58", 0x758d6336, "blank + 64 zeroes")
+	DO_MD5CRC("", 65)
+	CHECK_BLOCK(65, "\x1e\xf5\xe8\x29\x30\x3a\x13\x9c\xe9\x67\x44\x0e\x0c\xdc\xa1\x0c", 0x1dcdf777, "blank + 65 zeroes")
+	DO_MD5CRC("", 128)
+	CHECK_BLOCK(128, "\xf0\x9f\x35\xa5\x63\x78\x39\x45\x8e\x46\x2e\x63\x50\xec\xbc\xe4", 0xc2a8fa9d, "blank + 128 zeroes")
+	
+	ADD_DATA("a", 0)
+	CHECK_BLOCK(0, "\x0c\xc1\x75\xb9\xc0\xf1\xb6\xa8\x31\xc3\x99\xe2\x69\x77\x26\x61", 0xe8b7be43, "single byte")
+	CHECK_END("\x0c\xc1\x75\xb9\xc0\xf1\xb6\xa8\x31\xc3\x99\xe2\x69\x77\x26\x61", "single byte")
+	CHECK_CRC("a", 0xe8b7be43, "single byte")
+	
+	MD5_ACTION(reset());
+	MD5_ACTION(update("ab", 2));
+	DO_MD5CRC("ab", 1)
+	CHECK_BLOCK(1, "\x5d\x36\xfe\x0e\x22\x1c\x3f\xd9\x7c\x6b\x87\xa4\x6c\x9f\xaf\x43", 0xe19f7120, "two bytes + 1 zero")
+	MD5_ACTION(update("cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012", 53));
+	DO_MD5CRC("cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012", 3)
+	CHECK_BLOCK(3, "\x39\xf1\xb4\x77\xb1\x7a\x07\xb1\xa4\x73\xef\xe9\x2c\x28\xc1\x1f", 0x13c041ef, "53 bytes + 3 zeroes")
+	CHECK_END("\x3d\x37\x3b\x8c\xd6\xfd\x06\x9d\x31\x3c\xdc\x3f\x38\xa1\x89\x63", "55 bytes")
+	
+	MD5_ACTION(reset());
+	ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123", 7);
+	CHECK_BLOCK(7, "\x85\xea\x2f\x1f\xb8\x4a\x41\x48\x6b\xfe\xc6\x74\x69\x65\x7f\xae", 0x776f469a, "56 bytes + 7 zeroes")
+	CHECK_END("\xd4\x3e\x61\xe9\xb5\xf8\xc9\xd2\x2c\x4d\xc5\xdb\x6e\x6d\xf7\x75", "56 bytes")
+	
+	MD5_ACTION(reset());
+	ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 1);
+	CHECK_BLOCK(1, "\xe2\x11\x99\xfd\x5d\x1c\xc7\xe4\x20\xd5\xd2\xec\xd6\xa2\x62\xb3", 0xc65ef97b, "63 bytes + 1 zero")
+	DO_MD5CRC("", 0)
+	CHECK_BLOCK(0, "\xd4\x1d\x8c\xd9\x8f\0\xb2\x04\xe9\x80\x09\x98\xec\xf8\x42\x7e", 0, "2nd block blank")
+	CHECK_END("\xce\x3a\x13\xcb\x6c\x59\xe1\xda\xd8\xa1\x70\xec\xd5\x0f\x0c\xe8", "63 bytes")
+	
+	MD5_ACTION(reset());
+	ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_", 1);
+	CHECK_BLOCK(1, "\x70\x4f\x4c\x47\x80\xc9\x02\x56\x4a\x7b\xcc\xe6\x6a\x6d\x03\x3a", 0x2830585b, "64 bytes + 1 zero")
+	CHECK_END("\x2a\x37\x87\xf9\x92\x07\xe3\x6b\x2c\xb2\xc3\x40\x68\x92\xde\xf0", "64 bytes")
+	
+	MD5_ACTION(reset());
+	ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_=", 0);
+	CHECK_BLOCK(0, "\x77\xf8\x6b\xd2\x20\x76\xca\x4e\x99\x0f\xc7\xba\x77\x78\x11\x13", 0x7058144a, "65 bytes")
+	CHECK_END("\x77\xf8\x6b\xd2\x20\x76\xca\x4e\x99\x0f\xc7\xba\x77\x78\x11\x13", "65 bytes")
+	CHECK_CRC("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_=", 0x7058144a, "65 bytes")
+	
+	MD5_ACTION(reset());
+	ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 2);
+	CHECK_BLOCK(2, "\x93\xc9\x82\x8d\x41\x99\xd6\xb6\xfa\xee\x9b\xe5\xef\xfd\xd9\xee", 0x151319c0, "63 bytes + 2 zeroes")
+	MD5_ACTION(update("_a", 2));
+	MD5_ACTION(update("b", 1));
+	MD5_ACTION(update("cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_", 62));
+	CHECK_END("\x9b\x27\x94\x27\xd4\x81\xc9\xc9\xc7\x1d\x9a\xcb\x4f\xc9\xe9\x9a", "128 bytes")
+	
+	MD5_ACTION(reset());
+	ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 0);
+	CHECK_BLOCK(0, "\xce\x3a\x13\xcb\x6c\x59\xe1\xda\xd8\xa1\x70\xec\xd5\x0f\x0c\xe8", 0x5d4ab91c, "63 bytes")
+	CHECK_CRC("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 0x5d4ab91c, "63 bytes")
+	MD5_ACTION(update("_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_", 65));
+	CHECK_END("\x9b\x27\x94\x27\xd4\x81\xc9\xc9\xc7\x1d\x9a\xcb\x4f\xc9\xe9\x9a", "128 bytes (2)")
+	
+	MD5_ACTION(reset());
+	ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_", 0);
+	CHECK_BLOCK(0, "\x9b\x27\x94\x27\xd4\x81\xc9\xc9\xc7\x1d\x9a\xcb\x4f\xc9\xe9\x9a", 0xcf479cf1, "128 bytes")
+	CHECK_CRC("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_", 0xcf479cf1, "128 bytes")
+	CHECK_END("\x9b\x27\x94\x27\xd4\x81\xc9\xc9\xc7\x1d\x9a\xcb\x4f\xc9\xe9\x9a", "128 bytes (single update)")
+	
+	// test block slipping case
+	MD5_ACTION(reset());
+	ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 0);
+	CHECK_BLOCK(0, "\xce\x3a\x13\xcb\x6c\x59\xe1\xda\xd8\xa1\x70\xec\xd5\x0f\x0c\xe8", 0x5d4ab91c, "63 bytes (1)")
+	ADD_DATA("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-", 0);
+	CHECK_BLOCK(0, "\xce\x3a\x13\xcb\x6c\x59\xe1\xda\xd8\xa1\x70\xec\xd5\x0f\x0c\xe8", 0x5d4ab91c, "63 bytes (2)")
+	CHECK_END("\xb7\x8f\x77\xf2\x49\xd1\x1b\xab\x5f\xcd\x04\xc3\x34\x85\xde\x56", "2x63 bytes")
+	
+	
+	// TODO: need more tests with mismatched block/file offsets
+	
+	// long tests
+	MD5_ACTION(reset());
+	DO_MD5CRC("", 10000)
+	CHECK_BLOCK(10000, "\xb8\x5d\x6f\xb9\xef\x42\x60\xdc\xf1\xce\x0a\x1b\x0b\xff\x80\xd3", 0x4d3bca2e, "10000 zeroes")
+	// randomish mix
+	uint8_t stuff[8128]; // == (1+127)*63.5
+	for(unsigned c=0; c<sizeof(stuff); c++)
+		stuff[c] = (c & 0xff) ^ 0x9d;
+	uint8_t* pStuff = stuff;
+	for(int l=0; l<128; l++) {
+		MD5_ACTION(update(pStuff, l ^ 0x55));
+		pStuff += l ^ 0x55;
+	}
+	memset(stuff, 0x37, 2000);
+	memset(stuff+2000, 0x9a, 1500);
+	MD5_ACTION(update(stuff, 1500));
+	MD5_ACTION(update(stuff+1500, 2000));
+	md5crcImpl = NULL; // stop testing this
+	CHECK_BLOCK(99, "\x98\x4c\x32\x54\x8d\xee\xbf\xf8\x32\x3f\x82\x57\x96\xb5\xe8\x8a", 0x6f6f1bcf, "Long mix")
+	CHECK_END("\x08\xfc\xb7\x18\xe7\x2f\xbd\xe9\x5c\x92\x36\x66\xed\x91\x76\xfe", "Long mix")
+	
+	#undef MD5_ACTION
+	#undef CHECK_BLOCK
+	#undef CHECK_END
+	#undef CHECK_CRC
+	return false;
+}
+
+
+const int MAX_REGIONS = 128; // max SVE2 region count
+bool do_mb_tests(MD5Multi* hasher, const md5hash expected[MAX_REGIONS], const void* const* src, size_t len, int regions) {
+	hasher->reset();
+	hasher->update(src, len);
+	hasher->end();
+	
+#ifdef _MSC_VER
+	md5hash results[MAX_REGIONS];
+#else
+	md5hash results[regions];
+#endif
+	hasher->get(results);
+	for(int i=0; i<regions; i++) {
+		md5hash result;
+		hasher->get1(i, result);
+		if(memcmp(expected[i], result, 16)) {
+			return true;
+		}
+		if(memcmp(results[i], result, 16)) {
+			return true;
+		}
+	}
+	
+	// test multi-part update
+	if(len > 1) {
+		int firstChunk = len >= 64 ? 64-1 : 1;
+		const void* src2[MAX_REGIONS];
+		for(int i=0; i<regions; i++)
+			src2[i] = (char*)(src[i]) + firstChunk;
+		
+		hasher->reset();
+		hasher->update(src, firstChunk);
+		hasher->update(src2, len - firstChunk);
+		hasher->end();
+		hasher->get(results);
+		
+		for(int i=0; i<regions; i++) {
+			md5hash result;
+			hasher->get1(i, result);
+			if(memcmp(expected[i], result, 16)) {
+				return true;
+			}
+			if(memcmp(results[i], result, 16)) {
+				return true;
+			}
+		}
+	}
+	
+	return false;
+}
+
+
+int main(void) {
+	#define ERROR(s) { std::cout << s << std::endl; return 1; }
+	
+	std::cout << "Testing individual hashers..." << std::endl;
+	auto singleHashers = hasherMD5CRC_availableMethods(true);
+	for(auto hId : singleHashers) {
+		set_hasherMD5CRC(hId);
+		std::cout << "  " << md5crc_methodName();
+		if(do_tests(nullptr, MD5Single::_update, MD5CRC_Calc, CRC32_Calc)) ERROR(" - FAILED");
+		std::cout << std::endl;
+	}
+	
+	std::cout << "Testing input hashers..." << std::endl;
+	auto inputHashers = hasherInput_availableMethods(true);
+	for(auto hId : inputHashers) {
+		set_hasherInput(hId);
+		std::cout << "  " << hasherInput_methodName();
+		auto hasher = HasherInput_Create();
+		if(do_tests(hasher, nullptr, nullptr, nullptr)) ERROR(" - FAILED");
+		hasher->destroy();
+		std::cout << std::endl;
+	}
+	
+	set_hasherInput(inputHashers[0]);
+	IHasherInput* hiScalar = HasherInput_Create();
+	
+	srand(0x12345678);
+	// test multi-buffer
+	// (this assumes the input hasher works)
+	char data[MAX_REGIONS][128];
+	const void* dataPtr[MAX_REGIONS];
+	md5hash ref[MAX_REGIONS];
+	for(int i=0; i<MAX_REGIONS; i++) {
+		dataPtr[i] = data[i];
+		for(auto& c : data[i])
+			c = rand();
+	}
+	
+	std::cout << "Testing multi-buffer MD5..." << std::endl;
+	auto outputHashers = hasherMD5Multi_availableMethods(true);
+	
+	int sizes[] = {0, 1, 55, 56, 64, 65, 128};
+	int regionCounts[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,23,31,32,33,40,MAX_REGIONS};
+	for(auto hId : outputHashers) {
+		set_hasherMD5MultiLevel(hId);
+		std::cout << "  " << hasherMD5Multi_methodName() << std::endl;
+		
+		for(auto& size : sizes) for(auto& numRegions : regionCounts) {
+			// generate reference
+			for(int region = 0; region < numRegions; region++) {
+				hiScalar->reset();
+				hiScalar->update(dataPtr[region], size);
+				hiScalar->end(ref[region]);
+			}
+			
+			auto hasher = new MD5Multi(numRegions);
+			
+			if(do_mb_tests(hasher, ref, dataPtr, size, numRegions))
+				ERROR("  - FAILED: regions=" << numRegions << "; size=" << size);
+			delete hasher;
+		}
+	}
+	
+	hiScalar->destroy();
+	
+	std::cout << "All tests passed" << std::endl;
+	return 0;
+}

From 64c684fa3a5546424d15cf15a7501b8fa1cf6507 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 22 Aug 2023 16:49:35 +1000
Subject: [PATCH 64/91] Python2.7 fix for Windows builds

---
 .github/workflows/build-dev-win64.yml | 2 +-
 .github/workflows/build.yml           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-dev-win64.yml b/.github/workflows/build-dev-win64.yml
index f82cd12d..1fdeffe3 100644
--- a/.github/workflows/build-dev-win64.yml
+++ b/.github/workflows/build-dev-win64.yml
@@ -11,7 +11,7 @@ jobs:
       BUILD_LOGLEVEL: verbose
     steps:
       - uses: ilammy/setup-nasm@v1
-      - uses: actions/setup-python@v4
+      - uses: MatteoH2O1999/setup-python@v1
         id: py
         with:
           python-version: '2.7'
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 84c55000..f17d667a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -15,7 +15,7 @@ jobs:
       BUILD_LOGLEVEL: verbose
     steps:
       - uses: ilammy/setup-nasm@v1
-      - uses: actions/setup-python@v4
+      - uses: MatteoH2O1999/setup-python@v1
         id: py
         with:
           python-version: '2.7'

From cb3c15af3be625326fea10f7f4af237857e1b101 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 22 Aug 2023 17:12:47 +1000
Subject: [PATCH 65/91] Test workflow typo fix

---
 .github/workflows/test.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3bb6d5d1..eaa55095 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -27,9 +27,9 @@ jobs:
       - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe
       - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
       - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f
-        if: ${{ matrix.config == 'Release' && (matric.compiler == 'ClangCL' || matric.compiler == 'v143') }}
+        if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
       - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f
-        if: ${{ matrix.config == 'Release' && (matric.compiler == 'ClangCL' || matric.compiler == 'v143') }}
+        if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
       - run: sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe
       
       # test SSE2-only to see if CPUID checking works
@@ -126,9 +126,9 @@ jobs:
       - run: ${{ matrix.t.emu }} test/gf16/build/test
       - run: ${{ matrix.t.emu }} test/gf16/build/test-pmul
       - run: ${{ matrix.t.emu }} test/gf16/build/test-ctrl -f
-        if: ${{ matrix.config == 'Release' && matric.cc_ver == '12' }}
+        if: ${{ matrix.config == 'Release' && matrix.cc_ver == '12' }}
       - run: ${{ matrix.t.emu }} test/gf16/build/test-inv -f
-        if: ${{ matrix.config == 'Release' && matric.cc_ver == '12' }}
+        if: ${{ matrix.config == 'Release' && matrix.cc_ver == '12' }}
       - run: ${{ matrix.t.emu }} test/hasher/build/test
   
   test-linux-clang:
@@ -202,9 +202,9 @@ jobs:
       - run: ${{ matrix.t.emu }} test/gf16/build/test
       - run: ${{ matrix.t.emu }} test/gf16/build/test-pmul
       - run: ${{ matrix.t.emu }} test/gf16/build/test-ctrl -f
-        if: ${{ matrix.config == 'Release' && matric.cc_ver == '15' }}
+        if: ${{ matrix.config == 'Release' && matrix.cc_ver == '15' }}
       - run: ${{ matrix.t.emu }} test/gf16/build/test-inv -f
-        if: ${{ matrix.config == 'Release' && matric.cc_ver == '15' }}
+        if: ${{ matrix.config == 'Release' && matrix.cc_ver == '15' }}
       - run: ${{ matrix.t.emu }} test/hasher/build/test
   
   

From 4732f5f9d08d72ed1b6e633583007321bf90e0ea Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 22 Aug 2023 17:49:33 +1000
Subject: [PATCH 66/91] Test workflow fix

---
 .github/workflows/test.yml | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index eaa55095..77c5d681 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -11,18 +11,18 @@ jobs:
         compiler: ['v141', 'v142', 'v143', 'ClangCL']
         arch: ['Win32', 'x64']
     name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }} (${{ matrix.config }})
-    runs-on: windows-latest
+    runs-on: windows-2022
     steps:
       - uses: ilammy/setup-nasm@v1
       - uses: petarpetrovt/setup-sde@v2.1
       - uses: actions/checkout@v3
       - run: |
           mkdir test\gf16\build
-          cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
+          cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\gf16\build --config ${{ matrix.config }}
           
           mkdir test\hasher\build
-          cmake -B test\hasher\build -S test\hasher -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
+          cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\hasher\build --config ${{ matrix.config }}
       - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe
       - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
@@ -48,16 +48,16 @@ jobs:
         compiler: ['v142', 'v143', 'ClangCL']
         arch: ['ARM', 'ARM64']
     name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }}
-    runs-on: windows-latest
+    runs-on: windows-2022
     steps:
       - uses: actions/checkout@v3
       - run: |
           mkdir test\gf16\build
-          cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
+          cmake -B test\gf16\build -S test\gf16 -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\gf16\build --config Debug
           
           mkdir test\hasher\build
-          cmake -B test\hasher\build -S test\hasher -G "Visual Studio 16 2019" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
+          cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\hasher\build --config Debug
     
   # TODO: test mingw
@@ -88,18 +88,18 @@ jobs:
     runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
-      - run: apt update
+      - run: sudo apt update
       - uses: petarpetrovt/setup-sde@v2.1
         if: ${{ matrix.t.arch == 'amd64' || matrix.t.arch == 'i386' }}
-      - run: apt install -y qemu-user-static
+      - run: sudo apt install -y qemu-user-static
         if: ${{ matrix.t.arch != 'amd64' && matrix.t.arch != 'i386' }}
       - run: |
-          apt install -y g++-${{ matrix.cc_ver }}-${{ matrix.t.target }}
+          sudo apt install -y g++-${{ matrix.cc_ver }}-${{ matrix.t.target }}
           echo "CC=${{ matrix.t.target }}-gcc-${{ matrix.cc_ver }}" >> $GITHUB_ENV
           echo "CXX=${{ matrix.t.target }}-g++-${{ matrix.cc_ver }}" >> $GITHUB_ENV
         if: ${{ matrix.t.arch != 'amd64' }}
       - run: |
-          apt install -y g++-${{ matrix.cc_ver }}
+          sudo apt install -y g++-${{ matrix.cc_ver }}
           echo "CC=gcc-${{ matrix.cc_ver }}" >> $GITHUB_ENV
           echo "CXX=g++-${{ matrix.cc_ver }}" >> $GITHUB_ENV
         if: ${{ matrix.t.arch == 'amd64' }}
@@ -156,12 +156,12 @@ jobs:
     runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
-      - run: apt update && apt install -y clang-${{ matrix.cc_ver }}
+      - run: sudo apt update && sudo apt install -y clang-${{ matrix.cc_ver }}
       - uses: petarpetrovt/setup-sde@v2.1
         if: ${{ matrix.t.arch == 'amd64' || matrix.t.arch == 'i386' }}
-      - run: apt install -y qemu-user-static
+      - run: sudo apt install -y qemu-user-static
         if: ${{ matrix.t.arch != 'amd64' && matrix.t.arch != 'i386' }}
-      - run: apt install -y binutils-${{ matrix.t.target }} libgcc-12-dev-${{ matrix.t.libc }}-cross libstdc++-12-dev-${{ matrix.t.libc }}-cross
+      - run: sudo apt install -y binutils-${{ matrix.t.target }} libgcc-12-dev-${{ matrix.t.libc }}-cross libstdc++-12-dev-${{ matrix.t.libc }}-cross
         if: ${{ matrix.t.arch != 'amd64' }}
       - run: echo "SANITIZE=-DENABLE_SANITIZE=1" >> $GITHUB_ENV
         if: ${{ matrix.config == 'Release' && matrix.t.arch == 'amd64' }}

From 4145211c99e50261566add0c431c50df37e919a1 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 23 Aug 2023 23:41:27 +1000
Subject: [PATCH 67/91] Test workflow fix

---
 .github/workflows/test.yml | 44 ++++++++++++++++++++++----------------
 test/gf16/test-inv.cpp     |  3 ++-
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 77c5d681..358a48ff 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -8,7 +8,7 @@ jobs:
       fail-fast: false
       matrix:
         config: [Debug, Release]
-        compiler: ['v141', 'v142', 'v143', 'ClangCL']
+        compiler: ['v141', 'v143', 'ClangCL']
         arch: ['Win32', 'x64']
     name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }} (${{ matrix.config }})
     runs-on: windows-2022
@@ -24,19 +24,19 @@ jobs:
           mkdir test\hasher\build
           cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\hasher\build --config ${{ matrix.config }}
-      - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe
-      - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
-      - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f
+      - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe
+      - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
+      - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f
+      - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe
+      - run: $SDE_PATH\sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe
       
       # test SSE2-only to see if CPUID checking works
       - run: |
-          sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe
-          sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
-          sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe
+          $SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe
+          $SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
+          $SDE_PATH\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe
         if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }}
       
   
@@ -47,6 +47,9 @@ jobs:
       matrix:
         compiler: ['v142', 'v143', 'ClangCL']
         arch: ['ARM', 'ARM64']
+        exclude:
+          - compiler: ClangCL
+            arch: ARM
     name: Test VS ${{ matrix.compiler }} ${{ matrix.arch }}
     runs-on: windows-2022
     steps:
@@ -74,8 +77,8 @@ jobs:
         cc_ver: ['9','12']
         t:
           # qemu x86 doesn't support AVX, so we use Intel SDE instead
-          - {arch: 'i386',     target: 'i686-linux-gnu',       libc: 'i386',    emu: '$SDE_PATH/sde -icl --'}
-          - {arch: 'amd64',    target: 'x86-64-linux-gnu',     libc: 'amd64',   emu: '$SDE_PATH/sde64 -icl --'}
+          - {arch: 'i386',     target: 'i686-linux-gnu',       libc: 'i386',    emu: '$SDE_PATH/sde -icx --'}
+          - {arch: 'amd64',    target: 'x86-64-linux-gnu',     libc: 'amd64',   emu: '$SDE_PATH/sde64 -icx --'}
           #- {arch: 'amd64',    target: 'x86-64-linux-gnux32',  libc: 'x32',     emu: 'qemu-x86_64-static -cpu max'}
           # TODO: how to test x32?
           - {arch: 'aarch64',  target: 'aarch64-linux-gnu',    libc: 'arm64',   emu: 'qemu-aarch64-static -L /usr/aarch64-linux-gnu -cpu max,sve-max-vq=4'}
@@ -139,8 +142,8 @@ jobs:
         # Clang 6 available in 20.04
         cc_ver: ['11','15']
         t:
-          - {arch: 'i386', target: 'i686-linux-gnu', cl_target: 'x86-linux-gnu', libc: 'i386', emu: '$SDE_PATH/sde -icl --'}
-          - {arch: 'amd64', target: 'x86-64-linux-gnu', cl_target: 'x86_64-linux-gnu', libc: 'amd64', emu: '$SDE_PATH/sde64 -icl --'}
+          - {arch: 'i386', target: 'i686-linux-gnu', cl_target: 'i386-linux-gnu', libc: 'i386', emu: '$SDE_PATH/sde -icx --'}
+          - {arch: 'amd64', target: 'x86-64-linux-gnu', cl_target: 'x86_64-linux-gnu', libc: 'amd64', emu: '$SDE_PATH/sde64 -icx --'}
           #- {arch: 'amd64', target: 'x86-64-linux-gnux32', cl_target: 'x86-64-linux-gnux32', libc: 'x32', emu: 'qemu- -cpu max'}
           # TODO: how to test x32?
           - {arch: 'aarch64', target: 'aarch64-linux-gnu', cl_target: 'aarch64-linux-gnu', libc: 'arm64', emu: 'qemu-aarch64-static -L /usr/aarch64-linux-gnu -cpu max,sve-max-vq=4'}
@@ -166,6 +169,9 @@ jobs:
       - run: echo "SANITIZE=-DENABLE_SANITIZE=1" >> $GITHUB_ENV
         if: ${{ matrix.config == 'Release' && matrix.t.arch == 'amd64' }}
       - run: |
+          if [ '${{ matrix.t.arch }}' != 'amd64' ]; then
+            LINKER_FLAG=-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld
+          fi
           mkdir test/gf16/build
           cmake -Btest/gf16/build -Stest/gf16 -DSKIP_AUX=1 -DCMAKE_BUILD_TYPE=${{ matrix.config }} $SANITIZE \
             -DCMAKE_C_COMPILER=clang-${{ matrix.cc_ver }} \
@@ -178,9 +184,9 @@ jobs:
             -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
             -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
             -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \
-            -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \
-            -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \
-            -DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld
+            -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES="/usr/${{ matrix.t.target }}/include;`ls -d /usr/${{ matrix.t.target }}/include/c++/*|head -n1`/${{ matrix.t.target }}" \
+            -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES="/usr/${{ matrix.t.target }}/include;`ls -d /usr/${{ matrix.t.target }}/include/c++/*|head -n1`/${{ matrix.t.target }}" \
+            $LINKER_FLAG
           cmake --build test/gf16/build
           
           mkdir test/hasher/build
@@ -195,9 +201,9 @@ jobs:
             -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
             -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
             -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \
-            -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \
-            -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=/usr/${{ matrix.t.target }}/include \
-            -DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld
+            -DCMAKE_C_STANDARD_INCLUDE_DIRECTORIES="/usr/${{ matrix.t.target }}/include;`ls -d /usr/${{ matrix.t.target }}/include/c++/*|head -n1`/${{ matrix.t.target }}" \
+            -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES="/usr/${{ matrix.t.target }}/include;`ls -d /usr/${{ matrix.t.target }}/include/c++/*|head -n1`/${{ matrix.t.target }}" \
+            $LINKER_FLAG
           cmake --build test/hasher/build
       - run: ${{ matrix.t.emu }} test/gf16/build/test
       - run: ${{ matrix.t.emu }} test/gf16/build/test-pmul
diff --git a/test/gf16/test-inv.cpp b/test/gf16/test-inv.cpp
index a42c2c27..58e3fef5 100644
--- a/test/gf16/test-inv.cpp
+++ b/test/gf16/test-inv.cpp
@@ -21,11 +21,12 @@ static void compare_invert(const Galois16RecMatrix& mat, Galois16* leftmatrix, s
 	if(recovery.size() != invalidCount) abort();
 	
 	// compare
-	for(unsigned outRow = 0; outRow < invalidCount; outRow++)
+	for(unsigned outRow = 0; outRow < invalidCount; outRow++) {
 		for(unsigned inCol = 0; inCol < inputValid.size(); inCol++) {
 			if(leftmatrix[outRow * inputValid.size() + inCol] != mat.GetFactor(inCol, outRow))
 				abort();
 		}
+	}
 }
 
 static void do_test(std::vector<bool> inputValid, std::vector<uint16_t> recovery, Galois16Methods method) {

From d0b72690c76d9e10843f5503c998afcd3a1271d0 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 24 Aug 2023 11:15:54 +1000
Subject: [PATCH 68/91] Disable SVE on Clang<12 Produces suspect code

---
 src/platform.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/platform.h b/src/platform.h
index 3ee0d37d..d4e8ebf2 100644
--- a/src/platform.h
+++ b/src/platform.h
@@ -203,6 +203,16 @@ HEDLEY_WARNING("GFNI disabled on GCC < 10 due to incorrect GF2P8AFFINEQB operand
 # endif
 #endif
 
+#if defined(__ARM_FEATURE_SVE) && defined(__clang__) && __clang_major__<12
+// Clang < 12 has issues with SVE
+# ifdef __ARM_FEATURE_SVE
+#  undef __ARM_FEATURE_SVE
+# endif
+# ifdef __ARM_FEATURE_SVE2
+#  undef __ARM_FEATURE_SVE2
+# endif
+#endif
+
 #if defined(__riscv_vector) && defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(13,0,0)
 // GCC added RVV intrinsics in GCC13
 # undef __riscv_vector

From 0d852f8eb2c5e23dca11aae5ae9d5a323e81192b Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 24 Aug 2023 15:57:58 +1000
Subject: [PATCH 69/91] Try to fix 'out of registers' errors for MD5 ASM

---
 hasher/md5x2-sse.h     |  2 +-
 hasher/md5x2-x86-asm.h | 73 ++++++++++++++++++++++++------------------
 2 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/hasher/md5x2-sse.h b/hasher/md5x2-sse.h
index c77e4ac1..c860262f 100644
--- a/hasher/md5x2-sse.h
+++ b/hasher/md5x2-sse.h
@@ -1,5 +1,5 @@
 
-#if defined(__GNUC__) || defined(__clang__)
+#if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE__)
 # define MD5_USE_ASM
 # include "md5x2-sse-asm.h"
 #endif
diff --git a/hasher/md5x2-x86-asm.h b/hasher/md5x2-x86-asm.h
index 325938a7..d6c61926 100644
--- a/hasher/md5x2-x86-asm.h
+++ b/hasher/md5x2-x86-asm.h
@@ -155,45 +155,52 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co
 #else
 # define ASM_PARAMS_ONES
 #endif
-#define ASM_PARAMS(i0, i1) \
+#define ASM_PARAMS(in) \
 	[A1]"+&r"(A1), [B1]"+&r"(B1), [C1]"+&r"(C1), [D1]"+&r"(D1), \
 	[A2]"+&r"(A2), [B2]"+&r"(B2), [C2]"+&r"(C2), [D2]"+&r"(D2), \
 	[TMP1]"=&r"(tmp1), [TMP2]"=&r"(tmp2) \
-: [i0_0]"m"(_data[0][i0]), [i0_1]"m"(_data[0][i1]), \
-  [i1_0]"m"(_data[1][i0]), [i1_1]"m"(_data[1][i1])  ASM_PARAMS_ONES \
+: [i0]"m"(_data[0][in]), [i1]"m"(_data[1][in])  ASM_PARAMS_ONES \
 :
 
 #define RF4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \
-	ROUND_F(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 7) \
-	ROUND_F(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 12) \
-: ASM_PARAMS(i0, i1)); __asm__( \
-	ROUND_F(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 17) \
-	ROUND_F(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 22) \
-: ASM_PARAMS(i2, i3));
+	ROUND_F(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0]", "%[i1]", k0, 7) \
+: ASM_PARAMS(i0)); __asm__( \
+	ROUND_F(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0]", "%[i1]", k1, 12) \
+: ASM_PARAMS(i1)); __asm__( \
+	ROUND_F(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0]", "%[i1]", k2, 17) \
+: ASM_PARAMS(i2)); __asm__( \
+	ROUND_F(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0]", "%[i1]", k3, 22) \
+: ASM_PARAMS(i3));
 	
 #define RG4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \
-	ROUND_G(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 5) \
-	ROUND_G(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 9) \
-: ASM_PARAMS(i0, i1)); __asm__( \
-	ROUND_G(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 14) \
-	ROUND_G(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 20) \
-: ASM_PARAMS(i2, i3));
+	ROUND_G(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0]", "%[i1]", k0, 5) \
+: ASM_PARAMS(i0)); __asm__( \
+	ROUND_G(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0]", "%[i1]", k1, 9) \
+: ASM_PARAMS(i1)); __asm__( \
+	ROUND_G(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0]", "%[i1]", k2, 14) \
+: ASM_PARAMS(i2)); __asm__( \
+	ROUND_G(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0]", "%[i1]", k3, 20) \
+: ASM_PARAMS(i3));
 	
 #define RH4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \
-	ROUND_H(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 4) \
-	ROUND_H(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 11) \
-: ASM_PARAMS(i0, i1)); __asm__( \
-	ROUND_H(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 16) \
-	ROUND_H(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 23) \
-: ASM_PARAMS(i2, i3));
+	ROUND_H(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0]", "%[i1]", k0, 4) \
+: ASM_PARAMS(i0)); __asm__( \
+	ROUND_H(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0]", "%[i1]", k1, 11) \
+: ASM_PARAMS(i1)); __asm__( \
+	ROUND_H(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0]", "%[i1]", k2, 16) \
+: ASM_PARAMS(i2)); __asm__( \
+	ROUND_H(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0]", "%[i1]", k3, 23) \
+: ASM_PARAMS(i3));
 	
 #define RI4(i0, i1, i2, i3, k0, k1, k2, k3) __asm__( \
-	ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", k0, 6) \
-	ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", k1, 10) \
-: ASM_PARAMS(i0, i1)); __asm__( \
-	ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", k2, 15) \
-	ROUND_I(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0_1]", "%[i1_1]", k3, 21) \
-: ASM_PARAMS(i2, i3));
+	ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0]", "%[i1]", k0, 6) \
+: ASM_PARAMS(i0)); __asm__( \
+	ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0]", "%[i1]", k1, 10) \
+: ASM_PARAMS(i1)); __asm__( \
+	ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0]", "%[i1]", k2, 15) \
+: ASM_PARAMS(i2)); __asm__( \
+	ROUND_I(B1, C1, D1, A1, B2, C2, D2, A2, "%[i0]", "%[i1]", k3, 21) \
+: ASM_PARAMS(i3));
 	
 	A1 += read32(_data[0]);
 	A2 += read32(_data[1]);
@@ -218,12 +225,14 @@ static HEDLEY_ALWAYS_INLINE void md5_process_block_x2_scalar(uint32_t* state, co
 	RI4(15,  6, 13,  4,  0x6fa87e4f, -0x01d31920, -0x5cfebcec, 0x4e0811a1)
 	
 	__asm__(
-		ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0_0]", "%[i1_0]", -0x08ac817e, 6)
-		ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0_1]", "%[i1_1]", -0x42c50dcb, 10)
-	: ASM_PARAMS(11, 2)); __asm__(
-		ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0_0]", "%[i1_0]", 0x2ad7d2bb, 15)
+		ROUND_I(A1, B1, C1, D1, A2, B2, C2, D2, "%[i0]", "%[i1]", -0x08ac817e, 6)
+	: ASM_PARAMS(11)); __asm__(
+		ROUND_I(D1, A1, B1, C1, D2, A2, B2, C2, "%[i0]", "%[i1]", -0x42c50dcb, 10)
+	: ASM_PARAMS(2)); __asm__(
+		ROUND_I(C1, D1, A1, B1, C2, D2, A2, B2, "%[i0]", "%[i1]", 0x2ad7d2bb, 15)
+	: ASM_PARAMS(9)); __asm__(
 		ROUND_I_LAST(B1, C1, D1, A1, B2, C2, D2, A2, -0x14792c6f, 21)
-	: ASM_PARAMS(9, 0));
+	: ASM_PARAMS(0));
 	state[0] += A1;
 	state[1] += B1;
 	state[2] += C1;

From b4858c9c2e26d081670e08f56bc23a73b63da66d Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 24 Aug 2023 16:13:57 +1000
Subject: [PATCH 70/91] Windows workflow fixes

---
 .github/workflows/build-dev-win64.yml |  2 +-
 .github/workflows/build.yml           |  2 +-
 .github/workflows/test.yml            | 16 ++++++++--------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-dev-win64.yml b/.github/workflows/build-dev-win64.yml
index 1fdeffe3..6e7a2a2a 100644
--- a/.github/workflows/build-dev-win64.yml
+++ b/.github/workflows/build-dev-win64.yml
@@ -20,7 +20,7 @@ jobs:
       - run: (cd nexe && npm install --production)
       - run: (cd nexe && node build)
       - run: nexe\parpar.exe --version
-      - run: nexe\parpar.exe -r1 -s1M -onexe\test.par2 nexe\parpar.exe
+      - run: nexe\parpar.exe -r1 -s1M -o nexe\test.par2 nexe\parpar.exe
       - run: move nexe\parpar.exe parpar.exe && 7z a -t7z -mx=9 parpar.7z parpar.exe
       - uses: actions/upload-artifact@v3
         with:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f17d667a..2c5c36d5 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -29,7 +29,7 @@ jobs:
       - run: (cd nexe && npm install --production)
       - run: (cd nexe && node build)
       - run: nexe\parpar.exe --version
-      - run: nexe\parpar.exe -r1 -s1M -onexe\test.par2 nexe\parpar.exe
+      - run: nexe\parpar.exe -r1 -s1M -o nexe\test.par2 nexe\parpar.exe
       - run: move nexe\parpar.exe parpar.exe && 7z a -t7z -mx=9 parpar.7z parpar.exe
       - uses: actions/upload-release-asset@v1
         env:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 358a48ff..57b0bf61 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,19 +24,19 @@ jobs:
           mkdir test\hasher\build
           cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\hasher\build --config ${{ matrix.config }}
-      - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe
-      - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
-      - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f
+      - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe"
+      - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe"
+      - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f"
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: $SDE_PATH\sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f
+      - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f"
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: $SDE_PATH\sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe
+      - run: "%SDE_PATH%\sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe"
       
       # test SSE2-only to see if CPUID checking works
       - run: |
-          $SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe
-          $SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
-          $SDE_PATH\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe
+          %SDE_PATH%\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe
+          %SDE_PATH%\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
+          %SDE_PATH%\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe
         if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }}
       
   

From 0b8bc40bc0c61035324f8a3c31c7ff2dce9e4465 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 24 Aug 2023 17:18:15 +1000
Subject: [PATCH 71/91] Fix matrix inverse on big endian

---
 gf16/gf16mul.h     |  7 ++++---
 gf16/gfmat_inv.cpp |  6 +++---
 gf16/gfmat_inv.h   |  5 ++++-
 hasher/crc_arm.h   | 18 ------------------
 src/platform.h     | 15 +++++++++++++++
 test/gf16/test.cpp |  8 ++++++++
 6 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/gf16/gf16mul.h b/gf16/gf16mul.h
index 5979d93a..c8b0e722 100644
--- a/gf16/gf16mul.h
+++ b/gf16/gf16mul.h
@@ -134,9 +134,10 @@ class Galois16Mul {
 	static void _finish_none(void *HEDLEY_RESTRICT, size_t) {}
 	static void _prepare_packed_none(void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t srcLen, size_t sliceLen, unsigned inputPackSize, unsigned inputNum, size_t chunkLen);
 	static uint16_t _replace_word(void* data, size_t index, uint16_t newValue) {
-		uint16_t* p = (uint16_t*)data + index;
-		uint16_t oldValue = *p;
-		*p = newValue;
+		uint8_t* p = (uint8_t*)data + index*2;
+		uint16_t oldValue = p[0] | (p[1]<<8);
+		p[0] = newValue & 0xff;
+		p[1] = newValue>>8;
 		return oldValue;
 	}
 	
diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index 0697b788..ad01643f 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -467,7 +467,7 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 	if(recovery.at(0) == 0) { // first recovery having exponent 0 is a common case
 		for(unsigned stripe=0; stripe<numStripes; stripe++) {
 			for(unsigned i=0; i<sw16; i++)
-				mat[stripe * numRec*sw16 + i] = 1;
+				mat[stripe * numRec*sw16 + i] = _LE16(1);
 		}
 		recStart++;
 	}
@@ -488,7 +488,7 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 			for(loopcond) { \
 				uint16_t exp = recovery.at(rec); \
 				for(unsigned i=0; i<GROUP_AMOUNT; i++) { \
-					mat[rec * sw16 + targetCol[i]] = gfmat_coeff_from_log(inputLog[i], exp); \
+					mat[rec * sw16 + targetCol[i]] = _LE16(gfmat_coeff_from_log(inputLog[i], exp)); \
 				} \
 			} \
 		} \
@@ -497,7 +497,7 @@ void Galois16RecMatrix::Construct(const std::vector<bool>& inputValid, unsigned
 			unsigned targetCol = inputValid.at(input) ? validCol++ : missingCol++; \
 			targetCol = (targetCol/sw16)*sw16*numRec + (targetCol%sw16); \
 			for(loopcond) { \
-				mat[rec * sw16 + targetCol] = gfmat_coeff_from_log(inputLog, recovery.at(rec)); \
+				mat[rec * sw16 + targetCol] = _LE16(gfmat_coeff_from_log(inputLog, recovery.at(rec))); \
 			} \
 		} \
 		assert(validCol == validCount)
diff --git a/gf16/gfmat_inv.h b/gf16/gfmat_inv.h
index 0e18a096..681b0216 100644
--- a/gf16/gfmat_inv.h
+++ b/gf16/gfmat_inv.h
@@ -6,8 +6,10 @@
 #include "../src/stdint.h"
 
 #ifdef PARPAR_INVERT_SUPPORT
+#include "../src/platform.h"
 const unsigned PP_INVERT_MAX_MULTI_ROWS = 6; // process up to 6 rows in a multi-mul call
 
+
 class Galois16Mul;
 class Galois16RecMatrixWorker;
 struct Galois16RecMatrixComputeState;
@@ -39,13 +41,14 @@ class Galois16RecMatrix {
 		// TODO: check if numStripes==1? consider optimising division?
 		unsigned sw = stripeWidth/sizeof(uint16_t);
 		unsigned stripe = inIdx / sw;
-		return mat[stripe * numRec*sw + recIdx * sw + (inIdx % sw)];
+		return _LE16(mat[stripe * numRec*sw + recIdx * sw + (inIdx % sw)]);
 	}
 	
 	// these should only be queried after Compute has started (i.e. from the progressCb, or after it returns)
 	/*Galois16Methods*/ int regionMethod;
 	const char* getPointMulMethodName() const;
 };
+
 #endif
 
 #endif
diff --git a/hasher/crc_arm.h b/hasher/crc_arm.h
index 7c84ed7c..938e1a29 100644
--- a/hasher/crc_arm.h
+++ b/hasher/crc_arm.h
@@ -18,21 +18,6 @@ static HEDLEY_ALWAYS_INLINE void crc_init_arm(void* crc) {
 	memset(crc, 0xff, sizeof(uint32_t));
 }
 
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-# ifdef __GNUC__
-#  define _LE16 __builtin_bswap16
-#  define _LE32 __builtin_bswap32
-#  define _LE64 __builtin_bswap64
-# else
-// currently not supported
-#  error No endian swap intrinsic defined
-# endif
-#else
-# define _LE16(x) (x)
-# define _LE32(x) (x)
-# define _LE64(x) (x)
-#endif
-
 static HEDLEY_ALWAYS_INLINE void crc_process_block_arm(void* HEDLEY_RESTRICT crc, const void* HEDLEY_RESTRICT src) {
 	uint32_t* _crc = (uint32_t*)crc;
 #ifdef __aarch64__
@@ -74,6 +59,3 @@ static HEDLEY_ALWAYS_INLINE uint32_t crc_finish_arm(void* HEDLEY_RESTRICT state,
 	return ~crc;
 }
 
-#undef _LE16
-#undef _LE32
-#undef _LE64
diff --git a/src/platform.h b/src/platform.h
index d4e8ebf2..0d57a3bf 100644
--- a/src/platform.h
+++ b/src/platform.h
@@ -44,6 +44,21 @@
 # endif
 #endif
 
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+# ifdef __GNUC__
+#  define _LE16 __builtin_bswap16
+#  define _LE32 __builtin_bswap32
+#  define _LE64 __builtin_bswap64
+# else
+// currently not supported
+#  error No endian swap intrinsic defined
+# endif
+#else
+# define _LE16(x) (x)
+# define _LE32(x) (x)
+# define _LE64(x) (x)
+#endif
+
 # ifdef _M_ARM64
 	#define __ARM_NEON 1
 	#define __aarch64__ 1
diff --git a/test/gf16/test.cpp b/test/gf16/test.cpp
index e90067ac..244a0ad7 100644
--- a/test/gf16/test.cpp
+++ b/test/gf16/test.cpp
@@ -920,7 +920,15 @@ int main(int argc, char** argv) {
 						memcpy(dst, src, regionSize);
 					
 					for(unsigned i=0; i<regionSize/sizeof(uint16_t); i++) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+						uint16_t w = src2[i];
+						w = (w>>8) | ((w&0xff) <<8);
+						w = g.replace_word(dst, i, w);
+						w = (w>>8) | ((w&0xff) <<8);
+						tmp[i] = w;
+#else
 						tmp[i] = g.replace_word(dst, i, src2[i]);
+#endif
 					}
 					if(g.needPrepare())
 						g.finish(dst, regionSize);

From 8fd18a0ad602e889c78227d17327a656b77d0ac7 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 24 Aug 2023 17:19:04 +1000
Subject: [PATCH 72/91] Add error messages to test-inv failures

---
 test/gf16/test-inv.cpp | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/test/gf16/test-inv.cpp b/test/gf16/test-inv.cpp
index 58e3fef5..f3b0d024 100644
--- a/test/gf16/test-inv.cpp
+++ b/test/gf16/test-inv.cpp
@@ -18,13 +18,20 @@ static void compare_invert(const Galois16RecMatrix& mat, Galois16* leftmatrix, s
 	unsigned validCount = std::count(inputValid.begin(), inputValid.end(), true);
 	unsigned invalidCount = inputValid.size()-validCount;
 	
-	if(recovery.size() != invalidCount) abort();
+	if(recovery.size() != invalidCount) {
+		std::cout << "Count mismatch: " << recovery.size() << "!=" << invalidCount << std::endl;
+		abort();
+	}
 	
 	// compare
 	for(unsigned outRow = 0; outRow < invalidCount; outRow++) {
 		for(unsigned inCol = 0; inCol < inputValid.size(); inCol++) {
-			if(leftmatrix[outRow * inputValid.size() + inCol] != mat.GetFactor(inCol, outRow))
+			auto expected = leftmatrix[outRow * inputValid.size() + inCol];
+			auto actual = mat.GetFactor(inCol, outRow);
+			if(expected != actual) {
+				std::cout << "Value mismatch at " << outRow << "x" << inCol << ": " << expected << "!=" << actual << std::endl;
 				abort();
+			}
 		}
 	}
 }
@@ -40,7 +47,10 @@ static void do_test(std::vector<bool> inputValid, std::vector<uint16_t> recovery
 	unsigned validCount = std::count(inputValid.begin(), inputValid.end(), true);
 	Galois16RecMatrix mat;
 	mat.regionMethod = (int)method;
-	if(mat.Compute(inputValid, validCount, recovery) != canInvert) abort();
+	if(mat.Compute(inputValid, validCount, recovery) != canInvert) {
+		std::cout << "Inversion success mismatch" << std::endl;
+		abort();
+	}
 	if(canInvert) {
 		compare_invert(mat, leftmatrix, inputValid, recovery);
 	}
@@ -102,9 +112,18 @@ int main(int argc, char** argv) {
 			mat.regionMethod = (int)method;
 			
 			unsigned validCount = std::count(flawedInput.begin(), flawedInput.end(), true);
-			if(!mat.Compute(flawedInput, validCount, recovery)) abort();
-			if(recovery.size() != 2) abort();
-			if(!((recovery.at(0) == 0 || recovery.at(0) == 5) && recovery.at(1) == 6)) abort();
+			if(!mat.Compute(flawedInput, validCount, recovery)) {
+				std::cout << "Failed to invert PAR2 flaw" << std::endl;
+				abort();
+			}
+			if(recovery.size() != 2) {
+				std::cout << "Recovery size mismatch: 2 != " << recovery.size() << std::endl;
+				abort();
+			}
+			if(!((recovery.at(0) == 0 || recovery.at(0) == 5) && recovery.at(1) == 6)) {
+				std::cout << "Recovery exponent incorrect" << std::endl;
+				abort();
+			}
 			
 			Galois16* leftmatrix = nullptr;
 			bool canInvert = p2c_invert(flawedInput, recovery, leftmatrix);
@@ -170,7 +189,10 @@ int main(int argc, char** argv) {
 					// do inversion
 					Galois16RecMatrix mat;
 					mat.regionMethod = (int)method;
-					if(mat.Compute(inputValid, iSize-invalidCount, recovery) != canInvert) abort();
+					if(mat.Compute(inputValid, iSize-invalidCount, recovery) != canInvert) {
+						std::cout << "Inversion success mismatch" << std::endl;
+						abort();
+					}
 					if(canInvert) {
 						compare_invert(mat, leftmatrix, inputValid, recovery);
 					}

From abd07c8a2f1ae337f088e9a7113facdba1434bd8 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 24 Aug 2023 17:39:10 +1000
Subject: [PATCH 73/91] Move some CLI logic into util file + add unclean exit
 detector Ref #51

---
 bin/parpar.js | 42 +++++++++++++++------------------
 cli/util.js   | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 23 deletions(-)
 create mode 100644 cli/util.js

diff --git a/bin/parpar.js b/bin/parpar.js
index f134bf5b..ed3094ac 100755
--- a/bin/parpar.js
+++ b/bin/parpar.js
@@ -3,6 +3,7 @@
 "use strict";
 
 var ParPar = require('../lib/parpar.js');
+var cliUtil = require('../cli/util');
 var cliFormat = process.stderr.isTTY ? function(code, msg) {
 	return '\x1b[' + code + 'm' + msg + '\x1b[0m';
 } : function(code, msg) { return msg; };
@@ -19,15 +20,6 @@ var print_json = function(type, obj) {
 };
 var arg_parser = require('../lib/arg_parser.js');
 
-var friendlySize = function(s) {
-	var units = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB'];
-	for(var i=0; i<units.length; i++) {
-		if(s < 10000) break;
-		s /= 1024;
-	}
-	return (Math.round(s *100)/100) + ' ' + units[i];
-};
-
 var opts = {
 	'input-slices': {
 		alias: 's',
@@ -362,7 +354,7 @@ if(argv['opencl-list']) {
 				var output = [
 					'  - Device #' + dvId + ': ' + device.name + (dvId == defaultDev.id ? defaultLabel : ''),
 					'    Type: ' + device.type,
-					'    Memory: ' + friendlySize(device.memory_global) + (device.memory_unified ? ' (shared)':''),
+					'    Memory: ' + cliUtil.friendlySize(device.memory_global) + (device.memory_unified ? ' (shared)':''),
 				];
 				if(!device.supported)
 					output.splice(1, 0, '    Supported: no');
@@ -394,20 +386,13 @@ if(argv.json)
 		print_json('progress', data);
 	};
 else if(argv.progress == 'stdout' || argv.progress == 'stderr') {
-	var decimalPoint = (1.1).toLocaleString().substring(1, 2);
 	// TODO: display slices processed, pass# if verbose progress requested
 	writeProgress = function(data) {
 		// add formatting for aesthetics
 		var parts = data.progress_percent.toLocaleString().match(/^([0-9]+)([.,][0-9]+)?$/);
-		while(parts[1].length < 3)
-			parts[1] = ' ' + parts[1];
-		if(parts[2]) while(parts[2].length < 3)
-			parts[2] += '0';
-		else
-			parts[2] = decimalPoint + '00';
-		var state = data.state;
-		while(state.length < 18)
-			state += ' ';
+		parts[1] = cliUtil.lpad(parts[1], 3, ' ');
+		parts[2] = cliUtil.rpad(parts[2] || cliUtil.decimalPoint, 3, '0');
+		var state = cliUtil.rpad(data.state, 18, ' ');
 		if(process[argv.progress].isTTY)
 			process[argv.progress].write(state + ': \x1b[1m' + (parts[1] + parts[2]) + '%\x1b[0m\x1b[0G');
 		else
@@ -730,7 +715,7 @@ var inputFiles = argv._;
 				return cliFormat('1', n) + ' ' + unit + suffix;
 			};
 			var sizeDisp = function(val) {
-				return cliFormat('1', friendlySize(val));
+				return cliFormat('1', cliUtil.friendlySize(val));
 			};
 			var hash_methods = g.hash_methods();
 			if(argv.json) {
@@ -762,10 +747,10 @@ var inputFiles = argv._;
 				if(g.opts.sliceSize > 1024*1048576) {
 					// par2j has 1GB slice size limit hard-coded; 32-bit version supports 1GB slices
 					// some 32-bit applications seem to have issues with 1GB slices as well (phpar2 v1.4 win32 seems to have trouble with 854M slices, 848M works in the test I did)
-					process.stderr.write(cliFormat('33', 'Warning') + ': selected slice size (' + friendlySize(g.opts.sliceSize) + ') is larger than 1GB, which is beyond what a number of PAR2 clients support. Consider increasing the number of slices or reducing the slice size so that it is under 1GB\n');
+					process.stderr.write(cliFormat('33', 'Warning') + ': selected slice size (' + cliUtil.friendlySize(g.opts.sliceSize) + ') is larger than 1GB, which is beyond what a number of PAR2 clients support. Consider increasing the number of slices or reducing the slice size so that it is under 1GB\n');
 				}
 				else if(g.opts.sliceSize > 100*1000000 && g.totalSize <= 32768*100*1000000) { // we also check whether 100MB slices are viable by checking the input size - essentially there's a max of 32768 slices, so at 100MB, max size would be 3051.76GB
-					process.stderr.write(cliFormat('33', 'Warning') + ': selected slice size (' + friendlySize(g.opts.sliceSize) + ') may be too large to be compatible with QuickPar\n');
+					process.stderr.write(cliFormat('33', 'Warning') + ': selected slice size (' + cliUtil.friendlySize(g.opts.sliceSize) + ') may be too large to be compatible with QuickPar\n');
 				}
 				
 				process.stderr.write('Input data        : ' + sizeDisp(g.totalSize) + ' (' + pluralDisp(g.inputSlices, 'slice') + ' from ' + pluralDisp(info.length, 'file') + ')\n');
@@ -921,6 +906,17 @@ var inputFiles = argv._;
 				else
 					process.stderr.write('\nProcessing time   : ' + cliFormat('1', timeTaken + ' s') + '\n');
 			}
+			
+			setTimeout(function() {
+				if(!argv.quiet) {
+					process.stderr.write('Process did not terminate cleanly');
+					var handles = cliUtil.activeHandleCounts();
+					if(handles)
+						process.stderr.write('; active handles: ' + cliUtil.activeHandlesStr(handles[0]));
+					process.stderr.write('\n');
+				}
+				process.exit();
+			}, 5000).unref();
 		});
 		
 	});
diff --git a/cli/util.js b/cli/util.js
new file mode 100644
index 00000000..37267312
--- /dev/null
+++ b/cli/util.js
@@ -0,0 +1,64 @@
+"use strict";
+
+module.exports = {
+	decimalPoint: ('' + 1.1).replace(/1/g, ''),
+	
+	friendlySize: function(s) {
+		var units = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB'];
+		for(var i=0; i<units.length; i++) {
+			if(s < 10000) break;
+			s /= 1024;
+		}
+		return (Math.round(s *100)/100) + ' ' + units[i];
+	},
+	repeatChar: function(c, l) {
+		if(c.repeat) return c.repeat(l);
+		var buf = Buffer(l);
+		buf.fill(c);
+		return buf.toString();
+	},
+	lpad: function(s, l, c) {
+		if(s.length > l) return s;
+		return module.exports.repeatChar((c || ' '), l-s.length) + s;
+	},
+	rpad: function(s, l, c) {
+		if(s.length > l) return s;
+		return s + module.exports.repeatChar((c || ' '), l-s.length);
+	},
+	activeHandleCounts: function() {
+		if(!process._getActiveHandles && !process.getActiveResourcesInfo)
+			return null;
+		var hTypes = {};
+		var ah;
+		if(process._getActiveHandles) { // undocumented function, but seems to always work
+			ah = process._getActiveHandles().filter(function(h) {
+				// exclude stdout/stderr from count
+				return !h.constructor || h.constructor.name != 'WriteStream' || (h.fd != 1 && h.fd != 2);
+			});
+			ah.forEach(function(h) {
+				var cn = (h.constructor ? h.constructor.name : 0) || 'unknown';
+				if(cn in hTypes)
+					hTypes[cn]++;
+				else
+					hTypes[cn] = 1;
+			});
+		} else {
+			process.getActiveResourcesInfo().forEach(function(h) {
+				if(h in hTypes)
+					hTypes[h]++;
+				else
+					hTypes[h] = 1;
+			});
+			// TODO: is there any way to exclude stdout/stderr?
+		}
+		return [hTypes, ah];
+	},
+	activeHandlesStr: function(hTypes) {
+		var handleStr = '';
+		for(var hn in hTypes) {
+			handleStr += ', ' + hn + (hTypes[hn] > 1 ? ' (' + hTypes[hn] + ')' : '');
+		}
+		return handleStr.substring(2);
+	}
+	
+};

From 87b969edb47e84b89453abcaaabb028dfb634684 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 24 Aug 2023 20:39:17 +1000
Subject: [PATCH 74/91] Test workflow fix

---
 .github/workflows/test.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 57b0bf61..904762d5 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,19 +24,19 @@ jobs:
           mkdir test\hasher\build
           cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\hasher\build --config ${{ matrix.config }}
-      - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe"
-      - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe"
-      - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f"
+      - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test.exe"
+      - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-pmul.exe"
+      - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-ctrl.exe -f"
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: "%SDE_PATH%\sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f"
+      - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-inv.exe -f"
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: "%SDE_PATH%\sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe"
+      - run: "%SDE_PATH%\\sde -icx -- test\hasher\\build\\${{ matrix.config }}\\test.exe"
       
       # test SSE2-only to see if CPUID checking works
       - run: |
-          %SDE_PATH%\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe
-          %SDE_PATH%\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
-          %SDE_PATH%\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe
+          %SDE_PATH%\\sde -p4 -- test\\gf16\\build\\${{ matrix.config }}\\test.exe
+          %SDE_PATH%\\sde -p4 -- test\\gf16\\build\\${{ matrix.config }}\test-pmul.exe
+          %SDE_PATH%\\sde -p4 -- test\\hasher\\build\\${{ matrix.config }}\\test.exe
         if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }}
       
   

From a10998930b6ab77d24c1fc2a41f9f0d268449af0 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Thu, 24 Aug 2023 20:40:40 +1000
Subject: [PATCH 75/91] Test workflow fix

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 904762d5..cb302ea8 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -30,7 +30,7 @@ jobs:
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
       - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-inv.exe -f"
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: "%SDE_PATH%\\sde -icx -- test\hasher\\build\\${{ matrix.config }}\\test.exe"
+      - run: "%SDE_PATH%\\sde -icx -- test\\hasher\\build\\${{ matrix.config }}\\test.exe"
       
       # test SSE2-only to see if CPUID checking works
       - run: |

From dd000ca97abc3f72b96ad75ff31b20efc6e8a84c Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sat, 26 Aug 2023 20:45:01 +1000
Subject: [PATCH 76/91] Fix leak in par2cmdline inversion test

---
 test/gf16/p2c-inv/reedsolomon.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/gf16/p2c-inv/reedsolomon.cpp b/test/gf16/p2c-inv/reedsolomon.cpp
index 17e41dec..3f86b287 100644
--- a/test/gf16/p2c-inv/reedsolomon.cpp
+++ b/test/gf16/p2c-inv/reedsolomon.cpp
@@ -135,9 +135,9 @@ bool ReedSolomon_Compute(const vector<bool> &present, vector<RSOutputRow> output
   // SetInput
   u32 inputcount = (u32)present.size();
 
-  u32* datapresentindex = new u32[inputcount];
-  u32* datamissingindex = new u32[inputcount];
-  Galois16::ValueType* database    = new Galois16::ValueType[inputcount];
+  vector<u32> datapresentindex(inputcount);
+  vector<u32> datamissingindex(inputcount);
+  vector<Galois16::ValueType> database(inputcount);
   u32 datapresent = 0, datamissing = 0;
 
   unsigned int logbase = 0;

From e391ad4319f096295974236bfe5c0b9b90e268cb Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sat, 26 Aug 2023 20:45:26 +1000
Subject: [PATCH 77/91] Fixes to test workflow

---
 .github/workflows/test.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index cb302ea8..b75387b3 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,19 +24,19 @@ jobs:
           mkdir test\hasher\build
           cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\hasher\build --config ${{ matrix.config }}
-      - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test.exe"
-      - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-pmul.exe"
-      - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-ctrl.exe -f"
+      - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test.exe"
+      - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-pmul.exe"
+      - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-ctrl.exe -f"
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: "%SDE_PATH%\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-inv.exe -f"
+      - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-inv.exe -f"
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: "%SDE_PATH%\\sde -icx -- test\\hasher\\build\\${{ matrix.config }}\\test.exe"
+      - run: "$env:SDE_PATH\\sde -icx -- test\\hasher\\build\\${{ matrix.config }}\\test.exe"
       
       # test SSE2-only to see if CPUID checking works
       - run: |
-          %SDE_PATH%\\sde -p4 -- test\\gf16\\build\\${{ matrix.config }}\\test.exe
-          %SDE_PATH%\\sde -p4 -- test\\gf16\\build\\${{ matrix.config }}\test-pmul.exe
-          %SDE_PATH%\\sde -p4 -- test\\hasher\\build\\${{ matrix.config }}\\test.exe
+          $env:SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe
+          $env:SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
+          $env:SDE_PATH\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe
         if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }}
       
   
@@ -167,7 +167,7 @@ jobs:
       - run: sudo apt install -y binutils-${{ matrix.t.target }} libgcc-12-dev-${{ matrix.t.libc }}-cross libstdc++-12-dev-${{ matrix.t.libc }}-cross
         if: ${{ matrix.t.arch != 'amd64' }}
       - run: echo "SANITIZE=-DENABLE_SANITIZE=1" >> $GITHUB_ENV
-        if: ${{ matrix.config == 'Release' && matrix.t.arch == 'amd64' }}
+        if: ${{ matrix.config == 'Release' && matrix.t.arch == 'amd64' && matrix.cc_ver == '15' }} # SDE+ASAN problematic with Clang 11
       - run: |
           if [ '${{ matrix.t.arch }}' != 'amd64' ]; then
             LINKER_FLAG=-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=/usr/bin/${{ matrix.t.target }}-ld

From 5901d7addfe7cf4c90a2747eab2aa43f09641e9d Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Sun, 27 Aug 2023 16:51:12 +1000
Subject: [PATCH 78/91] Updates to build workflow

---
 .github/workflows/build.yml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 2c5c36d5..21444993 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -22,7 +22,7 @@ jobs:
       - uses: actions/checkout@v3
       - name: Get release
         id: get_release
-        uses: bruceadams/get-release@v1.2.3
+        uses: bruceadams/get-release@v1.3.2
         env:
           GITHUB_TOKEN: ${{ github.token }}
       - run: npm install --production --ignore-scripts
@@ -31,7 +31,7 @@ jobs:
       - run: nexe\parpar.exe --version
       - run: nexe\parpar.exe -r1 -s1M -o nexe\test.par2 nexe\parpar.exe
       - run: move nexe\parpar.exe parpar.exe && 7z a -t7z -mx=9 parpar.7z parpar.exe
-      - uses: actions/upload-release-asset@v1
+      - uses: sekwah41/upload-release-assets@v1
         env:
           GITHUB_TOKEN: ${{ github.token }}
         with:
@@ -63,7 +63,7 @@ jobs:
           python-version: '3.9' # workaround "cannot import name 'Mapping' from 'collections'" error
       - name: Get release
         id: get_release
-        uses: bruceadams/get-release@v1.2.3
+        uses: bruceadams/get-release@v1.3.2
         env:
           GITHUB_TOKEN: ${{ github.token }}
       - run: |
@@ -78,7 +78,7 @@ jobs:
       - run: nexe/parpar --version
       - run: nexe/parpar -r1 -s1M -onexe/test.par2 nexe/parpar
       - run: xz -9e --x86 --lzma2 nexe/parpar -c > parpar.xz
-      - uses: actions/upload-release-asset@v1
+      - uses: sekwah41/upload-release-assets@v1
         env:
           GITHUB_TOKEN: ${{ github.token }}
         with:
@@ -113,7 +113,7 @@ jobs:
           arch: arm64
       - name: Get release
         id: get_release
-        uses: bruceadams/get-release@v1.2.3
+        uses: bruceadams/get-release@v1.3.2
         env:
           GITHUB_TOKEN: ${{ github.token }}
       - run: npm install --production --ignore-scripts
@@ -125,7 +125,7 @@ jobs:
           CC_host: cc
           CXX_host: c++
       - run: xz -9e --lzma2 nexe/parpar -c > parpar.xz
-      - uses: actions/upload-release-asset@v1
+      - uses: sekwah41/upload-release-assets@v1
         env:
           GITHUB_TOKEN: ${{ github.token }}
         with:
@@ -166,7 +166,7 @@ jobs:
 #          packages: "libstdc++-$(c++ -dumpversion)-dev:i386 libc6-dev:i386"
 #      - name: Get release
 #        id: get_release
-#        uses: bruceadams/get-release@v1.2.3
+#        uses: bruceadams/get-release@v1.3.2
 #        env:
 #          GITHUB_TOKEN: ${{ github.token }}
 #      - run: npm install --production --ignore-scripts
@@ -178,7 +178,7 @@ jobs:
 #          CC_host: cc
 #          CXX_host: c++
 #      - run: xz -9e --arm --lzma2 nexe/parpar -c > parpar.xz
-#      - uses: actions/upload-release-asset@v1
+#      - uses: sekwah41/upload-release-assets@v1
 #        env:
 #          GITHUB_TOKEN: ${{ github.token }}
 #        with:
@@ -197,7 +197,7 @@ jobs:
       - uses: actions/checkout@v3
       - name: Get release
         id: get_release
-        uses: bruceadams/get-release@v1.2.3
+        uses: bruceadams/get-release@v1.3.2
         env:
           GITHUB_TOKEN: ${{ github.token }}
       - run: npm install --production --ignore-scripts
@@ -206,7 +206,7 @@ jobs:
       - run: nexe/parpar --version
       - run: nexe/parpar -r1 -s1M -onexe/test.par2 nexe/parpar
       - run: xz -9e --x86 --lzma2 nexe/parpar -c > parpar.xz
-      - uses: actions/upload-release-asset@v1
+      - uses: sekwah41/upload-release-assets@v1
         env:
           GITHUB_TOKEN: ${{ github.token }}
         with:

From edcb353ba76f4913f662fa5f91ad9ee4ac03c6f1 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Mon, 28 Aug 2023 22:11:21 +1000
Subject: [PATCH 79/91] Add MSYS/POCL/FreeBSD tests

---
 .github/workflows/test.yml | 84 ++++++++++++++++++++++++++++++++++----
 1 file changed, 76 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b75387b3..83645a83 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -62,11 +62,47 @@ jobs:
           mkdir test\hasher\build
           cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\hasher\build --config Debug
-    
-  # TODO: test mingw
-  #  https://github.com/msys2/setup-msys2
-  #  https://www.msys2.org/docs/ci/
-  # TODO: test libuv, OpenCL
+  
+  test-msys:
+    runs-on: windows-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        sys: [mingw32, ucrt64, clang64]
+        compiler:
+          - {cc: gcc, cxx: g++}
+          - {cc: clang, cxx: clang++}
+          #- { sys: mingw32, env: i686 }
+          #- { sys: ucrt64,  env: ucrt-x86_64 }
+          #- { sys: clang64, env: clang-x86_64 }
+    name: Test MSYS ${{matrix.sys}} ${{matrix.compiler.cc}}
+    defaults:
+      run:
+        shell: msys2 {0}
+    steps:
+      #- uses: petarpetrovt/setup-sde@v2.1
+      - uses: msys2/setup-msys2@v2
+        with:
+          msystem: ${{matrix.sys}}
+          #update: true
+          install: cmake ${{matrix.compiler.cc}} make git
+      - uses: actions/checkout@v3
+      - run: |
+          mkdir test/gf16/build
+          cmake -B test/gf16/build -S test/gf16 -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=${{matrix.compiler.cc}} -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
+          cmake --build test/gf16/build
+          
+          mkdir test/hasher/build
+          cmake -B test/hasher/build -S test/hasher -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=${{matrix.compiler.cc}} -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
+          cmake --build test/hasher/build
+      # TODO: test under SDE (needs static linking, or calling SDE from within MSYS)
+      - run: test/gf16/build/test
+      - run: test/gf16/build/test-pmul
+      - run: test/gf16/build/test-ctrl -f
+      - run: test/gf16/build/test-inv -f
+      - run: test/hasher/build/test
+  
+  # TODO: test libuv
   
   test-linux-gcc:
     strategy:
@@ -213,6 +249,17 @@ jobs:
         if: ${{ matrix.config == 'Release' && matrix.cc_ver == '15' }}
       - run: ${{ matrix.t.emu }} test/hasher/build/test
   
+  test-linux-pocl:
+    name: Test POCL (OpenCL)
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - run: sudo apt update && sudo apt install -y pocl-opencl-icd
+      - run: |
+          mkdir test/gf16/build
+          cmake -Btest/gf16/build -Stest/gf16 -DCMAKE_BUILD_TYPE=Release -DENABLE_OCL=1
+          cmake --build test/gf16/build
+      - run: test/gf16/build/test-ctrl -pg
   
   test-mac-x86:
     strategy:
@@ -244,6 +291,27 @@ jobs:
   
   # TODO: test building on Mac ARM64? might not be necessary, given we build it in par2cmdline-turbo
   
-  # TODO: BSD?
-  #  https://github.com/marketplace/actions/freebsd-vm
-  #  https://github.com/vmactions
+  test-fbsd-x86:
+    runs-on: macos-12
+    name: Test FreeBSD amd64
+    steps:
+    - uses: actions/checkout@v3
+    - id: fbsd_test
+      uses: vmactions/freebsd-vm@v0
+      with:
+        usesh: true
+        prepare: pkg install -y cmake lang/gcc gmake
+        run: |
+          mkdir test/gf16/build
+          cmake -Btest/gf16/build -Stest/gf16 -DCMAKE_BUILD_TYPE=Release
+          cmake --build test/gf16/build
+          
+          mkdir test/hasher/build
+          cmake -Btest/hasher/build -Stest/hasher -DCMAKE_BUILD_TYPE=Release
+          cmake --build test/hasher/build
+          
+          test/gf16/build/test
+          test/gf16/build/test-pmul
+          test/gf16/build/test-ctrl -f
+          test/gf16/build/test-inv -f
+          test/hasher/build/test

From a54eb75fbe38442ee6c56881d296483c3494cd91 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 29 Aug 2023 11:53:17 +1000
Subject: [PATCH 80/91] Fix possible leak in matrix inversion

---
 gf16/gfmat_inv.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gf16/gfmat_inv.cpp b/gf16/gfmat_inv.cpp
index ad01643f..0c85ed70 100644
--- a/gf16/gfmat_inv.cpp
+++ b/gf16/gfmat_inv.cpp
@@ -625,6 +625,9 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 	if(rowGroupSize < rowMultiple*2) rowGroupSize = rowMultiple*2;
 	if(rowGroupSize > numRec) rowGroupSize = numRec;
 	
+	std::vector<uint16_t> stateCoeff(rowGroupSize*rowGroupSize);
+	state.coeff = stateCoeff.data();
+	
 	invert_loop: { // loop, in the unlikely case we hit the PAR2 un-invertability flaw; TODO: is there a faster way than just retrying?
 		if(numRec > recovery.size()) { // not enough recovery
 			if(_numThreads <= 1)
@@ -658,14 +661,12 @@ bool Galois16RecMatrix::Compute(const std::vector<bool>& inputValid, unsigned va
 				} \
 			}
 		// max out at 6 groups (registers + cache assoc?)
-		state.coeff = new uint16_t[rowGroupSize*rowGroupSize];
 		INVERT_GROUP(6)
 		INVERT_GROUP(5)
 		INVERT_GROUP(4)
 		INVERT_GROUP(3)
 		INVERT_GROUP(2)
 		INVERT_GROUP(1)
-		delete[] state.coeff;
 		#undef INVERT_GROUP
 		
 		// post transform

From ff6e0af925dfa882c1d0e4294e25e070d099a2db Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 29 Aug 2023 12:23:11 +1000
Subject: [PATCH 81/91] Build/test fixes/tweaks

---
 .github/workflows/test.yml | 27 ++++++++++------------
 binding.gyp                | 10 +++++++-
 test/gf16/CMakeLists.txt   | 47 +++++++++++++++++++++++++++++---------
 test/hasher/CMakeLists.txt | 15 +++++++++++-
 4 files changed, 71 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 83645a83..da70268a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,19 +24,19 @@ jobs:
           mkdir test\hasher\build
           cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\hasher\build --config ${{ matrix.config }}
-      - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test.exe"
-      - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-pmul.exe"
-      - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-ctrl.exe -f"
+      - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe
+      - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
+      - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: "$env:SDE_PATH\\sde -icx -- test\\gf16\\build\\${{ matrix.config }}\\test-inv.exe -f"
+      - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: "$env:SDE_PATH\\sde -icx -- test\\hasher\\build\\${{ matrix.config }}\\test.exe"
+      - run: $env:SDE_PATH/sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe
       
       # test SSE2-only to see if CPUID checking works
       - run: |
-          $env:SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe
-          $env:SDE_PATH\sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
-          $env:SDE_PATH\sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe
+          $env:SDE_PATH/sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe
+          $env:SDE_PATH/sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
+          $env:SDE_PATH/sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe
         if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }}
       
   
@@ -70,11 +70,8 @@ jobs:
       matrix:
         sys: [mingw32, ucrt64, clang64]
         compiler:
-          - {cc: gcc, cxx: g++}
-          - {cc: clang, cxx: clang++}
-          #- { sys: mingw32, env: i686 }
-          #- { sys: ucrt64,  env: ucrt-x86_64 }
-          #- { sys: clang64, env: clang-x86_64 }
+          - {cc: gcc, cxx: g++, cc_extra: ""}
+          - {cc: clang, cxx: clang++, cc_extra: gcc}
     name: Test MSYS ${{matrix.sys}} ${{matrix.compiler.cc}}
     defaults:
       run:
@@ -85,7 +82,7 @@ jobs:
         with:
           msystem: ${{matrix.sys}}
           #update: true
-          install: cmake ${{matrix.compiler.cc}} make git
+          install: cmake ${{matrix.compiler.cc}} ${{matrix.compiler.cc_extra}} make git
       - uses: actions/checkout@v3
       - run: |
           mkdir test/gf16/build
@@ -299,7 +296,7 @@ jobs:
     - id: fbsd_test
       uses: vmactions/freebsd-vm@v0
       with:
-        usesh: true
+        copyback: false
         prepare: pkg install -y cmake lang/gcc gmake
         run: |
           mkdir test/gf16/build
diff --git a/binding.gyp b/binding.gyp
index cfdf1265..84b47dac 100644
--- a/binding.gyp
+++ b/binding.gyp
@@ -19,9 +19,17 @@
             }
           }]
         ]
+      }],
+      ['OS!="win"', {
+        "variables": {"missing_memalign%": "<!(<!(echo ${CC_target:-${CC:-cc}}) -c src/test_alignalloc.c -o /dev/null -Werror 2>/dev/null || echo failed)"},
+        "conditions": [
+          ['missing_memalign!=""', {
+            "cflags_c": ["-D_POSIX_C_SOURCE=200112L"],
+          }]
+        ]
       }]
     ],
-    "cflags_c": ["-std=c99", "-D_POSIX_C_SOURCE=200112L", "-D_DARWIN_C_SOURCE", "-D_GNU_SOURCE"],
+    "cflags_c": ["-std=c99", "-D_DARWIN_C_SOURCE", "-D_GNU_SOURCE", "-D_DEFAULT_SOURCE"],
     "cxxflags": ["-std=c++11"],
     "msvs_settings": {"VCCLCompilerTool": {"Optimization": "MaxSpeed"}},
     "configurations": {"Release": {
diff --git a/test/gf16/CMakeLists.txt b/test/gf16/CMakeLists.txt
index 37f7b578..49b716ae 100644
--- a/test/gf16/CMakeLists.txt
+++ b/test/gf16/CMakeLists.txt
@@ -116,11 +116,6 @@ set(GF16_CPP_SOURCES
 	${GF16_DIR}/controller_cpu.cpp
 	${GF16_DIR}/controller_ocl.cpp
 	${GF16_DIR}/controller_ocl_init.cpp
-	${GF16_DIR}/gf16mul.cpp
-	
-	
-	${GF16_DIR}/gf16pmul.cpp
-	${GF16_DIR}/gfmat_inv.cpp
 )
 
 include_directories(${GF16_DIR}/opencl-include ${GF16_DIR})
@@ -142,11 +137,23 @@ else()
 	endif()
 	
 	if(ENABLE_SANITIZE)
-		set(SANITIZE_OPTS -fsanitize=address -fsanitize=bool,builtin,bounds,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,object-size,return,returns-nonnull-attribute,shift,signed-integer-overflow,unreachable,vla-bound)
+		set(SANITIZE_OPTS -fsanitize=address -fsanitize=bool,builtin,bounds,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,object-size,return,returns-nonnull-attribute,shift,signed-integer-overflow,unreachable,vla-bound -fno-sanitize-recover=all)
 		# -fsanitize=pointer-overflow causes compilation of shuffle_avx512 to freeze on clang10
 		# -fsanitize=memory requires instrumented libraries, so not useful
 		add_compile_options(-fno-omit-frame-pointer ${SANITIZE_OPTS})
 		add_link_options(${SANITIZE_OPTS})
+		
+		#include(CheckLinkerFlag)
+		#check_linker_flag(C -static-libasan HAS_LIBASAN)  # GCC
+		#check_linker_flag(C -static-libsan HAS_LIBSAN)  # Clang
+		CHECK_CXX_COMPILER_FLAG(-static-libasan HAS_LIBASAN)
+		CHECK_CXX_COMPILER_FLAG(-static-libsan HAS_LIBSAN)
+		if(HAS_LIBASAN)
+			add_link_options(-static-libasan)
+		endif()
+		if(HAS_LIBSAN)
+			add_link_options(-static-libsan)
+		endif()
 	endif()
 	
 	#if(ENABLE_OCL)
@@ -158,20 +165,38 @@ endif()
 
 add_compile_definitions(PARPAR_INVERT_SUPPORT=1)
 add_library(gf16_c STATIC ${GF16_C_SOURCES})
+add_library(gf16_base STATIC ${GF16_DIR}/gf16mul.cpp)
+add_library(gf16_pmul STATIC ${GF16_DIR}/gf16pmul.cpp)
+add_library(gf16_inv STATIC ${GF16_DIR}/gfmat_inv.cpp)
 add_library(gf16_ctl STATIC ${GF16_CPP_SOURCES})
-target_link_libraries(gf16_ctl gf16_c)
+target_link_libraries(gf16_base gf16_c)
+target_link_libraries(gf16_pmul gf16_c)
+target_link_libraries(gf16_inv gf16_base gf16_pmul)
+target_link_libraries(gf16_ctl gf16_base)
 
 if(NOT MSVC)
 	if(NOT ENABLE_SANITIZE)
+		target_compile_options(gf16_base PRIVATE -fno-rtti -fno-exceptions)
+		target_compile_options(gf16_pmul PRIVATE -fno-rtti -fno-exceptions)
+		target_compile_options(gf16_inv PRIVATE -fno-rtti -fno-exceptions)
 		target_compile_options(gf16_ctl PRIVATE -fno-rtti)
 	endif()
-	target_compile_definitions(gf16_c PRIVATE _POSIX_C_SOURCE=200112L)
+	
+	# posix_memalign may require _POSIX_C_SOURCE, but doing that on FreeBSD causes MAP_ANON* to disappear
+	# try to work around this by checking if posix_memalign exists without the define
+	include(CheckSymbolExists)
+	check_symbol_exists(posix_memalign "stdlib.h" HAVE_MEMALIGN)
+	if(NOT HAVE_MEMALIGN)
+		target_compile_definitions(gf16_c PRIVATE _POSIX_C_SOURCE=200112L)
+	endif()
 	target_compile_definitions(gf16_c PRIVATE _DARWIN_C_SOURCE=)
 	target_compile_definitions(gf16_c PRIVATE _GNU_SOURCE=)
+	target_compile_definitions(gf16_c PRIVATE _DEFAULT_SOURCE=)
 	
 	if(ENABLE_SANITIZE)
 		# not supported on all platforms?
 		#target_compile_options(gf16_ctl PRIVATE -fsanitize=thread)
+		#target_compile_options(gf16_inv PRIVATE -fsanitize=thread)
 	endif()
 endif()
 
@@ -291,13 +316,13 @@ endif()
 # binaries
 set(TEST_DIR .)
 add_executable(test ${TEST_DIR}/test.cpp)
-target_link_libraries(test gf16_ctl)
+target_link_libraries(test gf16_base)
 add_executable(test-ctrl ${TEST_DIR}/test-ctrl.cpp)
 target_link_libraries(test-ctrl gf16_ctl)
 add_executable(test-inv ${TEST_DIR}/test-inv.cpp ${TEST_DIR}/p2c-inv/reedsolomon.cpp)
-target_link_libraries(test-inv gf16_ctl)
+target_link_libraries(test-inv gf16_inv)
 add_executable(test-pmul ${TEST_DIR}/test-pmul.cpp)
-target_link_libraries(test-pmul gf16_ctl)
+target_link_libraries(test-pmul gf16_pmul)
 
 if(NOT MSVC)
 	target_link_libraries(test-ctrl -pthread)
diff --git a/test/hasher/CMakeLists.txt b/test/hasher/CMakeLists.txt
index 507dd536..6c0efa82 100644
--- a/test/hasher/CMakeLists.txt
+++ b/test/hasher/CMakeLists.txt
@@ -83,9 +83,21 @@ else()
 	endif()
 	
 	if(ENABLE_SANITIZE)
-		set(SANITIZE_OPTS -fsanitize=address -fsanitize=undefined)
+		set(SANITIZE_OPTS -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all)
 		add_compile_options(-fno-omit-frame-pointer ${SANITIZE_OPTS})
 		add_link_options(${SANITIZE_OPTS})
+		
+		#include(CheckLinkerFlag)
+		#check_linker_flag(C -static-libasan HAS_LIBASAN)  # GCC
+		#check_linker_flag(C -static-libsan HAS_LIBSAN)  # Clang
+		CHECK_CXX_COMPILER_FLAG(-static-libasan HAS_LIBASAN)
+		CHECK_CXX_COMPILER_FLAG(-static-libsan HAS_LIBSAN)
+		if(HAS_LIBASAN)
+			add_link_options(-static-libasan)
+		endif()
+		if(HAS_LIBSAN)
+			add_link_options(-static-libsan)
+		endif()
 	endif()
 endif()
 
@@ -103,6 +115,7 @@ if(NOT MSVC)
 	target_compile_definitions(hasher_c PRIVATE _POSIX_C_SOURCE=200112L)
 	target_compile_definitions(hasher_c PRIVATE _DARWIN_C_SOURCE=)
 	target_compile_definitions(hasher_c PRIVATE _GNU_SOURCE=)
+	target_compile_definitions(hasher_c PRIVATE _DEFAULT_SOURCE=)
 endif()
 
 if(MSVC)

From e9b6d3344908c727651e7cb0c733a10d3f247cb7 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 29 Aug 2023 12:31:32 +1000
Subject: [PATCH 82/91] Avoid CpuCap name conflict

---
 gf16/gf16mul.cpp  | 33 ++++++++++++++-------------------
 hasher/hasher.cpp | 22 +++++++++++-----------
 2 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/gf16/gf16mul.cpp b/gf16/gf16mul.cpp
index 142aba3d..34ad4ba3 100644
--- a/gf16/gf16mul.cpp
+++ b/gf16/gf16mul.cpp
@@ -13,21 +13,20 @@ extern "C" {
 }
 
 // CPUID stuff
-#include "../src/platform.h"
+#include "../src/cpuid.h"
 #ifdef PLATFORM_X86
-# include "../src/cpuid.h"
 # ifdef __APPLE__
 #  include <sys/types.h>
 #  include <sys/sysctl.h>
 # endif
 # include "x86_jit.h"
-struct CpuCap {
+struct GF16CpuCap {
 	bool hasSSE2, hasSSSE3, hasAVX, hasAVX2, hasAVX512VLBW, hasAVX512VBMI, hasGFNI;
 	size_t propPrefShuffleThresh;
 	bool propFastJit, propHT;
 	bool canMemWX, isEmulated;
 	int jitOptStrat;
-	CpuCap(bool detect) :
+	GF16CpuCap(bool detect) :
 	  hasSSE2(true),
 	  hasSSSE3(true),
 	  hasAVX(true),
@@ -196,14 +195,12 @@ struct CpuCap {
 };
 #endif
 #ifdef PLATFORM_ARM
-# include "../src/cpuid.h"
-
-struct CpuCap {
+struct GF16CpuCap {
 	bool hasNEON;
 	bool hasSHA3;
 	bool hasSVE;
 	bool hasSVE2;
-	CpuCap(bool detect) : hasNEON(true), hasSVE(true), hasSVE2(true) {
+	GF16CpuCap(bool detect) : hasNEON(true), hasSVE(true), hasSVE2(true) {
 		if(!detect) return;
 		hasNEON = CPU_HAS_NEON;
 		hasSHA3 = CPU_HAS_NEON_SHA3;
@@ -220,11 +217,9 @@ struct CpuCap {
 };
 #endif
 #ifdef __riscv
-# include "../src/cpuid.h"
-
-struct CpuCap {
+struct GF16CpuCap {
 	bool hasVector;
-	CpuCap(bool detect) : hasVector(true) {
+	GF16CpuCap(bool detect) : hasVector(true) {
 		if(!detect) return;
 		hasVector = CPU_HAS_VECTOR && CPU_HAS_GC;
 	}
@@ -1117,7 +1112,7 @@ void Galois16Mul::setupMethod(Galois16Methods _method) {
 		case GF16_XOR_JIT_SSE2:
 		case GF16_XOR_SSE2: {
 #ifdef PLATFORM_X86
-			int jitOptStrat = CpuCap(true).jitOptStrat;
+			int jitOptStrat = GF16CpuCap(true).jitOptStrat;
 			
 			switch(method) {
 				case GF16_XOR_JIT_SSE2:
@@ -1359,7 +1354,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu
 	(void)forInvert;
 	
 #ifdef PLATFORM_X86
-	const CpuCap caps(true);
+	const GF16CpuCap caps(true);
 	if(caps.hasGFNI) {
 		if(gf16_affine_available_avx512 && caps.hasAVX512VLBW)
 			return GF16_AFFINE_AVX512;
@@ -1397,7 +1392,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu
 		return GF16_XOR_SSE2;
 #endif
 #ifdef PLATFORM_ARM
-	const CpuCap caps(true);
+	const GF16CpuCap caps(true);
 	if(caps.hasSVE2) {
 		if(gf16_sve_get_size() >= 64)
 			return GF16_SHUFFLE_512_SVE2;
@@ -1419,7 +1414,7 @@ Galois16Methods Galois16Mul::default_method(size_t regionSizeHint, unsigned inpu
 			? GF16_CLMUL_NEON : GF16_SHUFFLE_NEON;
 #endif
 #ifdef __riscv_
-	const CpuCap caps(true);
+	const GF16CpuCap caps(true);
 	if(caps.hasVector && gf16_available_rvv && gf16_rvv_get_size() >= 16)
 		return GF16_SHUFFLE_128_RVV;
 #endif
@@ -1437,7 +1432,7 @@ std::vector<Galois16Methods> Galois16Mul::availableMethods(bool checkCpuid) {
 		ret.push_back(GF16_LOOKUP3);
 	
 #ifdef PLATFORM_X86
-	const CpuCap caps(checkCpuid);
+	const GF16CpuCap caps(checkCpuid);
 	if(gf16_shuffle_available_ssse3 && caps.hasSSSE3)
 		ret.push_back(GF16_SHUFFLE_SSSE3);
 	if(gf16_shuffle_available_avx && caps.hasAVX)
@@ -1485,7 +1480,7 @@ std::vector<Galois16Methods> Galois16Mul::availableMethods(bool checkCpuid) {
 	}
 #endif
 #ifdef PLATFORM_ARM
-	const CpuCap caps(checkCpuid);
+	const GF16CpuCap caps(checkCpuid);
 	if(gf16_available_neon && caps.hasNEON) {
 		ret.push_back(GF16_SHUFFLE_NEON);
 		ret.push_back(GF16_CLMUL_NEON);
@@ -1505,7 +1500,7 @@ std::vector<Galois16Methods> Galois16Mul::availableMethods(bool checkCpuid) {
 	}
 #endif
 #ifdef __riscv
-	const CpuCap caps(checkCpuid);
+	const GF16CpuCap caps(checkCpuid);
 	if(gf16_available_rvv && caps.hasVector && gf16_rvv_get_size() >= 16)
 		ret.push_back(GF16_SHUFFLE_128_RVV);
 #endif
diff --git a/hasher/hasher.cpp b/hasher/hasher.cpp
index ce6e3f40..f3d23d59 100644
--- a/hasher/hasher.cpp
+++ b/hasher/hasher.cpp
@@ -8,11 +8,11 @@ uint32_t(*MD5CRC_Calc)(const void*, size_t, size_t, void*) = NULL;
 MD5CRCMethods MD5CRC_Method = MD5CRCMETH_SCALAR;
 uint32_t(*CRC32_Calc)(const void*, size_t) = NULL;
 MD5CRCMethods CRC32_Method = MD5CRCMETH_SCALAR;
-struct CpuCap {
+struct HasherCpuCap {
 #ifdef PLATFORM_X86
 	bool hasSSE2, hasClMul, hasXOP, hasBMI1, hasAVX2, hasAVX512F, hasAVX512VLBW;
 	bool isSmallCore, isLEASlow, isVecRotSlow;
-	CpuCap(bool detect) :
+	HasherCpuCap(bool detect) :
 		hasSSE2(true), hasClMul(true), hasXOP(true), hasBMI1(true), hasAVX2(true), hasAVX512F(true), hasAVX512VLBW(true),
 		isSmallCore(false), isLEASlow(false), isVecRotSlow(false)
 	{
@@ -65,7 +65,7 @@ struct CpuCap {
 #endif
 #ifdef PLATFORM_ARM
 	bool hasCRC, hasNEON, hasSVE2;
-	CpuCap(bool detect) : hasCRC(true), hasNEON(true), hasSVE2(true) {
+	HasherCpuCap(bool detect) : hasCRC(true), hasNEON(true), hasSVE2(true) {
 		if(!detect) return;
 		hasCRC = CPU_HAS_ARMCRC;
 		hasNEON = CPU_HAS_NEON;
@@ -83,7 +83,7 @@ void setup_hasher() {
 	set_hasherMD5CRC(MD5CRCMETH_SCALAR);
 	
 #ifdef PLATFORM_X86
-	struct CpuCap caps(true);
+	struct HasherCpuCap caps(true);
 	
 	if(caps.hasAVX512VLBW && caps.hasClMul && !caps.isVecRotSlow && HasherInput_AVX512::isAvailable)
 		set_hasherInput(INHASH_AVX512);
@@ -111,7 +111,7 @@ void setup_hasher() {
 	
 #endif
 #ifdef PLATFORM_ARM
-	struct CpuCap caps(true);
+	struct HasherCpuCap caps(true);
 	
 	if(caps.hasCRC && HasherInput_ARMCRC::isAvailable) // TODO: fast core only
 		set_hasherInput(INHASH_CRC);
@@ -527,7 +527,7 @@ std::vector<HasherInputMethods> hasherInput_availableMethods(bool checkCpuid) {
 	ret.push_back(INHASH_SCALAR);
 	
 #ifdef PLATFORM_X86
-	const CpuCap caps(checkCpuid);
+	const HasherCpuCap caps(checkCpuid);
 	if(caps.hasClMul) {
 		if(caps.hasAVX512VLBW && HasherInput_AVX512::isAvailable)
 			ret.push_back(INHASH_AVX512);
@@ -542,7 +542,7 @@ std::vector<HasherInputMethods> hasherInput_availableMethods(bool checkCpuid) {
 		ret.push_back(INHASH_SIMD);
 #endif
 #ifdef PLATFORM_ARM
-	const CpuCap caps(checkCpuid);
+	const HasherCpuCap caps(checkCpuid);
 	if(caps.hasCRC && HasherInput_ARMCRC::isAvailable)
 		ret.push_back(INHASH_CRC);
 	if(caps.hasNEON && HasherInput_NEON::isAvailable)
@@ -559,7 +559,7 @@ std::vector<MD5CRCMethods> hasherMD5CRC_availableMethods(bool checkCpuid) {
 	ret.push_back(MD5CRCMETH_SCALAR);
 	
 #ifdef PLATFORM_X86
-	const CpuCap caps(checkCpuid);
+	const HasherCpuCap caps(checkCpuid);
 	if(caps.hasClMul) {
 		if(caps.hasAVX512VLBW && MD5CRC_isAvailable_AVX512)
 			ret.push_back(MD5CRCMETH_AVX512);
@@ -572,7 +572,7 @@ std::vector<MD5CRCMethods> hasherMD5CRC_availableMethods(bool checkCpuid) {
 	}
 #endif
 #ifdef PLATFORM_ARM
-	const CpuCap caps(checkCpuid);
+	const HasherCpuCap caps(checkCpuid);
 	if(caps.hasCRC && MD5CRC_isAvailable_ARMCRC)
 		ret.push_back(MD5CRCMETH_ARMCRC);
 #endif
@@ -585,7 +585,7 @@ std::vector<MD5MultiLevels> hasherMD5Multi_availableMethods(bool checkCpuid) {
 	ret.push_back(MD5MULT_SCALAR);
 	
 #ifdef PLATFORM_X86
-	const CpuCap caps(checkCpuid);
+	const HasherCpuCap caps(checkCpuid);
 	if(caps.hasAVX512VLBW && MD5Multi_AVX512_256::isAvailable)
 		ret.push_back(MD5MULT_AVX512VL);
 	if(caps.hasAVX512F && MD5Multi_AVX512::isAvailable)
@@ -598,7 +598,7 @@ std::vector<MD5MultiLevels> hasherMD5Multi_availableMethods(bool checkCpuid) {
 		ret.push_back(MD5MULT_SSE);
 #endif
 #ifdef PLATFORM_ARM
-	const CpuCap caps(checkCpuid);
+	const HasherCpuCap caps(checkCpuid);
 	if(caps.hasSVE2 && MD5Multi_SVE2::isAvailable)
 		ret.push_back(MD5MULT_SVE2);
 	if(caps.hasNEON && MD5Multi_NEON::isAvailable)

From 2757f9caf61c7a5a00c860e6d66aeaea460ad841 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 29 Aug 2023 17:07:00 +1000
Subject: [PATCH 83/91] Fix UBSan warnings

---
 .github/workflows/test.yml |  2 +-
 gf16/gf16_affine_avx10.h   | 16 ++++++++--------
 gf16/gf16_muladd_multi.h   | 15 +++++++++++----
 gf16/gf16_shuffle_avx2.c   | 14 +++++++-------
 gf16/gf16_xor_avx2.c       | 16 ++++++++--------
 gf16/gf16_xor_avx512.c     | 16 ++++++++--------
 gf16/gf16_xor_sse2.c       | 26 +++++++++++++-------------
 hasher/md5mb-sse.h         |  6 +++---
 test/gf16/CMakeLists.txt   |  3 +--
 9 files changed, 60 insertions(+), 54 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index da70268a..543b284e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -38,7 +38,7 @@ jobs:
           $env:SDE_PATH/sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
           $env:SDE_PATH/sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe
         if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }}
-      
+      # TODO: XOP tests for hasher?
   
   # test building only
   test-win-arm:
diff --git a/gf16/gf16_affine_avx10.h b/gf16/gf16_affine_avx10.h
index f2e9de1a..54ff86c7 100644
--- a/gf16/gf16_affine_avx10.h
+++ b/gf16/gf16_affine_avx10.h
@@ -199,10 +199,10 @@ GF16_MULADD_MULTI_FUNCS_STUB(gf16_affine, _FNSUFFIX)
 
 
 #ifdef _AVAILABLE
-static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_2round)(const int srcCountOffs, const void* _src1, const void* _src2, _mword* result, _mword* swapped, _mword matNorm1, _mword matSwap1, _mword matNorm2, _mword matSwap2) {
+static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_2round)(const int srcCountOffs, const uint8_t* _src1, const uint8_t* _src2, intptr_t srcOffset, _mword* result, _mword* swapped, _mword matNorm1, _mword matSwap1, _mword matNorm2, _mword matSwap2) {
 	if(srcCountOffs < 0) return;
 	
-	_mword data1 = _MMI(load)(_src1);
+	_mword data1 = _MMI(load)((const _mword*)(_src1 + srcOffset));
 	if(srcCountOffs == 0) {
 		*result = _MMI(xor)(
 			*result,
@@ -214,7 +214,7 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_2round)(const int srcC
 		);
 	}
 	else { // if(srcCountOffs > 0)
-		_mword data2 = _MMI(load)(_src2);
+		_mword data2 = _MMI(load)((const _mword*)(_src2 + srcOffset));
 		*result = _MM(ternarylogic_epi32)(
 			*result,
 			_MM(gf2p8affine_epi64_epi8)(data1, matNorm1, 0),
@@ -391,11 +391,11 @@ static HEDLEY_ALWAYS_INLINE void _FN(gf16_affine2x_muladd_x)(
 			);
 		}
 		
-		_FN(gf16_affine2x_muladd_2round)(srcCount - 4, _src4 + ptr*srcScale, _src5 + ptr*srcScale, &result, &swapped, matNormD, matSwapD, matNormE, matSwapE);
-		_FN(gf16_affine2x_muladd_2round)(srcCount - 6, _src6 + ptr*srcScale, _src7 + ptr*srcScale, &result, &swapped, matNormF, matSwapF, matNormG, matSwapG);
-		_FN(gf16_affine2x_muladd_2round)(srcCount - 8, _src8 + ptr*srcScale, _src9 + ptr*srcScale, &result, &swapped, matNormH, matSwapH, matNormI, matSwapI);
-		_FN(gf16_affine2x_muladd_2round)(srcCount - 10, _src10 + ptr*srcScale, _src11 + ptr*srcScale, &result, &swapped, matNormJ, matSwapJ, matNormK, matSwapK);
-		_FN(gf16_affine2x_muladd_2round)(srcCount - 12, _src12 + ptr*srcScale, _src13 + ptr*srcScale, &result, &swapped, matNormL, matSwapL, matNormM, matSwapM);
+		_FN(gf16_affine2x_muladd_2round)(srcCount - 4, _src4, _src5, ptr*srcScale, &result, &swapped, matNormD, matSwapD, matNormE, matSwapE);
+		_FN(gf16_affine2x_muladd_2round)(srcCount - 6, _src6, _src7, ptr*srcScale, &result, &swapped, matNormF, matSwapF, matNormG, matSwapG);
+		_FN(gf16_affine2x_muladd_2round)(srcCount - 8, _src8, _src9, ptr*srcScale, &result, &swapped, matNormH, matSwapH, matNormI, matSwapI);
+		_FN(gf16_affine2x_muladd_2round)(srcCount - 10, _src10, _src11, ptr*srcScale, &result, &swapped, matNormJ, matSwapJ, matNormK, matSwapK);
+		_FN(gf16_affine2x_muladd_2round)(srcCount - 12, _src12, _src13, ptr*srcScale, &result, &swapped, matNormL, matSwapL, matNormM, matSwapM);
 		
 		result = _MM(ternarylogic_epi32)(
 			result,
diff --git a/gf16/gf16_muladd_multi.h b/gf16/gf16_muladd_multi.h
index f445dbb3..fc10c97e 100644
--- a/gf16/gf16_muladd_multi.h
+++ b/gf16/gf16_muladd_multi.h
@@ -78,6 +78,12 @@ typedef void (*const fMuladdPF)
 	const int doPrefetch, const char* _pf
 );
 
+// suppress UBSan warning about adding to a NULL pointer; `coefficients` can be NULL from gf_add*, but it's never used there, and it's annoying to have to check and branch on these
+#if defined(__clang__)
+# define IGNORE_NULL_ADD __attribute__((no_sanitize("pointer-overflow")))
+#else
+# define IGNORE_NULL_ADD
+#endif
 
 static HEDLEY_ALWAYS_INLINE void gf16_muladd_single(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, uint16_t val) {
 	muladd_pf(
@@ -107,7 +113,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_prefetch_single(const void *HEDLEY_
 
 #define REMAINING_CASES CASE(17); CASE(16); CASE(15); CASE(14); CASE(13); CASE(12); CASE(11); CASE(10); CASE( 9); CASE( 8); CASE( 7); CASE( 6); CASE( 5); CASE( 4); CASE( 3); CASE( 2); CASE( 1)
 
-static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients) {
+static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients) IGNORE_NULL_ADD {
 	uint8_t* _dst = (uint8_t*)dst + offset + len;
 	
 	#define _SRC(limit, n) limit > n ? (const uint8_t*)src[region+n] + offset + len : NULL
@@ -148,7 +154,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi(const void *HEDLEY_RESTRICT s
 	#undef _SRC
 }
 
-static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const unsigned pfFactor, const void* HEDLEY_RESTRICT prefetch) {
+static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regions, size_t srcStride, void *HEDLEY_RESTRICT dst, const void *HEDLEY_RESTRICT src, size_t len, const uint16_t *HEDLEY_RESTRICT coefficients, const unsigned pfFactor, const void* HEDLEY_RESTRICT prefetch) IGNORE_NULL_ADD {
 	uint8_t* _dst = (uint8_t*)dst + len;
 	uint8_t* srcEnd = (uint8_t*)src + len;
 	
@@ -232,7 +238,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_stridepf(const void *HEDLEY_R
 	#undef _SRC
 }
 
-static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packed(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regionsPerCall, unsigned inputPackSize, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, size_t blockLen, const uint16_t *HEDLEY_RESTRICT coefficients) {
+static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packed(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regionsPerCall, unsigned inputPackSize, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, size_t blockLen, const uint16_t *HEDLEY_RESTRICT coefficients) IGNORE_NULL_ADD {
 	ASSUME(regions <= inputPackSize);
 	
 	uint8_t* _dst = (uint8_t*)dst + len;
@@ -351,7 +357,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packed(const void *HEDLEY_RES
 # define MM_HINT_WT1 _MM_HINT_ET1
 #endif
 
-static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packpf(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regionsPerCall, unsigned inputPackSize, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, size_t blockLen, const uint16_t *HEDLEY_RESTRICT coefficients, const unsigned pfFactor, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) {
+static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packpf(const void *HEDLEY_RESTRICT scratch, fMuladdPF muladd_pf, const unsigned interleave, unsigned regionsPerCall, unsigned inputPackSize, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, size_t blockLen, const uint16_t *HEDLEY_RESTRICT coefficients, const unsigned pfFactor, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) IGNORE_NULL_ADD {
 	ASSUME(regions <= inputPackSize);
 	
 	uint8_t* _dst = (uint8_t*)dst + len;
@@ -615,3 +621,4 @@ static HEDLEY_ALWAYS_INLINE void gf16_muladd_multi_packpf(const void *HEDLEY_RES
 }
 
 #undef REMAINING_CASES
+#undef IGNORE_NULL_ADD
diff --git a/gf16/gf16_shuffle_avx2.c b/gf16/gf16_shuffle_avx2.c
index 28a7991e..5899d4c7 100644
--- a/gf16/gf16_shuffle_avx2.c
+++ b/gf16/gf16_shuffle_avx2.c
@@ -17,8 +17,8 @@
 #include "gf16_muladd_multi.h"
 
 #if defined(_AVAILABLE)
-static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_round_avx2(__m256i* _dst, const int srcCount, __m256i* _src1, __m256i* _src2, __m256i shufNormLoA, __m256i shufNormLoB, __m256i shufNormHiA, __m256i shufNormHiB, __m256i shufSwapLoA, __m256i shufSwapLoB, __m256i shufSwapHiA, __m256i shufSwapHiB) {
-	__m256i data = _mm256_load_si256(_src1);
+static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_round_avx2(__m256i* _dst, const int srcCount, const uint8_t* _src1, const uint8_t* _src2, intptr_t srcOffset, __m256i shufNormLoA, __m256i shufNormLoB, __m256i shufNormHiA, __m256i shufNormHiB, __m256i shufSwapLoA, __m256i shufSwapLoB, __m256i shufSwapHiA, __m256i shufSwapHiB) {
+	__m256i data = _mm256_load_si256((const __m256i*)(_src1 + srcOffset));
 	__m256i mask = _mm256_set1_epi8(0x0f);
 	
 	__m256i ti = _mm256_and_si256(mask, data);
@@ -32,7 +32,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_round_avx2(__m256i* _dst,
 	result = _mm256_xor_si256(result, _mm256_load_si256(_dst));
 	
 	if(srcCount > 1) {
-		data = _mm256_load_si256(_src2);
+		data = _mm256_load_si256((const __m256i*)(_src2 + srcOffset));
 		
 		ti = _mm256_and_si256(mask, data);
 		result = _mm256_xor_si256(_mm256_shuffle_epi8(shufNormLoB, ti), result);
@@ -117,7 +117,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_x_avx2(const void *HEDLEY
 		intptr_t ptr = -(intptr_t)len;
 		if(len & (sizeof(__m256i)*2-1)) { // number of loop iterations isn't even, so do one iteration to make it even
 			gf16_shuffle2x_muladd_round_avx2(
-				(__m256i*)(_dst+ptr), srcCount, (__m256i*)(_src1+ptr*srcScale), (__m256i*)(_src2+ptr*srcScale),
+				(__m256i*)(_dst+ptr), srcCount, _src1, _src2, ptr*srcScale,
 				shufNormLoA, shufNormLoB, shufNormHiA, shufNormHiB, shufSwapLoA, shufSwapLoB, shufSwapHiA, shufSwapHiB
 			);
 			if(doPrefetch == 1)
@@ -128,12 +128,12 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_x_avx2(const void *HEDLEY
 		}
 		while(ptr) {
 			gf16_shuffle2x_muladd_round_avx2(
-				(__m256i*)(_dst+ptr), srcCount, (__m256i*)(_src1+ptr*srcScale), (__m256i*)(_src2+ptr*srcScale),
+				(__m256i*)(_dst+ptr), srcCount, _src1, _src2, ptr*srcScale,
 				shufNormLoA, shufNormLoB, shufNormHiA, shufNormHiB, shufSwapLoA, shufSwapLoB, shufSwapHiA, shufSwapHiB
 			);
 			ptr += sizeof(__m256i);
 			gf16_shuffle2x_muladd_round_avx2(
-				(__m256i*)(_dst+ptr), srcCount, (__m256i*)(_src1+ptr*srcScale), (__m256i*)(_src2+ptr*srcScale),
+				(__m256i*)(_dst+ptr), srcCount, _src1, _src2, ptr*srcScale,
 				shufNormLoA, shufNormLoB, shufNormHiA, shufNormHiB, shufSwapLoA, shufSwapLoB, shufSwapHiA, shufSwapHiB
 			);
 			
@@ -146,7 +146,7 @@ static HEDLEY_ALWAYS_INLINE void gf16_shuffle2x_muladd_x_avx2(const void *HEDLEY
 	} else {
 		for(intptr_t ptr = -(intptr_t)len; ptr; ptr += sizeof(__m256i)) {
 			gf16_shuffle2x_muladd_round_avx2(
-				(__m256i*)(_dst+ptr), srcCount, (__m256i*)(_src1+ptr*srcScale), (__m256i*)(_src2+ptr*srcScale),
+				(__m256i*)(_dst+ptr), srcCount, _src1, _src2, ptr*srcScale,
 				shufNormLoA, shufNormLoB, shufNormHiA, shufNormHiB, shufSwapLoA, shufSwapLoB, shufSwapHiA, shufSwapHiB
 			);
 		}
diff --git a/gf16/gf16_xor_avx2.c b/gf16/gf16_xor_avx2.c
index 219b05ae..019af819 100644
--- a/gf16/gf16_xor_avx2.c
+++ b/gf16/gf16_xor_avx2.c
@@ -477,22 +477,22 @@ static HEDLEY_ALWAYS_INLINE __m256i gf16_xor_finish_extract_bits(__m256i src) {
 static HEDLEY_ALWAYS_INLINE void gf16_xor_finish_extract_bits_store(uint32_t* dst, __m256i src) {
 	__m256i srcShifted = _mm256_add_epi8(src, src);
 	__m256i lane = _mm256_inserti128_si256(srcShifted, _mm256_castsi256_si128(src), 1);
-	dst[3] = _mm256_movemask_epi8(lane);
+	write32(dst+3, _mm256_movemask_epi8(lane));
 	lane = _mm256_slli_epi16(lane, 2);
-	dst[2] = _mm256_movemask_epi8(lane);
+	write32(dst+2, _mm256_movemask_epi8(lane));
 	lane = _mm256_slli_epi16(lane, 2);
-	dst[1] = _mm256_movemask_epi8(lane);
+	write32(dst+1, _mm256_movemask_epi8(lane));
 	lane = _mm256_slli_epi16(lane, 2);
-	dst[0] = _mm256_movemask_epi8(lane);
+	write32(dst+0, _mm256_movemask_epi8(lane));
 	
 	lane = _mm256_permute2x128_si256(srcShifted, src, 0x31);
-	dst[7] = _mm256_movemask_epi8(lane);
+	write32(dst+7, _mm256_movemask_epi8(lane));
 	lane = _mm256_slli_epi16(lane, 2);
-	dst[6] = _mm256_movemask_epi8(lane);
+	write32(dst+6, _mm256_movemask_epi8(lane));
 	lane = _mm256_slli_epi16(lane, 2);
-	dst[5] = _mm256_movemask_epi8(lane);
+	write32(dst+5, _mm256_movemask_epi8(lane));
 	lane = _mm256_slli_epi16(lane, 2);
-	dst[4] = _mm256_movemask_epi8(lane);
+	write32(dst+4, _mm256_movemask_epi8(lane));
 }
 
 #define LOAD_HALVES(a, b, upper) \
diff --git a/gf16/gf16_xor_avx512.c b/gf16/gf16_xor_avx512.c
index d87d79e1..6fea216d 100644
--- a/gf16/gf16_xor_avx512.c
+++ b/gf16/gf16_xor_avx512.c
@@ -1024,20 +1024,20 @@ static HEDLEY_ALWAYS_INLINE void gf16_xor_finish_bit_extract(uint64_t* dst, __m5
 		0x10101010, 0x10101010, 0x10101010, 0x10101010
 	);
 	__m512i lane = _mm512_shuffle_i32x4(src, src, _MM_SHUFFLE(0,0,0,0));
-	dst[0] = _mm512_test_epi8_mask(lane, lo_nibble_test);
-	dst[1] = _mm512_test_epi8_mask(lane, hi_nibble_test);
+	write64(dst+0, _mm512_test_epi8_mask(lane, lo_nibble_test));
+	write64(dst+1, _mm512_test_epi8_mask(lane, hi_nibble_test));
 	
 	lane = _mm512_shuffle_i32x4(src, src, _MM_SHUFFLE(1,1,1,1));
-	dst[32 +0] = _mm512_test_epi8_mask(lane, lo_nibble_test);
-	dst[32 +1] = _mm512_test_epi8_mask(lane, hi_nibble_test);
+	write64(dst+32 +0, _mm512_test_epi8_mask(lane, lo_nibble_test));
+	write64(dst+32 +1, _mm512_test_epi8_mask(lane, hi_nibble_test));
 	
 	lane = _mm512_shuffle_i32x4(src, src, _MM_SHUFFLE(2,2,2,2));
-	dst[64 +0] = _mm512_test_epi8_mask(lane, lo_nibble_test);
-	dst[64 +1] = _mm512_test_epi8_mask(lane, hi_nibble_test);
+	write64(dst+64 +0, _mm512_test_epi8_mask(lane, lo_nibble_test));
+	write64(dst+64 +1, _mm512_test_epi8_mask(lane, hi_nibble_test));
 	
 	lane = _mm512_shuffle_i32x4(src, src, _MM_SHUFFLE(3,3,3,3));
-	dst[96 +0] = _mm512_test_epi8_mask(lane, lo_nibble_test);
-	dst[96 +1] = _mm512_test_epi8_mask(lane, hi_nibble_test);
+	write64(dst+96 +0, _mm512_test_epi8_mask(lane, lo_nibble_test));
+	write64(dst+96 +1, _mm512_test_epi8_mask(lane, hi_nibble_test));
 }
 
 static HEDLEY_ALWAYS_INLINE void _gf16_xor_finish_copy_block_avx512(void* dst, const void* src) {
diff --git a/gf16/gf16_xor_sse2.c b/gf16/gf16_xor_sse2.c
index fe68fbbf..c5dcb948 100644
--- a/gf16/gf16_xor_sse2.c
+++ b/gf16/gf16_xor_sse2.c
@@ -271,7 +271,7 @@ static HEDLEY_ALWAYS_INLINE void STOREU_XMM(void* dest, __m128i xmm) {
 	#define CMOV(c, d, s) if(c) (d) = (s)
 #endif
 
-static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3(uint8_t* dest, uint_fast32_t mask, __m128i* tCode, uint16_t* tInfo, intptr_t* posC, unsigned long* movC, uint_fast8_t isR64) {
+static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3(uint8_t* dest, uint_fast32_t mask, __m128i* tCode, uint16_t* tInfo, intptr_t* posC, long* movC, uint_fast8_t isR64) {
 	uint_fast16_t info = tInfo[mask>>1];
 	intptr_t pC = info >> 12;
 	
@@ -281,12 +281,12 @@ static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3(uint8_t* dest, uint_f
 	// handle conditional move for common mask (since it's always done)
 	CMOV(*movC, *posC, pC+isR64);
 	*posC -= info & 0xF;
-	*movC &= -(pC == 0);
+	*movC &= -(long)(pC == 0);
 	
 	return info;
 }
 
-static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_noxor(uint8_t* dest, uint_fast16_t info, intptr_t* pos1, unsigned long* mov1, intptr_t* pos2, unsigned long* mov2, int isR64) {
+static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_noxor(uint8_t* dest, uint_fast16_t info, intptr_t* pos1, long* mov1, intptr_t* pos2, long* mov2, int isR64) {
 	UNUSED(dest);
 	uintptr_t p1 = (info >> 4) & 0xF;
 	uintptr_t p2 = (info >> 8) & 0xF;
@@ -294,12 +294,12 @@ static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_noxor(uint8_t* dest,
 	CMOV(*mov2, *pos2, p2+isR64);
 	*pos1 -= info & 0xF;
 	*pos2 -= info & 0xF;
-	*mov1 &= -(p1 == 0);
-	*mov2 &= -(p2 == 0);
+	*mov1 &= -(long)(p1 == 0);
+	*mov2 &= -(long)(p2 == 0);
 	return info & 0xF;
 }
 
-static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_nc_noxor(uint8_t* dest, uint_fast16_t info, intptr_t* pos1, unsigned long* mov1, intptr_t* pos2, unsigned long* mov2, int isR64) {
+static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_nc_noxor(uint8_t* dest, uint_fast16_t info, intptr_t* pos1, long* mov1, intptr_t* pos2, long* mov2, int isR64) {
 	UNUSED(dest);
 	uintptr_t p1 = (info >> 8) & 0xF;
 	uintptr_t p2 = info >> 12;
@@ -307,8 +307,8 @@ static HEDLEY_ALWAYS_INLINE uint_fast16_t xor_jit_bitpair3_nc_noxor(uint8_t* des
 	CMOV(*mov2, *pos2, p2+isR64);
 	*pos1 -= info & 0xF;
 	*pos2 -= info & 0xF;
-	*mov1 &= -(p1 == 0);
-	*mov2 &= -(p2 == 0);
+	*mov1 &= -(long)(p1 == 0);
+	*mov2 &= -(long)(p2 == 0);
 	return info & 0xF;
 }
 #undef CMOV
@@ -499,7 +499,7 @@ static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_REST
 		for(bit=0; bit<8; bit++) {
 			int destOffs = (bit<<5)-128;
 			int destOffs2 = destOffs+16;
-			unsigned long movC = 0xFF;
+			long movC = 0xFF;
 			intptr_t posC = 0;
 			uint_fast32_t mask = lumask[bit];
 			_LD_APS(0, DX, destOffs);
@@ -572,8 +572,8 @@ static inline void* xor_write_jit_sse(const struct gf16_xor_scratch *HEDLEY_REST
 		for(bit=0; bit<8; bit++) {
 			int destOffs = (bit<<5)-128;
 			int destOffs2 = destOffs+16;
-			unsigned long mov1 = 0xFF, mov2 = 0xFF,
-			              movC = 0xFF;
+			long mov1 = 0xFF, mov2 = 0xFF,
+			     movC = 0xFF;
 			intptr_t pos1 = 0, pos2 = 0, posC = 0;
 			uint_fast32_t mask = lumask[bit];
 			
@@ -1012,10 +1012,10 @@ void gf16_xor_muladd_sse2(const void *HEDLEY_RESTRICT scratch, void *HEDLEY_REST
 	srcDQh = _mm_unpackhi_epi64(srcQ0d, srcQ8d)
 
 #define EXTRACT_BITS(target, srcVec) \
-	(target)[7] = _mm_movemask_epi8(srcVec); \
+	write16((target) + 7, _mm_movemask_epi8(srcVec)); \
 	for(int i=6; i>=0; i--) { \
 		srcVec = _mm_add_epi8(srcVec, srcVec); \
-		(target)[i] = _mm_movemask_epi8(srcVec); \
+		write16((target) + i, _mm_movemask_epi8(srcVec)); \
 	}
 void gf16_xor_finish_block_sse2(void *HEDLEY_RESTRICT dst) {
 	uint16_t* _dst = (uint16_t*)dst;
diff --git a/hasher/md5mb-sse.h b/hasher/md5mb-sse.h
index 81e2503a..3eb26329 100644
--- a/hasher/md5mb-sse.h
+++ b/hasher/md5mb-sse.h
@@ -59,7 +59,7 @@
 
 
 static HEDLEY_ALWAYS_INLINE void md5_extract_mb_sse(void* dst, void* state, int idx) {
-	HEDLEY_ASSUME(idx < md5mb_regions_sse);
+	HEDLEY_ASSUME(idx >= 0 && idx < md5mb_regions_sse*2); // 2 = md5mb_interleave
 	__m128i* state_ = (__m128i*)state + (idx & 4);
 	__m128i tmp1 = _mm_unpacklo_epi32(state_[0], state_[1]);
 	__m128i tmp2 = _mm_unpackhi_epi32(state_[0], state_[1]);
@@ -269,7 +269,7 @@ static HEDLEY_ALWAYS_INLINE void md5_extract_all_mb_sse(void* dst, void* state,
 
 
 static HEDLEY_ALWAYS_INLINE void md5_extract_mb_avx2(void* dst, void* state, int idx) {
-	HEDLEY_ASSUME(idx < md5mb_regions_avx2);
+	HEDLEY_ASSUME(idx >= 0 && idx < md5mb_regions_avx2*2);
 	__m256i* state_ = (__m256i*)state + ((idx & 8) >> 1);
 	__m256i tmpAB0 = _mm256_unpacklo_epi32(state_[0], state_[1]);
 	__m256i tmpAB2 = _mm256_unpackhi_epi32(state_[0], state_[1]);
@@ -477,7 +477,7 @@ static HEDLEY_ALWAYS_INLINE void md5_extract_all_mb_avx2(void* dst, void* state,
 #undef LOAD16
 
 static HEDLEY_ALWAYS_INLINE void md5_extract_mb_avx512(void* dst, void* state, int idx) {
-	HEDLEY_ASSUME(idx < md5mb_regions_avx512);
+	HEDLEY_ASSUME(idx >= 0 && idx < md5mb_regions_avx512*2);
 	__m512i* state_ = (__m512i*)state + ((idx & 16) >> 2);
 	__m512i tmpAB0 = _mm512_unpacklo_epi32(state_[0], state_[1]);
 	__m512i tmpAB2 = _mm512_unpackhi_epi32(state_[0], state_[1]);
diff --git a/test/gf16/CMakeLists.txt b/test/gf16/CMakeLists.txt
index 49b716ae..66af243b 100644
--- a/test/gf16/CMakeLists.txt
+++ b/test/gf16/CMakeLists.txt
@@ -137,8 +137,7 @@ else()
 	endif()
 	
 	if(ENABLE_SANITIZE)
-		set(SANITIZE_OPTS -fsanitize=address -fsanitize=bool,builtin,bounds,enum,float-cast-overflow,function,integer-divide-by-zero,nonnull-attribute,null,object-size,return,returns-nonnull-attribute,shift,signed-integer-overflow,unreachable,vla-bound -fno-sanitize-recover=all)
-		# -fsanitize=pointer-overflow causes compilation of shuffle_avx512 to freeze on clang10
+		set(SANITIZE_OPTS -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all)
 		# -fsanitize=memory requires instrumented libraries, so not useful
 		add_compile_options(-fno-omit-frame-pointer ${SANITIZE_OPTS})
 		add_link_options(${SANITIZE_OPTS})

From 91b8c39f3771b7a9819c402ceac3cf7f52e95d44 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 29 Aug 2023 20:14:18 +1000
Subject: [PATCH 84/91] Test workflow fixes

---
 .github/workflows/test.yml | 31 +++++++++++++++++--------------
 gf16/threadqueue.h         |  4 +++-
 gf16/x86_jit.h             |  4 +++-
 src/cpuid.h                |  4 +++-
 4 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 543b284e..8357eec4 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,19 +24,19 @@ jobs:
           mkdir test\hasher\build
           cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\hasher\build --config ${{ matrix.config }}
-      - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test.exe
-      - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
-      - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-ctrl.exe -f
+      - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test.exe"
+      - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-pmul.exe"
+      - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-ctrl.exe -f"
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: $env:SDE_PATH/sde -icx -- test\gf16\build\${{ matrix.config }}\test-inv.exe -f
+      - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-inv.exe -f"
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: $env:SDE_PATH/sde -icx -- test\hasher\build\${{ matrix.config }}\test.exe
+      - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/hasher/build/${{ matrix.config }}/test.exe"
       
       # test SSE2-only to see if CPUID checking works
       - run: |
-          $env:SDE_PATH/sde -p4 -- test\gf16\build\${{ matrix.config }}\test.exe
-          $env:SDE_PATH/sde -p4 -- test\gf16\build\${{ matrix.config }}\test-pmul.exe
-          $env:SDE_PATH/sde -p4 -- test\hasher\build\${{ matrix.config }}\test.exe
+          Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/gf16/build/${{ matrix.config }}/test.exe"
+          Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/gf16/build/${{ matrix.config }}/test-pmul.exe"
+          Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/hasher/build/${{ matrix.config }}/test.exe"
         if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }}
       # TODO: XOP tests for hasher?
   
@@ -68,11 +68,14 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        sys: [mingw32, ucrt64, clang64]
+        sys:
+          - { sys: mingw32, env: i686 }
+          - { sys: ucrt64, env: ucrt-x86_64 }
+          - { sys: clang64, env: clang-x86_64 }
         compiler:
-          - {cc: gcc, cxx: g++, cc_extra: ""}
-          - {cc: clang, cxx: clang++, cc_extra: gcc}
-    name: Test MSYS ${{matrix.sys}} ${{matrix.compiler.cc}}
+          - {cc: gcc, cxx: g++}
+          - {cc: clang, cxx: clang++}
+    name: Test MSYS ${{matrix.sys.sys}} ${{matrix.compiler.cc}}
     defaults:
       run:
         shell: msys2 {0}
@@ -80,9 +83,9 @@ jobs:
       #- uses: petarpetrovt/setup-sde@v2.1
       - uses: msys2/setup-msys2@v2
         with:
-          msystem: ${{matrix.sys}}
+          msystem: ${{matrix.sys.sys}}
           #update: true
-          install: cmake ${{matrix.compiler.cc}} ${{matrix.compiler.cc_extra}} make git
+          install: cmake mingw-w64-${{matrix.sys.env}}-${{matrix.compiler.cc}} make git
       - uses: actions/checkout@v3
       - run: |
           mkdir test/gf16/build
diff --git a/gf16/threadqueue.h b/gf16/threadqueue.h
index 3db53cb3..0b3c23f8 100644
--- a/gf16/threadqueue.h
+++ b/gf16/threadqueue.h
@@ -201,7 +201,9 @@ typedef std::function<void(ThreadMessageQueue<void*>&)> thread_cb_t;
 
 
 #if defined(_WINDOWS) || defined(__WINDOWS__) || defined(_WIN32) || defined(_WIN64)
-# define NOMINMAX
+# ifndef NOMINMAX
+#  define NOMINMAX
+# endif
 # define WIN32_LEAN_AND_MEAN
 # include <Windows.h>
 #else
diff --git a/gf16/x86_jit.h b/gf16/x86_jit.h
index 2597d9d8..08636a94 100644
--- a/gf16/x86_jit.h
+++ b/gf16/x86_jit.h
@@ -702,7 +702,9 @@ typedef struct {
 } jit_wx_pair;
 
 #if defined(_WINDOWS) || defined(__WINDOWS__) || defined(_WIN32) || defined(_WIN64)
-# define NOMINMAX
+# ifndef NOMINMAX
+#  define NOMINMAX
+# endif
 # include <windows.h>
 static HEDLEY_ALWAYS_INLINE jit_wx_pair* jit_alloc(size_t len) {
 	void* mem = VirtualAlloc(NULL, len, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
diff --git a/src/cpuid.h b/src/cpuid.h
index 6d7c1c11..c44cabd3 100644
--- a/src/cpuid.h
+++ b/src/cpuid.h
@@ -46,7 +46,9 @@
 #  include <cpu-features.h>
 # elif defined(_WIN32)
 #  define WIN32_LEAN_AND_MEAN
-#  define NOMINMAX
+#  ifndef NOMINMAX
+#   define NOMINMAX
+#  endif
 #  include <Windows.h>
 # elif defined(__APPLE__)
 #  include <sys/types.h>

From 7b6fb6660a492a5fbbdbea9ea46bc5aba801d7bf Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 29 Aug 2023 20:36:46 +1000
Subject: [PATCH 85/91] Add full test workflow

---
 .github/workflows/test-full.yml |  36 ++++++++
 test/cached-cmpref-fast.json    |   1 +
 test/par-compare.js             | 149 ++++++++++++++++++--------------
 3 files changed, 120 insertions(+), 66 deletions(-)
 create mode 100644 .github/workflows/test-full.yml
 create mode 100644 test/cached-cmpref-fast.json

diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml
new file mode 100644
index 00000000..b936c7cc
--- /dev/null
+++ b/.github/workflows/test-full.yml
@@ -0,0 +1,36 @@
+name: Run PAR2 Create Tests
+on:
+  workflow_dispatch:
+  push:
+
+jobs:
+  test-node:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - version: '0.10.40'
+          flags: ''
+          python2: true
+        - version: '4.9.1'
+          flags: ''
+          python2: true
+        - version: '12.22.12'
+          flags: '--trace-warnings'
+          python2: false
+        - version: '20.5.1'
+          flags: '--pending-deprecation --throw-deprecation --trace-warnings'
+          python2: false
+    name: Test on Node v${{ matrix.version }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: MatteoH2O1999/setup-python@v1
+        with:
+          python-version: '2.7'
+        if: ${{ matrix.python2 }}
+      - uses: actions/checkout@v3
+      - uses: actions/setup-node@v3
+        with:
+          node-version: ${{ matrix.version }}
+      - run: (npm install --production
+      - run: node ${{ matrix.flags }} test/par-compare.js -f
diff --git a/test/cached-cmpref-fast.json b/test/cached-cmpref-fast.json
new file mode 100644
index 00000000..3b0ac589
--- /dev/null
+++ b/test/cached-cmpref-fast.json
@@ -0,0 +1 @@
+{"0":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"384329a66d8557f9c35b05e2d391b2db","len":262152},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2e968c2b42a5e148ee1da7556645ec19","len":262152},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"f9cbb890066d9b19df7fb43cdec29f89","len":262152},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee74b923884c67a0f9c75f8db2a4946a","len":262152},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"c95cc5d5456cc427eb3f05cab6e7712d","len":262152},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"b789836bb5fda2ad650ed492693c24d0","len":262152},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"30c5398aa3faf8b4306d4010dde7f34a","len":262152},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"3652d2c728cf4e5cf0305e05237bc8b4","len":262152},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"8c51f2e4e80ac1f825bc98ce1ee50137","len":262152},"main":{"type":"PAR 2.0\u0000Main","md5":"a1ab3aa1dd29953af5f118e683a1ebef","len":92},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"527d15c6ae89dd498b68d283cb13a04d","len":262152},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"5a841a77e7876ad22c0860da5f6ed754","len":262152},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"2d942f333b87e8242d5addbfc309c9c0","len":262152},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"f8c66e0c1496a0ab21c6af961c8a8a73","len":262152},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"3419f14c6492702df0b75e1eac8a0e07","len":262152},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"50b3d0e1f17909a0d32b1a9c5a983ce2","len":262152},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab9251ff754860ecff97ecb45d113171","len":262152},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"eda7ed48e29ee51e543eae412afa3e38","len":262152},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"ba3580d34dbf456def2775e5f47fe32f","len":132},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe34f3c978e36050cc0324ebd10f2113","len":262152},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"770f8e4351f416d01b4e5905e6182114","len":262152},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"3ffc5b30c2ae5f68d97db47093f093b7","len":262152},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"203bf0ad82cefd3d9240a50883226500","len":262152},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"7083ddb527d8f284a27b965dbb3f71ce","len":262152},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"8972d12d517f515153e4217593e38cbd","len":262152},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"378c7d54aa0e47c1be08062302c2d615","len":262152},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3b09495cfc309312879658b0e636063","len":262152},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1883c8c7fa2b7da5978e4b5199f6e168","len":5220},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c57c1edbd2ecdb08710d02cd8b708c6","len":262152},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"8dea37ee27350c5d4e14f60749484d58","len":262152},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0dee7d46d4c23673394b693f27f3766","len":262152},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"c62c595a6e2990882d7cadff631c56de","len":262152},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"08eff6bf1aed164dab1b5144c93a339e","len":262152},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"c9e1ea86f490cb436315b2a5dfe27700","len":262152},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"62c45fbbc54d5ba2aab1e4f219639e16","len":262152},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"76d12aa5f889db92da1b4d90cb70ec28","len":262152},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"cbb84f9b1dd02e5e6cce7d385aefc47e","len":262152},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"65ea8fae55f8e2b93ffca5cfa59417b3","len":262152},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"308bd5740cd38138b61a6a659ab3489d","len":262152},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"725f7e7221786949fb166a17cea3d4fe","len":262152},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"94b230fc6b49f004a0b35fca8321953f","len":262152},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"37d8119968567d0a7e7f04b0e0e10d7e","len":262152},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"381fdb3aa50e50cbfc3692633c89c847","len":262152},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"a9dfadf596069976c88a1943099e0284","len":262152},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"2968cf0b54fc2d2f28777d67a6cb74ba","len":262152},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"59682b0bd874f1f6f2dc8f8a37d065f7","len":262152},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4abc2e61efa29e839e8c54a8597b3cd","len":262152},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"d827bb49770c8dd9397e1dc3e435f7b4","len":262152},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"572d530fcddea95b69daa2c5c49ca59e","len":262152},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"2eea5c67990da60d4af649369ee57a4e","len":262152},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"fbdde1ef3f31e9b49e200583f3ea3162","len":262152},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6d04af4d5bddebd010e2c9087835ad6","len":262152},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"67d807315ac9bdf41ff67f27a1b9e9da","len":262152},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"cc9a11df7a97fb9a0169f63150aff0d2","len":262152},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"8fa64d50c2e72c70d47c527a412c6ebc","len":262152},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b18bcf9f7017a7a8a261cf34c570817","len":262152},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"5dac9d8f1e5754d818a724841712d518","len":262152},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"99395b4ea95dfb1e30421850562fccae","len":262152},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"e2470652540238a6d4e4ee4a0f41b288","len":262152},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"7578ea0996471412fa31d42b4b6793fc","len":262152},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"c324e0207bf219380c8498200d187e09","len":262152},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f5237508b02edaeea3b7566aaab1c8f","len":262152},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"45b6aa9de72aa9ecb63b77a4c6469f08","len":262152},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"d171092245e7736d740668d4bf88b69d","len":262152},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a55521c05a72864671e695a953d7f63","len":262152},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"352e09046f010c26bcebf1caf6a31008","len":262152},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a88541f965eea59e425f5a0ee54e7ea","len":262152},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"16de857133d3118e86a0ea8ec2b32108","len":262152},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"7e86600a79c8e4255d6b16051997160c","len":262152},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"685f9d66019a07b7663c1cc3333d27ab","len":262152},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd358825cd83587525a2707e3d966696","len":262152},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"98db58ba135f4006057cef5ad69029f8","len":262152},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ee1489fa618a21a3b3366e0c8322165","len":262152},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"d236c0a9ed4b8e5915bc8c265bb06aca","len":262152},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"b9d1304bd26aaefa615f00737a864779","len":262152},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8b0767636b21e45d207cc97617666a3","len":262152},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"e104cc83447db74e477a6dc172e4a44b","len":262152},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"47a8035c958e238f18d8365c27d4ed34","len":262152},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"b1d0bd7ef446e2cfcbbc882324f6a64b","len":262152},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"d66d76a5e5ea3aaeb7693bfd4d79c38b","len":262152},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"92f12b3316b91c0e478b71db38f662b3","len":262152},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"19640a449419963e7035979c5a420833","len":262152},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a6755283ec0bea1915a63ddff53f349","len":262152},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"ca91dc007eb44c1f55f889cc7375748a","len":262152},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"592ea92cac06a328f6b7b00153ce343a","len":262152},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"5154fae148e8c21e507215d740ce275c","len":262152},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"79a4101d6c828627657c6c1dd0b775e0","len":262152},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"98b70a8f1dda25aaf4cd6490e7d75a71","len":262152},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"6f04b148eadd8c49cd620f50cc4d254d","len":262152},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"182191f40c369d41af2d331056975cbd","len":262152},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"eb514ff59b226f072aed7f351ef62890","len":262152},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"cfe974ad0e408adcae5c4359e9333167","len":262152},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"19550d9807d635fa0c76430a90ef9b2d","len":262152},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8b0bed1ee4a5791b1bf6b339096cb5f","len":262152},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"a5d9e00c46253847f07171849d6eadc7","len":262152},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"82d7d59990475c76797adafe0dc40696","len":262152},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"097f1e8dd4ccb3592b8da5723ef59cd4","len":262152},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"ded7762317423bd6ba8c1c70ee003895","len":262152},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"b20168baef01265ca59af7749a004f19","len":262152},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"f2a61d2c37afd4e32380c5f974c718c4","len":262152},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"c505c8d454e6bd6805c19983c4b3becb","len":262152},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"451a64afafa5742e04a006595e3b76ed","len":262152},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"27122f17a65d56f09a5a99ec15023906","len":262152},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd77e86b9c6a708f0fdef896794e1416","len":262152},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b66fb1a91751f8fa60a26d01266773b","len":262152},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"17396f15a240e560fd0abbf793cff777","len":262152},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"db3e3eefb31e2e13de7f6b792d08fab6","len":262152},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"96c045e794657f81be4f550931d09f86","len":262152},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b2d795b329310ade27c930c0c9ac3a5","len":262152},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"5429af73f35f0c3aec714c0c2bb6fd63","len":262152},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba08b07cbfaa853ad84d02e16862b0c4","len":262152},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"73a6530cfe5bba58e1fec23eabeb8e6f","len":262152},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"01eb64dc438d15b3d37ad07a751ba19e","len":262152},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"055ff87be7d79d6a503ec16287cd7e1d","len":262152},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"f1d7485cfe6bceb4eaba44d23362a168","len":262152},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"60f1e6f7d1317bf376fb6c3fd6470db0","len":262152},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"371f5d440f6fb38a8dbd1bb08426d734","len":262152},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"1088a949951fc9d698682d724e67dd7b","len":262152},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4a1947699b7877aca19f2e9ba66d3a6","len":262152},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"39aecd2559c82e37e66328bcd8ece41d","len":262152},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"764fab8b1ebda52512b3bbeefb9a9bc2","len":262152},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"b9be02bf929a30beec27ada23f59b6a3","len":262152},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"03c4ba2e7b40c211a91300091cdfaea1","len":262152},"recovery120":{"type":"PAR 2.0\u0000RecvSlic","md5":"7f161539917d7aed2bdcfc6a97bf1243","len":262152},"recovery121":{"type":"PAR 2.0\u0000RecvSlic","md5":"06cb8f2d83a03ad41d991c95cd93df59","len":262152},"recovery122":{"type":"PAR 2.0\u0000RecvSlic","md5":"03c36c76eeaf4c7c3d669133896b673e","len":262152},"recovery123":{"type":"PAR 2.0\u0000RecvSlic","md5":"2701161fade73a745dfe8fc06d097aa5","len":262152},"recovery124":{"type":"PAR 2.0\u0000RecvSlic","md5":"02579b843bcf0750cf7595c84c4a1b2c","len":262152},"recovery125":{"type":"PAR 2.0\u0000RecvSlic","md5":"8af83a50b1e0c00dc811d24090466dc6","len":262152},"recovery126":{"type":"PAR 2.0\u0000RecvSlic","md5":"5fbcd8d0454c6674bd236ccfab72f784","len":262152},"recovery127":{"type":"PAR 2.0\u0000RecvSlic","md5":"2aae76e0ee91e98cfcad8b6edf64b30a","len":262152},"recovery128":{"type":"PAR 2.0\u0000RecvSlic","md5":"3c80e0a3b4aced780158b2cda147025b","len":262152},"recovery129":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d714844585e05d9b8ecca9b3ce144e2","len":262152},"recovery130":{"type":"PAR 2.0\u0000RecvSlic","md5":"3ff70a539d48bf46bddc4a4d604ef122","len":262152},"recovery131":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac9e11b7553248cc9cab4ffa4b0eadc0","len":262152},"recovery132":{"type":"PAR 2.0\u0000RecvSlic","md5":"a338dfa5fb3dfda5d0844ccf33b7a0a0","len":262152},"recovery133":{"type":"PAR 2.0\u0000RecvSlic","md5":"55816e85a627b359f1ce6abcf07a80f8","len":262152},"recovery134":{"type":"PAR 2.0\u0000RecvSlic","md5":"62989b2c6a48c883b7153f8841c57fbf","len":262152},"recovery135":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d714d1f7c84c36bba5211a441322f1f","len":262152},"recovery136":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe16f8e7183642779c2fda0bdd2e69c7","len":262152},"recovery137":{"type":"PAR 2.0\u0000RecvSlic","md5":"959e3ed520135276ce507bfb974170e5","len":262152},"recovery138":{"type":"PAR 2.0\u0000RecvSlic","md5":"38755e49834c707a18ebc78153dd309f","len":262152},"recovery139":{"type":"PAR 2.0\u0000RecvSlic","md5":"861c385818cc82e20538517a860ac822","len":262152},"recovery140":{"type":"PAR 2.0\u0000RecvSlic","md5":"573401fb45c72080a5fbe59575daf1f0","len":262152},"recovery141":{"type":"PAR 2.0\u0000RecvSlic","md5":"cbfa6d65eeffb234b4b35bb0101813c2","len":262152},"recovery142":{"type":"PAR 2.0\u0000RecvSlic","md5":"a915d1e365b2f6667b561f15074c246f","len":262152},"recovery143":{"type":"PAR 2.0\u0000RecvSlic","md5":"aaa5d236c4b0ab8bca3837357a676828","len":262152},"recovery144":{"type":"PAR 2.0\u0000RecvSlic","md5":"a5e825e51919a1526298b2a63a67cd18","len":262152},"recovery145":{"type":"PAR 2.0\u0000RecvSlic","md5":"935be6ea3de1a2b7662868af15db66ae","len":262152},"recovery146":{"type":"PAR 2.0\u0000RecvSlic","md5":"20927e4c32ca839d9bc2e32dda3b88bb","len":262152},"recovery147":{"type":"PAR 2.0\u0000RecvSlic","md5":"734b9f415296856d8a9935673a6359ce","len":262152},"recovery148":{"type":"PAR 2.0\u0000RecvSlic","md5":"00df8edd547ee5f396ff09c1b0c8a979","len":262152},"recovery149":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c7d4748b05c963294d008bcd4d9abd4","len":262152},"recovery150":{"type":"PAR 2.0\u0000RecvSlic","md5":"136eefb57c6163ba9ee4361bc30a946b","len":262152},"recovery151":{"type":"PAR 2.0\u0000RecvSlic","md5":"eb45e084b5f5d3c1659825791ebe9dcf","len":262152},"recovery152":{"type":"PAR 2.0\u0000RecvSlic","md5":"b1167f13fd2ab1a49de63f0c23d97b30","len":262152},"recovery153":{"type":"PAR 2.0\u0000RecvSlic","md5":"ec60aca69736e5c01c0bc0a4a5fc20a7","len":262152},"recovery154":{"type":"PAR 2.0\u0000RecvSlic","md5":"1eeecfb1e63674b87f0d47736f77cf8a","len":262152},"recovery155":{"type":"PAR 2.0\u0000RecvSlic","md5":"d24ea57a9b206855ce59c28a29a609b5","len":262152},"recovery156":{"type":"PAR 2.0\u0000RecvSlic","md5":"adaad933db3226d3778da61397198f17","len":262152},"recovery157":{"type":"PAR 2.0\u0000RecvSlic","md5":"36e8c301bcbcb253546bb3672400f0f2","len":262152},"recovery158":{"type":"PAR 2.0\u0000RecvSlic","md5":"acb88c3d8b4676e5b2a1d07240ecbf3a","len":262152},"recovery159":{"type":"PAR 2.0\u0000RecvSlic","md5":"6ec97320f986ffc6dc8884d762784e64","len":262152},"recovery160":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba205e752f5743e7fb56f91159f11638","len":262152},"recovery161":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee0bc353e44cb3a0a6674bebb8f2b02f","len":262152},"recovery162":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ffb47c96191de05ee92d65d17e6d1b5","len":262152},"recovery163":{"type":"PAR 2.0\u0000RecvSlic","md5":"cab383c52fd7edf43c813a02b0e9f715","len":262152},"recovery164":{"type":"PAR 2.0\u0000RecvSlic","md5":"7579d93610965f3fa9d5380b8a05d179","len":262152},"recovery165":{"type":"PAR 2.0\u0000RecvSlic","md5":"79ebb790e0f5b3878e8efa9e31a1ec2f","len":262152},"recovery166":{"type":"PAR 2.0\u0000RecvSlic","md5":"d98d9e9fcaeab7644965287fdfa14927","len":262152},"recovery167":{"type":"PAR 2.0\u0000RecvSlic","md5":"5c4d552661fe8a972e9a3451829f467f","len":262152},"recovery168":{"type":"PAR 2.0\u0000RecvSlic","md5":"e48d561f1fbc700b8c676ee59c4a9f5d","len":262152},"recovery169":{"type":"PAR 2.0\u0000RecvSlic","md5":"585373bf1021202ac1a871dd1325e797","len":262152},"recovery170":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab371b66e23bf6b2f143a97162857c7f","len":262152},"recovery171":{"type":"PAR 2.0\u0000RecvSlic","md5":"e73956ff54bc05bd6f269456952de3f1","len":262152},"recovery172":{"type":"PAR 2.0\u0000RecvSlic","md5":"1908334591c7199fb59474e189631053","len":262152},"recovery173":{"type":"PAR 2.0\u0000RecvSlic","md5":"c2fe26625667e1d867eb435a7b542044","len":262152},"recovery174":{"type":"PAR 2.0\u0000RecvSlic","md5":"54a87b2c409465195efcac62cf90c0d0","len":262152},"recovery175":{"type":"PAR 2.0\u0000RecvSlic","md5":"b067362a194e73030224dcb460b406f5","len":262152},"recovery176":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce671aa8082cea7d459d1f23b43ed566","len":262152},"recovery177":{"type":"PAR 2.0\u0000RecvSlic","md5":"37770253e086905a8273fa7681e120eb","len":262152},"recovery178":{"type":"PAR 2.0\u0000RecvSlic","md5":"891a00e3973ede2824a25a87243e8847","len":262152},"recovery179":{"type":"PAR 2.0\u0000RecvSlic","md5":"44cb5fa5db70c22cb3d382bf4fe76924","len":262152},"recovery180":{"type":"PAR 2.0\u0000RecvSlic","md5":"27ca455202140b26fb48af7db5c559eb","len":262152},"recovery181":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ac3a13daccd4e334fb2cb4ba8806931","len":262152},"recovery182":{"type":"PAR 2.0\u0000RecvSlic","md5":"6d218c46b445edd8b584b1079aea762c","len":262152},"recovery183":{"type":"PAR 2.0\u0000RecvSlic","md5":"c288a78496f8e466bc2ca2555ab57651","len":262152},"recovery184":{"type":"PAR 2.0\u0000RecvSlic","md5":"2e2d494f19d7e12728b2813959f363b9","len":262152},"recovery185":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ac64828b943e917b0b327ba4e3e1d7a","len":262152},"recovery186":{"type":"PAR 2.0\u0000RecvSlic","md5":"64149d71b23f3b48601a1f66d38d2ce9","len":262152},"recovery187":{"type":"PAR 2.0\u0000RecvSlic","md5":"cb81ff2511a7b5d0cd9f4c88ebef6f4b","len":262152},"recovery188":{"type":"PAR 2.0\u0000RecvSlic","md5":"6fc5b1e6e008764933bb831afe224ea7","len":262152},"recovery189":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ad66c7aeb321475b766456499f641ea","len":262152},"recovery190":{"type":"PAR 2.0\u0000RecvSlic","md5":"15a3793c1c67f7a9cf718d331d34f41d","len":262152},"recovery191":{"type":"PAR 2.0\u0000RecvSlic","md5":"40d0e37fbc8aa71fb21b18da1b7d025c","len":262152},"recovery192":{"type":"PAR 2.0\u0000RecvSlic","md5":"c650dffc875af3e180761d3eef8994c9","len":262152},"recovery193":{"type":"PAR 2.0\u0000RecvSlic","md5":"4c22cdc5d78c408671a55c724affe633","len":262152},"recovery194":{"type":"PAR 2.0\u0000RecvSlic","md5":"80e4eca09aeed7823945af8434d4ccd8","len":262152},"recovery195":{"type":"PAR 2.0\u0000RecvSlic","md5":"3fe2b6e3f848a643fd8e77a05d795486","len":262152},"recovery196":{"type":"PAR 2.0\u0000RecvSlic","md5":"9ab3653b65ae244124b97817498673f0","len":262152},"recovery197":{"type":"PAR 2.0\u0000RecvSlic","md5":"3499ed0cd0b8e789b4622ebc71c49e1b","len":262152},"recovery198":{"type":"PAR 2.0\u0000RecvSlic","md5":"853ea439016d879ca104b2ea5621c320","len":262152},"recovery199":{"type":"PAR 2.0\u0000RecvSlic","md5":"99af3d84da4967f583714a44b8c5129b","len":262152},"creator":{"type":"PAR 2.0\u0000Creator","md5":"75978a963bad01ec4845ee37a32b8523","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"a1ab3aa1dd29953af5f118e683a1ebef","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"ba3580d34dbf456def2775e5f47fe32f","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1883c8c7fa2b7da5978e4b5199f6e168","len":5220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"75978a963bad01ec4845ee37a32b8523","len":104}}],"1":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"6281dd7c3c3938b3e6185a0477e6e287","len":65608},"main":{"type":"PAR 2.0\u0000Main","md5":"0b675e25887343203a39a3e2c8d1ed28","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"67c7299fe12d06356026566cc7084417","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1ae47841abe78267f3850e465307d555","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"498e695c1f6fd9d63f95c51ad93d14b0","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"0b675e25887343203a39a3e2c8d1ed28","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"67c7299fe12d06356026566cc7084417","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1ae47841abe78267f3850e465307d555","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"498e695c1f6fd9d63f95c51ad93d14b0","len":104}}],"2":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"80ee3e17eb31ff1aeb2b4b1299f54110","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"47d39160c9dd187d9602b395a9960adc","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"b2fce1dc1ba509aae7225e828a735ca3","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"93c8615d044e46d375da63c6e5c6999e","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"fcef287602419071bdf23c06a665b7dd","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"73901dbf207c0a3ce23b257df0c90f1c","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e455419ba916f25c94ac35c7fdf339b","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"30b4e3c59fa3f4dfce5440d6494f2eed","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"7868a53c236c0bdb9cd3a2f2a167766b","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"5081c501ed7ef9de8960c89020e87192","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"3984f758117ba91e1a6f4a4a0371d017","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"12b685ecaae6d8aa9e6a993fdfd07e0c","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"f50c1e1f38f49140b7a0601896740fa1","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6f795b7c34ab97e65dd3bca71698c5c","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"96ec24936af60d8ae873945cf38b9091","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"73a8d86b450e2da2d84a606e4bf97079","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"c867457ea6e5d2a1cf07091126ab98eb","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}}],"3":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"74520873b88f74d1de0e6b8f5c54006f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac1164d1c1d45c3e959976aa8d9c82ed","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"09a680944b9e3751e0397455e7f0561b","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd4b284c1cfb31f1f9dc4d2cb3b46f74","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"f86624cb3556a6c9ad0f97831984fd4b","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"0282fb1156b97afeb15889463fe6fb38","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4366cec10c3c406ecf39967a3488d00","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"debbeee2d37ef4bb964a68adfda5a836","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ca06454a70ccef99e374792f85ccb8c","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"f38a39a88bfcf792866e55a8103347d7","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"962eb1b20273dc63ac3e1afa4995ab10","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"58f929cac57b0a76a830ea74a0eac1c4","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b6c986373a5068a2afc6857243f2b7f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"22bcda081f15c8d4ae9297dfe0ac4f5e","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee302032bc01cd26256dbf9e984d5f6f","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"084d81c51717cf4b55eb2510bfab58eb","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac7539f5e66f2a7c029b596fa2ef20fa","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"749080994e1077637af1a102f5734400","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"b66e22198f11e1eb81c03fb83ac9f243","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"6b2a3aaf0a6af110d986adc5dcbbb4d9","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4c20ebe89a5409958eeab01b7fdc6d7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"41d9af908d53898507506f5bff21a546","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"693981d690733efd35c8594f2b88afe3","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b5b5257da59294665c41eb6ef8d6d72","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd20bc56ef956ab4ae3bccead46ff165","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"6156ade2b63b682f8719e01cd98802c9","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff7336302eea1d8c43a55084ddfbbf3b","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d0621c20b187eced33a3de5d14fa1b0","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c492e6e9078f3d5221d0de69a638ac2","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4b0c879fe80608a1ff5f460330ecb48","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"63516826cd3746a78cd797f5a849bdb4","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"35cce3895f113abc206b169fcee3dd0b","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"97f39c926596f591f608439ec2d6e50b","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e6dcf153973245f9f41f2f6d68dcf3e","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff89baf219572f14f59a7f08a193c155","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"d72a195ba2006e3d70e3d970d9710daa","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7461a500af7f1e129bc9d347a60c0a62","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"dbb3c6a53d9d9ca3e152ecb21f5cefe9","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"89b986d2b3aa9550a1d32f046436cc23","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f0454501952dac6bc2d1242629bd829","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}}],"4":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"edd1ade340f0f983fbfbcf844d801cda","len":2097304},"main":{"type":"PAR 2.0\u0000Main","md5":"09dfad62ef619decf35f32c2bf0a1522","len":124},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"b10d9aba7502ed7811bb25f22caa8e89","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"a245487ad0d095cfd6add75baa138e1f","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd6685f87867e9c3b0990dd40ba8fc1d","len":2097304},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7c84d68381da44721063e3c93bae8301","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"49d38647c2861e08ba2cb852dfe4062b","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"eb959263385ec801a008a8adb7bc86ec","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"6fe15849c49662577aea77449e41e12e","len":2097304},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"e35965c8be6de77d07426b0d067b3626","len":220},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"720cf3d8b68b0312ba6972859c9adac5","len":2097304},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"4588d83af606eb1601a1b5ff510b7879","len":2097304},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"2f302e2675b82a7e510046d3053d9919","len":2097304},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"dfe8e1351ed2ac4376d0309aa01301b1","len":2097304},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f00c87d06d608c53d16e6cbe3735a766","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"09dfad62ef619decf35f32c2bf0a1522","len":124},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"b10d9aba7502ed7811bb25f22caa8e89","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"a245487ad0d095cfd6add75baa138e1f","len":132},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7c84d68381da44721063e3c93bae8301","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"49d38647c2861e08ba2cb852dfe4062b","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"eb959263385ec801a008a8adb7bc86ec","len":100},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"e35965c8be6de77d07426b0d067b3626","len":220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f00c87d06d608c53d16e6cbe3735a766","len":104}}],"5":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"227b1235272f1640a2f3f1f5ac13109f","len":4194372},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b43e42ea2ec0617637ab5e1cfb98c0d","len":4194372},"main":{"type":"PAR 2.0\u0000Main","md5":"76a2c8053eca00b765ada9fdcde12245","len":92},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"612f84ace10d12ed08b83ccdd9d66511","len":4194372},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"f3595e2157d1f7b0864a88e07c1f5442","len":4194372},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"49439ce8a8009014fd812470983aedfe","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"e1ca7d5e9064e21e1777d44aacf4fb30","len":4194372},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"3da3ef1c9cde24fce3e996add1d608a9","len":400},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"c1a5cf7498aaafb919379feccf77ad64","len":4194372},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"e1de02388438296cf7a44cdef5c38856","len":4194372},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"1283885f4c351acdab8eef7df7258942","len":4194372},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f9d71ae23872e39f122c02e3f403cbc","len":4194372},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"662220713f645187c50d0f950bafc747","len":4194372},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"63bafc1c7d16744ec4904235621d3be1","len":4194372},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"d67b2fdd14e40510d595897300ff8889","len":4194372},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"50ef19310cad87061283fa3ce62188a3","len":4194372},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b79e5fac9942dc229eef88d1b65c15c","len":4194372},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"dfc8f3cf562f919f39e681a0207afa00","len":4194372},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"80e536d4be26737e0f2fb2941c652b9b","len":4194372},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"e9afe78529cbfaffceba5e2601f13279","len":4194372},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"1240e4446fc41a8805e0b328ff3fef7c","len":4194372},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"7d6562c315684126e67926609d6ea1c7","len":4194372},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"ddeef17620d1515a91bb6d195e627438","len":4194372},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"e03cdfc92253706ac7392426134ce613","len":4194372},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"05d9de37cb8619790dabc4686a6465ed","len":4194372},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"ddd9a65e990a88b69ba76360900162bd","len":4194372},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"79a1dac3c4b04b02bd63db1169606f96","len":4194372},"creator":{"type":"PAR 2.0\u0000Creator","md5":"4b6de922b9d6c30abe117f55842b19ef","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"76a2c8053eca00b765ada9fdcde12245","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"49439ce8a8009014fd812470983aedfe","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"3da3ef1c9cde24fce3e996add1d608a9","len":400},"creator":{"type":"PAR 2.0\u0000Creator","md5":"4b6de922b9d6c30abe117f55842b19ef","len":104}}],"6":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"74520873b88f74d1de0e6b8f5c54006f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac1164d1c1d45c3e959976aa8d9c82ed","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"09a680944b9e3751e0397455e7f0561b","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd4b284c1cfb31f1f9dc4d2cb3b46f74","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"f86624cb3556a6c9ad0f97831984fd4b","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"0282fb1156b97afeb15889463fe6fb38","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4366cec10c3c406ecf39967a3488d00","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"debbeee2d37ef4bb964a68adfda5a836","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ca06454a70ccef99e374792f85ccb8c","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"f38a39a88bfcf792866e55a8103347d7","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"962eb1b20273dc63ac3e1afa4995ab10","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"58f929cac57b0a76a830ea74a0eac1c4","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b6c986373a5068a2afc6857243f2b7f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"22bcda081f15c8d4ae9297dfe0ac4f5e","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee302032bc01cd26256dbf9e984d5f6f","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"084d81c51717cf4b55eb2510bfab58eb","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac7539f5e66f2a7c029b596fa2ef20fa","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"749080994e1077637af1a102f5734400","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"b66e22198f11e1eb81c03fb83ac9f243","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"6b2a3aaf0a6af110d986adc5dcbbb4d9","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4c20ebe89a5409958eeab01b7fdc6d7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"41d9af908d53898507506f5bff21a546","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"693981d690733efd35c8594f2b88afe3","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b5b5257da59294665c41eb6ef8d6d72","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd20bc56ef956ab4ae3bccead46ff165","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"6156ade2b63b682f8719e01cd98802c9","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff7336302eea1d8c43a55084ddfbbf3b","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d0621c20b187eced33a3de5d14fa1b0","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c492e6e9078f3d5221d0de69a638ac2","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4b0c879fe80608a1ff5f460330ecb48","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"63516826cd3746a78cd797f5a849bdb4","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"35cce3895f113abc206b169fcee3dd0b","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"97f39c926596f591f608439ec2d6e50b","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e6dcf153973245f9f41f2f6d68dcf3e","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff89baf219572f14f59a7f08a193c155","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"d72a195ba2006e3d70e3d970d9710daa","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7461a500af7f1e129bc9d347a60c0a62","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"dbb3c6a53d9d9ca3e152ecb21f5cefe9","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"89b986d2b3aa9550a1d32f046436cc23","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f0454501952dac6bc2d1242629bd829","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}}],"7":[{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"13d75e3ff2871a1f3554d0810d758a9b","len":12292},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d61059e69c4520710a1bf6157339902","len":12292},"main":{"type":"PAR 2.0\u0000Main","md5":"b546f34896688623139361b73f7466da","len":140},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"ad306d05c1f843b71793db9705949994","len":12292},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"9df88ae2f95bb7ac4d7756cff03be42e","len":12292},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"bdf7fd79dc9be3242911d615013b4675","len":132},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"019ef2edd1afe0d9e3a4fae44fcc4dcb","len":12292},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a296997ecfb7742bd8adcb4408f3b1e","len":12292},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"1074e6bd114064d4010cba8850279936","len":132},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"396efc5068f49b51ff1c78438defd032","len":12292},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"e29e2769b588fdd6073366f0c69ba755","len":12292},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"6c37323d51a7cfcc68cd519a949ed0b9","len":132},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"59175e7ac2d1d2364a469e6213bae609","len":12292},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"10df8b27966f711989172bc5bb351909","len":132},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"423e090521d2e6e445f3ab47a591f170","len":12292},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"2ce148ae06221e99da47f28ef4fa0c2e","len":12292},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"939568408c85444683327985736473f1","len":200},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"61c196fcab5efb2423e912493deb174f","len":12292},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"2a95543e8ad6e15bd03ee56837995b56","len":12292},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"4b33cc66f96237779dac4a2920e5c36b","len":100},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"3b48bff16a3a8eeebaef276c142aa66b","len":12292},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"6018b4d628e4267080dfbfbaa11f8482","len":12292},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"7900826aa0cdcd35f5b1b6d309285106","len":100},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"6355f957b5299352ceb471cc7aaab9de","len":12292},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0f14355deaf0bae05b3a601788f952c","len":12292},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"89550dc87a9e0098a8035d643c7c1fa9","len":22400},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a9553081f2bcb2568cc3d49e817c12d","len":12292},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"64c64c2c2aac4fa2cf1dc5abe9e79e40","len":12292},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"11e3ecabd3e549dc7bd276d0ef06e4f2","len":12292},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"045ba81e58ecde2ae2e5e10d6ff52deb","len":12292},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"7c2a0122a173e9c24288c6fdff476f75","len":12292},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"7fada7e5fd40793968feb8708724e76f","len":12292},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"6051d8fff3ba7d1a9ef0b89dc3c099fe","len":12292},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"a20b372c84071403679473808447e1d1","len":12292},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"a073c7ab64609655101b3f9d43f83217","len":12292},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef752e17c8404a77522e886979da238e","len":12292},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"9489fae5a303376fcf21bdeca134a01e","len":12292},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd47dc979bcdc24667a1743097d6d57b","len":12292},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"6576e487f28903ad92ac1bb9afde1cfa","len":12292},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"a77c05ca3f1390283d1fbb1c0332a0d0","len":12292},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"3923f44f59d0ba6e84ce1d30a44962b7","len":12292},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"d956db1617f11f33b3663c81e560bae8","len":12292},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"740ded4b4c20fe1f6799961a9f4a863e","len":12292},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"90781f2934736ba360ceb99d95652fde","len":12292},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a4bdc4e888708e7639a073817e96be3","len":12292},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"ad671f3f22846043eee50812b7841d19","len":12292},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"370998274b1b0348d66f79f98f277707","len":12292},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"c8491de0561f52f26a01dd5b1fc0a680","len":12292},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"9108adfad172584acad344c253d6407c","len":12292},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"64d94e06c9c8076fb98148977aabada7","len":12292},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"934a2da041551ca688d4cd65201de9de","len":12292},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"47ad1b8f9e972ff43bc55d8a7c7c8b82","len":12292},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"dff77ec66b2bb8226d40075e7dafd65d","len":12292},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"947c5b6bded6872b18f8b0eafbb75a82","len":12292},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"9dc6162f9bf3e41821ff1aa49b02fd7c","len":12292},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"0936daced4a42c3bdd97bcf7bed560be","len":12292},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"3d3733e28add7241ce6c160afc499514","len":12292},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"3e457b7d118e652d35420859f0c2cb0c","len":12292},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"89751e1de0aea82ecff902443fc29a65","len":12292},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"0bcb7ff73a2b4185a25634b41fd282db","len":12292},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"0051a97153a602a5da921ce4cc39ab8c","len":12292},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"c33f2d2904d814872fe43dcb1ba83fa6","len":12292},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"44d12ca49b89389205fa867b31ef821f","len":12292},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"908fa6ddb6e46fbc79cd7e8f36fdc91c","len":12292},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"e01263d2374aca3358fafd4f1e31dd50","len":12292},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"d87a878179763b3b553850bcc3f37463","len":12292},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"4748999b7a65523b2c40bdba325fa0d3","len":12292},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"abf213d33e195ca39e86262a57f5a335","len":12292},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba9e9db844838a23d3a845cc180aa7a9","len":12292},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"4598550a4ee7bb79427d2baa85ed5341","len":12292},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"3aeb560438ea23e78d2d990b70579d84","len":12292},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"b21c0f8558b48322af337d9502ff7410","len":12292},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"f965345c3ebc04d23a8a7343d3f4573a","len":12292},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ad9c5792fcc6ed1d0d6889b8067ebc1","len":12292},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b73c50d6e875d041d47d371bebdeaff","len":12292},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3bbd4599eaf7e264910b462fa59ee48","len":12292},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"052c9e79b7f175a837a46923f0a80ee2","len":12292},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"be0f6bfabf1bf6ba6508237f216475a1","len":12292},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea169208817c6d7fe336c901bf4a5ea0","len":12292},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"625407da280dfbfc7cacc8fb4f8bb265","len":12292},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"1c32d547d7d52d43cebc4d996c262111","len":12292},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"cedc3a8750dfef198ad10d9dccefe339","len":12292},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"ec4ef290224420846245b9375b814f8b","len":12292},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"4bad80c7f5a56d2fc87937642b18aa0c","len":12292},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"c4b0db1068256aa8c1e07f98a2d16339","len":12292},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"375911da8153e31766960340b1626520","len":12292},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"38b8edbd15145ef6126699807dc73da5","len":12292},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"6e66ffe7118eb850b326d9a3daba5d97","len":12292},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"d12c98c5352c1db7a19cd928513e66c3","len":12292},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef8dc2866ac48b1831afacf9e0750164","len":12292},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd23bf25bed444e4bdc22b9fc65288bd","len":12292},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"a82f0d4f7663ac83d47ca10ac3eac1a7","len":12292},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"340d4ae8bbe9f403176a4a001ecede09","len":12292},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"1465427d8499f1cc00977b1bde040381","len":12292},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"961b476e9df27b019b129aa37e9b4c87","len":12292},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4743689d9571c1ecc2c4341a8400e40","len":12292},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f6d7fead6d5b60a7fdbe453abd63fa5","len":12292},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"b56ed13c80fed786dfaec38591a7f849","len":12292},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"3dffd1a0424af8dae4cf74d936aff7ea","len":12292},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8b1e226311ae44d16bcc0f56c68c1a3","len":12292},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3fc2ca676e4c262dd57e61cf68c5361","len":12292},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"42b3a719315c551d3c91ee822c2da7ed","len":12292},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"928fc76675423931cbabc3c07b638294","len":12292},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"e3928b4db0b5b11d3cafba8c20f77376","len":12292},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"69092ea4065e6a6cbec1bf3841321cc4","len":12292},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"011dcea0d6d2d576f76d0a06f7f745b8","len":12292},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"e48dc08c95b0176c34116600eb14961c","len":12292},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"51ecf96adae6cab2ddb04d94f1e31d78","len":12292},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"8fa7ffc715bf00106a5d7bc6d0da3ab2","len":12292},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"c4cf770599853908d4cea098a1298970","len":12292},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac084faa95146eeed6664cc3e029e6ce","len":12292},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"52f195fc483df0404a6c742750023de0","len":12292},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"61ce493a2aa951375af43e9d1ca79d4f","len":12292},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"fefb4a742be24ac3731afe307f5b82ba","len":12292},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c4e137dade4e9642b809f09dc050eaa","len":12292},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"af7592f4713e63de24ff2a591dd4fe26","len":12292},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"c9b995c9fe0601e5e333f734fb387047","len":12292},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"7f0bc8e940bfc1ee4a8088af4d36ea80","len":12292},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a3fef678dfb898d2a6a2fc349ab8885","len":12292},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4c5d8d29385549ee22e2e3a796ecbce","len":12292},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"a94ce91cec61f2add2f780412f6688dd","len":12292},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"45117210792cfdca35871f0625f29f50","len":12292},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a1304668f393486b4bf99741bcc8c0e4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"b546f34896688623139361b73f7466da","len":140},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"bdf7fd79dc9be3242911d615013b4675","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"1074e6bd114064d4010cba8850279936","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"6c37323d51a7cfcc68cd519a949ed0b9","len":132},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"10df8b27966f711989172bc5bb351909","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"939568408c85444683327985736473f1","len":200},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"4b33cc66f96237779dac4a2920e5c36b","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"7900826aa0cdcd35f5b1b6d309285106","len":100},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"89550dc87a9e0098a8035d643c7c1fa9","len":22400},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a1304668f393486b4bf99741bcc8c0e4","len":104}}],"8":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"8f490677c7370c81405b5dc720de5259","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab4b40f2f8c79be1b72fc3c297034e18","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}}],"9":[{"main":{"type":"PAR 2.0\u0000Main","md5":"d2e9f5f81e8780b703db895c249cbd68","len":92},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"4a2d665a1e6879cd1b9d04025a7a5f80","len":132},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"59a78d4061c80c5b686bc85f1ae871e1","len":120},"creator":{"type":"PAR 2.0\u0000Creator","md5":"aef6213477d5a93d8932e106b971214e","len":104}}],"10":[{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2b7cd021389fdb5ed3791087cab67848","len":16777284},"main":{"type":"PAR 2.0\u0000Main","md5":"c3e44363bd4b65f4942581572936dff4","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"c295b42879c2d2a3ee4843a3bb0f5ee3","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"13b6852f097a72fc7cc04761a10a480c","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"1ad880202a49781b87e335d349930f26","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"910c17cd5227f95a8f2c837c237ed405","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"8cbc4d1dad7ea1f63678571057c323a1","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f5a9392a75d31501b020918c799fab52","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a19208469e9ef2c68f2ce37e0cb36780","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"c3e44363bd4b65f4942581572936dff4","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"c295b42879c2d2a3ee4843a3bb0f5ee3","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"13b6852f097a72fc7cc04761a10a480c","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"1ad880202a49781b87e335d349930f26","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"910c17cd5227f95a8f2c837c237ed405","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"8cbc4d1dad7ea1f63678571057c323a1","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f5a9392a75d31501b020918c799fab52","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a19208469e9ef2c68f2ce37e0cb36780","len":104}}],"11":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ae3fa27bb30388ee1999b3e5d295b1e","len":1048644},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"76549588cac2b8d3f6ffb53ef2e5e5d5","len":1048644},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"46a19207e25504383c733859fa23a5d3","len":1048644},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"e39bd46ac1405579ea58ad6990a52e5f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"3796a7821994867eb08d0b9ecfb1f022","len":92},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"3aa647a57f56be00b66830ebeed8d766","len":1048644},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"8e3df5e94c2602a027a43df9a79340fa","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"97253b6d0f8d827181326470066333e4","len":1048644},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7388491d9d881d220d94b5a26306cbdc","len":132},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d3dc575cd28880a5218b9c5d0762efb","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"696a55821a69dc48cde7e7b7dbfe18e7","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f8aa5bf1983752b0122f91c6dc46dae","len":1048644},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"dc3363b470db43fc863d728a48decd34","len":340},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"60ab3ac1c0b22bb2660ec3b1a44a454f","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"32b0c0ccf4da378145ec2881d06a0104","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe3cf4681ff6dab4fe48baadeefa8b9f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"75a0660ce94daa96fefcdb8ecb7ffe2d","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"e9fc5ad5e0214fb90588695cf1bea1d4","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a7a7c816d43b31ffd1f1ab7ff93cb78","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"6514fb821fb9d012cedb09a5cdb5ce81","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"c55c09da6d1858b24764007cc42d01e1","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ee9880f40b8062e5748abe8b5fb2579","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"329cb0d4712fc629414fbb2aa1c15454","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8b03860153ecb1c0b15b785422cf36c","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4ffc6740802ea6274134e905fe8ec1b","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"36c944effcf8db4f5098fdef4d7f9fb7","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"33dddc46138a869d45473597bd8a0885","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"52e988fed07f5c02b94695f129ffee05","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"860ce0c08f14146f6545f8b455388457","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"02fd2a64a0440a35e571383c3d0a62ef","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"557a895ab8b60101fc681cd5908d4bfe","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"2045f82f13b16f36b30986653111655d","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce6fad602d7745fc6b87ce0cf9adf74b","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"f916de512a753ad776ee91d1cd0c5e88","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"5f9b5225d53a5df264e98d57ef41488f","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"b35b8b7c45908e260ff729d0a927ed93","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"f976fa312351c40ff2c6fb8022aabf6d","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef96b2f79e199d7bb037fe0b05056211","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"96647c04ece1d25fb0f6b1eca3ae2219","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7bf3716a84aed5b1321b19e18a445eb4","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"257500d3e77cf74b1efa67606d4ec1d0","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"696781c6156f397df6106e2d51819aa4","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"9046f4b89e3a13e3947e785eae760ea7","len":1048644},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"7fad2133a7b1672ed31694f89fe8b1c1","len":1048644},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"285897a69aa8e07e6f4f82025f25455f","len":1048644},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"424f74e6c405f183cb635524ae086d36","len":1048644},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"d7285bfef36a2314e8d5cc2323a91625","len":1048644},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"bf8285ec53c4c06737ba3f1ffd792760","len":1048644},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef6616466b9fefebfaddfadcf25b8fa9","len":1048644},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"3b64f6fc3816ff762953d256f6be5c09","len":1048644},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a6690f87f54eb8769c236e5972f4817","len":1048644},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"9fcdc6307e18029a7306c24d6e23a798","len":1048644},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"291516ea56111c0400a6a12f676e5a53","len":1048644},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"c87b641d60c32ea5d2883509304abc60","len":1048644},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8a1d99820b3030161654ce66e732522","len":1048644},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b7465b4e0aa16ed2531a4a10297c4d3","len":1048644},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"6a0f4220cc76d6ab4aa7d537c1f7c77b","len":1048644},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"9143733d0076ab08719a05ca13f1f422","len":1048644},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"5266852e8cce0de780bce15fb1a608dd","len":1048644},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"19a85bb67233bbe31319d61afa482c25","len":1048644},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"777e26b540a27ea9480817bad4ba5eb1","len":1048644},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"a6559b6d4dc1280cbcf748e968b4dbae","len":1048644},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"29a393bd18579d5ae612a73b5ad5471f","len":1048644},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"0c61aafcedeeb70a5677bac1f64ace26","len":1048644},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"18c227628a2b46e865649fbe7ed1c9ed","len":1048644},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"4dea15252f8943bc37ccfd8dede9ca4d","len":1048644},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"a437635c37da22690a51bcca8af94066","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"79b7e4a62fce4bbc39cb1d35a3b28150","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"3796a7821994867eb08d0b9ecfb1f022","len":92},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7388491d9d881d220d94b5a26306cbdc","len":132},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"dc3363b470db43fc863d728a48decd34","len":340},"creator":{"type":"PAR 2.0\u0000Creator","md5":"79b7e4a62fce4bbc39cb1d35a3b28150","len":104}}],"14":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"bdb6fb8f2a0d2a5902cabcde8b859035","len":4294967364},"main":{"type":"PAR 2.0\u0000Main","md5":"854e212b116ec286bc7a0254029f405c","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"43a52a9af71e0d95624092ff681fbdd6","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"0af69ba8da0e6e6ac51e55f8d36c4b10","len":100},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b2721e854623f19ede417796298e409","len":4294967364},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f85ecf559c229c58166c2b3a837817d5","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"854e212b116ec286bc7a0254029f405c","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"43a52a9af71e0d95624092ff681fbdd6","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"0af69ba8da0e6e6ac51e55f8d36c4b10","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f85ecf559c229c58166c2b3a837817d5","len":104}}],"18":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5c59db66d27af0dd33c5c76649c80596","len":268435528},"main":{"type":"PAR 2.0\u0000Main","md5":"d4aaedb226e0b58bad779c6228832593","len":92},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"413399233805cfe58ec209a1bd76bd06","len":136},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"cd00ea091d2134ba1a49b9f0683b3367","len":260},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"b997d60937956d94164d0d4afbb9efe3","len":268435528},"creator":{"type":"PAR 2.0\u0000Creator","md5":"66b1394753c32007408e5bfd2d973ec4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"d4aaedb226e0b58bad779c6228832593","len":92},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"413399233805cfe58ec209a1bd76bd06","len":136},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"cd00ea091d2134ba1a49b9f0683b3367","len":260},"creator":{"type":"PAR 2.0\u0000Creator","md5":"66b1394753c32007408e5bfd2d973ec4","len":104}}],"20":[{"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}}],"21":[{"main":{"type":"PAR 2.0\u0000Main","md5":"979bf683edbeb3c67b54eefe72ca796e","len":92},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"572fd34269714193e5755b1c9b4e3dcd","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"787536bb15eb4279f56212f01d1fdc1d","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"0769f180278adcffd6dcdfbaa0987882","len":104}}],"22":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"dc21985927d59404c3a10a1523e53e0c","len":262212},"main":{"type":"PAR 2.0\u0000Main","md5":"819d570e0ad997c9e3852a8df6bb59a8","len":140},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"bbacaf20c3142b7a9fcd00fe2653e19e","len":132},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"8ec3f9507a26ff1d780cbac99bc97fd8","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2042cbf8d665fba64349918e12425f02","len":262212},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"7c287f4caeaee8170bee952ec02c6248","len":136},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"a2c6b03ef481ef57b2e7d70dc964ee13","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"e670a9d5df380732a1518e3f5be7422f","len":5200},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"d3baa6921143e1a203ac14a502ab6c8a","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"010700d8dce400b258d57b93189f6261","len":262212},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"35caeff6d05c414457ba3fa8973417bd","len":176080},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"ded26a35cec9f7aafdaf3f2fc53d5378","len":1120},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"a0871e7345520f1e61b2a99292b01a18","len":262212},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"6e9c8c7d20fe75165ee4f0812ebfd434","len":262212},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"f6b02899978c2d52177f754cb1708bc1","len":262212},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"55ec6d9b1af64f6fa6ceda6fa43fd722","len":262212},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"1b50a45fdef6a9878bcf7de3e56cc23a","len":262212},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"36d262d667af95827a870ca0379516fb","len":262212},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"d0a7a6cecea8434b3de7934ecb982e53","len":262212},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8cb6790dad8ccd9f59af46152131e65a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"819d570e0ad997c9e3852a8df6bb59a8","len":140},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"bbacaf20c3142b7a9fcd00fe2653e19e","len":132},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"8ec3f9507a26ff1d780cbac99bc97fd8","len":132},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"7c287f4caeaee8170bee952ec02c6248","len":136},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"a2c6b03ef481ef57b2e7d70dc964ee13","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"e670a9d5df380732a1518e3f5be7422f","len":5200},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"d3baa6921143e1a203ac14a502ab6c8a","len":100},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"35caeff6d05c414457ba3fa8973417bd","len":176080},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"ded26a35cec9f7aafdaf3f2fc53d5378","len":1120},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8cb6790dad8ccd9f59af46152131e65a","len":104}}]}
\ No newline at end of file
diff --git a/test/par-compare.js b/test/par-compare.js
index 60832de8..79c2ab72 100644
--- a/test/par-compare.js
+++ b/test/par-compare.js
@@ -13,8 +13,11 @@ var exeParpar = '../bin/parpar';
 var exePar2 = 'par2';
 
 var skipFileCreate = true; // skip creating test files if they already exist (speeds up repeated failing tests, but existing files aren't checked)
+var pruneCache = false; // prune unused keys from cached results
 
 
+var fastTest = process.argv.slice(2).indexOf('-f') > -1;
+
 var fs = require('fs');
 var crypto = require('crypto');
 
@@ -311,7 +314,8 @@ function writeRndFile(name, size) {
 }
 writeRndFile('test64m.bin', 64*1048576);
 writeRndFile('test2200m.bin', 2200*1048576);
-writeRndFile('test4100m.bin', 4100*1048576); // >4GB to test 32-bit overflows
+if(!fastTest)
+	writeRndFile('test4100m.bin', 4100*1048576); // >4GB to test 32-bit overflows
 
 // we don't test 0 byte files - different implementations seem to treat it differently:
 // - par2cmdline: skips all 0 byte files
@@ -326,14 +330,15 @@ writeRndFile('test13m.bin', 13631477);
 
 
 var cachedResults = {};
+var setCacheKeys = {};
 var sourceFiles = {};
-
+var cacheFileName = fastTest ? 'cached-cmpref-fast.json' : 'cached-cmpref.json';
 try {
-	cachedResults = require(tmpDir + 'cached-cmpref.json');
+	cachedResults = require(tmpDir + cacheFileName);
 } catch(x) {
 	try {
 		// try current folder as well, since I tend to stick it there
-		cachedResults = require('./cached-cmpref.json');
+		cachedResults = require('./' + cacheFileName);
 	} catch(x) {
 		cachedResults = {};
 	}
@@ -456,15 +461,6 @@ var allTests = [
 		cacheKey: '22'
 	},
 	
-	// issue #6
-	{
-		in: [tmpDir + 'test64m.bin'],
-		blockSize: 40000,
-		blocks: 10000,
-		singleFile: true,
-		cacheKey: '12'
-	},
-	
 	// no recovery test
 	{
 		in: [tmpDir + 'test64m.bin'],
@@ -481,15 +477,7 @@ var allTests = [
 		cacheKey: '21'
 	},
 	
-	// 2x large block size test
-	{
-		in: [tmpDir + 'test64m.bin'],
-		blockSize: 2048*1048576 - 1024-68,
-		blocks: 1,
-		memory: is64bPlatform ? 2560*1048576 : 1536*1048576,
-		singleFile: true,
-		cacheKey: '13'
-	},
+	// large block size test
 	{
 		in: [tmpDir + 'test64m.bin'],
 		blockSize: 4294967296, // 4GB, should exceed node's limit
@@ -499,30 +487,6 @@ var allTests = [
 		cacheKey: '14'
 	},
 	
-	// 2x large input file test
-	{
-		in: [tmpDir + 'test4100m.bin'],
-		blockSize: 1048576,
-		blocks: 64,
-		singleFile: true,
-		cacheKey: '15'
-	},
-	{
-		in: [tmpDir + 'test2200m.bin', tmpDir + 'test1b.bin'],
-		blockSize: 768000,
-		blocks: 2800,
-		singleFile: true,
-		cacheKey: '16'
-	},
-	
-	{ // max number of blocks test
-		in: [tmpDir + 'test64m.bin'],
-		blockSize: 2048,
-		blocks: 32768, // max allowed by par2cmdline; TODO: test w/ 65535
-		singleFile: true,
-		cacheKey: '17'
-	},
-	
 	{ // skewed slice size to test chunk miscalculation bug
 		in: [tmpDir + 'test2200m.bin'],
 		blockSize: 256*1048576 + 4,
@@ -531,25 +495,70 @@ var allTests = [
 		cacheKey: '18'
 	},
 	
-	{ // slice > 4GB (generally unsupported, but can be made via par2cmdline with some trickery)
-		in: [tmpDir + 'test4100m.bin'],
-		inBlocks: 1, // 4100MB slice
-		blocks: 2,
-		singleFile: true,
-		cacheKey: '19'
-	},
-	
 ];
-if(is64bPlatform) {
-	allTests.push({ // recovery > 4GB in memory [https://github.com/animetosho/par2cmdline-turbo/issues/7]
-		in: [tmpDir + 'test4100m.bin'],
-		blockSize: 100*1048576,
-		blocks: 41,
-		singleFile: true,
-		memory: 8192*1048576,
-		cacheKey: '23',
-		readSize: '100M'
-	});
+if(!fastTest) {
+	allTests.push(
+		// issue #6
+		{
+			in: [tmpDir + 'test64m.bin'],
+			blockSize: 40000,
+			blocks: 10000,
+			singleFile: true,
+			cacheKey: '12'
+		},
+		// large block+mem test
+		{
+			in: [tmpDir + 'test64m.bin'],
+			blockSize: 2048*1048576 - 1024-68,
+			blocks: 1,
+			memory: is64bPlatform ? 2560*1048576 : 1536*1048576,
+			singleFile: true,
+			cacheKey: '13'
+		},
+		
+		// 2x large input file test
+		{
+			in: [tmpDir + 'test4100m.bin'],
+			blockSize: 1048576,
+			blocks: 64,
+			singleFile: true,
+			cacheKey: '15'
+		},
+		{
+			in: [tmpDir + 'test2200m.bin', tmpDir + 'test1b.bin'],
+			blockSize: 768000,
+			blocks: 2800,
+			singleFile: true,
+			cacheKey: '16'
+		},
+		
+		{ // max number of blocks test
+			in: [tmpDir + 'test64m.bin'],
+			blockSize: 2048,
+			blocks: 32768, // max allowed by par2cmdline; TODO: test w/ 65535
+			singleFile: true,
+			cacheKey: '17'
+		},
+		
+		{ // slice > 4GB (generally unsupported, but can be made via par2cmdline with some trickery)
+			in: [tmpDir + 'test4100m.bin'],
+			inBlocks: 1, // 4100MB slice
+			blocks: 2,
+			singleFile: true,
+			cacheKey: '19'
+		},
+	);
+	if(is64bPlatform) {
+		allTests.push({ // recovery > 4GB in memory [https://github.com/animetosho/par2cmdline-turbo/issues/7]
+			in: [tmpDir + 'test4100m.bin'],
+			blockSize: 100*1048576,
+			blocks: 41,
+			singleFile: true,
+			memory: 8192*1048576,
+			cacheKey: '23',
+			readSize: '100M'
+		});
+	}
 }
 
 
@@ -635,6 +644,7 @@ async.timesSeries(allTests.length, function(testNum, cb) {
 				}
 				return ret;
 			})));
+			setCacheKeys[test.cacheKey] = 1;
 			
 			delOutput();
 			cb();
@@ -650,12 +660,19 @@ async.timesSeries(allTests.length, function(testNum, cb) {
 		fs.unlinkSync(tmpDir + 'test65k.bin');
 		fs.unlinkSync(tmpDir + 'test13m.bin');
 		fs.unlinkSync(tmpDir + 'test2200m.bin');
-		fs.unlinkSync(tmpDir + 'test4100m.bin');
+		if(!fastTest)
+			fs.unlinkSync(tmpDir + 'test4100m.bin');
 	}
 	
 	if(!err) {
+		if(pruneCache) {
+			for(var k in cachedResults)
+				if(!(k in setCacheKeys))
+					delete cachedResults[k];
+		}
+		
 		try {
-			fs.writeFileSync(tmpDir + 'cached-cmpref.json', JSON.stringify(cachedResults));
+			fs.writeFileSync(tmpDir + cacheFileName, JSON.stringify(cachedResults));
 		} catch(x) {
 			console.log(x);
 		}

From 63b1a40778a38ed80e799dbbc579bc65491c31f7 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 29 Aug 2023 20:43:00 +1000
Subject: [PATCH 86/91] Fix test workflow

---
 .github/workflows/test-full.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml
index b936c7cc..d573e406 100644
--- a/.github/workflows/test-full.yml
+++ b/.github/workflows/test-full.yml
@@ -1,7 +1,7 @@
 name: Run PAR2 Create Tests
 on:
   workflow_dispatch:
-  push:
+  #push:
 
 jobs:
   test-node:
@@ -32,5 +32,6 @@ jobs:
       - uses: actions/setup-node@v3
         with:
           node-version: ${{ matrix.version }}
-      - run: (npm install --production
+      - run: npm install --production
+      - run: node-gyp rebuild
       - run: node ${{ matrix.flags }} test/par-compare.js -f

From afe543d48f7d77d51e348fb71eb8c83590f4a9b3 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 29 Aug 2023 21:30:02 +1000
Subject: [PATCH 87/91] More build fixes

---
 .github/workflows/test-full.yml |  2 +-
 .github/workflows/test.yml      | 17 +++++++++--------
 gf16/threadqueue.h              |  2 +-
 src/cpuid.h                     |  4 ++--
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml
index d573e406..2dc57ad1 100644
--- a/.github/workflows/test-full.yml
+++ b/.github/workflows/test-full.yml
@@ -1,7 +1,7 @@
 name: Run PAR2 Create Tests
 on:
   workflow_dispatch:
-  #push:
+  push:
 
 jobs:
   test-node:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8357eec4..13356699 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,19 +24,20 @@ jobs:
           mkdir test\hasher\build
           cmake -B test\hasher\build -S test\hasher -G "Visual Studio 17 2022" -T ${{ matrix.compiler }} -A ${{ matrix.arch }}
           cmake --build test\hasher\build --config ${{ matrix.config }}
-      - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test.exe"
-      - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-pmul.exe"
-      - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-ctrl.exe -f"
+      - run: dir $env:SDE_PATH
+      - run: Invoke-Expression "$env:SDE_PATH/sde.exe -icx -- test/gf16/build/${{ matrix.config }}/test.exe"
+      - run: Invoke-Expression "$env:SDE_PATH/sde.exe -icx -- test/gf16/build/${{ matrix.config }}/test-pmul.exe"
+      - run: Invoke-Expression "$env:SDE_PATH/sde.exe -icx -- test/gf16/build/${{ matrix.config }}/test-ctrl.exe -f"
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/gf16/build/${{ matrix.config }}/test-inv.exe -f"
+      - run: Invoke-Expression "$env:SDE_PATH/sde.exe -icx -- test/gf16/build/${{ matrix.config }}/test-inv.exe -f"
         if: ${{ matrix.config == 'Release' && (matrix.compiler == 'ClangCL' || matrix.compiler == 'v143') }}
-      - run: Invoke-Expression "$env:SDE_PATH/sde -icx -- test/hasher/build/${{ matrix.config }}/test.exe"
+      - run: Invoke-Expression "$env:SDE_PATH/sde.exe -icx -- test/hasher/build/${{ matrix.config }}/test.exe"
       
       # test SSE2-only to see if CPUID checking works
       - run: |
-          Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/gf16/build/${{ matrix.config }}/test.exe"
-          Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/gf16/build/${{ matrix.config }}/test-pmul.exe"
-          Invoke-Expression "$env:SDE_PATH/sde -p4 -- test/hasher/build/${{ matrix.config }}/test.exe"
+          Invoke-Expression "$env:SDE_PATH/sde.exe -p4 -- test/gf16/build/${{ matrix.config }}/test.exe"
+          Invoke-Expression "$env:SDE_PATH/sde.exe -p4 -- test/gf16/build/${{ matrix.config }}/test-pmul.exe"
+          Invoke-Expression "$env:SDE_PATH/sde.exe -p4 -- test/hasher/build/${{ matrix.config }}/test.exe"
         if: ${{ matrix.config == 'Release' && matrix.arch == 'x64' && matrix.compiler == 'ClangCL' }}
       # TODO: XOP tests for hasher?
   
diff --git a/gf16/threadqueue.h b/gf16/threadqueue.h
index 0b3c23f8..a203d050 100644
--- a/gf16/threadqueue.h
+++ b/gf16/threadqueue.h
@@ -280,7 +280,7 @@ class MessageThread {
 			#if defined(_WINDOWS) || defined(__WINDOWS__) || defined(_WIN32) || defined(_WIN64)
 			HMODULE h = GetModuleHandleA("kernelbase.dll");
 			if(h) {
-				HRESULT(__stdcall *fnSetTD)(HANDLE, PCWSTR) = (HRESULT(__stdcall *)(HANDLE, PCWSTR))GetProcAddress(h, "SetThreadDescription");
+				HRESULT(__stdcall *fnSetTD)(HANDLE, PCWSTR) = (HRESULT(__stdcall *)(HANDLE, PCWSTR))((void*)GetProcAddress(h, "SetThreadDescription"));
 				if(fnSetTD) {
 					wchar_t nameUCS2[17];
 					//assert(strlen(self->name) <= 16); // always hard-coded string, plus Linux limits it to 16 chars, so shouldn't ever overflow
diff --git a/src/cpuid.h b/src/cpuid.h
index c44cabd3..2fa6cf38 100644
--- a/src/cpuid.h
+++ b/src/cpuid.h
@@ -16,8 +16,8 @@
 # else
 	#include <cpuid.h>
 	/* GCC seems to support this, I assume everyone else does too? */
-	#define _cpuid(ar, eax) __cpuid(eax, ar[0], ar[1], ar[2], ar[3])
-	#define _cpuidX(ar, eax, ecx) __cpuid_count(eax, ecx, ar[0], ar[1], ar[2], ar[3])
+	#define _cpuid(ar, eax) __cpuid(eax, (ar)[0], (ar)[1], (ar)[2], (ar)[3])
+	#define _cpuidX(ar, eax, ecx) __cpuid_count(eax, ecx, (ar)[0], (ar)[1], (ar)[2], (ar)[3])
 	
 	static inline int _GET_XCR() {
 		int xcr0;

From ce88e2825843c90a31bf2feccbb2d744aafeabd1 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 29 Aug 2023 21:54:35 +1000
Subject: [PATCH 88/91] Test workflow fixes

---
 .github/workflows/test-full.yml | 1 -
 .github/workflows/test.yml      | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml
index 2dc57ad1..90b17d37 100644
--- a/.github/workflows/test-full.yml
+++ b/.github/workflows/test-full.yml
@@ -33,5 +33,4 @@ jobs:
         with:
           node-version: ${{ matrix.version }}
       - run: npm install --production
-      - run: node-gyp rebuild
       - run: node ${{ matrix.flags }} test/par-compare.js -f
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 13356699..2e1c3534 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -15,6 +15,7 @@ jobs:
     steps:
       - uses: ilammy/setup-nasm@v1
       - uses: petarpetrovt/setup-sde@v2.1
+        sdeVersion: 8.69.1
       - uses: actions/checkout@v3
       - run: |
           mkdir test\gf16\build

From 72d5382c25896a4615abb56ab16b70b342a0c3ff Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 29 Aug 2023 22:09:54 +1000
Subject: [PATCH 89/91] Test workflow fixes

---
 .github/workflows/test-full.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml
index 90b17d37..9474aa31 100644
--- a/.github/workflows/test-full.yml
+++ b/.github/workflows/test-full.yml
@@ -33,4 +33,4 @@ jobs:
         with:
           node-version: ${{ matrix.version }}
       - run: npm install --production
-      - run: node ${{ matrix.flags }} test/par-compare.js -f
+      - run: (cd test && node ${{ matrix.flags }} par-compare.js -f)

From d56cc9989a7eaf52658e18b4140995a4a97c6edc Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Tue, 29 Aug 2023 22:49:24 +1000
Subject: [PATCH 90/91] Test workflow fixes

---
 .github/workflows/test-full.yml |  2 +-
 test/cached-cmpref-fast.json    |  2 +-
 test/par-compare.js             | 11 +++++------
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml
index 9474aa31..562917d9 100644
--- a/.github/workflows/test-full.yml
+++ b/.github/workflows/test-full.yml
@@ -19,7 +19,7 @@ jobs:
           flags: '--trace-warnings'
           python2: false
         - version: '20.5.1'
-          flags: '--pending-deprecation --throw-deprecation --trace-warnings'
+          flags: '--pending-deprecation --throw-deprecation --trace-warnings --openssl-legacy-provider'
           python2: false
     name: Test on Node v${{ matrix.version }}
     runs-on: ubuntu-latest
diff --git a/test/cached-cmpref-fast.json b/test/cached-cmpref-fast.json
index 3b0ac589..c6232270 100644
--- a/test/cached-cmpref-fast.json
+++ b/test/cached-cmpref-fast.json
@@ -1 +1 @@
-{"0":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"384329a66d8557f9c35b05e2d391b2db","len":262152},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2e968c2b42a5e148ee1da7556645ec19","len":262152},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"f9cbb890066d9b19df7fb43cdec29f89","len":262152},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee74b923884c67a0f9c75f8db2a4946a","len":262152},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"c95cc5d5456cc427eb3f05cab6e7712d","len":262152},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"b789836bb5fda2ad650ed492693c24d0","len":262152},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"30c5398aa3faf8b4306d4010dde7f34a","len":262152},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"3652d2c728cf4e5cf0305e05237bc8b4","len":262152},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"8c51f2e4e80ac1f825bc98ce1ee50137","len":262152},"main":{"type":"PAR 2.0\u0000Main","md5":"a1ab3aa1dd29953af5f118e683a1ebef","len":92},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"527d15c6ae89dd498b68d283cb13a04d","len":262152},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"5a841a77e7876ad22c0860da5f6ed754","len":262152},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"2d942f333b87e8242d5addbfc309c9c0","len":262152},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"f8c66e0c1496a0ab21c6af961c8a8a73","len":262152},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"3419f14c6492702df0b75e1eac8a0e07","len":262152},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"50b3d0e1f17909a0d32b1a9c5a983ce2","len":262152},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab9251ff754860ecff97ecb45d113171","len":262152},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"eda7ed48e29ee51e543eae412afa3e38","len":262152},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"ba3580d34dbf456def2775e5f47fe32f","len":132},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe34f3c978e36050cc0324ebd10f2113","len":262152},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"770f8e4351f416d01b4e5905e6182114","len":262152},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"3ffc5b30c2ae5f68d97db47093f093b7","len":262152},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"203bf0ad82cefd3d9240a50883226500","len":262152},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"7083ddb527d8f284a27b965dbb3f71ce","len":262152},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"8972d12d517f515153e4217593e38cbd","len":262152},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"378c7d54aa0e47c1be08062302c2d615","len":262152},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3b09495cfc309312879658b0e636063","len":262152},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1883c8c7fa2b7da5978e4b5199f6e168","len":5220},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c57c1edbd2ecdb08710d02cd8b708c6","len":262152},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"8dea37ee27350c5d4e14f60749484d58","len":262152},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0dee7d46d4c23673394b693f27f3766","len":262152},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"c62c595a6e2990882d7cadff631c56de","len":262152},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"08eff6bf1aed164dab1b5144c93a339e","len":262152},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"c9e1ea86f490cb436315b2a5dfe27700","len":262152},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"62c45fbbc54d5ba2aab1e4f219639e16","len":262152},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"76d12aa5f889db92da1b4d90cb70ec28","len":262152},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"cbb84f9b1dd02e5e6cce7d385aefc47e","len":262152},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"65ea8fae55f8e2b93ffca5cfa59417b3","len":262152},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"308bd5740cd38138b61a6a659ab3489d","len":262152},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"725f7e7221786949fb166a17cea3d4fe","len":262152},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"94b230fc6b49f004a0b35fca8321953f","len":262152},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"37d8119968567d0a7e7f04b0e0e10d7e","len":262152},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"381fdb3aa50e50cbfc3692633c89c847","len":262152},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"a9dfadf596069976c88a1943099e0284","len":262152},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"2968cf0b54fc2d2f28777d67a6cb74ba","len":262152},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"59682b0bd874f1f6f2dc8f8a37d065f7","len":262152},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4abc2e61efa29e839e8c54a8597b3cd","len":262152},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"d827bb49770c8dd9397e1dc3e435f7b4","len":262152},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"572d530fcddea95b69daa2c5c49ca59e","len":262152},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"2eea5c67990da60d4af649369ee57a4e","len":262152},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"fbdde1ef3f31e9b49e200583f3ea3162","len":262152},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6d04af4d5bddebd010e2c9087835ad6","len":262152},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"67d807315ac9bdf41ff67f27a1b9e9da","len":262152},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"cc9a11df7a97fb9a0169f63150aff0d2","len":262152},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"8fa64d50c2e72c70d47c527a412c6ebc","len":262152},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b18bcf9f7017a7a8a261cf34c570817","len":262152},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"5dac9d8f1e5754d818a724841712d518","len":262152},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"99395b4ea95dfb1e30421850562fccae","len":262152},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"e2470652540238a6d4e4ee4a0f41b288","len":262152},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"7578ea0996471412fa31d42b4b6793fc","len":262152},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"c324e0207bf219380c8498200d187e09","len":262152},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f5237508b02edaeea3b7566aaab1c8f","len":262152},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"45b6aa9de72aa9ecb63b77a4c6469f08","len":262152},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"d171092245e7736d740668d4bf88b69d","len":262152},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a55521c05a72864671e695a953d7f63","len":262152},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"352e09046f010c26bcebf1caf6a31008","len":262152},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a88541f965eea59e425f5a0ee54e7ea","len":262152},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"16de857133d3118e86a0ea8ec2b32108","len":262152},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"7e86600a79c8e4255d6b16051997160c","len":262152},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"685f9d66019a07b7663c1cc3333d27ab","len":262152},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd358825cd83587525a2707e3d966696","len":262152},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"98db58ba135f4006057cef5ad69029f8","len":262152},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ee1489fa618a21a3b3366e0c8322165","len":262152},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"d236c0a9ed4b8e5915bc8c265bb06aca","len":262152},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"b9d1304bd26aaefa615f00737a864779","len":262152},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8b0767636b21e45d207cc97617666a3","len":262152},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"e104cc83447db74e477a6dc172e4a44b","len":262152},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"47a8035c958e238f18d8365c27d4ed34","len":262152},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"b1d0bd7ef446e2cfcbbc882324f6a64b","len":262152},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"d66d76a5e5ea3aaeb7693bfd4d79c38b","len":262152},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"92f12b3316b91c0e478b71db38f662b3","len":262152},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"19640a449419963e7035979c5a420833","len":262152},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a6755283ec0bea1915a63ddff53f349","len":262152},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"ca91dc007eb44c1f55f889cc7375748a","len":262152},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"592ea92cac06a328f6b7b00153ce343a","len":262152},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"5154fae148e8c21e507215d740ce275c","len":262152},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"79a4101d6c828627657c6c1dd0b775e0","len":262152},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"98b70a8f1dda25aaf4cd6490e7d75a71","len":262152},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"6f04b148eadd8c49cd620f50cc4d254d","len":262152},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"182191f40c369d41af2d331056975cbd","len":262152},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"eb514ff59b226f072aed7f351ef62890","len":262152},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"cfe974ad0e408adcae5c4359e9333167","len":262152},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"19550d9807d635fa0c76430a90ef9b2d","len":262152},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8b0bed1ee4a5791b1bf6b339096cb5f","len":262152},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"a5d9e00c46253847f07171849d6eadc7","len":262152},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"82d7d59990475c76797adafe0dc40696","len":262152},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"097f1e8dd4ccb3592b8da5723ef59cd4","len":262152},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"ded7762317423bd6ba8c1c70ee003895","len":262152},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"b20168baef01265ca59af7749a004f19","len":262152},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"f2a61d2c37afd4e32380c5f974c718c4","len":262152},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"c505c8d454e6bd6805c19983c4b3becb","len":262152},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"451a64afafa5742e04a006595e3b76ed","len":262152},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"27122f17a65d56f09a5a99ec15023906","len":262152},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd77e86b9c6a708f0fdef896794e1416","len":262152},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b66fb1a91751f8fa60a26d01266773b","len":262152},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"17396f15a240e560fd0abbf793cff777","len":262152},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"db3e3eefb31e2e13de7f6b792d08fab6","len":262152},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"96c045e794657f81be4f550931d09f86","len":262152},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b2d795b329310ade27c930c0c9ac3a5","len":262152},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"5429af73f35f0c3aec714c0c2bb6fd63","len":262152},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba08b07cbfaa853ad84d02e16862b0c4","len":262152},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"73a6530cfe5bba58e1fec23eabeb8e6f","len":262152},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"01eb64dc438d15b3d37ad07a751ba19e","len":262152},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"055ff87be7d79d6a503ec16287cd7e1d","len":262152},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"f1d7485cfe6bceb4eaba44d23362a168","len":262152},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"60f1e6f7d1317bf376fb6c3fd6470db0","len":262152},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"371f5d440f6fb38a8dbd1bb08426d734","len":262152},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"1088a949951fc9d698682d724e67dd7b","len":262152},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4a1947699b7877aca19f2e9ba66d3a6","len":262152},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"39aecd2559c82e37e66328bcd8ece41d","len":262152},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"764fab8b1ebda52512b3bbeefb9a9bc2","len":262152},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"b9be02bf929a30beec27ada23f59b6a3","len":262152},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"03c4ba2e7b40c211a91300091cdfaea1","len":262152},"recovery120":{"type":"PAR 2.0\u0000RecvSlic","md5":"7f161539917d7aed2bdcfc6a97bf1243","len":262152},"recovery121":{"type":"PAR 2.0\u0000RecvSlic","md5":"06cb8f2d83a03ad41d991c95cd93df59","len":262152},"recovery122":{"type":"PAR 2.0\u0000RecvSlic","md5":"03c36c76eeaf4c7c3d669133896b673e","len":262152},"recovery123":{"type":"PAR 2.0\u0000RecvSlic","md5":"2701161fade73a745dfe8fc06d097aa5","len":262152},"recovery124":{"type":"PAR 2.0\u0000RecvSlic","md5":"02579b843bcf0750cf7595c84c4a1b2c","len":262152},"recovery125":{"type":"PAR 2.0\u0000RecvSlic","md5":"8af83a50b1e0c00dc811d24090466dc6","len":262152},"recovery126":{"type":"PAR 2.0\u0000RecvSlic","md5":"5fbcd8d0454c6674bd236ccfab72f784","len":262152},"recovery127":{"type":"PAR 2.0\u0000RecvSlic","md5":"2aae76e0ee91e98cfcad8b6edf64b30a","len":262152},"recovery128":{"type":"PAR 2.0\u0000RecvSlic","md5":"3c80e0a3b4aced780158b2cda147025b","len":262152},"recovery129":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d714844585e05d9b8ecca9b3ce144e2","len":262152},"recovery130":{"type":"PAR 2.0\u0000RecvSlic","md5":"3ff70a539d48bf46bddc4a4d604ef122","len":262152},"recovery131":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac9e11b7553248cc9cab4ffa4b0eadc0","len":262152},"recovery132":{"type":"PAR 2.0\u0000RecvSlic","md5":"a338dfa5fb3dfda5d0844ccf33b7a0a0","len":262152},"recovery133":{"type":"PAR 2.0\u0000RecvSlic","md5":"55816e85a627b359f1ce6abcf07a80f8","len":262152},"recovery134":{"type":"PAR 2.0\u0000RecvSlic","md5":"62989b2c6a48c883b7153f8841c57fbf","len":262152},"recovery135":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d714d1f7c84c36bba5211a441322f1f","len":262152},"recovery136":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe16f8e7183642779c2fda0bdd2e69c7","len":262152},"recovery137":{"type":"PAR 2.0\u0000RecvSlic","md5":"959e3ed520135276ce507bfb974170e5","len":262152},"recovery138":{"type":"PAR 2.0\u0000RecvSlic","md5":"38755e49834c707a18ebc78153dd309f","len":262152},"recovery139":{"type":"PAR 2.0\u0000RecvSlic","md5":"861c385818cc82e20538517a860ac822","len":262152},"recovery140":{"type":"PAR 2.0\u0000RecvSlic","md5":"573401fb45c72080a5fbe59575daf1f0","len":262152},"recovery141":{"type":"PAR 2.0\u0000RecvSlic","md5":"cbfa6d65eeffb234b4b35bb0101813c2","len":262152},"recovery142":{"type":"PAR 2.0\u0000RecvSlic","md5":"a915d1e365b2f6667b561f15074c246f","len":262152},"recovery143":{"type":"PAR 2.0\u0000RecvSlic","md5":"aaa5d236c4b0ab8bca3837357a676828","len":262152},"recovery144":{"type":"PAR 2.0\u0000RecvSlic","md5":"a5e825e51919a1526298b2a63a67cd18","len":262152},"recovery145":{"type":"PAR 2.0\u0000RecvSlic","md5":"935be6ea3de1a2b7662868af15db66ae","len":262152},"recovery146":{"type":"PAR 2.0\u0000RecvSlic","md5":"20927e4c32ca839d9bc2e32dda3b88bb","len":262152},"recovery147":{"type":"PAR 2.0\u0000RecvSlic","md5":"734b9f415296856d8a9935673a6359ce","len":262152},"recovery148":{"type":"PAR 2.0\u0000RecvSlic","md5":"00df8edd547ee5f396ff09c1b0c8a979","len":262152},"recovery149":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c7d4748b05c963294d008bcd4d9abd4","len":262152},"recovery150":{"type":"PAR 2.0\u0000RecvSlic","md5":"136eefb57c6163ba9ee4361bc30a946b","len":262152},"recovery151":{"type":"PAR 2.0\u0000RecvSlic","md5":"eb45e084b5f5d3c1659825791ebe9dcf","len":262152},"recovery152":{"type":"PAR 2.0\u0000RecvSlic","md5":"b1167f13fd2ab1a49de63f0c23d97b30","len":262152},"recovery153":{"type":"PAR 2.0\u0000RecvSlic","md5":"ec60aca69736e5c01c0bc0a4a5fc20a7","len":262152},"recovery154":{"type":"PAR 2.0\u0000RecvSlic","md5":"1eeecfb1e63674b87f0d47736f77cf8a","len":262152},"recovery155":{"type":"PAR 2.0\u0000RecvSlic","md5":"d24ea57a9b206855ce59c28a29a609b5","len":262152},"recovery156":{"type":"PAR 2.0\u0000RecvSlic","md5":"adaad933db3226d3778da61397198f17","len":262152},"recovery157":{"type":"PAR 2.0\u0000RecvSlic","md5":"36e8c301bcbcb253546bb3672400f0f2","len":262152},"recovery158":{"type":"PAR 2.0\u0000RecvSlic","md5":"acb88c3d8b4676e5b2a1d07240ecbf3a","len":262152},"recovery159":{"type":"PAR 2.0\u0000RecvSlic","md5":"6ec97320f986ffc6dc8884d762784e64","len":262152},"recovery160":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba205e752f5743e7fb56f91159f11638","len":262152},"recovery161":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee0bc353e44cb3a0a6674bebb8f2b02f","len":262152},"recovery162":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ffb47c96191de05ee92d65d17e6d1b5","len":262152},"recovery163":{"type":"PAR 2.0\u0000RecvSlic","md5":"cab383c52fd7edf43c813a02b0e9f715","len":262152},"recovery164":{"type":"PAR 2.0\u0000RecvSlic","md5":"7579d93610965f3fa9d5380b8a05d179","len":262152},"recovery165":{"type":"PAR 2.0\u0000RecvSlic","md5":"79ebb790e0f5b3878e8efa9e31a1ec2f","len":262152},"recovery166":{"type":"PAR 2.0\u0000RecvSlic","md5":"d98d9e9fcaeab7644965287fdfa14927","len":262152},"recovery167":{"type":"PAR 2.0\u0000RecvSlic","md5":"5c4d552661fe8a972e9a3451829f467f","len":262152},"recovery168":{"type":"PAR 2.0\u0000RecvSlic","md5":"e48d561f1fbc700b8c676ee59c4a9f5d","len":262152},"recovery169":{"type":"PAR 2.0\u0000RecvSlic","md5":"585373bf1021202ac1a871dd1325e797","len":262152},"recovery170":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab371b66e23bf6b2f143a97162857c7f","len":262152},"recovery171":{"type":"PAR 2.0\u0000RecvSlic","md5":"e73956ff54bc05bd6f269456952de3f1","len":262152},"recovery172":{"type":"PAR 2.0\u0000RecvSlic","md5":"1908334591c7199fb59474e189631053","len":262152},"recovery173":{"type":"PAR 2.0\u0000RecvSlic","md5":"c2fe26625667e1d867eb435a7b542044","len":262152},"recovery174":{"type":"PAR 2.0\u0000RecvSlic","md5":"54a87b2c409465195efcac62cf90c0d0","len":262152},"recovery175":{"type":"PAR 2.0\u0000RecvSlic","md5":"b067362a194e73030224dcb460b406f5","len":262152},"recovery176":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce671aa8082cea7d459d1f23b43ed566","len":262152},"recovery177":{"type":"PAR 2.0\u0000RecvSlic","md5":"37770253e086905a8273fa7681e120eb","len":262152},"recovery178":{"type":"PAR 2.0\u0000RecvSlic","md5":"891a00e3973ede2824a25a87243e8847","len":262152},"recovery179":{"type":"PAR 2.0\u0000RecvSlic","md5":"44cb5fa5db70c22cb3d382bf4fe76924","len":262152},"recovery180":{"type":"PAR 2.0\u0000RecvSlic","md5":"27ca455202140b26fb48af7db5c559eb","len":262152},"recovery181":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ac3a13daccd4e334fb2cb4ba8806931","len":262152},"recovery182":{"type":"PAR 2.0\u0000RecvSlic","md5":"6d218c46b445edd8b584b1079aea762c","len":262152},"recovery183":{"type":"PAR 2.0\u0000RecvSlic","md5":"c288a78496f8e466bc2ca2555ab57651","len":262152},"recovery184":{"type":"PAR 2.0\u0000RecvSlic","md5":"2e2d494f19d7e12728b2813959f363b9","len":262152},"recovery185":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ac64828b943e917b0b327ba4e3e1d7a","len":262152},"recovery186":{"type":"PAR 2.0\u0000RecvSlic","md5":"64149d71b23f3b48601a1f66d38d2ce9","len":262152},"recovery187":{"type":"PAR 2.0\u0000RecvSlic","md5":"cb81ff2511a7b5d0cd9f4c88ebef6f4b","len":262152},"recovery188":{"type":"PAR 2.0\u0000RecvSlic","md5":"6fc5b1e6e008764933bb831afe224ea7","len":262152},"recovery189":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ad66c7aeb321475b766456499f641ea","len":262152},"recovery190":{"type":"PAR 2.0\u0000RecvSlic","md5":"15a3793c1c67f7a9cf718d331d34f41d","len":262152},"recovery191":{"type":"PAR 2.0\u0000RecvSlic","md5":"40d0e37fbc8aa71fb21b18da1b7d025c","len":262152},"recovery192":{"type":"PAR 2.0\u0000RecvSlic","md5":"c650dffc875af3e180761d3eef8994c9","len":262152},"recovery193":{"type":"PAR 2.0\u0000RecvSlic","md5":"4c22cdc5d78c408671a55c724affe633","len":262152},"recovery194":{"type":"PAR 2.0\u0000RecvSlic","md5":"80e4eca09aeed7823945af8434d4ccd8","len":262152},"recovery195":{"type":"PAR 2.0\u0000RecvSlic","md5":"3fe2b6e3f848a643fd8e77a05d795486","len":262152},"recovery196":{"type":"PAR 2.0\u0000RecvSlic","md5":"9ab3653b65ae244124b97817498673f0","len":262152},"recovery197":{"type":"PAR 2.0\u0000RecvSlic","md5":"3499ed0cd0b8e789b4622ebc71c49e1b","len":262152},"recovery198":{"type":"PAR 2.0\u0000RecvSlic","md5":"853ea439016d879ca104b2ea5621c320","len":262152},"recovery199":{"type":"PAR 2.0\u0000RecvSlic","md5":"99af3d84da4967f583714a44b8c5129b","len":262152},"creator":{"type":"PAR 2.0\u0000Creator","md5":"75978a963bad01ec4845ee37a32b8523","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"a1ab3aa1dd29953af5f118e683a1ebef","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"ba3580d34dbf456def2775e5f47fe32f","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1883c8c7fa2b7da5978e4b5199f6e168","len":5220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"75978a963bad01ec4845ee37a32b8523","len":104}}],"1":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"6281dd7c3c3938b3e6185a0477e6e287","len":65608},"main":{"type":"PAR 2.0\u0000Main","md5":"0b675e25887343203a39a3e2c8d1ed28","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"67c7299fe12d06356026566cc7084417","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1ae47841abe78267f3850e465307d555","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"498e695c1f6fd9d63f95c51ad93d14b0","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"0b675e25887343203a39a3e2c8d1ed28","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"67c7299fe12d06356026566cc7084417","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"1ae47841abe78267f3850e465307d555","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"498e695c1f6fd9d63f95c51ad93d14b0","len":104}}],"2":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"80ee3e17eb31ff1aeb2b4b1299f54110","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"47d39160c9dd187d9602b395a9960adc","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"b2fce1dc1ba509aae7225e828a735ca3","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"93c8615d044e46d375da63c6e5c6999e","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"fcef287602419071bdf23c06a665b7dd","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"73901dbf207c0a3ce23b257df0c90f1c","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e455419ba916f25c94ac35c7fdf339b","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"30b4e3c59fa3f4dfce5440d6494f2eed","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"7868a53c236c0bdb9cd3a2f2a167766b","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"5081c501ed7ef9de8960c89020e87192","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"3984f758117ba91e1a6f4a4a0371d017","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"12b685ecaae6d8aa9e6a993fdfd07e0c","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"f50c1e1f38f49140b7a0601896740fa1","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6f795b7c34ab97e65dd3bca71698c5c","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"96ec24936af60d8ae873945cf38b9091","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"73a8d86b450e2da2d84a606e4bf97079","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"c867457ea6e5d2a1cf07091126ab98eb","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}}],"3":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"74520873b88f74d1de0e6b8f5c54006f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac1164d1c1d45c3e959976aa8d9c82ed","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"09a680944b9e3751e0397455e7f0561b","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd4b284c1cfb31f1f9dc4d2cb3b46f74","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"f86624cb3556a6c9ad0f97831984fd4b","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"0282fb1156b97afeb15889463fe6fb38","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4366cec10c3c406ecf39967a3488d00","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"debbeee2d37ef4bb964a68adfda5a836","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ca06454a70ccef99e374792f85ccb8c","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"f38a39a88bfcf792866e55a8103347d7","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"962eb1b20273dc63ac3e1afa4995ab10","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"58f929cac57b0a76a830ea74a0eac1c4","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b6c986373a5068a2afc6857243f2b7f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"22bcda081f15c8d4ae9297dfe0ac4f5e","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee302032bc01cd26256dbf9e984d5f6f","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"084d81c51717cf4b55eb2510bfab58eb","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac7539f5e66f2a7c029b596fa2ef20fa","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"749080994e1077637af1a102f5734400","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"b66e22198f11e1eb81c03fb83ac9f243","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"6b2a3aaf0a6af110d986adc5dcbbb4d9","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4c20ebe89a5409958eeab01b7fdc6d7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"41d9af908d53898507506f5bff21a546","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"693981d690733efd35c8594f2b88afe3","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b5b5257da59294665c41eb6ef8d6d72","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd20bc56ef956ab4ae3bccead46ff165","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"6156ade2b63b682f8719e01cd98802c9","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff7336302eea1d8c43a55084ddfbbf3b","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d0621c20b187eced33a3de5d14fa1b0","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c492e6e9078f3d5221d0de69a638ac2","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4b0c879fe80608a1ff5f460330ecb48","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"63516826cd3746a78cd797f5a849bdb4","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"35cce3895f113abc206b169fcee3dd0b","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"97f39c926596f591f608439ec2d6e50b","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e6dcf153973245f9f41f2f6d68dcf3e","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff89baf219572f14f59a7f08a193c155","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"d72a195ba2006e3d70e3d970d9710daa","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7461a500af7f1e129bc9d347a60c0a62","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"dbb3c6a53d9d9ca3e152ecb21f5cefe9","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"89b986d2b3aa9550a1d32f046436cc23","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f0454501952dac6bc2d1242629bd829","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}}],"4":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"edd1ade340f0f983fbfbcf844d801cda","len":2097304},"main":{"type":"PAR 2.0\u0000Main","md5":"09dfad62ef619decf35f32c2bf0a1522","len":124},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"b10d9aba7502ed7811bb25f22caa8e89","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"a245487ad0d095cfd6add75baa138e1f","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd6685f87867e9c3b0990dd40ba8fc1d","len":2097304},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7c84d68381da44721063e3c93bae8301","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"49d38647c2861e08ba2cb852dfe4062b","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"eb959263385ec801a008a8adb7bc86ec","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"6fe15849c49662577aea77449e41e12e","len":2097304},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"e35965c8be6de77d07426b0d067b3626","len":220},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"720cf3d8b68b0312ba6972859c9adac5","len":2097304},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"4588d83af606eb1601a1b5ff510b7879","len":2097304},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"2f302e2675b82a7e510046d3053d9919","len":2097304},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"dfe8e1351ed2ac4376d0309aa01301b1","len":2097304},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f00c87d06d608c53d16e6cbe3735a766","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"09dfad62ef619decf35f32c2bf0a1522","len":124},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"b10d9aba7502ed7811bb25f22caa8e89","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"a245487ad0d095cfd6add75baa138e1f","len":132},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7c84d68381da44721063e3c93bae8301","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"49d38647c2861e08ba2cb852dfe4062b","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"eb959263385ec801a008a8adb7bc86ec","len":100},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"e35965c8be6de77d07426b0d067b3626","len":220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f00c87d06d608c53d16e6cbe3735a766","len":104}}],"5":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"227b1235272f1640a2f3f1f5ac13109f","len":4194372},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b43e42ea2ec0617637ab5e1cfb98c0d","len":4194372},"main":{"type":"PAR 2.0\u0000Main","md5":"76a2c8053eca00b765ada9fdcde12245","len":92},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"612f84ace10d12ed08b83ccdd9d66511","len":4194372},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"f3595e2157d1f7b0864a88e07c1f5442","len":4194372},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"49439ce8a8009014fd812470983aedfe","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"e1ca7d5e9064e21e1777d44aacf4fb30","len":4194372},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"3da3ef1c9cde24fce3e996add1d608a9","len":400},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"c1a5cf7498aaafb919379feccf77ad64","len":4194372},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"e1de02388438296cf7a44cdef5c38856","len":4194372},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"1283885f4c351acdab8eef7df7258942","len":4194372},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f9d71ae23872e39f122c02e3f403cbc","len":4194372},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"662220713f645187c50d0f950bafc747","len":4194372},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"63bafc1c7d16744ec4904235621d3be1","len":4194372},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"d67b2fdd14e40510d595897300ff8889","len":4194372},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"50ef19310cad87061283fa3ce62188a3","len":4194372},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b79e5fac9942dc229eef88d1b65c15c","len":4194372},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"dfc8f3cf562f919f39e681a0207afa00","len":4194372},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"80e536d4be26737e0f2fb2941c652b9b","len":4194372},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"e9afe78529cbfaffceba5e2601f13279","len":4194372},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"1240e4446fc41a8805e0b328ff3fef7c","len":4194372},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"7d6562c315684126e67926609d6ea1c7","len":4194372},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"ddeef17620d1515a91bb6d195e627438","len":4194372},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"e03cdfc92253706ac7392426134ce613","len":4194372},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"05d9de37cb8619790dabc4686a6465ed","len":4194372},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"ddd9a65e990a88b69ba76360900162bd","len":4194372},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"79a1dac3c4b04b02bd63db1169606f96","len":4194372},"creator":{"type":"PAR 2.0\u0000Creator","md5":"4b6de922b9d6c30abe117f55842b19ef","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"76a2c8053eca00b765ada9fdcde12245","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"49439ce8a8009014fd812470983aedfe","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"3da3ef1c9cde24fce3e996add1d608a9","len":400},"creator":{"type":"PAR 2.0\u0000Creator","md5":"4b6de922b9d6c30abe117f55842b19ef","len":104}}],"6":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"74520873b88f74d1de0e6b8f5c54006f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac1164d1c1d45c3e959976aa8d9c82ed","len":1048644},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"09a680944b9e3751e0397455e7f0561b","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd4b284c1cfb31f1f9dc4d2cb3b46f74","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"f86624cb3556a6c9ad0f97831984fd4b","len":1048644},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"0282fb1156b97afeb15889463fe6fb38","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"f4366cec10c3c406ecf39967a3488d00","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"debbeee2d37ef4bb964a68adfda5a836","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ca06454a70ccef99e374792f85ccb8c","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"f38a39a88bfcf792866e55a8103347d7","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"962eb1b20273dc63ac3e1afa4995ab10","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"58f929cac57b0a76a830ea74a0eac1c4","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b6c986373a5068a2afc6857243f2b7f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"22bcda081f15c8d4ae9297dfe0ac4f5e","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee302032bc01cd26256dbf9e984d5f6f","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"084d81c51717cf4b55eb2510bfab58eb","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac7539f5e66f2a7c029b596fa2ef20fa","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"749080994e1077637af1a102f5734400","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"b66e22198f11e1eb81c03fb83ac9f243","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"6b2a3aaf0a6af110d986adc5dcbbb4d9","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4c20ebe89a5409958eeab01b7fdc6d7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"41d9af908d53898507506f5bff21a546","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"693981d690733efd35c8594f2b88afe3","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b5b5257da59294665c41eb6ef8d6d72","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd20bc56ef956ab4ae3bccead46ff165","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"6156ade2b63b682f8719e01cd98802c9","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff7336302eea1d8c43a55084ddfbbf3b","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d0621c20b187eced33a3de5d14fa1b0","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c492e6e9078f3d5221d0de69a638ac2","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4b0c879fe80608a1ff5f460330ecb48","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"63516826cd3746a78cd797f5a849bdb4","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"35cce3895f113abc206b169fcee3dd0b","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"97f39c926596f591f608439ec2d6e50b","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e6dcf153973245f9f41f2f6d68dcf3e","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ff89baf219572f14f59a7f08a193c155","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"d72a195ba2006e3d70e3d970d9710daa","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7461a500af7f1e129bc9d347a60c0a62","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"dbb3c6a53d9d9ca3e152ecb21f5cefe9","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"89b986d2b3aa9550a1d32f046436cc23","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f0454501952dac6bc2d1242629bd829","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"52818a8855a4f9452bf7505f73351c79","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"69beb9ebafc6b9d17ebd20d73291f900","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"fce642a756dfbd6298ced47a6364adf3","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"33eccf584ac19db13bb5c10e6ee0bac1","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"cdd697b1320c885fa1b315c0e1913670","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"532e5c58590e5efb9f6e36fb3dbaaeaa","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f1e08faaba02c5163b2ae24f2c4d84e1","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"fa5bf7092a30b550e90dbb486dc0445a","len":104}}],"7":[{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"13d75e3ff2871a1f3554d0810d758a9b","len":12292},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d61059e69c4520710a1bf6157339902","len":12292},"main":{"type":"PAR 2.0\u0000Main","md5":"b546f34896688623139361b73f7466da","len":140},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"ad306d05c1f843b71793db9705949994","len":12292},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"9df88ae2f95bb7ac4d7756cff03be42e","len":12292},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"bdf7fd79dc9be3242911d615013b4675","len":132},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"019ef2edd1afe0d9e3a4fae44fcc4dcb","len":12292},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a296997ecfb7742bd8adcb4408f3b1e","len":12292},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"1074e6bd114064d4010cba8850279936","len":132},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"396efc5068f49b51ff1c78438defd032","len":12292},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"e29e2769b588fdd6073366f0c69ba755","len":12292},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"6c37323d51a7cfcc68cd519a949ed0b9","len":132},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"59175e7ac2d1d2364a469e6213bae609","len":12292},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"10df8b27966f711989172bc5bb351909","len":132},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"423e090521d2e6e445f3ab47a591f170","len":12292},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"2ce148ae06221e99da47f28ef4fa0c2e","len":12292},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"939568408c85444683327985736473f1","len":200},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"61c196fcab5efb2423e912493deb174f","len":12292},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"2a95543e8ad6e15bd03ee56837995b56","len":12292},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"4b33cc66f96237779dac4a2920e5c36b","len":100},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"3b48bff16a3a8eeebaef276c142aa66b","len":12292},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"6018b4d628e4267080dfbfbaa11f8482","len":12292},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"7900826aa0cdcd35f5b1b6d309285106","len":100},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"6355f957b5299352ceb471cc7aaab9de","len":12292},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0f14355deaf0bae05b3a601788f952c","len":12292},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"89550dc87a9e0098a8035d643c7c1fa9","len":22400},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a9553081f2bcb2568cc3d49e817c12d","len":12292},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"64c64c2c2aac4fa2cf1dc5abe9e79e40","len":12292},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"11e3ecabd3e549dc7bd276d0ef06e4f2","len":12292},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"045ba81e58ecde2ae2e5e10d6ff52deb","len":12292},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"7c2a0122a173e9c24288c6fdff476f75","len":12292},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"7fada7e5fd40793968feb8708724e76f","len":12292},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"6051d8fff3ba7d1a9ef0b89dc3c099fe","len":12292},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"a20b372c84071403679473808447e1d1","len":12292},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"a073c7ab64609655101b3f9d43f83217","len":12292},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef752e17c8404a77522e886979da238e","len":12292},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"9489fae5a303376fcf21bdeca134a01e","len":12292},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd47dc979bcdc24667a1743097d6d57b","len":12292},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"6576e487f28903ad92ac1bb9afde1cfa","len":12292},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"a77c05ca3f1390283d1fbb1c0332a0d0","len":12292},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"3923f44f59d0ba6e84ce1d30a44962b7","len":12292},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"d956db1617f11f33b3663c81e560bae8","len":12292},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"740ded4b4c20fe1f6799961a9f4a863e","len":12292},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"90781f2934736ba360ceb99d95652fde","len":12292},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a4bdc4e888708e7639a073817e96be3","len":12292},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"ad671f3f22846043eee50812b7841d19","len":12292},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"370998274b1b0348d66f79f98f277707","len":12292},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"c8491de0561f52f26a01dd5b1fc0a680","len":12292},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"9108adfad172584acad344c253d6407c","len":12292},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"64d94e06c9c8076fb98148977aabada7","len":12292},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"934a2da041551ca688d4cd65201de9de","len":12292},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"47ad1b8f9e972ff43bc55d8a7c7c8b82","len":12292},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"dff77ec66b2bb8226d40075e7dafd65d","len":12292},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"947c5b6bded6872b18f8b0eafbb75a82","len":12292},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"9dc6162f9bf3e41821ff1aa49b02fd7c","len":12292},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"0936daced4a42c3bdd97bcf7bed560be","len":12292},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"3d3733e28add7241ce6c160afc499514","len":12292},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"3e457b7d118e652d35420859f0c2cb0c","len":12292},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"89751e1de0aea82ecff902443fc29a65","len":12292},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"0bcb7ff73a2b4185a25634b41fd282db","len":12292},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"0051a97153a602a5da921ce4cc39ab8c","len":12292},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"c33f2d2904d814872fe43dcb1ba83fa6","len":12292},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"44d12ca49b89389205fa867b31ef821f","len":12292},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"908fa6ddb6e46fbc79cd7e8f36fdc91c","len":12292},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"e01263d2374aca3358fafd4f1e31dd50","len":12292},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"d87a878179763b3b553850bcc3f37463","len":12292},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"4748999b7a65523b2c40bdba325fa0d3","len":12292},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"abf213d33e195ca39e86262a57f5a335","len":12292},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"ba9e9db844838a23d3a845cc180aa7a9","len":12292},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"4598550a4ee7bb79427d2baa85ed5341","len":12292},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"3aeb560438ea23e78d2d990b70579d84","len":12292},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"b21c0f8558b48322af337d9502ff7410","len":12292},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"f965345c3ebc04d23a8a7343d3f4573a","len":12292},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ad9c5792fcc6ed1d0d6889b8067ebc1","len":12292},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b73c50d6e875d041d47d371bebdeaff","len":12292},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3bbd4599eaf7e264910b462fa59ee48","len":12292},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"052c9e79b7f175a837a46923f0a80ee2","len":12292},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"be0f6bfabf1bf6ba6508237f216475a1","len":12292},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea169208817c6d7fe336c901bf4a5ea0","len":12292},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"625407da280dfbfc7cacc8fb4f8bb265","len":12292},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"1c32d547d7d52d43cebc4d996c262111","len":12292},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"cedc3a8750dfef198ad10d9dccefe339","len":12292},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"ec4ef290224420846245b9375b814f8b","len":12292},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"4bad80c7f5a56d2fc87937642b18aa0c","len":12292},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"c4b0db1068256aa8c1e07f98a2d16339","len":12292},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"375911da8153e31766960340b1626520","len":12292},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"38b8edbd15145ef6126699807dc73da5","len":12292},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"6e66ffe7118eb850b326d9a3daba5d97","len":12292},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"d12c98c5352c1db7a19cd928513e66c3","len":12292},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef8dc2866ac48b1831afacf9e0750164","len":12292},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"bd23bf25bed444e4bdc22b9fc65288bd","len":12292},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"a82f0d4f7663ac83d47ca10ac3eac1a7","len":12292},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"340d4ae8bbe9f403176a4a001ecede09","len":12292},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"1465427d8499f1cc00977b1bde040381","len":12292},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"961b476e9df27b019b129aa37e9b4c87","len":12292},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4743689d9571c1ecc2c4341a8400e40","len":12292},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f6d7fead6d5b60a7fdbe453abd63fa5","len":12292},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"b56ed13c80fed786dfaec38591a7f849","len":12292},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"3dffd1a0424af8dae4cf74d936aff7ea","len":12292},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8b1e226311ae44d16bcc0f56c68c1a3","len":12292},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3fc2ca676e4c262dd57e61cf68c5361","len":12292},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"42b3a719315c551d3c91ee822c2da7ed","len":12292},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"928fc76675423931cbabc3c07b638294","len":12292},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"e3928b4db0b5b11d3cafba8c20f77376","len":12292},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"69092ea4065e6a6cbec1bf3841321cc4","len":12292},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"011dcea0d6d2d576f76d0a06f7f745b8","len":12292},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"e48dc08c95b0176c34116600eb14961c","len":12292},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"51ecf96adae6cab2ddb04d94f1e31d78","len":12292},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"8fa7ffc715bf00106a5d7bc6d0da3ab2","len":12292},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"c4cf770599853908d4cea098a1298970","len":12292},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac084faa95146eeed6664cc3e029e6ce","len":12292},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"52f195fc483df0404a6c742750023de0","len":12292},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"61ce493a2aa951375af43e9d1ca79d4f","len":12292},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"fefb4a742be24ac3731afe307f5b82ba","len":12292},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c4e137dade4e9642b809f09dc050eaa","len":12292},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"af7592f4713e63de24ff2a591dd4fe26","len":12292},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"c9b995c9fe0601e5e333f734fb387047","len":12292},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"7f0bc8e940bfc1ee4a8088af4d36ea80","len":12292},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a3fef678dfb898d2a6a2fc349ab8885","len":12292},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"e4c5d8d29385549ee22e2e3a796ecbce","len":12292},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"a94ce91cec61f2add2f780412f6688dd","len":12292},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"45117210792cfdca35871f0625f29f50","len":12292},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a1304668f393486b4bf99741bcc8c0e4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"b546f34896688623139361b73f7466da","len":140},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"bdf7fd79dc9be3242911d615013b4675","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"1074e6bd114064d4010cba8850279936","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"6c37323d51a7cfcc68cd519a949ed0b9","len":132},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"10df8b27966f711989172bc5bb351909","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"939568408c85444683327985736473f1","len":200},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"4b33cc66f96237779dac4a2920e5c36b","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"7900826aa0cdcd35f5b1b6d309285106","len":100},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"89550dc87a9e0098a8035d643c7c1fa9","len":22400},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a1304668f393486b4bf99741bcc8c0e4","len":104}}],"8":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"8f490677c7370c81405b5dc720de5259","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab4b40f2f8c79be1b72fc3c297034e18","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}}],"9":[{"main":{"type":"PAR 2.0\u0000Main","md5":"d2e9f5f81e8780b703db895c249cbd68","len":92},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"4a2d665a1e6879cd1b9d04025a7a5f80","len":132},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"59a78d4061c80c5b686bc85f1ae871e1","len":120},"creator":{"type":"PAR 2.0\u0000Creator","md5":"aef6213477d5a93d8932e106b971214e","len":104}}],"10":[{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2b7cd021389fdb5ed3791087cab67848","len":16777284},"main":{"type":"PAR 2.0\u0000Main","md5":"c3e44363bd4b65f4942581572936dff4","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"c295b42879c2d2a3ee4843a3bb0f5ee3","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"13b6852f097a72fc7cc04761a10a480c","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"1ad880202a49781b87e335d349930f26","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"910c17cd5227f95a8f2c837c237ed405","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"8cbc4d1dad7ea1f63678571057c323a1","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f5a9392a75d31501b020918c799fab52","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a19208469e9ef2c68f2ce37e0cb36780","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"c3e44363bd4b65f4942581572936dff4","len":124},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"c295b42879c2d2a3ee4843a3bb0f5ee3","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"13b6852f097a72fc7cc04761a10a480c","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"1ad880202a49781b87e335d349930f26","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"910c17cd5227f95a8f2c837c237ed405","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"8cbc4d1dad7ea1f63678571057c323a1","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"f5a9392a75d31501b020918c799fab52","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"a19208469e9ef2c68f2ce37e0cb36780","len":104}}],"11":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ae3fa27bb30388ee1999b3e5d295b1e","len":1048644},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"76549588cac2b8d3f6ffb53ef2e5e5d5","len":1048644},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"46a19207e25504383c733859fa23a5d3","len":1048644},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"e39bd46ac1405579ea58ad6990a52e5f","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"3796a7821994867eb08d0b9ecfb1f022","len":92},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"3aa647a57f56be00b66830ebeed8d766","len":1048644},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"8e3df5e94c2602a027a43df9a79340fa","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"97253b6d0f8d827181326470066333e4","len":1048644},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7388491d9d881d220d94b5a26306cbdc","len":132},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"5d3dc575cd28880a5218b9c5d0762efb","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"696a55821a69dc48cde7e7b7dbfe18e7","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f8aa5bf1983752b0122f91c6dc46dae","len":1048644},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"dc3363b470db43fc863d728a48decd34","len":340},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"60ab3ac1c0b22bb2660ec3b1a44a454f","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"32b0c0ccf4da378145ec2881d06a0104","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"fe3cf4681ff6dab4fe48baadeefa8b9f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"75a0660ce94daa96fefcdb8ecb7ffe2d","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"e9fc5ad5e0214fb90588695cf1bea1d4","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a7a7c816d43b31ffd1f1ab7ff93cb78","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"6514fb821fb9d012cedb09a5cdb5ce81","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"c55c09da6d1858b24764007cc42d01e1","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ee9880f40b8062e5748abe8b5fb2579","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"329cb0d4712fc629414fbb2aa1c15454","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8b03860153ecb1c0b15b785422cf36c","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4ffc6740802ea6274134e905fe8ec1b","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"36c944effcf8db4f5098fdef4d7f9fb7","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"33dddc46138a869d45473597bd8a0885","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"52e988fed07f5c02b94695f129ffee05","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"860ce0c08f14146f6545f8b455388457","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"02fd2a64a0440a35e571383c3d0a62ef","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"557a895ab8b60101fc681cd5908d4bfe","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"2045f82f13b16f36b30986653111655d","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce6fad602d7745fc6b87ce0cf9adf74b","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"f916de512a753ad776ee91d1cd0c5e88","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"5f9b5225d53a5df264e98d57ef41488f","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"b35b8b7c45908e260ff729d0a927ed93","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"f976fa312351c40ff2c6fb8022aabf6d","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef96b2f79e199d7bb037fe0b05056211","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"96647c04ece1d25fb0f6b1eca3ae2219","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"7bf3716a84aed5b1321b19e18a445eb4","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"257500d3e77cf74b1efa67606d4ec1d0","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"696781c6156f397df6106e2d51819aa4","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"9046f4b89e3a13e3947e785eae760ea7","len":1048644},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"7fad2133a7b1672ed31694f89fe8b1c1","len":1048644},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"285897a69aa8e07e6f4f82025f25455f","len":1048644},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"424f74e6c405f183cb635524ae086d36","len":1048644},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"d7285bfef36a2314e8d5cc2323a91625","len":1048644},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"bf8285ec53c4c06737ba3f1ffd792760","len":1048644},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef6616466b9fefebfaddfadcf25b8fa9","len":1048644},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"3b64f6fc3816ff762953d256f6be5c09","len":1048644},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a6690f87f54eb8769c236e5972f4817","len":1048644},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"9fcdc6307e18029a7306c24d6e23a798","len":1048644},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"291516ea56111c0400a6a12f676e5a53","len":1048644},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"c87b641d60c32ea5d2883509304abc60","len":1048644},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8a1d99820b3030161654ce66e732522","len":1048644},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b7465b4e0aa16ed2531a4a10297c4d3","len":1048644},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"6a0f4220cc76d6ab4aa7d537c1f7c77b","len":1048644},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"9143733d0076ab08719a05ca13f1f422","len":1048644},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"5266852e8cce0de780bce15fb1a608dd","len":1048644},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"19a85bb67233bbe31319d61afa482c25","len":1048644},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"777e26b540a27ea9480817bad4ba5eb1","len":1048644},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"a6559b6d4dc1280cbcf748e968b4dbae","len":1048644},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"29a393bd18579d5ae612a73b5ad5471f","len":1048644},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"0c61aafcedeeb70a5677bac1f64ace26","len":1048644},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"18c227628a2b46e865649fbe7ed1c9ed","len":1048644},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"4dea15252f8943bc37ccfd8dede9ca4d","len":1048644},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"a437635c37da22690a51bcca8af94066","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"79b7e4a62fce4bbc39cb1d35a3b28150","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"3796a7821994867eb08d0b9ecfb1f022","len":92},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"7388491d9d881d220d94b5a26306cbdc","len":132},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"dc3363b470db43fc863d728a48decd34","len":340},"creator":{"type":"PAR 2.0\u0000Creator","md5":"79b7e4a62fce4bbc39cb1d35a3b28150","len":104}}],"14":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"bdb6fb8f2a0d2a5902cabcde8b859035","len":4294967364},"main":{"type":"PAR 2.0\u0000Main","md5":"854e212b116ec286bc7a0254029f405c","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"43a52a9af71e0d95624092ff681fbdd6","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"0af69ba8da0e6e6ac51e55f8d36c4b10","len":100},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b2721e854623f19ede417796298e409","len":4294967364},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f85ecf559c229c58166c2b3a837817d5","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"854e212b116ec286bc7a0254029f405c","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"43a52a9af71e0d95624092ff681fbdd6","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"0af69ba8da0e6e6ac51e55f8d36c4b10","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"f85ecf559c229c58166c2b3a837817d5","len":104}}],"18":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5c59db66d27af0dd33c5c76649c80596","len":268435528},"main":{"type":"PAR 2.0\u0000Main","md5":"d4aaedb226e0b58bad779c6228832593","len":92},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"413399233805cfe58ec209a1bd76bd06","len":136},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"cd00ea091d2134ba1a49b9f0683b3367","len":260},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"b997d60937956d94164d0d4afbb9efe3","len":268435528},"creator":{"type":"PAR 2.0\u0000Creator","md5":"66b1394753c32007408e5bfd2d973ec4","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"d4aaedb226e0b58bad779c6228832593","len":92},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"413399233805cfe58ec209a1bd76bd06","len":136},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"cd00ea091d2134ba1a49b9f0683b3367","len":260},"creator":{"type":"PAR 2.0\u0000Creator","md5":"66b1394753c32007408e5bfd2d973ec4","len":104}}],"20":[{"main":{"type":"PAR 2.0\u0000Main","md5":"8818ce3d940fd2ee906eb7372a9d8706","len":92},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"5d6e51774704c147c0eb2b21d63f8d6c","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"f90490a6bce853105ed49210ed38c6de","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"1762e1d053a8628c6311f1cfa802c0f4","len":104}}],"21":[{"main":{"type":"PAR 2.0\u0000Main","md5":"979bf683edbeb3c67b54eefe72ca796e","len":92},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"572fd34269714193e5755b1c9b4e3dcd","len":132},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"787536bb15eb4279f56212f01d1fdc1d","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"0769f180278adcffd6dcdfbaa0987882","len":104}}],"22":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"dc21985927d59404c3a10a1523e53e0c","len":262212},"main":{"type":"PAR 2.0\u0000Main","md5":"819d570e0ad997c9e3852a8df6bb59a8","len":140},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"bbacaf20c3142b7a9fcd00fe2653e19e","len":132},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"8ec3f9507a26ff1d780cbac99bc97fd8","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"2042cbf8d665fba64349918e12425f02","len":262212},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"7c287f4caeaee8170bee952ec02c6248","len":136},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"a2c6b03ef481ef57b2e7d70dc964ee13","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"e670a9d5df380732a1518e3f5be7422f","len":5200},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"d3baa6921143e1a203ac14a502ab6c8a","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"010700d8dce400b258d57b93189f6261","len":262212},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"35caeff6d05c414457ba3fa8973417bd","len":176080},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"ded26a35cec9f7aafdaf3f2fc53d5378","len":1120},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"a0871e7345520f1e61b2a99292b01a18","len":262212},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"6e9c8c7d20fe75165ee4f0812ebfd434","len":262212},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"f6b02899978c2d52177f754cb1708bc1","len":262212},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"55ec6d9b1af64f6fa6ceda6fa43fd722","len":262212},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"1b50a45fdef6a9878bcf7de3e56cc23a","len":262212},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"36d262d667af95827a870ca0379516fb","len":262212},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"d0a7a6cecea8434b3de7934ecb982e53","len":262212},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8cb6790dad8ccd9f59af46152131e65a","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"819d570e0ad997c9e3852a8df6bb59a8","len":140},"desc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000FileDesc","md5":"bbacaf20c3142b7a9fcd00fe2653e19e","len":132},"desc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000FileDesc","md5":"8ec3f9507a26ff1d780cbac99bc97fd8","len":132},"desc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000FileDesc","md5":"7c287f4caeaee8170bee952ec02c6248","len":136},"desc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000FileDesc","md5":"a2c6b03ef481ef57b2e7d70dc964ee13","len":132},"ifsc89a8cbd5f6d267e2a5e7a0a1e30d9422":{"type":"PAR 2.0\u0000IFSC","md5":"e670a9d5df380732a1518e3f5be7422f","len":5200},"ifsc6f0a4233b67c9828947bb27bd84dd028":{"type":"PAR 2.0\u0000IFSC","md5":"d3baa6921143e1a203ac14a502ab6c8a","len":100},"ifsc84612f1ae1cb84f1b796e424861c3d46":{"type":"PAR 2.0\u0000IFSC","md5":"35caeff6d05c414457ba3fa8973417bd","len":176080},"ifsc91d0a18c04c94c19e37e69b1df84a1ee":{"type":"PAR 2.0\u0000IFSC","md5":"ded26a35cec9f7aafdaf3f2fc53d5378","len":1120},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8cb6790dad8ccd9f59af46152131e65a","len":104}}]}
\ No newline at end of file
+{"0":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"03970ede6d760d2bf5d6cb33b38b008c","len":262152},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"a09b1093677a95f91a6e12f30c96d465","len":262152},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"f3172fabbb103c06cf765dac0ab61938","len":262152},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f9744132ce0f40d202a35620f660cf4","len":262152},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"afde5ce9c99744bf99890c2c907be1a0","len":262152},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"0383fdc3bf825deed285ad41e72dad0f","len":262152},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"49861fef773046c995ce1b470a408786","len":262152},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"8c7b9b0852558fbf8671ecb317087352","len":262152},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"698f229cbf8b133ae4a8e7291ef1ece7","len":262152},"main":{"type":"PAR 2.0\u0000Main","md5":"89c5482dbd2f87c50e63dd1d6fe57acb","len":92},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"7d5a13921d239a4c41c1a717b8cfff80","len":262152},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"2169e55a9542206a54e61a1bdc497b81","len":262152},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"9ac590d0f6eb4acf2e4722b1e0886f9d","len":262152},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"0f04c18cb08ae75f14a1bfcdd93fef33","len":262152},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"9520fde4a0a015846a26b27e6fa8ffc9","len":262152},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"cd75fe1a450ec13d5deb291ec7a6c06e","len":262152},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"0df5e975fd4d26d4645efcf277b97864","len":262152},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"da0633552876e6952b343e9e1475a07d","len":262152},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"52fb5aaf6b778813e2457d51678ebe56","len":132},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"c8a5773035c82e3919e1fa630397f4e3","len":262152},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"887d8999ab30bda143c7c44d5db8b2ef","len":262152},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"042606927322c90d5a47d3db0940d20f","len":262152},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a1cf4b6cbf163175568c2ba6cd59e54","len":262152},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"aab5ff9e96bf7166715848629553e1d7","len":262152},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"99dee1834fe4bbf362867cdc1f2657cc","len":262152},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"4aad3e6f408fdb71d8da0ae89cb8a6d8","len":262152},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"aa0cfff6648596cc55d43cdf7edcc811","len":262152},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cfcc0ca48bb20a25e91d1b117b1ab518","len":5220},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"f0da77095c905c3f12473d95b228a833","len":262152},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"e995cb583e47d8062b8a570ae3d89890","len":262152},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d95da53406f7f05b3df9fd772fddbb2","len":262152},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"081a934272ce2b045588c3e8315dab85","len":262152},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"210a791f5f1abb3c766a9afb868046ff","len":262152},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"a054993798a060f9158bb6ee5429ba39","len":262152},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea9bdbfd00fb4d92a5ad7e12806a07e3","len":262152},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"71095456144a043238aac78b9404cf31","len":262152},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"161d6a70a1f9660ddaee08b2f8ff07c5","len":262152},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"5fbc1fbe505af3935a6626704443f95f","len":262152},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"b137effeecf7508303909cc1f4c4744a","len":262152},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"4bd21a197729254883c39639ab1d1d2c","len":262152},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"421dab830bd4008cb6b2fbae4b4acac4","len":262152},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"a414ec0c642cbd9696617549ac233fcf","len":262152},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"aa1231c5f3b62b97aaabab8360687d69","len":262152},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"75ec845540c1623cb23d923ff1599c62","len":262152},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"125cf5ecbca7b2cec06c2aeb34bb0c45","len":262152},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"83bcfdab942c36d87c16a86854b7126a","len":262152},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"05edd2b3d1b0956ee0a746c3ac4e89f2","len":262152},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"5e298933bc064994adafa95427d9cc16","len":262152},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"eb9dbce2b3cedfde6ff13bda272c9492","len":262152},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"777fa36408c9baff7d53124a155e9b70","len":262152},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"0d51af8d47e332b8b7588bc6b41cb90f","len":262152},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"c304b5026423dae46d152ea39fdcba2e","len":262152},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"a3b6c7d58ae173aa8b5e427e52ba5ccb","len":262152},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"3a79c70ba528b971d431549a6ab780b7","len":262152},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"343fb9b3384b5e33f1ee51361a5998c4","len":262152},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8074380558ea76ffd244dfc0c7f43a3","len":262152},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"275a845bd65e643230f8fd898e5011b7","len":262152},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"fef14352bee0d22ffa74386828f3ccbd","len":262152},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"82f3994148d2892b8f1b8843cf335295","len":262152},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"cb516d90ba09d6075ac74892b50618db","len":262152},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"d5ecab451b59a28d40239111294a39af","len":262152},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"b20f9e576ffdb89fab5e192e80ee7a33","len":262152},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"1287eef5baaff56af11ca84e8d3c1748","len":262152},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"54687f28deea74ad577c42aff9f76e78","len":262152},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"8cb4a241f39bc369936757cf033a1eb5","len":262152},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"bba478514161683f5822e4b58cb8454e","len":262152},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"16893c1739ef5d4261f1cf5c309f7b79","len":262152},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"0735e832cbfeeebd6f9c96be8b23cf51","len":262152},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"70509929a1adbc65f16109b5bf09f63e","len":262152},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"fa0e8cb3bf86b47dc92058df73c7d004","len":262152},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"331fc5d6ca5bfdd47656af08be93b0fd","len":262152},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"12c947862c6378a86c6e6b06d5498589","len":262152},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"cfb0553db13af90a358a7d170570bdfa","len":262152},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"6a4c6e95258bd09d23607a9410943f39","len":262152},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"3d92e9d78ad8e50271f5bfa55948743c","len":262152},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"d123c3ec100da92fe45cd2a5af2ec7fc","len":262152},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"039f147b63cc0777d6e1d805fdafbee2","len":262152},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"1be0fc177323345fd62a7c22f2a94853","len":262152},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"bc493ce8d6f38631383bf2979bc01067","len":262152},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"b2961018c462f5981cef07267176f04c","len":262152},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"3f87e937dea4e4664a872645746db8ba","len":262152},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"8de28f1862f9a086d04a9aa960b43a12","len":262152},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"1f44c18c155bdbdd2341bc31e37082ee","len":262152},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"e2fd62569e0cbb5b572880819f5f201f","len":262152},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"347a0aa5b8f4447262aeeaa00f53bce4","len":262152},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"24031e3e370225bfea2d86072b79cf26","len":262152},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"1e177641661d08ec6f0a42cd4b41cc15","len":262152},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"988b050937b5ae4910a31432fefc875e","len":262152},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"6dda853f4ccc46c65262e9af10246ea4","len":262152},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"8badc2fa06aa5d56409a90ff28335101","len":262152},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"7a1b2d483182384822640f2d1c8c624a","len":262152},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"63f2bfc85026efe8a791208a7d8c3089","len":262152},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"106bdd07f0dc559e30784a840ce2059d","len":262152},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"663360b546f956b7aa0a1487b6cf55ae","len":262152},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"f80076528b533e0f05d16c09091138f7","len":262152},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"3f0b031ac855ae6194a0d226ab404f4d","len":262152},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"f97921024f81af02f9ee5d4de57482a9","len":262152},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"e0d93d56835eeaa7be21514d3649b750","len":262152},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"3ac9c411a5f7a1c4a340f9963d8e9600","len":262152},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce73db440330f0a2bb0376af3c4993cc","len":262152},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"286de36d1b78d7b80d31b6f17a648185","len":262152},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"48451352ab94a5022853ce548c665da8","len":262152},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"a15cfb01e9080e979c370c258d71d784","len":262152},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"90bc078b3558a063e4b3d863036c6e30","len":262152},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"1d6aa5f278b70abac0173874216eca34","len":262152},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"bdbdd9d8d8b6e554960b88fd458b6a45","len":262152},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ec684e6bab962b228409487a0d4984c","len":262152},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"2e462ac7f98db990ea3ed78239df0f1c","len":262152},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"7bcd383088e1c74c0e403aef0892b806","len":262152},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"fa220540df29315fb7dbd3681b094310","len":262152},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"c5be37a007252a75765caab04115a9f8","len":262152},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"92e1cc5b4b87395214afe2fa361422e6","len":262152},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"2d60d115f1f2bf3ff2ca6a351b72bbd7","len":262152},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"cf6b7a3892e01ca2909ccd1462bd4c20","len":262152},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"5c6decd7cec310b857010c848077aef9","len":262152},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"a43027f82a38b9c78e5fc3c2f00731f6","len":262152},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"91c953ee943d0d89ca50ee73053305dc","len":262152},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"1b4e8c8065d7c190d54f51cdde021058","len":262152},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"8d6675acb4ebd061ac9598fd01dae7b4","len":262152},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"78524ff0643436fd95d771664da8b12f","len":262152},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"9f688409071efcdb9ea196eec65be8b3","len":262152},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"a69818a17a03aa13375cc10a07f616d2","len":262152},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"30f3355ccf75fc559138cc706c9c04d5","len":262152},"recovery120":{"type":"PAR 2.0\u0000RecvSlic","md5":"4dfcbcb0851fb544530c4c4099fd053d","len":262152},"recovery121":{"type":"PAR 2.0\u0000RecvSlic","md5":"c7a364ecbceb8c9491dffe835e09f7bb","len":262152},"recovery122":{"type":"PAR 2.0\u0000RecvSlic","md5":"abaa58c8e90a572bf76e49a7e4845646","len":262152},"recovery123":{"type":"PAR 2.0\u0000RecvSlic","md5":"d3e9332163a0bec102eec6bb74569474","len":262152},"recovery124":{"type":"PAR 2.0\u0000RecvSlic","md5":"48a5bec73284a3e683cb1e88f6b4dff1","len":262152},"recovery125":{"type":"PAR 2.0\u0000RecvSlic","md5":"b751ff190f59c1d807d088ca5f9d544e","len":262152},"recovery126":{"type":"PAR 2.0\u0000RecvSlic","md5":"efb5922f8842d49d836e54b3c230f47d","len":262152},"recovery127":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a6a008bd479eba9f3bf9ac723f9d3a6","len":262152},"recovery128":{"type":"PAR 2.0\u0000RecvSlic","md5":"872f97d2b9da3c002a036e9142c7f481","len":262152},"recovery129":{"type":"PAR 2.0\u0000RecvSlic","md5":"e005e9f222f3061cfc5c79cfccb2799a","len":262152},"recovery130":{"type":"PAR 2.0\u0000RecvSlic","md5":"aeb4683c1a61c30756d52c15b4a25a19","len":262152},"recovery131":{"type":"PAR 2.0\u0000RecvSlic","md5":"72aa837c40a87a4ce9c2db41b36d17f8","len":262152},"recovery132":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac87c0cecb67c4ef0befe5c7193d71d2","len":262152},"recovery133":{"type":"PAR 2.0\u0000RecvSlic","md5":"fcaad629446fdf03a295e2fc729fa956","len":262152},"recovery134":{"type":"PAR 2.0\u0000RecvSlic","md5":"1b2dbd073bf9a16c19d980ccb3c3cf36","len":262152},"recovery135":{"type":"PAR 2.0\u0000RecvSlic","md5":"e1075aef3398e5021a8acbe3196e019d","len":262152},"recovery136":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef67403febd7eb60a1220317bad008a8","len":262152},"recovery137":{"type":"PAR 2.0\u0000RecvSlic","md5":"9541bc64807293499de6abbd1da9d633","len":262152},"recovery138":{"type":"PAR 2.0\u0000RecvSlic","md5":"ca40c2c9aed94b57fa53fab122abc95c","len":262152},"recovery139":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ef6a42a467cdea12815b5c150a5f743","len":262152},"recovery140":{"type":"PAR 2.0\u0000RecvSlic","md5":"e0f5ce4dc6e0cadc5f3dddca5b93c005","len":262152},"recovery141":{"type":"PAR 2.0\u0000RecvSlic","md5":"8b5c43d4b1a0c46990f3f333a0898054","len":262152},"recovery142":{"type":"PAR 2.0\u0000RecvSlic","md5":"d5eed530038d77dac6eab507dcf47fa2","len":262152},"recovery143":{"type":"PAR 2.0\u0000RecvSlic","md5":"529c34fd816ede1899e40f536e0c8203","len":262152},"recovery144":{"type":"PAR 2.0\u0000RecvSlic","md5":"8a6cb458f1f508547f2a9184855c79dc","len":262152},"recovery145":{"type":"PAR 2.0\u0000RecvSlic","md5":"7be5eac2be255b4adcdfcc3d58b622bf","len":262152},"recovery146":{"type":"PAR 2.0\u0000RecvSlic","md5":"f8799c463c2efda9dbdb4ddc9bddcf54","len":262152},"recovery147":{"type":"PAR 2.0\u0000RecvSlic","md5":"80140c6dddb23f99f49a231f0f710b64","len":262152},"recovery148":{"type":"PAR 2.0\u0000RecvSlic","md5":"19bc507b8ec8c8260ea526036b381547","len":262152},"recovery149":{"type":"PAR 2.0\u0000RecvSlic","md5":"6214b5e6747bceb3b135b1ab9b91c6bc","len":262152},"recovery150":{"type":"PAR 2.0\u0000RecvSlic","md5":"e63a5aaa79fb2c561c3e89983fe217b7","len":262152},"recovery151":{"type":"PAR 2.0\u0000RecvSlic","md5":"3d50287fe67bbab640739a3b9f85f978","len":262152},"recovery152":{"type":"PAR 2.0\u0000RecvSlic","md5":"7a9be033a85fafaa8122980e4ddb4947","len":262152},"recovery153":{"type":"PAR 2.0\u0000RecvSlic","md5":"a4a4267103497dc3e7061219c0342a71","len":262152},"recovery154":{"type":"PAR 2.0\u0000RecvSlic","md5":"ae8fe2ccaf48ddf415d154cb74f27961","len":262152},"recovery155":{"type":"PAR 2.0\u0000RecvSlic","md5":"29ee8f8377b984f3d91c787092777399","len":262152},"recovery156":{"type":"PAR 2.0\u0000RecvSlic","md5":"98d05ea372a553c0dfa1f15fb91860b1","len":262152},"recovery157":{"type":"PAR 2.0\u0000RecvSlic","md5":"96189de63a41a05d863fa42f3badf79b","len":262152},"recovery158":{"type":"PAR 2.0\u0000RecvSlic","md5":"df1f2f861783d35897b02b43afe62db3","len":262152},"recovery159":{"type":"PAR 2.0\u0000RecvSlic","md5":"1e97fd5e5fc588e3424bccaba08ff3a9","len":262152},"recovery160":{"type":"PAR 2.0\u0000RecvSlic","md5":"894e9bae976951d893f88faf3b539be3","len":262152},"recovery161":{"type":"PAR 2.0\u0000RecvSlic","md5":"a6debea18ff12b7c152cdf8539a8b84c","len":262152},"recovery162":{"type":"PAR 2.0\u0000RecvSlic","md5":"446cb123074c7435ed1a883ffb578e62","len":262152},"recovery163":{"type":"PAR 2.0\u0000RecvSlic","md5":"af617de168be8b15a2e30aa5b11a630d","len":262152},"recovery164":{"type":"PAR 2.0\u0000RecvSlic","md5":"caeaeda3c01970907014b4cb6be2e0e9","len":262152},"recovery165":{"type":"PAR 2.0\u0000RecvSlic","md5":"b72d7c84ee2f6480e66c5451c305361f","len":262152},"recovery166":{"type":"PAR 2.0\u0000RecvSlic","md5":"bddcb7c296fa98f0f93c6233b5971cab","len":262152},"recovery167":{"type":"PAR 2.0\u0000RecvSlic","md5":"c5c6024a222934cde4a08ad92cfb7c4e","len":262152},"recovery168":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea2a639a62ca0356d05eb81c5f1e51e3","len":262152},"recovery169":{"type":"PAR 2.0\u0000RecvSlic","md5":"cb81b5ea915c74803315125ef6b35082","len":262152},"recovery170":{"type":"PAR 2.0\u0000RecvSlic","md5":"4f860ad42df81289117e7599b54c7eb7","len":262152},"recovery171":{"type":"PAR 2.0\u0000RecvSlic","md5":"a28214640f8b91fb23b8ac7c31781808","len":262152},"recovery172":{"type":"PAR 2.0\u0000RecvSlic","md5":"0ce2938fd11eb52f82458e81375c5a87","len":262152},"recovery173":{"type":"PAR 2.0\u0000RecvSlic","md5":"9aafee82dd491549c94c1d094898dd4a","len":262152},"recovery174":{"type":"PAR 2.0\u0000RecvSlic","md5":"12852b57d924c0caf9f1a0380431434b","len":262152},"recovery175":{"type":"PAR 2.0\u0000RecvSlic","md5":"39e2197c88713c38df03b7ed1bcdf3f7","len":262152},"recovery176":{"type":"PAR 2.0\u0000RecvSlic","md5":"c61db631d2776f27c18fc2afe3a5da22","len":262152},"recovery177":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e92440d523b464790094b6edf7d9ebf","len":262152},"recovery178":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd7203dab76dcf651a159fffea1da80f","len":262152},"recovery179":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c3ddd4864fbc7f9102326252c5155d4","len":262152},"recovery180":{"type":"PAR 2.0\u0000RecvSlic","md5":"a586d5f39701672f9c030fd24c8fcb76","len":262152},"recovery181":{"type":"PAR 2.0\u0000RecvSlic","md5":"9904b5bdaaf4a4b9592bd00f981dbb3b","len":262152},"recovery182":{"type":"PAR 2.0\u0000RecvSlic","md5":"8ffb4c94544b1b24582e29539ad3005a","len":262152},"recovery183":{"type":"PAR 2.0\u0000RecvSlic","md5":"7287c960ac6b695db428449fbca96281","len":262152},"recovery184":{"type":"PAR 2.0\u0000RecvSlic","md5":"883a3c3f3d0f9df49abf107c3847af28","len":262152},"recovery185":{"type":"PAR 2.0\u0000RecvSlic","md5":"93e9c487429c1befc4bbd9ab268593da","len":262152},"recovery186":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ccbaa2bb58dc2a99f9975526ef8bc80","len":262152},"recovery187":{"type":"PAR 2.0\u0000RecvSlic","md5":"f745e05a999b05e38c55e4376581c488","len":262152},"recovery188":{"type":"PAR 2.0\u0000RecvSlic","md5":"bdc31d49a5d632f5a63f254bfa941581","len":262152},"recovery189":{"type":"PAR 2.0\u0000RecvSlic","md5":"7521bc07a1963e4e752c9b9762cf1886","len":262152},"recovery190":{"type":"PAR 2.0\u0000RecvSlic","md5":"0273545e7fd5bec03732379f53affe0c","len":262152},"recovery191":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b0135fd6e027d865fb67b993c8b25c0","len":262152},"recovery192":{"type":"PAR 2.0\u0000RecvSlic","md5":"9cf0b02b61ff390c9075abc5b0b07940","len":262152},"recovery193":{"type":"PAR 2.0\u0000RecvSlic","md5":"c993b05b5f88490514cf00a7379c027c","len":262152},"recovery194":{"type":"PAR 2.0\u0000RecvSlic","md5":"0854447904b8ba72f87e4bca81ba7c36","len":262152},"recovery195":{"type":"PAR 2.0\u0000RecvSlic","md5":"41b20d0d135d0d9e1e69f075d7415511","len":262152},"recovery196":{"type":"PAR 2.0\u0000RecvSlic","md5":"9b36c343aeef12aa5aebef859afb7939","len":262152},"recovery197":{"type":"PAR 2.0\u0000RecvSlic","md5":"566559dbe69de697c566139016380d23","len":262152},"recovery198":{"type":"PAR 2.0\u0000RecvSlic","md5":"37e47ff9f55e0cc4f4c2a219a92b5ca2","len":262152},"recovery199":{"type":"PAR 2.0\u0000RecvSlic","md5":"2395c640cf1ea0a5a607c8b191045299","len":262152},"creator":{"type":"PAR 2.0\u0000Creator","md5":"89ad6e4b397f77c2695f641c56d1b889","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"89c5482dbd2f87c50e63dd1d6fe57acb","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"52fb5aaf6b778813e2457d51678ebe56","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cfcc0ca48bb20a25e91d1b117b1ab518","len":5220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"89ad6e4b397f77c2695f641c56d1b889","len":104}}],"1":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"2ac05d18f3edb8fbf441a9f3a6e4020f","len":65608},"main":{"type":"PAR 2.0\u0000Main","md5":"588dc7bd3ba772f311d29a7969658d81","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"7615919203c5f0ba8accba4877e1aa55","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"762e98460db2f630a54c36d8e3a15807","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8e109e9880f96796975db1e8977b2890","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"588dc7bd3ba772f311d29a7969658d81","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"7615919203c5f0ba8accba4877e1aa55","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"762e98460db2f630a54c36d8e3a15807","len":20560},"creator":{"type":"PAR 2.0\u0000Creator","md5":"8e109e9880f96796975db1e8977b2890","len":104}}],"2":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"c4e872922149a549d977d81640ceab55","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"b85a4b084b9d52d116f333a1d5b19e19","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"59adbd88d4c1bc4319b425a546cf89d8","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}},{"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"1814cf6eb1e1ace8332be23cd15eefd1","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"9ffcbdd906b9ff145708f777748d03b5","len":1048644},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"dd8421495e090fd5c060556086b84c32","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"76ec256a0f5572fcd0fc51ad0276877f","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}},{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"4d1bb22d29b6cb751b1fa24f5a815fb1","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea7f112eb738f044695834348c2e0cfa","len":1048644},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"ea31249d845a9a3303e2a0d706a9dff0","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"399718d48d5c703f63d4f8c3d61fd6c7","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"f507373d364c93b47de1f01eb44d3d61","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"a834bfee7edcedd0328f0b89ed2332b8","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"12f17a64a1486c7b076ae9053a07457a","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"41459752a93ac1f4f6686d4fb1db904d","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}},{"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"b977f212e7d954a4ff9e679fa3240a50","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"742f7601e141c6853d6ac6c7f7b9d0ab","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}}],"3":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ec1b43947a219587684fbf415f9b610","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"fee51ffcec529f55044d9012eec50f04","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"f6a13ffd2f75ce6c75ac4028baaf39ca","len":1048644},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"a0c91ed55a96152ad69b4392d466fe5d","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"70205e891ea5cf723375e8b710a10678","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"3d1587e90e9ef06582e1ab54f2c4e730","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"2436f22b6d754e6ea2c20fefa2600cc5","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"77aa638c222a72c7910c80366ba93fa4","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"bc1f6c6d31a8e0ac1da75822406a8183","len":1048644},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"9740a7da4885f568a5ecb92f6c40b542","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c9f5e668b354ba002563567c415410a","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"dae559f4ad022bce1bd7e565562d27fb","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"4c36084e21c5aa8043a9af0bbe2f40b1","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"a33898e46cecf57b80666d9a75e2ca98","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"0e0a6b21b13b40d970bbf3d68cef8749","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"923493bb8645068ce59031656a6a81be","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"24f3affd0a3c433979e520fe164af3a2","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"2007e274e35d6592233a68a944ed564d","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"4dad701e5314dbf13b703a0f2bf1b462","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a7398c38a1927b36ec9fc4974dc0c6a","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee398a2258c8e389ba78b782cb0bcc67","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0f51d8bdeb8d22405eb473efae46343","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"b276b7929c15f1ae6cf6d40e8376d4e1","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"075682015f58a65d763dbc9f0ee513e2","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce527a46c4740a6061dfdc4341b75ef8","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"6cae9d1f1b7a5fe33e797d2f06a7fbf9","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"518201534e884cd9aca3371613be161f","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"468de55acbe2330f7d410c208c9d74c7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"11c5e20152902ec61007121531fcd640","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"d98a1d2fadc180e15253cb38e1a63cb8","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"23d0dabf638bbd3296eff194897c5d7c","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"fdaca43a4141a1d49ba22108b8811f31","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"57851eaa8bca48269261f4b6298e2e4f","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"b8be0336b8787be4fa86041f9d38ca6d","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"312b9e44906da959cf77f49e04f52095","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"d268da0938bdcd1210126898b124188e","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"5b66e1272d272cefd8608856755ffb19","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"b5f1a350dd4c18d3c9d8680d084abaea","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4e0b201fb86d09bbe9cf86903c17a82","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"a022d11b27ad0ca6beccb0e9aa1579b3","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"0de6017fd8d80d67e780eb679b9318f1","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"0e52bc14774ca31fa04edd8c5e9cd66f","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"50218f80d454d5c8e3c61698a5c500fb","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"e85705a19cca00706effad30d2e75c07","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"e3c10df5b6c5154300f69f7027b48b62","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"e7a98e885bb4f2393f8c1b560f2a56ab","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"602321ff1c2be2949f99be571fc7e6f1","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"ee0f6c5886993df8609c98d12aacc444","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"fee51ffcec529f55044d9012eec50f04","len":124},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"a0c91ed55a96152ad69b4392d466fe5d","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"3d1587e90e9ef06582e1ab54f2c4e730","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"77aa638c222a72c7910c80366ba93fa4","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"9740a7da4885f568a5ecb92f6c40b542","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"dae559f4ad022bce1bd7e565562d27fb","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"a33898e46cecf57b80666d9a75e2ca98","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"ee0f6c5886993df8609c98d12aacc444","len":104}}],"4":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"1f43fb52f1e4d733c3d2791ff276d1c8","len":2097304},"main":{"type":"PAR 2.0\u0000Main","md5":"15e29c28b0c942b71bf8d72c41f417e4","len":124},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"2b695fbbe4cffbd85107376fdebe4755","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"94c27b82ae09326b1d706f0a697bf363","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ec874e384853036e7db4c8f9e5daf295","len":2097304},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"253b5bbe92cfb3c63dad4803d0eac1c8","len":132},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"bd4dedc8e22c169bb6d1e7a5e88e1c5a","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"ebf38ffca9179af5a839caed9afc70a2","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"0ae9e4a6f98a778a295426b6a0b18fce","len":2097304},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"a3909310e9fa86d03b262e342c37eb46","len":220},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"025c93f5177f838bf02e3a62082b3cf9","len":2097304},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"8897558cd15484c5302aa09ebc257785","len":2097304},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"dcce7fba8e1775f5c94cc63f9031ea9a","len":2097304},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab2f333b9017c251b68841d2c8ae5e32","len":2097304},"creator":{"type":"PAR 2.0\u0000Creator","md5":"7bb60d3f320fb55987f5df188a263775","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"15e29c28b0c942b71bf8d72c41f417e4","len":124},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"2b695fbbe4cffbd85107376fdebe4755","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"94c27b82ae09326b1d706f0a697bf363","len":132},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"253b5bbe92cfb3c63dad4803d0eac1c8","len":132},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"bd4dedc8e22c169bb6d1e7a5e88e1c5a","len":100},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"ebf38ffca9179af5a839caed9afc70a2","len":100},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"a3909310e9fa86d03b262e342c37eb46","len":220},"creator":{"type":"PAR 2.0\u0000Creator","md5":"7bb60d3f320fb55987f5df188a263775","len":104}}],"5":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"4e9db1f3183bfee308f5b0916d01bf00","len":4194372},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"df2509789f3dc55df1987f2f92bb3fb7","len":4194372},"main":{"type":"PAR 2.0\u0000Main","md5":"dda74ad77c703c3b6bd3238d9609f4e6","len":92},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"5561f26c5da0d8aa979e4d20f219c2fc","len":4194372},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"1f4d785c8c4191bb867e53d6153f1a9b","len":4194372},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e826d9027b81b9c644a4846cc492e075","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"1e992b76a71d37375b85f658b0219484","len":4194372},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"1e2014a0ae283f00fe92273afb399c27","len":400},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"7acd330e30a81a3b06466db8d173cbbd","len":4194372},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"8c573ac66702d2f291fb274aa0807f60","len":4194372},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"e20112cc349e06f57cf69a4a4641d92d","len":4194372},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"88e4c154c27d77d92e628957cb75aa16","len":4194372},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"88f0286f334757303fd9e07740173ac5","len":4194372},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"97eea69ad75b5e84584002f9bec4f3be","len":4194372},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"a7ce99ec586dfaaec53f5c23bd6f11c7","len":4194372},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"baf2a0ffaf51aa6c6c79fb9ccbadfec4","len":4194372},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"a557d8b87d5fe7c931f0f842c6bf0442","len":4194372},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"8847c3bde0e6e789f862fb4bcb554554","len":4194372},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"cac77778f5f9468c4ac9d08c065bcbc5","len":4194372},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"178ad22283c6e76c44831fa5b25f13af","len":4194372},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"6776395d84a158b5b25fa69a7b9463fc","len":4194372},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"290f0106b684c7b0efcc4bc7b86ecd0a","len":4194372},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a5f230aa2dbf521be5ec054a8400452","len":4194372},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"2470711c92d349f1163b397692d7b330","len":4194372},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"8103d5f2ff25dd792bdbb177a3fe46ae","len":4194372},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"f52ffd659931f247f43ffdd0f11caf93","len":4194372},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8270285b6c635e60c4b4dee42bf05c1","len":4194372},"creator":{"type":"PAR 2.0\u0000Creator","md5":"16cbf752046052fdc30197e209799d64","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"dda74ad77c703c3b6bd3238d9609f4e6","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e826d9027b81b9c644a4846cc492e075","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"1e2014a0ae283f00fe92273afb399c27","len":400},"creator":{"type":"PAR 2.0\u0000Creator","md5":"16cbf752046052fdc30197e209799d64","len":104}}],"6":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ec1b43947a219587684fbf415f9b610","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"fee51ffcec529f55044d9012eec50f04","len":124},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"f6a13ffd2f75ce6c75ac4028baaf39ca","len":1048644},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"a0c91ed55a96152ad69b4392d466fe5d","len":132},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"70205e891ea5cf723375e8b710a10678","len":1048644},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"3d1587e90e9ef06582e1ab54f2c4e730","len":132},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"2436f22b6d754e6ea2c20fefa2600cc5","len":1048644},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"77aa638c222a72c7910c80366ba93fa4","len":132},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"bc1f6c6d31a8e0ac1da75822406a8183","len":1048644},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"9740a7da4885f568a5ecb92f6c40b542","len":1360},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"2c9f5e668b354ba002563567c415410a","len":1048644},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"dae559f4ad022bce1bd7e565562d27fb","len":100},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"4c36084e21c5aa8043a9af0bbe2f40b1","len":1048644},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"a33898e46cecf57b80666d9a75e2ca98","len":100},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"0e0a6b21b13b40d970bbf3d68cef8749","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"923493bb8645068ce59031656a6a81be","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"24f3affd0a3c433979e520fe164af3a2","len":1048644},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"2007e274e35d6592233a68a944ed564d","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"4dad701e5314dbf13b703a0f2bf1b462","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"0a7398c38a1927b36ec9fc4974dc0c6a","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee398a2258c8e389ba78b782cb0bcc67","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"b0f51d8bdeb8d22405eb473efae46343","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"b276b7929c15f1ae6cf6d40e8376d4e1","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"075682015f58a65d763dbc9f0ee513e2","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"ce527a46c4740a6061dfdc4341b75ef8","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"6cae9d1f1b7a5fe33e797d2f06a7fbf9","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"518201534e884cd9aca3371613be161f","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"468de55acbe2330f7d410c208c9d74c7","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"11c5e20152902ec61007121531fcd640","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"d98a1d2fadc180e15253cb38e1a63cb8","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"23d0dabf638bbd3296eff194897c5d7c","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"fdaca43a4141a1d49ba22108b8811f31","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"57851eaa8bca48269261f4b6298e2e4f","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"b8be0336b8787be4fa86041f9d38ca6d","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"312b9e44906da959cf77f49e04f52095","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"d268da0938bdcd1210126898b124188e","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"5b66e1272d272cefd8608856755ffb19","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"b5f1a350dd4c18d3c9d8680d084abaea","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"b4e0b201fb86d09bbe9cf86903c17a82","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"a022d11b27ad0ca6beccb0e9aa1579b3","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"0de6017fd8d80d67e780eb679b9318f1","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"0e52bc14774ca31fa04edd8c5e9cd66f","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"50218f80d454d5c8e3c61698a5c500fb","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"e85705a19cca00706effad30d2e75c07","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"e3c10df5b6c5154300f69f7027b48b62","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"e7a98e885bb4f2393f8c1b560f2a56ab","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"602321ff1c2be2949f99be571fc7e6f1","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"ee0f6c5886993df8609c98d12aacc444","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"fee51ffcec529f55044d9012eec50f04","len":124},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"a0c91ed55a96152ad69b4392d466fe5d","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"3d1587e90e9ef06582e1ab54f2c4e730","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"77aa638c222a72c7910c80366ba93fa4","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"9740a7da4885f568a5ecb92f6c40b542","len":1360},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"dae559f4ad022bce1bd7e565562d27fb","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"a33898e46cecf57b80666d9a75e2ca98","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"ee0f6c5886993df8609c98d12aacc444","len":104}}],"7":[{"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b005924c173f828f388d03710c8bfcf","len":12292},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"10ee04a4d28a19d6c0e88d58515e1ef1","len":12292},"main":{"type":"PAR 2.0\u0000Main","md5":"5e7183ec1456a6f15e072fed0f78b9ca","len":140},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6c047f94fcc5a2d6d0dbafe7e633c39","len":12292},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"a03a9d24991dfd875d3ec30272366de5","len":12292},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"a2f96fdc0a29fc04ca17e31c20e6fc3f","len":132},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"1369390495de64f6640e5b8e7be37f40","len":12292},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"4622df73a1e90846931c16c2209768c9","len":12292},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"c4ee087ce65d3beb7eeeaa194ab7bef8","len":132},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"6434dffa3a3e653035ec03b1e371d564","len":12292},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"f5df0e311c0a7cd67535d75dee491d87","len":12292},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"f11c6620a21f01950727de8dad1687f6","len":132},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"efb7ce56dcc860ae904aeaeae59a2ece","len":12292},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"dff92dc77555b6085e02423f106c4fd5","len":132},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"19a8eea83a8907a69f069f455550d359","len":12292},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a8e498c90635320fb4c3a88a0189665","len":12292},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"f6aa8895439563d682fb9fe2df0e1c9b","len":200},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"4a2b82f1404d9f2c83883ef6a43e6a81","len":12292},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"d409a0d5436982476d0a3b9bf19e3c8d","len":12292},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"1ffce628da93d13e375591ff67bc654b","len":100},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"b46ab703bca1431991ca000043f84ea2","len":12292},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"21cb7408d0ba065287cd8e4053d61063","len":12292},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"0933688bb6983474179824e77f29b250","len":22400},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"6b94f8141601721b6f240d521fc22a56","len":12292},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"6a4508896659f168b3e1fef517fb636e","len":12292},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"21252c20b9aed4d71890b7d5854ccdb2","len":100},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"05880fb4f3e515af747090399210bbed","len":12292},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"efacb826b39b78dd72e73ea4554edda1","len":12292},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"623f9c56d059a1e578f4625322980c5e","len":12292},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"aa90ff8cd4a601e4b4cf20e61b35d083","len":12292},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"fb4c6a48ffe06cd2d77a61f1bee63667","len":12292},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"375cd39fdab776888186847c949f43f0","len":12292},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"52006dc0cdbd3b368237437c602aac6c","len":12292},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"e150767a0391b4fda00804f29eff96b7","len":12292},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"4509170ce60b05ed4a7acc5d585db77e","len":12292},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"0e593087e7c9fae63066b8731a1f0ee1","len":12292},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"b1a80cefac3d5afd7fe58a1b344339ae","len":12292},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"d9065479d517927ec148b2519a09b08c","len":12292},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"82b9dcd7ccd15be338307666bd270064","len":12292},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"8ac4426e3cb2cff66b57ec356202890a","len":12292},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"d652a19d4c7dcfff936c634aa089fba8","len":12292},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"5b05c818bc77bda7fa2e7f51798b977c","len":12292},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"3217f0a91b66e3602c91723346a00d8f","len":12292},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"6046ec1511827d2425f3ce4cdcabd937","len":12292},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"53a6a420f0b16ada5b75efddd4ebe79c","len":12292},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"914df5e88258380e9efab604addaf6ca","len":12292},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"9a326bf24356ab8cb9585783e8344cff","len":12292},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"db20b5be20b70a83a3f8cafceaa4c330","len":12292},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"f028d797702c7bff3e83458be486d555","len":12292},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"5ee1623a1d7aa875f656b6c4775bb8f9","len":12292},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"ef1528e5ed4152c56b89a82bd3828dde","len":12292},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"01b9506a11bf8eec8a0ff39231c6bd0c","len":12292},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"f3ed5f3d7303078d06e3db993b095be1","len":12292},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"85d7f8cc42b4db916247c4ae3d2131b3","len":12292},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"85a61ece2caede278f5843acb51dde24","len":12292},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"7298e77277cd28ea5d03ecb7a8bf4e4b","len":12292},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"cdaffa6e46f13946f6be00cbae471a80","len":12292},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"a3dd9157d4fca3de8e070c3c5cb9d40a","len":12292},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"432d6bd89e4ed25d63ca7d7977503d45","len":12292},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"fd5c47a8c4e7571b90d1c7f26a498226","len":12292},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"4450fd57ec66606c74bc11073f4cc585","len":12292},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"c0f77d7ff4bd61b8ab8f76db2d97d95c","len":12292},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"bea6fe996d8e248a3765402e7d4d8d8a","len":12292},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"04d07195645d6a762d2073bdcd47222b","len":12292},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"5311502407c1f6897d2d3fce2a95a64e","len":12292},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"a18929e0350f8e2fb17116074acc713d","len":12292},"recovery64":{"type":"PAR 2.0\u0000RecvSlic","md5":"ae658203da57104cd100410480b1136f","len":12292},"recovery65":{"type":"PAR 2.0\u0000RecvSlic","md5":"550aa49e56358209671fde8c7d344bd2","len":12292},"recovery66":{"type":"PAR 2.0\u0000RecvSlic","md5":"ee727c6ba693c86d5d8c8fbdcc440f6d","len":12292},"recovery67":{"type":"PAR 2.0\u0000RecvSlic","md5":"d996d134917f84d902dc9c82af8c733b","len":12292},"recovery68":{"type":"PAR 2.0\u0000RecvSlic","md5":"643ad681de2963723f4456020b065f6a","len":12292},"recovery69":{"type":"PAR 2.0\u0000RecvSlic","md5":"2fea3ec825ac290c913f438be3a01ff8","len":12292},"recovery70":{"type":"PAR 2.0\u0000RecvSlic","md5":"aefdfd2895d3e4a0fbd94988e6bfa868","len":12292},"recovery71":{"type":"PAR 2.0\u0000RecvSlic","md5":"3154e06a28cc37addd261c3c6de5a3af","len":12292},"recovery72":{"type":"PAR 2.0\u0000RecvSlic","md5":"2a85acb9643df46c1627cc1a8b1bad59","len":12292},"recovery73":{"type":"PAR 2.0\u0000RecvSlic","md5":"20d3d32dea37223ccd90bb2d7b08c2cf","len":12292},"recovery74":{"type":"PAR 2.0\u0000RecvSlic","md5":"f2411605c2db3293b7a2d2d62494e958","len":12292},"recovery75":{"type":"PAR 2.0\u0000RecvSlic","md5":"3c890fbfb07da5b73a041d7322144348","len":12292},"recovery76":{"type":"PAR 2.0\u0000RecvSlic","md5":"852c9e4f9c86e2233998598df7ab330c","len":12292},"recovery77":{"type":"PAR 2.0\u0000RecvSlic","md5":"7737ca9add281e4f3ab131d174401010","len":12292},"recovery78":{"type":"PAR 2.0\u0000RecvSlic","md5":"7226ad8c693edc67a61f42d8ebb7a96a","len":12292},"recovery79":{"type":"PAR 2.0\u0000RecvSlic","md5":"91f51723204f5bf8544e89bd0bd30c13","len":12292},"recovery80":{"type":"PAR 2.0\u0000RecvSlic","md5":"d6a172d61a9a686a31985e1ae45e3f13","len":12292},"recovery81":{"type":"PAR 2.0\u0000RecvSlic","md5":"3454ea5ce87263b0d63af8d70af18988","len":12292},"recovery82":{"type":"PAR 2.0\u0000RecvSlic","md5":"870191c25f0f1aa43e69e5b7ebbf71ff","len":12292},"recovery83":{"type":"PAR 2.0\u0000RecvSlic","md5":"08cd7f192ee853ab5fef5d8930a06ede","len":12292},"recovery84":{"type":"PAR 2.0\u0000RecvSlic","md5":"a651ca28bf96b5ee3ffdd6e58db168f7","len":12292},"recovery85":{"type":"PAR 2.0\u0000RecvSlic","md5":"802c3adbf700cdc78b7d20ac503a3f04","len":12292},"recovery86":{"type":"PAR 2.0\u0000RecvSlic","md5":"0071f0ac02f83574e307b83a6c34c2c0","len":12292},"recovery87":{"type":"PAR 2.0\u0000RecvSlic","md5":"1a5656df4b21663954a9ef6a93895a64","len":12292},"recovery88":{"type":"PAR 2.0\u0000RecvSlic","md5":"317a089ebb628a02e47626fabb66c757","len":12292},"recovery89":{"type":"PAR 2.0\u0000RecvSlic","md5":"15c639f76ca5bf5bad1cebd95e575c41","len":12292},"recovery90":{"type":"PAR 2.0\u0000RecvSlic","md5":"682bc4e6f85516b926ac797bef70810c","len":12292},"recovery91":{"type":"PAR 2.0\u0000RecvSlic","md5":"7864ca2aa50b2bd02fad16497cf611bc","len":12292},"recovery92":{"type":"PAR 2.0\u0000RecvSlic","md5":"a781789f8de0cf9698b353ff9f316e8d","len":12292},"recovery93":{"type":"PAR 2.0\u0000RecvSlic","md5":"c184199949b8cf8a9c7ab1edee2ed8bc","len":12292},"recovery94":{"type":"PAR 2.0\u0000RecvSlic","md5":"3808131bb9c9e8a9265bc2a4fc3e4630","len":12292},"recovery95":{"type":"PAR 2.0\u0000RecvSlic","md5":"8fd9bdcc8c75efc129366221c931486d","len":12292},"recovery96":{"type":"PAR 2.0\u0000RecvSlic","md5":"b2b69333769ed73d2725830c5f7c4fa2","len":12292},"recovery97":{"type":"PAR 2.0\u0000RecvSlic","md5":"d126a7d127450fbc98bf683e4e6a9607","len":12292},"recovery98":{"type":"PAR 2.0\u0000RecvSlic","md5":"5df32336acf571e882b0a67c0cafd8ea","len":12292},"recovery99":{"type":"PAR 2.0\u0000RecvSlic","md5":"6a1cd80fe0422faa74deb2040ad8512d","len":12292},"recovery100":{"type":"PAR 2.0\u0000RecvSlic","md5":"2552769b53f92e48a2543d1f2b3136b4","len":12292},"recovery101":{"type":"PAR 2.0\u0000RecvSlic","md5":"c6ae6ae2dd2d59a93cdc83ec8e1a62f8","len":12292},"recovery102":{"type":"PAR 2.0\u0000RecvSlic","md5":"fae1d02640a88a5b4850932f209107db","len":12292},"recovery103":{"type":"PAR 2.0\u0000RecvSlic","md5":"bef5f91e22020b07dba8aa3745868e12","len":12292},"recovery104":{"type":"PAR 2.0\u0000RecvSlic","md5":"315adc56c4394db656591b6d57ec644b","len":12292},"recovery105":{"type":"PAR 2.0\u0000RecvSlic","md5":"c3fa310ab903ce125624e0fe1e4ca288","len":12292},"recovery106":{"type":"PAR 2.0\u0000RecvSlic","md5":"5dabc0bf499aacfbd85306b0e66d835f","len":12292},"recovery107":{"type":"PAR 2.0\u0000RecvSlic","md5":"15c7ec5674b88d1e3a7738b2c4b82699","len":12292},"recovery108":{"type":"PAR 2.0\u0000RecvSlic","md5":"4ec8e38dcc549f86442d8598c7810114","len":12292},"recovery109":{"type":"PAR 2.0\u0000RecvSlic","md5":"a3c4c001b720eb41feb7fbb5c96c76c0","len":12292},"recovery110":{"type":"PAR 2.0\u0000RecvSlic","md5":"975ac885338b2b4cc1d448f75c84cddc","len":12292},"recovery111":{"type":"PAR 2.0\u0000RecvSlic","md5":"92d2c61159fdb633f95d57bc41a5930f","len":12292},"recovery112":{"type":"PAR 2.0\u0000RecvSlic","md5":"57aa0f0b879ad7659452a0e5b1a99922","len":12292},"recovery113":{"type":"PAR 2.0\u0000RecvSlic","md5":"92eb3c634d9da1396cf3f14c49724a9f","len":12292},"recovery114":{"type":"PAR 2.0\u0000RecvSlic","md5":"5eb6738f6b97fbca8d990c125cabdbee","len":12292},"recovery115":{"type":"PAR 2.0\u0000RecvSlic","md5":"14cb0db32c4c2ab5c0b111063d7f78fd","len":12292},"recovery116":{"type":"PAR 2.0\u0000RecvSlic","md5":"1f1bfb1b31c59b76501e3c5bf0c4ac6e","len":12292},"recovery117":{"type":"PAR 2.0\u0000RecvSlic","md5":"11dce209195d56a10cf47d1fa358bd17","len":12292},"recovery118":{"type":"PAR 2.0\u0000RecvSlic","md5":"be401876f735ef36bc79be2624cf55e8","len":12292},"recovery119":{"type":"PAR 2.0\u0000RecvSlic","md5":"ac561be858d730b458a89417fc8cd472","len":12292},"creator":{"type":"PAR 2.0\u0000Creator","md5":"33e747ebfbcf20640dc2c135668fedad","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"5e7183ec1456a6f15e072fed0f78b9ca","len":140},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"a2f96fdc0a29fc04ca17e31c20e6fc3f","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"c4ee087ce65d3beb7eeeaa194ab7bef8","len":132},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"f11c6620a21f01950727de8dad1687f6","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"dff92dc77555b6085e02423f106c4fd5","len":132},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"f6aa8895439563d682fb9fe2df0e1c9b","len":200},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"1ffce628da93d13e375591ff67bc654b","len":100},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"0933688bb6983474179824e77f29b250","len":22400},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"21252c20b9aed4d71890b7d5854ccdb2","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"33e747ebfbcf20640dc2c135668fedad","len":104}}],"8":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"8f490677c7370c81405b5dc720de5259","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ab4b40f2f8c79be1b72fc3c297034e18","len":76},"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"e3dcbbad791c824b08deb1c570891096","len":108},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"b00de7d8184b9282d2fab59a09f4317a","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"3f2544814d4dae3cd3bc0c89602e85f9","len":132},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"d451ff5d82856339d2b7e616128d51e5","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"5425f856110771270c05e595c89172fb","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"af1d8653ae55de4ace006367e6339891","len":104}}],"9":[{"main":{"type":"PAR 2.0\u0000Main","md5":"d2e9f5f81e8780b703db895c249cbd68","len":92},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"4a2d665a1e6879cd1b9d04025a7a5f80","len":132},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"59a78d4061c80c5b686bc85f1ae871e1","len":120},"creator":{"type":"PAR 2.0\u0000Creator","md5":"aef6213477d5a93d8932e106b971214e","len":104}}],"10":[{"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"c98eac33ebcacab03a93c097875db85f","len":16777284},"main":{"type":"PAR 2.0\u0000Main","md5":"fa4e9795952daa58ec0c79f3bb486daa","len":124},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"1c0f2b78a1f547f026c817dcb3f78cfa","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"292f79fd2bede637da2b4a771c3e43de","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"46018ec469e1cdd39cffa769c532d229","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"3e007505ad34d9349ff9ec322bb276b1","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"bc61781d8a0b4d7835b0b7af7bc5a0b3","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"d2aea9e97389c23fb277e75ebae5789f","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"67e37e25db5bd8526d39916e2af81195","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"fa4e9795952daa58ec0c79f3bb486daa","len":124},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"1c0f2b78a1f547f026c817dcb3f78cfa","len":132},"desc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000FileDesc","md5":"292f79fd2bede637da2b4a771c3e43de","len":132},"descb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000FileDesc","md5":"46018ec469e1cdd39cffa769c532d229","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"3e007505ad34d9349ff9ec322bb276b1","len":160},"ifsc2c1f1a96dd9188a12a539499b0e06358":{"type":"PAR 2.0\u0000IFSC","md5":"bc61781d8a0b4d7835b0b7af7bc5a0b3","len":100},"ifscb79416c29031824332921edcc94cb2d1":{"type":"PAR 2.0\u0000IFSC","md5":"d2aea9e97389c23fb277e75ebae5789f","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"67e37e25db5bd8526d39916e2af81195","len":104}}],"11":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"27492ff0236d2b9f708c20c9037bb477","len":1048644},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"ca610c37276b236032385de6cfe31ad1","len":1048644},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"681a77c9df99be565a1d80059faa7849","len":1048644},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"869049b0db80bd673dc2e216b1bb558c","len":1048644},"main":{"type":"PAR 2.0\u0000Main","md5":"2014ba94366054b731ce6b7a09be65bc","len":92},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"e63821530a0ee31edbbeb1ea7b34d6eb","len":1048644},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"40943184042218658f4b5f2c42865457","len":1048644},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"d4a6a758a335e861e4653dbd042bb0e9","len":1048644},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"f1590ab293fde02149689a0c428cf36a","len":132},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"ebfa4c4d38ea7ba2b730b09b05125a9f","len":1048644},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"d8acf1a902681c73f2410df6f973ff19","len":1048644},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"05e99361f97158fdfb083898b7b1fee4","len":1048644},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"3fcb592bb781bd4f2dacafa15e64c282","len":340},"recovery10":{"type":"PAR 2.0\u0000RecvSlic","md5":"64705b161c0f222f3979de574c5292e5","len":1048644},"recovery11":{"type":"PAR 2.0\u0000RecvSlic","md5":"6661d09aa0869100e4c7ac009d732f22","len":1048644},"recovery12":{"type":"PAR 2.0\u0000RecvSlic","md5":"662fcae4ec9e8f216ed800402a86a19f","len":1048644},"recovery13":{"type":"PAR 2.0\u0000RecvSlic","md5":"1aeb37755b286c499e72f16d6b2997e3","len":1048644},"recovery14":{"type":"PAR 2.0\u0000RecvSlic","md5":"c1a04b4cb237cf7d2fcf96f65fa3707a","len":1048644},"recovery15":{"type":"PAR 2.0\u0000RecvSlic","md5":"c502f0e383a4913adf41bddc47188fa5","len":1048644},"recovery16":{"type":"PAR 2.0\u0000RecvSlic","md5":"f79753d9573717cb313ad28d49b58957","len":1048644},"recovery17":{"type":"PAR 2.0\u0000RecvSlic","md5":"6718a475be7d48d4505a816e607ac0cf","len":1048644},"recovery18":{"type":"PAR 2.0\u0000RecvSlic","md5":"b5e08b73c0a116a186200ceb5eee12e1","len":1048644},"recovery19":{"type":"PAR 2.0\u0000RecvSlic","md5":"26f49122ec0c65aaeee91f02fe3b298a","len":1048644},"recovery20":{"type":"PAR 2.0\u0000RecvSlic","md5":"f5cc151476af228e309246f1135a868c","len":1048644},"recovery21":{"type":"PAR 2.0\u0000RecvSlic","md5":"287dd184771362431860f2e79abf6b71","len":1048644},"recovery22":{"type":"PAR 2.0\u0000RecvSlic","md5":"15103320ebc4a1d287809b3f550cf466","len":1048644},"recovery23":{"type":"PAR 2.0\u0000RecvSlic","md5":"8a1b9c9e29c02f11e7efe347238dce56","len":1048644},"recovery24":{"type":"PAR 2.0\u0000RecvSlic","md5":"5bc916db93b8499680444eaa8922c71a","len":1048644},"recovery25":{"type":"PAR 2.0\u0000RecvSlic","md5":"81e31a0b3c5f6e52ebef05ac7ebf05ef","len":1048644},"recovery26":{"type":"PAR 2.0\u0000RecvSlic","md5":"894270cb9860bf959b94503998c2aa45","len":1048644},"recovery27":{"type":"PAR 2.0\u0000RecvSlic","md5":"ecd365739fa43e31fb3b8ad4a0c702aa","len":1048644},"recovery28":{"type":"PAR 2.0\u0000RecvSlic","md5":"e9839ce30516291840c500c62c54c7f3","len":1048644},"recovery29":{"type":"PAR 2.0\u0000RecvSlic","md5":"6e779c716007d2e2e5c13206b61ed242","len":1048644},"recovery30":{"type":"PAR 2.0\u0000RecvSlic","md5":"0b7edfa25e88e2d693acbc5c4ee7e3ab","len":1048644},"recovery31":{"type":"PAR 2.0\u0000RecvSlic","md5":"1ec91f3d8dbb68e15ee288967d4bda70","len":1048644},"recovery32":{"type":"PAR 2.0\u0000RecvSlic","md5":"069d8f9fc22a257a3087528f5f0d3e4b","len":1048644},"recovery33":{"type":"PAR 2.0\u0000RecvSlic","md5":"7ff092dc292066a6f16ebff3abcddaec","len":1048644},"recovery34":{"type":"PAR 2.0\u0000RecvSlic","md5":"e8396bbe2e1c7cd808a0c19b9e44366d","len":1048644},"recovery35":{"type":"PAR 2.0\u0000RecvSlic","md5":"49c529ecd0b1fa399ae7ab878560c92c","len":1048644},"recovery36":{"type":"PAR 2.0\u0000RecvSlic","md5":"8bebdbd3d808c72bb1dff32e304797da","len":1048644},"recovery37":{"type":"PAR 2.0\u0000RecvSlic","md5":"b5ba7fc114a2f51d4ca31eab75df5967","len":1048644},"recovery38":{"type":"PAR 2.0\u0000RecvSlic","md5":"90f2adae0315250cd0b3e2e061e145eb","len":1048644},"recovery39":{"type":"PAR 2.0\u0000RecvSlic","md5":"36fec8721806963a50db398bfe5d3530","len":1048644},"recovery40":{"type":"PAR 2.0\u0000RecvSlic","md5":"f7037497d34f5aaf8c80f390c3162a34","len":1048644},"recovery41":{"type":"PAR 2.0\u0000RecvSlic","md5":"c68e28d6e559c377c2d641d465e04c74","len":1048644},"recovery42":{"type":"PAR 2.0\u0000RecvSlic","md5":"c1c7deb85b51c16ff84a3b8b0f153db2","len":1048644},"recovery43":{"type":"PAR 2.0\u0000RecvSlic","md5":"9c9437f2aaa7eefb38cb9d2d3b5250da","len":1048644},"recovery44":{"type":"PAR 2.0\u0000RecvSlic","md5":"d33f3bc5820245947f80cfbbc9a5650a","len":1048644},"recovery45":{"type":"PAR 2.0\u0000RecvSlic","md5":"f989930092b5e09335b0d1744f5cd020","len":1048644},"recovery46":{"type":"PAR 2.0\u0000RecvSlic","md5":"abcf091241c99df80cf38d263f233b98","len":1048644},"recovery47":{"type":"PAR 2.0\u0000RecvSlic","md5":"5eb7e2ab745961fb460c6313224164db","len":1048644},"recovery48":{"type":"PAR 2.0\u0000RecvSlic","md5":"f6e1903b08958717ec3f45fb30a6ea18","len":1048644},"recovery49":{"type":"PAR 2.0\u0000RecvSlic","md5":"f7eed30e7e18813e48e9ef3072d1bf06","len":1048644},"recovery50":{"type":"PAR 2.0\u0000RecvSlic","md5":"b6f879be0f4dc7a2a69945b5078b2e2d","len":1048644},"recovery51":{"type":"PAR 2.0\u0000RecvSlic","md5":"15a4aab4c0452968a8ea76e360fab4e4","len":1048644},"recovery52":{"type":"PAR 2.0\u0000RecvSlic","md5":"6427967242a9cb6e6f87e77d9afe8c76","len":1048644},"recovery53":{"type":"PAR 2.0\u0000RecvSlic","md5":"8bdf763300c24964bc063a06c92c0311","len":1048644},"recovery54":{"type":"PAR 2.0\u0000RecvSlic","md5":"e53b551ec383f9a2e5a0869838c5cb62","len":1048644},"recovery55":{"type":"PAR 2.0\u0000RecvSlic","md5":"12f2b1472523b981fddeafa8587fb023","len":1048644},"recovery56":{"type":"PAR 2.0\u0000RecvSlic","md5":"01cae744d4718c396afd6dd2a66ea3cb","len":1048644},"recovery57":{"type":"PAR 2.0\u0000RecvSlic","md5":"7fc330e37834451f80325895fc0c05c2","len":1048644},"recovery58":{"type":"PAR 2.0\u0000RecvSlic","md5":"15000e384e5cc3035964779684a8a63d","len":1048644},"recovery59":{"type":"PAR 2.0\u0000RecvSlic","md5":"4b5584869ec45296ac2ec2ba5020dcbf","len":1048644},"recovery60":{"type":"PAR 2.0\u0000RecvSlic","md5":"21e6bd8d4d894064564b667206c4339b","len":1048644},"recovery61":{"type":"PAR 2.0\u0000RecvSlic","md5":"27b4816b7409ab074398958dc2b30d05","len":1048644},"recovery62":{"type":"PAR 2.0\u0000RecvSlic","md5":"9418ecd6483c380158ebe517c994d0f2","len":1048644},"recovery63":{"type":"PAR 2.0\u0000RecvSlic","md5":"373aeeefecdc31b6b003e673d7ce15c3","len":1048644},"creator":{"type":"PAR 2.0\u0000Creator","md5":"721763935dac2e9935321d25fad86412","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"2014ba94366054b731ce6b7a09be65bc","len":92},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"f1590ab293fde02149689a0c428cf36a","len":132},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"3fcb592bb781bd4f2dacafa15e64c282","len":340},"creator":{"type":"PAR 2.0\u0000Creator","md5":"721763935dac2e9935321d25fad86412","len":104}}],"14":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"2b7b3d05808018d22a6e51d7b4541f31","len":4294967364},"main":{"type":"PAR 2.0\u0000Main","md5":"9e8e86abd4093d642d36088b69fa93c0","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"bf520a3d185331665df4cefc0d8b2099","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"6c39df8daf2eec6742268a7ba4d70a68","len":100},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"22f4dae24dcee3451e6d66f05fa8ea04","len":4294967364},"creator":{"type":"PAR 2.0\u0000Creator","md5":"7d88aad2e7641f57678a0c84af8f2ebf","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"9e8e86abd4093d642d36088b69fa93c0","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"bf520a3d185331665df4cefc0d8b2099","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"6c39df8daf2eec6742268a7ba4d70a68","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"7d88aad2e7641f57678a0c84af8f2ebf","len":104}}],"18":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"60ca3542a3b114933c184af989c66e52","len":268435528},"main":{"type":"PAR 2.0\u0000Main","md5":"afdda0f6f4c81efa247b9ca7fe8a48dd","len":92},"desc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000FileDesc","md5":"15badf8997726cd74a70383d12ee7df4","len":136},"ifsc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000IFSC","md5":"3e75633d27da2cc7de064a08c3bc0e89","len":260},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"c3ece8b87332cb09588c073ca9d54df8","len":268435528},"creator":{"type":"PAR 2.0\u0000Creator","md5":"0711905a8aa594280a3f4826abc74dad","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"afdda0f6f4c81efa247b9ca7fe8a48dd","len":92},"desc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000FileDesc","md5":"15badf8997726cd74a70383d12ee7df4","len":136},"ifsc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000IFSC","md5":"3e75633d27da2cc7de064a08c3bc0e89","len":260},"creator":{"type":"PAR 2.0\u0000Creator","md5":"0711905a8aa594280a3f4826abc74dad","len":104}}],"20":[{"main":{"type":"PAR 2.0\u0000Main","md5":"4c1bdd66e1fc7f4e74160196fefb3e02","len":92},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"e0835cf9781a1c2f70fa240cb1abf172","len":132},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"cbba72beebff6608daac2e379c2acdf2","len":1360},"creator":{"type":"PAR 2.0\u0000Creator","md5":"e5bacb1af1948e08af87b41e4f067ff7","len":104}}],"21":[{"main":{"type":"PAR 2.0\u0000Main","md5":"ac75848fa5f4490a90eac71937d2ae59","len":92},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"0251d0bfed34934dff4d8ba49f6e7fdd","len":132},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"fffa78966785f1d47dfe63bb2b5aaf6f","len":100},"creator":{"type":"PAR 2.0\u0000Creator","md5":"94709bacceb3c6434a9cf90e822d7792","len":104}}],"22":[{"recovery0":{"type":"PAR 2.0\u0000RecvSlic","md5":"81b7b225c9636e2e948798bad4297bc3","len":262212},"main":{"type":"PAR 2.0\u0000Main","md5":"cbff33656e8183081812de0c17dadf53","len":140},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"8eab9660d91a0b7d093c50a66b9355f9","len":132},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"b0455e6f6d54c2789a9e80f7d33e24cf","len":132},"recovery1":{"type":"PAR 2.0\u0000RecvSlic","md5":"4afc8cfaccb54ea53674eb7130b8d975","len":262212},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"102ee7c2b1dd5c778b422ca005f70768","len":132},"desc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000FileDesc","md5":"4de6d7fd893ecbaea7974ff5d7fdd92c","len":136},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"7b643fb29da65e845878166ef9c16481","len":5200},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"1a68f1cf8dbb9bc94e967868cf528f7f","len":100},"recovery2":{"type":"PAR 2.0\u0000RecvSlic","md5":"3559df428b57c8df3f10d97519ca272e","len":262212},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"a49dd85cdc26defa02f1c21662cabc3b","len":1120},"ifsc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000IFSC","md5":"1212c757606d123ca2e279fb303034cf","len":176080},"recovery3":{"type":"PAR 2.0\u0000RecvSlic","md5":"ad1f197f813b2a966ab39e324249b936","len":262212},"recovery4":{"type":"PAR 2.0\u0000RecvSlic","md5":"43216e38f48a7d4a2f3041d6cb4edc3c","len":262212},"recovery5":{"type":"PAR 2.0\u0000RecvSlic","md5":"28d7d71e06c5fb7b049a4d21397bd017","len":262212},"recovery6":{"type":"PAR 2.0\u0000RecvSlic","md5":"421e1eb75721816b7017047ee0a5df55","len":262212},"recovery7":{"type":"PAR 2.0\u0000RecvSlic","md5":"9d855f5994aac01a34e38281f20c0e85","len":262212},"recovery8":{"type":"PAR 2.0\u0000RecvSlic","md5":"a4090de0a584011e0f2edf591989cca5","len":262212},"recovery9":{"type":"PAR 2.0\u0000RecvSlic","md5":"aeb7833d06dfded9b2c799bb7b705c0a","len":262212},"creator":{"type":"PAR 2.0\u0000Creator","md5":"c2132720a90fa23abd935f59c5ec96ab","len":104}},{"main":{"type":"PAR 2.0\u0000Main","md5":"cbff33656e8183081812de0c17dadf53","len":140},"desc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000FileDesc","md5":"8eab9660d91a0b7d093c50a66b9355f9","len":132},"desc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000FileDesc","md5":"b0455e6f6d54c2789a9e80f7d33e24cf","len":132},"desc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000FileDesc","md5":"102ee7c2b1dd5c778b422ca005f70768","len":132},"desc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000FileDesc","md5":"4de6d7fd893ecbaea7974ff5d7fdd92c","len":136},"ifsc0ece5ac9cc4bd721ae876affd5884b16":{"type":"PAR 2.0\u0000IFSC","md5":"7b643fb29da65e845878166ef9c16481","len":5200},"ifsc76d7f11d6b1621aaecd3d74b1442be16":{"type":"PAR 2.0\u0000IFSC","md5":"1a68f1cf8dbb9bc94e967868cf528f7f","len":100},"ifsc148b12e16fb3ba135f591cbad669b476":{"type":"PAR 2.0\u0000IFSC","md5":"a49dd85cdc26defa02f1c21662cabc3b","len":1120},"ifsc8d1dfea8582c9e7fe47516f0d4b7ccc6":{"type":"PAR 2.0\u0000IFSC","md5":"1212c757606d123ca2e279fb303034cf","len":176080},"creator":{"type":"PAR 2.0\u0000Creator","md5":"c2132720a90fa23abd935f59c5ec96ab","len":104}}]}
\ No newline at end of file
diff --git a/test/par-compare.js b/test/par-compare.js
index 79c2ab72..e3b2de35 100644
--- a/test/par-compare.js
+++ b/test/par-compare.js
@@ -194,7 +194,7 @@ function compare_files(file1, file2) {
 			//console.log('Packet mismatch for ' + k, file1[k], file2[k]);
 			var err = new Error('Packet mismatch for ' + k);
 			//err.pkts = [file1[k], file2[k]];
-			console.log("Packet dump:", file1[k], file2[k]);
+			console.log("Packet dump (expected/actual):", file1[k], file2[k]);
 			throw err;
 		}
 	}
@@ -314,8 +314,6 @@ function writeRndFile(name, size) {
 }
 writeRndFile('test64m.bin', 64*1048576);
 writeRndFile('test2200m.bin', 2200*1048576);
-if(!fastTest)
-	writeRndFile('test4100m.bin', 4100*1048576); // >4GB to test 32-bit overflows
 
 // we don't test 0 byte files - different implementations seem to treat it differently:
 // - par2cmdline: skips all 0 byte files
@@ -328,6 +326,8 @@ fs.writeFileSync(tmpDir + 'test8b.bin', '01234567');
 writeRndFile('test65k.bin', 65521);
 writeRndFile('test13m.bin', 13631477);
 
+if(!fastTest) // ensure this is last to make input files consistent between fast/slow tests
+	writeRndFile('test4100m.bin', 4100*1048576); // >4GB to test 32-bit overflows
 
 var cachedResults = {};
 var setCacheKeys = {};
@@ -493,8 +493,7 @@ var allTests = [
 		blocks: 2,
 		singleFile: true,
 		cacheKey: '18'
-	},
-	
+	}
 ];
 if(!fastTest) {
 	allTests.push(
@@ -546,7 +545,7 @@ if(!fastTest) {
 			blocks: 2,
 			singleFile: true,
 			cacheKey: '19'
-		},
+		}
 	);
 	if(is64bPlatform) {
 		allTests.push({ // recovery > 4GB in memory [https://github.com/animetosho/par2cmdline-turbo/issues/7]

From 403737ced71140e123ca1ccc71f62ae08b477284 Mon Sep 17 00:00:00 2001
From: animetosho <animetosho@users.noreply.github.com>
Date: Wed, 30 Aug 2023 20:48:00 +1000
Subject: [PATCH 91/91] Test workflow fixes

---
 .github/workflows/test-full.yml | 2 +-
 .github/workflows/test.yml      | 3 ++-
 test/par-compare.js             | 8 ++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test-full.yml b/.github/workflows/test-full.yml
index 562917d9..1a47a8f4 100644
--- a/.github/workflows/test-full.yml
+++ b/.github/workflows/test-full.yml
@@ -9,7 +9,7 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - version: '0.10.40'
+        - version: '0.10.48'
           flags: ''
           python2: true
         - version: '4.9.1'
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2e1c3534..548ea67d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -15,7 +15,8 @@ jobs:
     steps:
       - uses: ilammy/setup-nasm@v1
       - uses: petarpetrovt/setup-sde@v2.1
-        sdeVersion: 8.69.1
+        with:
+          sdeVersion: 8.69.1
       - uses: actions/checkout@v3
       - run: |
           mkdir test\gf16\build
diff --git a/test/par-compare.js b/test/par-compare.js
index e3b2de35..0f149562 100644
--- a/test/par-compare.js
+++ b/test/par-compare.js
@@ -299,13 +299,17 @@ console.log('Creating random input files...');
 function writeRndFile(name, size) {
 	if(skipFileCreate && fs.existsSync(tmpDir + name)) return;
 	var fd = fs.openSync(tmpDir + name, 'w');
-	var rand = require('crypto').createCipheriv('rc4', 'my_incredibly_strong_password' + name, '');
+	var rand = crypto.createCipheriv('rc4', 'my_incredibly_strong_password' + name, '');
 	rand.setAutoPadding(false);
 	var nullBuf = allocBuffer(1024*16);
 	nullBuf.fill(0);
 	var written = 0;
 	while(written < size) {
-		var b = bufferSlice.call(rand.update(nullBuf), 0, Math.min(1024*16, size-written));
+		var b = rand.update(nullBuf);
+		if(b.subarray)
+			b = bufferSlice.call(b, 0, Math.min(1024*16, size-written));
+		else // on Node v0.10.x, rand is a SlowBuffer, so calling Buffer.slice on it won't work
+			b = b.slice(0, Math.min(1024*16, size-written));
 		fsWriteSync(fd, b);
 		written += b.length;
 	}